Spaces:

ehagey
/

LLM_Healthcare_Benchmarking

Running

App Files Files Community

ehagey commited on 8 days ago

Commit

2322bf2

verified ·

1 Parent(s): 9ed09e4

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -89

app.py CHANGED Viewed

@@ -16,6 +16,8 @@ from anthropic import Anthropic
 import google.generativeai as genai
 import hmac
 import hashlib
 load_dotenv()
@@ -32,16 +34,6 @@ if not os.path.exists(DATA_DIR):
 else:
     st.info(f"`{DATA_DIR}` directory already exists.")
-if os.path.exists(DATA_DIR):
-    files = os.listdir(DATA_DIR)
-    st.write(f"Contents of `{DATA_DIR}` directory:")
-    if files:
-        for file in files:
-            st.write(f"- {file}")
-    else:
-        st.write("The data directory is currently empty.")
-else:
-    st.error(f"`{DATA_DIR}` directory does not exist.")
 def initialize_session_state():
     if 'api_configured' not in st.session_state:
@@ -62,8 +54,7 @@ def initialize_session_state():
         st.session_state.last_evaluated_dataset = None
 def setup_api_clients():
-    initialize_session_state()
     with st.sidebar:
         st.title("API Configuration")
@@ -76,20 +67,24 @@ def setup_api_clients():
             if st.button("Verify Credentials"):
                 if (hmac.compare_digest(username, os.environ.get("STREAMLIT_USERNAME", "")) and
                     hmac.compare_digest(password, os.environ.get("STREAMLIT_PASSWORD", ""))):
-                    st.session_state.togetherai_client = OpenAI(
-                        api_key=os.getenv('TOGETHERAI_API_KEY'),
-                        base_url="https://api.together.xyz/v1"
-                    )
-                    st.session_state.openai_client = OpenAI(
-                        api_key=os.getenv('OPENAI_API_KEY')
-                    )
-                    st.session_state.anthropic_client = Anthropic(
-                        api_key=os.getenv('ANTHROPIC_API_KEY')
-                    )
-                    genai.configure(api_key=os.environ["GEMINI_API_KEY"])
-                    st.session_state.api_configured = True
-                    st.success("Successfully configured the API clients with stored keys!")
                 else:
                     st.error("Invalid credentials. Please try again or use your own API keys.")
                     st.session_state.api_configured = False
@@ -120,6 +115,7 @@ def setup_api_clients():
                     st.error(f"Error initializing API clients: {str(e)}")
                     st.session_state.api_configured = False
 MAX_CONCURRENT_CALLS = 5
 semaphore = threading.Semaphore(MAX_CONCURRENT_CALLS)
@@ -145,7 +141,7 @@ def load_dataset_by_name(dataset_name, split="train"):
         }
         questions.append(question_dict)
-    st.write(f"Loaded {len(questions)} single-select questions from {dataset_name}")
     return questions
 @retry(
@@ -233,14 +229,13 @@ def process_single_evaluation(question, prompt_template, model_name, clients, la
         'options': ' | '.join(question['options']),
         'model_response': answer,
         'is_correct': is_correct,
-        'explanation': question['explanation']
     }
     with WRITE_LOCK:
         file_exists = os.path.isfile(RESULTS_FILE)
         with open(RESULTS_FILE, 'a', encoding='utf-8', newline='') as f:
-            writer = pd.DataFrame([result])
-            writer.to_csv(f, header=not file_exists, index=False)
     return result
@@ -264,7 +259,6 @@ def process_evaluations_concurrently(questions, prompt_template, models_to_evalu
                     current_iteration += 1
                     progress_callback(current_iteration, total_iterations)
                     continue  # Skip already completed evaluations
-                # Pass last_evaluated_dataset as an argument
                 future = executor.submit(
                     process_single_evaluation,
                     question,
@@ -283,42 +277,39 @@ def process_evaluations_concurrently(questions, prompt_template, models_to_evalu
     return results
-def main():
-    initialize_session_state()
-    setup_api_clients()
-    if not st.session_state.api_configured:
-        st.warning("Please configure API keys in the sidebar to proceed")
-        st.stop()
     if 'all_results' not in st.session_state:
-        if os.path.exists(RESULTS_FILE):
-            existing_df = pd.read_csv(RESULTS_FILE)
-            all_results = {}
-            for _, row in existing_df.iterrows():
-                model = row['model']
-                result = row.to_dict()
-                if model not in all_results:
-                    all_results[model] = []
-                all_results[model].append(result)
-            st.session_state.all_results = all_results
-            st.session_state.last_evaluated_dataset = existing_df['dataset'].iloc[-1]
-        else:
-            st.session_state.all_results = {}
-            st.session_state.last_evaluated_dataset = None
-    if 'detailed_model' not in st.session_state:
-        st.session_state.detailed_model = None
-    if 'detailed_dataset' not in st.session_state:
-        st.session_state.detailed_dataset = None
-    if 'last_evaluated_dataset' not in st.session_state:
         st.session_state.last_evaluated_dataset = None
     with st.sidebar:
         if st.button("Reset Results"):
             if os.path.exists(RESULTS_FILE):
                 os.remove(RESULTS_FILE)
                 st.session_state.all_results = {}
                 st.session_state.last_evaluated_dataset = None
                 st.success("Results have been reset.")
@@ -333,14 +324,15 @@ def main():
             help="Choose the dataset to evaluate on"
         )
     with col2:
-        selected_model = st.multiselect(
             "Select Model(s)",
             options=list(MODELS.keys()),
             default=[list(MODELS.keys())[0]],
             help="Choose one or more models to evaluate."
         )
-    models_to_evaluate = selected_model
     default_prompt = '''You are a medical AI assistant. Please answer the following multiple choice question.
 Question: {question}
@@ -365,13 +357,14 @@ Important:
 - Only the "answer" field will be used for evaluation
 - Ensure your response is in valid JSON format'''
     col1, col2 = st.columns([2, 1])
     with col1:
         prompt_template = st.text_area(
             "Customize Prompt Template",
             default_prompt,
             height=400,
-            help="The below prompt is editable. Please feel free to edit it before your run."
         )
     with col2:
@@ -381,28 +374,34 @@ Important:
         - `{options}`: The multiple choice options
         """)
     with st.spinner("Loading dataset..."):
         questions = load_dataset_by_name(selected_dataset)
     subjects = sorted(list(set(q['subject_name'] for q in questions)))
     selected_subject = st.selectbox("Filter by subject", ["All"] + subjects)
     if selected_subject != "All":
         questions = [q for q in questions if q['subject_name'] == selected_subject]
-    num_questions = st.number_input("Number of questions to evaluate", 1, len(questions))
     if st.button("Start Evaluation"):
         with st.spinner("Starting evaluation..."):
             selected_questions = questions[:num_questions]
-            # Create a clients dictionary
             clients = {
                 "togetherai": st.session_state["togetherai_client"],
                 "openai": st.session_state["openai_client"],
                 "anthropic": st.session_state["anthropic_client"]
             }
-            last_evaluated_dataset = st.session_state.last_evaluated_dataset
             progress_container = st.container()
             progress_bar = progress_container.progress(0)
@@ -443,7 +442,6 @@ Important:
     if st.session_state.all_results:
         st.subheader("Evaluation Results")
-        model_metrics = {}
         for model_name, results in st.session_state.all_results.items():
             df = pd.DataFrame(results)
             metrics = {
@@ -454,7 +452,6 @@ Important:
         metrics_df = pd.DataFrame(model_metrics).T
         st.subheader("Model Performance Comparison")
         accuracy_chart = alt.Chart(
             metrics_df.reset_index().melt(id_vars=['index'], value_vars=['Accuracy'])
         ).mark_bar().encode(
@@ -473,7 +470,6 @@ Important:
         )
         st.altair_chart(accuracy_chart, use_container_width=True)
     if st.session_state.all_results:
         st.subheader("Detailed Results")
@@ -518,11 +514,11 @@ Important:
                     col1, col2 = st.columns(2)
                     with col1:
-                        st.write("**Prompt Used:**")
-                        st.code(result.get('prompt_sent', "N/A"))
                     with col2:
-                        st.write("**Raw Response:**")
-                        st.code(result.get('raw_llm_response', "N/A"))
                     col1, col2 = st.columns(2)
                     with col1:
@@ -534,32 +530,20 @@ Important:
                         else:
                             st.error("Incorrect")
-                    st.write("**Explanation:**", result['explanation'])
         else:
             st.info(f"No results available for {selected_model_details} on {selected_dataset_details}. Please run the evaluation first.")
         st.markdown("---")
         all_data = []
         for model_name, results in st.session_state.all_results.items():
-            for question_idx, result in enumerate(results):
-                row = {
-                    'dataset': st.session_state.last_evaluated_dataset,
-                    'model': model_name,
-                    'question': result['question'],
-                    'correct_answer': result['correct_answer'],
-                    'subject': result['subject'],
-                    'options': result['options'],
-                    'model_response': result['model_response'],
-                    'is_correct': result['is_correct'],
-                    'explanation': result['explanation']
-                }
                 all_data.append(row)
         complete_df = pd.DataFrame(all_data)
         csv = complete_df.to_csv(index=False)
         st.download_button(
             label="Download All Results as CSV",
             data=csv,

 import google.generativeai as genai
 import hmac
 import hashlib
+from uuid import uuid4
+from datetime import datetime
 load_dotenv()
 else:
     st.info(f"`{DATA_DIR}` directory already exists.")
 def initialize_session_state():
     if 'api_configured' not in st.session_state:
         st.session_state.last_evaluated_dataset = None
 def setup_api_clients():
+	initialize_session_state()
     with st.sidebar:
         st.title("API Configuration")
             if st.button("Verify Credentials"):
                 if (hmac.compare_digest(username, os.environ.get("STREAMLIT_USERNAME", "")) and
                     hmac.compare_digest(password, os.environ.get("STREAMLIT_PASSWORD", ""))):
+                    try:
+                        st.session_state.togetherai_client = OpenAI(
+                            api_key=os.getenv('TOGETHERAI_API_KEY'),
+                            base_url="https://api.together.xyz/v1"
+                        )
+                        st.session_state.openai_client = OpenAI(
+                            api_key=os.getenv('OPENAI_API_KEY')
+                        )
+                        st.session_state.anthropic_client = Anthropic(
+                            api_key=os.getenv('ANTHROPIC_API_KEY')
+                        )
+                        genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+                        st.session_state.api_configured = True
+                        st.success("Successfully configured the API clients with stored keys!")
+                    except Exception as e:
+                        st.error(f"Error initializing API clients: {str(e)}")
+                        st.session_state.api_configured = False
                 else:
                     st.error("Invalid credentials. Please try again or use your own API keys.")
                     st.session_state.api_configured = False
                     st.error(f"Error initializing API clients: {str(e)}")
                     st.session_state.api_configured = False
+setup_api_clients()
 MAX_CONCURRENT_CALLS = 5
 semaphore = threading.Semaphore(MAX_CONCURRENT_CALLS)
         }
         questions.append(question_dict)
+    st.write(f"Loaded {len(questions)} single-select questions from `{dataset_name}`")
     return questions
 @retry(
         'options': ' | '.join(question['options']),
         'model_response': answer,
         'is_correct': is_correct,
+        'explanation': question['explanation'],
+        'timestamp': datetime.utcnow().isoformat()
     }
     with WRITE_LOCK:
         file_exists = os.path.isfile(RESULTS_FILE)
         with open(RESULTS_FILE, 'a', encoding='utf-8', newline='') as f:
+            pd.DataFrame([result]).to_csv(f, header=not file_exists, index=False)
     return result
                     current_iteration += 1
                     progress_callback(current_iteration, total_iterations)
                     continue  # Skip already completed evaluations
                 future = executor.submit(
                     process_single_evaluation,
                     question,
     return results
+def main():
     if 'all_results' not in st.session_state:
+        st.session_state.all_results = {}
+        st.session_state.last_evaluated_dataset = None
+    if os.path.exists(RESULTS_FILE):
+        existing_df = pd.read_csv(RESULTS_FILE)
+        all_results = {}
+        for _, row in existing_df.iterrows():
+            model = row['model']
+            result = row.to_dict()
+            if model not in all_results:
+                all_results[model] = []
+            all_results[model].append(result)
+        st.session_state.all_results = all_results
+        st.session_state.last_evaluated_dataset = existing_df['dataset'].iloc[-1]
+        st.info(f"Loaded existing results from `{RESULTS_FILE}`.")
+    else:
+        st.session_state.all_results = {}
         st.session_state.last_evaluated_dataset = None
+        st.info(f"No existing results found. Ready to start fresh.")
     with st.sidebar:
         if st.button("Reset Results"):
             if os.path.exists(RESULTS_FILE):
                 os.remove(RESULTS_FILE)
+                for file in os.listdir(DATA_DIR):
+                    file_path = os.path.join(DATA_DIR, file)
+                    try:
+                        if os.path.isfile(file_path):
+                            os.unlink(file_path)
+                    except Exception as e:
+                        st.error(f"Error deleting file {file_path}: {e}")
                 st.session_state.all_results = {}
                 st.session_state.last_evaluated_dataset = None
                 st.success("Results have been reset.")
             help="Choose the dataset to evaluate on"
         )
     with col2:
+        selected_models = st.multiselect(
             "Select Model(s)",
             options=list(MODELS.keys()),
             default=[list(MODELS.keys())[0]],
             help="Choose one or more models to evaluate."
         )
+    models_to_evaluate = selected_models
     default_prompt = '''You are a medical AI assistant. Please answer the following multiple choice question.
 Question: {question}
 - Only the "answer" field will be used for evaluation
 - Ensure your response is in valid JSON format'''
     col1, col2 = st.columns([2, 1])
     with col1:
         prompt_template = st.text_area(
             "Customize Prompt Template",
             default_prompt,
             height=400,
+            help="Edit the prompt template before starting the evaluation."
         )
     with col2:
         - `{options}`: The multiple choice options
         """)
     with st.spinner("Loading dataset..."):
         questions = load_dataset_by_name(selected_dataset)
     subjects = sorted(list(set(q['subject_name'] for q in questions)))
     selected_subject = st.selectbox("Filter by subject", ["All"] + subjects)
     if selected_subject != "All":
         questions = [q for q in questions if q['subject_name'] == selected_subject]
+    num_questions = st.number_input("Number of questions to evaluate", min_value=1, max_value=len(questions), value=1, step=1)
     if st.button("Start Evaluation"):
         with st.spinner("Starting evaluation..."):
             selected_questions = questions[:num_questions]
             clients = {
                 "togetherai": st.session_state["togetherai_client"],
                 "openai": st.session_state["openai_client"],
                 "anthropic": st.session_state["anthropic_client"]
             }
+            last_evaluated_dataset = st.session_state.last_evaluated_dataset if st.session_state.last_evaluated_dataset else selected_dataset
             progress_container = st.container()
             progress_bar = progress_container.progress(0)
     if st.session_state.all_results:
         st.subheader("Evaluation Results")
         for model_name, results in st.session_state.all_results.items():
             df = pd.DataFrame(results)
             metrics = {
         metrics_df = pd.DataFrame(model_metrics).T
         st.subheader("Model Performance Comparison")
         accuracy_chart = alt.Chart(
             metrics_df.reset_index().melt(id_vars=['index'], value_vars=['Accuracy'])
         ).mark_bar().encode(
         )
         st.altair_chart(accuracy_chart, use_container_width=True)
     if st.session_state.all_results:
         st.subheader("Detailed Results")
                     col1, col2 = st.columns(2)
                     with col1:
+                        st.write("**Model Response:**")
+                        st.code(result.get('model_response', "N/A"))
                     with col2:
+                        st.write("**Explanation:**")
+                        st.code(result.get('explanation', "N/A"))
                     col1, col2 = st.columns(2)
                     with col1:
                         else:
                             st.error("Incorrect")
+                    st.write("**Timestamp:**", result['timestamp'])
         else:
             st.info(f"No results available for {selected_model_details} on {selected_dataset_details}. Please run the evaluation first.")
         st.markdown("---")
+        st.subheader("Download Results")
         all_data = []
         for model_name, results in st.session_state.all_results.items():
+            for result in results:
+                row = result.copy()
                 all_data.append(row)
         complete_df = pd.DataFrame(all_data)
         csv = complete_df.to_csv(index=False)
         st.download_button(
             label="Download All Results as CSV",
             data=csv,