Joschka Strueber commited on
Commit
35404bc
·
1 Parent(s): 3eeaa4c

[Add, Fix] better warnings for missing models, better description

Browse files
Files changed (1) hide show
  1. app.py +13 -8
app.py CHANGED
@@ -3,13 +3,10 @@ import gradio as gr
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
6
- import re
7
  from io import BytesIO
8
  from PIL import Image
9
  from datasets.exceptions import DatasetNotFoundError
10
 
11
- print(gr.__version__)
12
-
13
  from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
14
  from src.similarity import load_data_and_compute_similarities
15
 
@@ -82,15 +79,15 @@ def update_datasets_based_on_models(selected_models, current_dataset):
82
  )
83
  except DatasetNotFoundError as e:
84
  # Extract model name from error message
85
- match = re.search(r"open-llm-leaderboard/([\w\-]+)", str(e))
86
- model_name = match.group(1) if match else "Unknown Model"
87
 
88
  # Display a shorter warning
89
  gr.Warning(f"Data for '{model_name}' is gated or unavailable.")
90
  return gr.update(choices=[], value=None)
91
 
92
  with gr.Blocks(title="LLM Similarity Analyzer") as demo:
93
- gr.Markdown("## Model Similarity Comparison Tool \n\nAs Language Model (LM) capabilities advance, evaluating and supervising them at scale is getting harder for humans. There is hope that other language models can automate both these tasks, which we refer to as AI Oversight. We study how model similarity affects both aspects of AI oversight by proposing a probabilistic metric for LM similarity based on overlap in model mistakes. Using this metric, we first show that LLM-as-a-judge scores favor models similar to the judge, generalizing recent self-preference results. Then, we study training on LM annotations, and find complementary knowledge between the weak supervisor and strong student model plays a crucial role in gains from weak-to-strong generalization. As model capabilities increase, it becomes harder to find their mistakes, and we might defer more to AI oversight. However, we observe a concerning trend -- model mistakes are becoming more similar with increasing capabilities, pointing to risks from correlated failures. Our work underscores the importance of reporting and correcting for model similarity, especially in the emerging paradigm of AI oversight. ")
94
 
95
  with gr.Row():
96
  dataset_dropdown = gr.Dropdown(
@@ -116,8 +113,6 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
116
  info="Search and select multiple models"
117
  )
118
 
119
- gr.Markdown("* For the probabilistic Kappa_p metric self-similarity is only 1, if the model predicts a single option with 100% confidence.")
120
-
121
  model_dropdown.change(
122
  fn=update_datasets_based_on_models,
123
  inputs=[model_dropdown, dataset_dropdown],
@@ -137,11 +132,21 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
137
  outputs=heatmap
138
  )
139
 
 
 
140
  clear_btn = gr.Button("Clear Selection")
141
  clear_btn.click(
142
  lambda: [[], None, None],
143
  outputs=[model_dropdown, dataset_dropdown, heatmap]
144
  )
145
 
 
 
 
 
 
 
 
 
146
  if __name__ == "__main__":
147
  demo.launch(ssr_mode=False)
 
3
  import numpy as np
4
  import matplotlib.pyplot as plt
5
  import seaborn as sns
 
6
  from io import BytesIO
7
  from PIL import Image
8
  from datasets.exceptions import DatasetNotFoundError
9
 
 
 
10
  from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
11
  from src.similarity import load_data_and_compute_similarities
12
 
 
79
  )
80
  except DatasetNotFoundError as e:
81
  # Extract model name from error message
82
+ model_name = e.args[0].split("'")[1]
83
+ model_name = model_name.split("/")[-1].replace("__", "/").replace("_details", "")
84
 
85
  # Display a shorter warning
86
  gr.Warning(f"Data for '{model_name}' is gated or unavailable.")
87
  return gr.update(choices=[], value=None)
88
 
89
  with gr.Blocks(title="LLM Similarity Analyzer") as demo:
90
+ gr.Markdown("## Model Similarity Comparison Tool")
91
 
92
  with gr.Row():
93
  dataset_dropdown = gr.Dropdown(
 
113
  info="Search and select multiple models"
114
  )
115
 
 
 
116
  model_dropdown.change(
117
  fn=update_datasets_based_on_models,
118
  inputs=[model_dropdown, dataset_dropdown],
 
132
  outputs=heatmap
133
  )
134
 
135
+ gr.Markdown("\* Self-similarity is only 1.0 for the probabilistic Kappa_p metric if the model predicts a single option with 100% confidence for each question.")
136
+
137
  clear_btn = gr.Button("Clear Selection")
138
  clear_btn.click(
139
  lambda: [[], None, None],
140
  outputs=[model_dropdown, dataset_dropdown, heatmap]
141
  )
142
 
143
+ gr.Markdown("""### Information \n
144
+ - **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
145
+ - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
146
+ - **Models**: Open LLM Leaderboard models \n
147
+ - Every model is gated on Hugging Face and access has to be requested. \n
148
+ - We requested access to the most popular models, but some may be missing. \n
149
+ - **Metrics**: Kappa_p (probabilistic), Kappa_p (deterministic), Error Consistency""")
150
+
151
  if __name__ == "__main__":
152
  demo.launch(ssr_mode=False)