Spaces:

bethgelab
/

lm-similarity

Running

App Files Files Community

Joschka Strueber commited on 17 days ago

Commit

35404bc

1 Parent(s): 3eeaa4c

[Add, Fix] better warnings for missing models, better description

Browse files

Files changed (1) hide show

app.py +13 -8

app.py CHANGED Viewed

@@ -3,13 +3,10 @@ import gradio as gr
 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
-import re
 from io import BytesIO
 from PIL import Image
 from datasets.exceptions import DatasetNotFoundError
-print(gr.__version__)
 from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
 from src.similarity import load_data_and_compute_similarities
@@ -82,15 +79,15 @@ def update_datasets_based_on_models(selected_models, current_dataset):
         )
     except DatasetNotFoundError as e:
         # Extract model name from error message
-        match = re.search(r"open-llm-leaderboard/([\w\-]+)", str(e))
-        model_name = match.group(1) if match else "Unknown Model"
         # Display a shorter warning
         gr.Warning(f"Data for '{model_name}' is gated or unavailable.")
         return gr.update(choices=[], value=None)
 with gr.Blocks(title="LLM Similarity Analyzer") as demo:
-    gr.Markdown("## Model Similarity Comparison Tool \n\nAs Language Model (LM) capabilities advance, evaluating and supervising them at scale is getting harder for humans. There is hope that other language models can automate both these tasks, which we refer to as AI Oversight. We study how model similarity affects both aspects of AI oversight by proposing a probabilistic metric for LM similarity based on overlap in model mistakes. Using this metric, we first show that LLM-as-a-judge scores favor models similar to the judge, generalizing recent self-preference results. Then, we study training on LM annotations, and find complementary knowledge between the weak supervisor and strong student model plays a crucial role in gains from weak-to-strong generalization. As model capabilities increase, it becomes harder to find their mistakes, and we might defer more to AI oversight. However, we observe a concerning trend -- model mistakes are becoming more similar with increasing capabilities, pointing to risks from correlated failures. Our work underscores the importance of reporting and correcting for model similarity, especially in the emerging paradigm of AI oversight. ")
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
@@ -116,8 +113,6 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
         info="Search and select multiple models"
     )
-    gr.Markdown("* For the probabilistic Kappa_p metric self-similarity is only 1, if the model predicts a single option with 100% confidence.")
     model_dropdown.change(
         fn=update_datasets_based_on_models,
         inputs=[model_dropdown, dataset_dropdown],
@@ -137,11 +132,21 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
         outputs=heatmap
     )
     clear_btn = gr.Button("Clear Selection")
     clear_btn.click(
         lambda: [[], None, None],
         outputs=[model_dropdown, dataset_dropdown, heatmap]
     )
 if __name__ == "__main__":
     demo.launch(ssr_mode=False)

 import numpy as np
 import matplotlib.pyplot as plt
 import seaborn as sns
 from io import BytesIO
 from PIL import Image
 from datasets.exceptions import DatasetNotFoundError
 from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
 from src.similarity import load_data_and_compute_similarities
         )
     except DatasetNotFoundError as e:
         # Extract model name from error message
+        model_name = e.args[0].split("'")[1]
+        model_name = model_name.split("/")[-1].replace("__", "/").replace("_details", "")
         # Display a shorter warning
         gr.Warning(f"Data for '{model_name}' is gated or unavailable.")
         return gr.update(choices=[], value=None)
 with gr.Blocks(title="LLM Similarity Analyzer") as demo:
+    gr.Markdown("## Model Similarity Comparison Tool")
     with gr.Row():
         dataset_dropdown = gr.Dropdown(
         info="Search and select multiple models"
     )
     model_dropdown.change(
         fn=update_datasets_based_on_models,
         inputs=[model_dropdown, dataset_dropdown],
         outputs=heatmap
     )
+    gr.Markdown("\* Self-similarity is only 1.0 for the probabilistic Kappa_p metric if the model predicts a single option with 100% confidence for each question.")
     clear_btn = gr.Button("Clear Selection")
     clear_btn.click(
         lambda: [[], None, None],
         outputs=[model_dropdown, dataset_dropdown, heatmap]
     )
+    gr.Markdown("""### Information \n
+- **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
+    - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
+- **Models**: Open LLM Leaderboard models \n
+    - Every model is gated on Hugging Face and access has to be requested. \n
+    - We requested access to the most popular models, but some may be missing. \n
+- **Metrics**: Kappa_p (probabilistic), Kappa_p (deterministic), Error Consistency""")
 if __name__ == "__main__":
     demo.launch(ssr_mode=False)