Spaces:
Running
Running
File size: 4,736 Bytes
2f2195a bd28414 0f7de99 c8f741c 238bffb b90e0d3 238bffb b8a1ed4 26c0eec 93d753c 238bffb bd28414 b90e0d3 238bffb 0a42e99 238bffb 465a95b 93d753c 465a95b 26c0eec 465a95b 1168f81 32f9617 465a95b 2cee451 ce6be70 2cee451 26c0eec 2cee451 465a95b a48b15f 1168f81 93d753c 1168f81 a48b15f bd28414 a48b15f ffacaaa cca1790 465a95b 4077e51 45b2347 cca1790 bd28414 465a95b cca1790 bd28414 ffacaaa 465a95b cca1790 c608f7f 35404bc cca1790 36159b1 465a95b cca1790 b90e0d3 bd1b20b 42ed9bf 5623280 bd1b20b 0d09d9a bd1b20b 1b549fb bd28414 b90e0d3 35404bc 5d4059c 1b549fb ce6be70 35404bc cca1790 465a95b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import gradio as gr
import src.app_util as app_util
from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets
links_markdown = """
[📄 Paper](https://arxiv.org/abs/2502.04313) |
[☯ Homepage](https://model-similarity.github.io/) |
[🐱 Code](https://github.com/model-similarity/lm-similarity) |
[🐍 pip install lm-sim](https://pypi.org/project/lm-sim/) |
[🤗 Data](https://huggingface.co/datasets/bethgelab/lm-similarity)
"""
model_init = ["HuggingFaceTB/SmolLM2-1.7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-4", "Qwen/Qwen2.5-14B-Instruct-1M", "meta-llama/Llama-3.3-70B-Instruct"]
dataset_init = "mmlu_pro"
metric_init = "CAPA"
# Create Gradio interface
with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
gr.Markdown("# Model Similarity Comparison Tool")
gr.Markdown(links_markdown)
gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')
with gr.Row():
dataset_dropdown = gr.Dropdown(
choices=get_leaderboard_datasets(model_init),
label="Select Dataset",
value=dataset_init,
filterable=True,
interactive=True,
allow_custom_value=False,
info="Open LLM Leaderboard v2 benchmark datasets"
)
metric_dropdown = gr.Dropdown(
choices=["CAPA", "CAPA (det.)", "Error Consistency"],
label="Select Metric",
value=metric_init,
info="Select a similarity metric to compute"
)
model_dropdown = gr.Dropdown(
choices=get_leaderboard_models_cached(),
label="Select Models",
value=model_init,
multiselect=True,
filterable=True,
allow_custom_value=False,
info="Search and select multiple models"
)
model_dropdown.change(
fn=app_util.update_datasets_based_on_models,
inputs=[model_dropdown, dataset_dropdown],
outputs=dataset_dropdown
)
generate_btn = gr.Button("Generate Heatmap", variant="primary")
heatmap = gr.Image(value=app_util.create_heatmap(model_init, dataset_init, metric_init), label="Similarity Heatmap", elem_classes="image_container", visible=True)
generate_btn.click(
fn=app_util.validate_inputs,
inputs=[model_dropdown, dataset_dropdown],
queue=False
).then(
fn=app_util.create_heatmap,
inputs=[model_dropdown, dataset_dropdown, metric_dropdown],
outputs=heatmap
)
gr.Markdown("\* Self-similarity is only 1.0 for CAPA if the model predicts a single option with 100% confidence for each question. If the model is uncertain, the self-similarity will be lower.")
clear_btn = gr.Button("Clear Selection")
clear_btn.click(
lambda: [[], None, None],
outputs=[model_dropdown, dataset_dropdown, heatmap]
)
gr.Markdown("## Information")
metric_info_markdown = r"""
We propose Chance Adjusted Probabilistic Agreement (CAPA, or κ_p), a novel metric for model similarity which adjusts for chance agreement due to accuracy.
Using CAPA, we find:
1. LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability.
2. Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different.
3. Concerningly, model errors are getting more correlated as capabilities increase.
"""
gr.Markdown(metric_info_markdown)
with gr.Row():
gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
gr.Markdown("""
- **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
- Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
- **Models**: Open LLM Leaderboard models \n
- Every model evaluation is gated on Hugging Face and access has to be requested. \n
- We requested access for the most popular models, but some may be missing. \n
- Notably, loading data is not possible for some meta-llama and gemma models.
- **Metrics**: CAPA (probabilistic), CAPA (deterministic), Error Consistency""")
if __name__ == "__main__":
demo.launch(ssr_mode=False) |