Spaces:

bethgelab
/

lm-similarity

Running

File size: 4,736 Bytes

2f2195a
 
bd28414
0f7de99
 
c8f741c
238bffb
b90e0d3
238bffb
 
 
 
 
 
b8a1ed4
26c0eec
 
 
93d753c
238bffb
bd28414
b90e0d3
238bffb
0a42e99
238bffb
465a95b
 
93d753c
465a95b
26c0eec
465a95b
 
1168f81
32f9617
465a95b
2cee451
ce6be70
2cee451
26c0eec
2cee451
465a95b
a48b15f
1168f81
 
 
93d753c
1168f81
 
 
 
 
 
a48b15f
bd28414
a48b15f
 
ffacaaa
cca1790
465a95b
4077e51
45b2347
cca1790
bd28414
465a95b
cca1790
 
bd28414
ffacaaa
465a95b
cca1790
 
c608f7f
35404bc
cca1790
 
36159b1
465a95b
cca1790
 
b90e0d3
bd1b20b
42ed9bf
5623280
bd1b20b
 
 
 
 
0d09d9a
bd1b20b
1b549fb
bd28414
b90e0d3
35404bc
 
 
5d4059c
 
1b549fb
ce6be70
35404bc
cca1790
465a95b

import gradio as gr

import src.app_util as app_util
from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets


links_markdown = """
[📄 Paper](https://arxiv.org/abs/2502.04313) &nbsp;&nbsp;|&nbsp;&nbsp;
[☯ Homepage](https://model-similarity.github.io/) &nbsp;&nbsp;|&nbsp;&nbsp;
[🐱 Code](https://github.com/model-similarity/lm-similarity) &nbsp;&nbsp;|&nbsp;&nbsp;
[🐍 pip install lm-sim](https://pypi.org/project/lm-sim/) &nbsp;&nbsp;|&nbsp;&nbsp;
[🤗 Data](https://huggingface.co/datasets/bethgelab/lm-similarity)
"""

model_init = ["HuggingFaceTB/SmolLM2-1.7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-4", "Qwen/Qwen2.5-14B-Instruct-1M", "meta-llama/Llama-3.3-70B-Instruct"]
dataset_init = "mmlu_pro"
metric_init = "CAPA"


# Create Gradio interface
with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
    gr.Markdown("# Model Similarity Comparison Tool")
    gr.Markdown(links_markdown)
    gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')

    with gr.Row():
        dataset_dropdown = gr.Dropdown(
            choices=get_leaderboard_datasets(model_init),
            label="Select Dataset",
            value=dataset_init,
            filterable=True,
            interactive=True,
            allow_custom_value=False,
            info="Open LLM Leaderboard v2 benchmark datasets"
        )
        metric_dropdown = gr.Dropdown(
            choices=["CAPA", "CAPA (det.)", "Error Consistency"],
            label="Select Metric",
            value=metric_init,
            info="Select a similarity metric to compute"
        )

    model_dropdown = gr.Dropdown(
        choices=get_leaderboard_models_cached(),
        label="Select Models",
        value=model_init,
        multiselect=True,
        filterable=True,
        allow_custom_value=False,
        info="Search and select multiple models"
    )

    model_dropdown.change(
        fn=app_util.update_datasets_based_on_models,
        inputs=[model_dropdown, dataset_dropdown],
        outputs=dataset_dropdown
    )
    
    generate_btn = gr.Button("Generate Heatmap", variant="primary")
    heatmap = gr.Image(value=app_util.create_heatmap(model_init, dataset_init, metric_init), label="Similarity Heatmap", elem_classes="image_container", visible=True)

    generate_btn.click(
        fn=app_util.validate_inputs,
        inputs=[model_dropdown, dataset_dropdown],
        queue=False
    ).then(
        fn=app_util.create_heatmap,
        inputs=[model_dropdown, dataset_dropdown, metric_dropdown],
        outputs=heatmap
    )
    
    gr.Markdown("\* Self-similarity is only 1.0 for CAPA if the model predicts a single option with 100% confidence for each question. If the model is uncertain, the self-similarity will be lower.")

    clear_btn = gr.Button("Clear Selection")
    clear_btn.click(
        lambda: [[], None, None],
        outputs=[model_dropdown, dataset_dropdown, heatmap]
    )

    gr.Markdown("## Information")
    metric_info_markdown = r"""
We propose Chance Adjusted Probabilistic Agreement (CAPA, or κ_p), a novel metric for model similarity which adjusts for chance agreement due to accuracy. 

Using CAPA, we find:

1. LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability.
2. Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different.
3. Concerningly, model errors are getting more correlated as capabilities increase.
"""
    gr.Markdown(metric_info_markdown)
    with gr.Row():
        gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
    gr.Markdown("""
- **Datasets**: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
    - Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
- **Models**: Open LLM Leaderboard models \n
    - Every model evaluation is gated on Hugging Face and access has to be requested. \n
    - We requested access for the most popular models, but some may be missing. \n
    - Notably, loading data is not possible for some meta-llama and gemma models.
- **Metrics**: CAPA (probabilistic), CAPA (deterministic), Error Consistency""")

if __name__ == "__main__":
    demo.launch(ssr_mode=False)