Spaces:

bethgelab
/

lm-similarity

Running

lm-similarity / app.py

Joschka Strueber

[Fix] change default models

b8a1ed4 14 days ago

4.74 kB

	import gradio as gr

	import src.app_util as app_util
	from src.dataloading import get_leaderboard_models_cached, get_leaderboard_datasets


	links_markdown = """
	[📄 Paper](https://arxiv.org/abs/2502.04313)   \|
	[☯ Homepage](https://model-similarity.github.io/)   \|
	[🐱 Code](https://github.com/model-similarity/lm-similarity)   \|
	[🐍 pip install lm-sim](https://pypi.org/project/lm-sim/)   \|
	[🤗 Data](https://huggingface.co/datasets/bethgelab/lm-similarity)
	"""

	model_init = ["HuggingFaceTB/SmolLM2-1.7B-Instruct", "meta-llama/Llama-3.1-8B-Instruct", "microsoft/phi-4", "Qwen/Qwen2.5-14B-Instruct-1M", "meta-llama/Llama-3.3-70B-Instruct"]
	dataset_init = "mmlu_pro"
	metric_init = "CAPA"


	# Create Gradio interface
	with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
	gr.Markdown("# Model Similarity Comparison Tool")
	gr.Markdown(links_markdown)
	gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')

	with gr.Row():
	dataset_dropdown = gr.Dropdown(
	choices=get_leaderboard_datasets(model_init),
	label="Select Dataset",
	value=dataset_init,
	filterable=True,
	interactive=True,
	allow_custom_value=False,
	info="Open LLM Leaderboard v2 benchmark datasets"
	)
	metric_dropdown = gr.Dropdown(
	choices=["CAPA", "CAPA (det.)", "Error Consistency"],
	label="Select Metric",
	value=metric_init,
	info="Select a similarity metric to compute"
	)

	model_dropdown = gr.Dropdown(
	choices=get_leaderboard_models_cached(),
	label="Select Models",
	value=model_init,
	multiselect=True,
	filterable=True,
	allow_custom_value=False,
	info="Search and select multiple models"
	)

	model_dropdown.change(
	fn=app_util.update_datasets_based_on_models,
	inputs=[model_dropdown, dataset_dropdown],
	outputs=dataset_dropdown
	)

	generate_btn = gr.Button("Generate Heatmap", variant="primary")
	heatmap = gr.Image(value=app_util.create_heatmap(model_init, dataset_init, metric_init), label="Similarity Heatmap", elem_classes="image_container", visible=True)

	generate_btn.click(
	fn=app_util.validate_inputs,
	inputs=[model_dropdown, dataset_dropdown],
	queue=False
	).then(
	fn=app_util.create_heatmap,
	inputs=[model_dropdown, dataset_dropdown, metric_dropdown],
	outputs=heatmap
	)

	gr.Markdown("\* Self-similarity is only 1.0 for CAPA if the model predicts a single option with 100% confidence for each question. If the model is uncertain, the self-similarity will be lower.")

	clear_btn = gr.Button("Clear Selection")
	clear_btn.click(
	lambda: [[], None, None],
	outputs=[model_dropdown, dataset_dropdown, heatmap]
	)

	gr.Markdown("## Information")
	metric_info_markdown = r"""
	We propose Chance Adjusted Probabilistic Agreement (CAPA, or κ_p), a novel metric for model similarity which adjusts for chance agreement due to accuracy.

	Using CAPA, we find:

	1. LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability.
	2. Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different.
	3. Concerningly, model errors are getting more correlated as capabilities increase.
	"""
	gr.Markdown(metric_info_markdown)
	with gr.Row():
	gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
	gr.Markdown("""
	- Datasets: [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/) benchmark datasets \n
	- Some datasets are not multiple-choice - for these, the metrics are not applicable. \n
	- Models: Open LLM Leaderboard models \n
	- Every model evaluation is gated on Hugging Face and access has to be requested. \n
	- We requested access for the most popular models, but some may be missing. \n
	- Notably, loading data is not possible for some meta-llama and gemma models.
	- Metrics: CAPA (probabilistic), CAPA (deterministic), Error Consistency""")

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)