Joschka Strueber commited on
Commit
a48b15f
·
1 Parent(s): 32f9617

[Add] load models and datasets from hub, compute similarities

Browse files
Files changed (3) hide show
  1. app.py +27 -8
  2. src/dataloading.py +43 -28
  3. src/similarity.py +41 -12
app.py CHANGED
@@ -59,9 +59,31 @@ def validate_inputs(selected_models, selected_dataset):
59
  raise gr.Error("Please select at least one model!")
60
  if not selected_dataset:
61
  raise gr.Error("Please select a dataset!")
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  with gr.Blocks(title="LLM Similarity Analyzer") as demo:
64
  gr.Markdown("## Model Similarity Comparison Tool")
 
 
 
 
 
 
 
 
 
65
 
66
  with gr.Row():
67
  dataset_dropdown = gr.Dropdown(
@@ -76,14 +98,11 @@ with gr.Blocks(title="LLM Similarity Analyzer") as demo:
76
  label="Select Metric",
77
  info="Select a similarity metric to compute"
78
  )
79
-
80
- model_dropdown = gr.Dropdown(
81
- choices=get_leaderboard_models_cached(),
82
- label="Select Models",
83
- multiselect=True,
84
- filterable=True,
85
- allow_custom_value=False,
86
- info="Search and select multiple models"
87
  )
88
 
89
  generate_btn = gr.Button("Generate Heatmap", variant="primary")
 
59
  raise gr.Error("Please select at least one model!")
60
  if not selected_dataset:
61
  raise gr.Error("Please select a dataset!")
62
+
63
+
64
+ def update_datasets_based_on_models(selected_models, current_dataset):
65
+ # Get available datasets for selected models
66
+ available_datasets = get_leaderboard_datasets(selected_models) if selected_models else []
67
+
68
+ # Check if current dataset is still valid
69
+ valid_dataset = current_dataset if current_dataset in available_datasets else None
70
+
71
+ return gr.Dropdown.update(
72
+ choices=available_datasets,
73
+ value=valid_dataset
74
+ )
75
 
76
  with gr.Blocks(title="LLM Similarity Analyzer") as demo:
77
  gr.Markdown("## Model Similarity Comparison Tool")
78
+
79
+ model_dropdown = gr.Dropdown(
80
+ choices=get_leaderboard_models_cached(),
81
+ label="Select Models",
82
+ multiselect=True,
83
+ filterable=True,
84
+ allow_custom_value=False,
85
+ info="Search and select multiple models"
86
+ )
87
 
88
  with gr.Row():
89
  dataset_dropdown = gr.Dropdown(
 
98
  label="Select Metric",
99
  info="Select a similarity metric to compute"
100
  )
101
+
102
+ model_dropdown.change(
103
+ fn=update_datasets_based_on_models,
104
+ inputs=[model_dropdown, dataset_dropdown],
105
+ outputs=dataset_dropdown
 
 
 
106
  )
107
 
108
  generate_btn = gr.Button("Generate Heatmap", variant="primary")
src/dataloading.py CHANGED
@@ -1,33 +1,27 @@
1
  import datasets
2
  import numpy as np
3
-
4
  from huggingface_hub import HfApi
 
5
  from functools import lru_cache
6
 
7
 
8
  def get_leaderboard_models():
9
- #api = HfApi()
10
 
11
  # List all datasets in the open-llm-leaderboard organization
12
- #datasets = api.list_datasets(author="open-llm-leaderboard")
13
 
14
  models = []
15
- #for dataset in datasets:
16
- # if dataset.id.endswith("-details"):
17
- # # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
18
- # model_part = dataset.id.split("/")[-1].replace("-details", "")
19
- # provider, model = model_part.split("__", 1)
20
- # models.append(f"{provider}/{model}")
 
 
 
21
 
22
- # Example models
23
- models = [
24
- "meta_llama/Llama-3.2-1B-Instruct",
25
- "meta_llama/Llama-3.2-3B-Instruct",
26
- "meta_llama/Llama-3.1-8B-Instruct",
27
- "meta_llama/Llama-3.1-70B-Instruct",
28
- "meta_llama/Llama-3.3-70B-Instruct",
29
- ]
30
-
31
  return sorted(models)
32
 
33
 
@@ -37,15 +31,27 @@ def get_leaderboard_models_cached():
37
  return get_leaderboard_models()
38
 
39
 
40
- def get_leaderboard_datasets():
41
- return [
42
- "ai2_arc",
43
- "hellaswag",
44
- "mmlu_pro",
45
- "truthful_qa",
46
- "winogrande",
47
- "gsm8k"
48
- ]
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
  def filter_labels(doc):
51
  labels = []
@@ -85,4 +91,13 @@ def load_run_data(model_name, dataset_name):
85
  log_probs = []
86
  labels = []
87
 
88
- return log_probs, labels
 
 
 
 
 
 
 
 
 
 
1
  import datasets
2
  import numpy as np
 
3
  from huggingface_hub import HfApi
4
+
5
  from functools import lru_cache
6
 
7
 
8
  def get_leaderboard_models():
9
+ api = HfApi()
10
 
11
  # List all datasets in the open-llm-leaderboard organization
12
+ datasets = api.list_datasets(author="open-llm-leaderboard")
13
 
14
  models = []
15
+ for dataset in datasets:
16
+ if dataset.id.endswith("-details"):
17
+ # Format: "open-llm-leaderboard/<provider>__<model_name>-details"
18
+ model_part = dataset.id.split("/")[-1].replace("-details", "")
19
+ if "__" in model_part:
20
+ provider, model = model_part.split("__", 1)
21
+ models.append(f"{provider}/{model}")
22
+ else:
23
+ models.append(model_part)
24
 
 
 
 
 
 
 
 
 
 
25
  return sorted(models)
26
 
27
 
 
31
  return get_leaderboard_models()
32
 
33
 
34
+ def get_leaderboard_datasets(model_ids):
35
+ if model_ids is None:
36
+ return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
37
+
38
+ # Map each model to its corresponding leaderboard version
39
+ leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
40
+
41
+ model_datasets = {}
42
+
43
+ for model_id in leaderboard_model_ids:
44
+ # Retrieve the list of available configuration names
45
+ config_names = datasets.get_dataset_config_names(model_id)
46
+ dataset_names = [name.split("__leaderboard_")[-1] for name in config_names]
47
+ model_datasets[model_id] = set(dataset_names)
48
+
49
+ # Compute the intersection of datasets across all models
50
+ if model_datasets:
51
+ common_datasets = set.intersection(*model_datasets.values())
52
+
53
+ return sorted(common_datasets)
54
+
55
 
56
  def filter_labels(doc):
57
  labels = []
 
91
  log_probs = []
92
  labels = []
93
 
94
+ return log_probs, labels
95
+
96
+
97
+ if __name__ == "__main__":
98
+ model_ids = [
99
+ 'Qwen/Qwen2.5-7B-Instruct'
100
+ ]
101
+
102
+ datasets = get_leaderboard_datasets(model_ids)
103
+ print(datasets)
src/similarity.py CHANGED
@@ -1,15 +1,28 @@
 
 
1
  from src.dataloading import load_run_data
2
- from lmsim.metrics import Kappa_p
3
- import random
4
 
5
 
6
- def compute_similarity(selected_model_a, selected_model_b, selected_dataset):
7
- """
8
- probs_a, gt_a = load_run_data(selected_model_a, selected_dataset)
9
- probs_b, gt_b = load_run_data(selected_model_b, selected_dataset)
10
 
11
- assert len(probs_a) == len(probs_b), f"Models must have the same number of responses: {len(probs_a)} != {len(probs_b)}"
 
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
 
13
  # Only keep responses where the ground truth is the same
14
  output_a = []
15
  output_b = []
@@ -21,10 +34,26 @@ def compute_similarity(selected_model_a, selected_model_b, selected_dataset):
21
  gt.append(gt_a[i])
22
 
23
  # Placeholder similarity value
24
- kappa_p = Kappa_p()
25
- similarity = kappa_p.compute_k(output_a, output_b, gt)
26
- """
27
 
28
- similarity = random.random()
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- return similarity
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
  from src.dataloading import load_run_data
4
+ from lmsim.metrics import Metric, Kappa_p, EC
 
5
 
6
 
 
 
 
 
7
 
8
+ def load_data_and_compute_similarities(models, dataset, metric_name):
9
+ # Load data
10
+ probs = []
11
+ gts = []
12
+ for model in models:
13
+ model_probs, model_gt = load_run_data(model, dataset)
14
+ probs.append(model_probs)
15
+ gts.append(model_gt)
16
 
17
+ # Compute pairwise similarities
18
+ similarities = compute_pairwise_similarities(probs, gts, metric_name)
19
+ return similarities
20
+
21
+
22
+ def compute_similarity(metric: Metric, probs_a: list[np.array], gt_a: list[int], probs_b: list[np.array], gt_b: list[int]) -> float:
23
+ # Check that the models have the same number of responses
24
+ assert len(probs_a) == len(probs_b), f"Models must have the same number of responses: {len(probs_a)} != {len(probs_b)}"
25
+
26
  # Only keep responses where the ground truth is the same
27
  output_a = []
28
  output_b = []
 
34
  gt.append(gt_a[i])
35
 
36
  # Placeholder similarity value
37
+ similarity = metric.compute_k(output_a, output_b, gt)
 
 
38
 
39
+ return similarity
40
+
41
+
42
+ def compute_pairwise_similarities(metric_name: str, probs: list[list[np.array]], gts: list[list[int]]) -> np.array:
43
+ # Select chosen metric
44
+ if metric_name == "Kappa_p (prob.)":
45
+ metric = Kappa_p()
46
+ elif metric_name == "Kappa_p (det.)":
47
+ metric = Kappa_p()
48
+ elif metric_name == "Error Consistency":
49
+ metric = EC()
50
+ else:
51
+ raise ValueError(f"Invalid metric: {metric_name}")
52
 
53
+
54
+ similarities = np.zeros((len(probs), len(probs)))
55
+ for i in range(len(probs)):
56
+ for j in range(i, len(probs)):
57
+ similarities[i, j] = compute_similarity(metric, probs[i], gts[i], probs[j], gts[j])
58
+ similarities[j, i] = similarities[i, j]
59
+ return similarities