Joschka Strueber commited on
Commit
c608f7f
·
1 Parent(s): 3dfa66b

[Ref, Fix] indentation error in answer key selection, longer explanation in demo, exclusion of broken dataset

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. src/dataloading.py +3 -1
  3. src/utils.py +6 -4
app.py CHANGED
@@ -69,7 +69,7 @@ with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo
69
  outputs=heatmap
70
  )
71
 
72
- gr.Markdown("\* Self-similarity is only 1.0 for the probabilistic Kappa_p metric if the model predicts a single option with 100% confidence for each question.")
73
 
74
  clear_btn = gr.Button("Clear Selection")
75
  clear_btn.click(
 
69
  outputs=heatmap
70
  )
71
 
72
+ gr.Markdown("\* Self-similarity is only 1.0 for CAPA if the model predicts a single option with 100% confidence for each question. If the model is uncertain, the self-similarity will be lower.")
73
 
74
  clear_btn = gr.Button("Clear Selection")
75
  clear_btn.click(
src/dataloading.py CHANGED
@@ -88,7 +88,7 @@ def get_leaderboard_datasets(model_ids):
88
  common_datasets = set.intersection(*model_datasets.values())
89
 
90
  # Filter datasets that are not MCQ or currently do not work
91
- ignore = ["math_", "ifeval"]
92
  discard = []
93
  for dataset in common_datasets:
94
  for ignore_data in ignore:
@@ -132,6 +132,8 @@ def filter_labels(dataset_name, doc):
132
  labels.append(1)
133
  elif test_target.isdigit():
134
  labels = [int(d[target_key]) for d in doc]
 
 
135
 
136
  return labels
137
 
 
88
  common_datasets = set.intersection(*model_datasets.values())
89
 
90
  # Filter datasets that are not MCQ or currently do not work
91
+ ignore = ["bbh_temporal_sequences", "math_", "ifeval"]
92
  discard = []
93
  for dataset in common_datasets:
94
  for ignore_data in ignore:
 
132
  labels.append(1)
133
  elif test_target.isdigit():
134
  labels = [int(d[target_key]) for d in doc]
135
+
136
+ print(f"Number of labels: {len(labels)}")
137
 
138
  return labels
139
 
src/utils.py CHANGED
@@ -18,7 +18,9 @@ def opt_in_pars_to_index(s):
18
  raise ValueError("Invalid format")
19
 
20
  def get_test_target(doc):
21
- if "target" in doc:
22
- return doc["target"], "target"
23
- elif "answer" in doc:
24
- return doc["answer"], "answer"
 
 
 
18
  raise ValueError("Invalid format")
19
 
20
  def get_test_target(doc):
21
+ if "target" in doc:
22
+ return doc["target"], "target"
23
+ elif "answer" in doc:
24
+ return doc["answer"], "answer"
25
+ else:
26
+ return "", ""