Spaces:
Running
Running
Joschka Strueber
commited on
Commit
·
0d09d9a
1
Parent(s):
5623280
[Ref] switch to KaTeX Css in html
Browse files
app.py
CHANGED
@@ -78,17 +78,27 @@ with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo
|
|
78 |
)
|
79 |
|
80 |
gr.Markdown("## Information")
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
85 |
|
86 |
-
<
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
with gr.Row():
|
93 |
gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
|
94 |
gr.Markdown("""
|
|
|
78 |
)
|
79 |
|
80 |
gr.Markdown("## Information")
|
81 |
+
metric_info_html = r"""
|
82 |
+
<!-- Include KaTeX CSS for styling -->
|
83 |
+
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-vZTGXXFDvM1R7zDKx2g5N5S4FcoFdTJuFTz1Xj2A2/J1j4fGmS7a6hLQ6ZPfF1sk" crossorigin="anonymous">
|
84 |
+
<!-- Include KaTeX and its auto-render extension -->
|
85 |
+
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-6R6ckgSpF6yXUHg9+KJGXN9I+ik5U9dviDuzhSxrtk4AUaGr8/8Qovm6N9fl/hkz" crossorigin="anonymous"></script>
|
86 |
+
<script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-mll67QQ8ErU7t8/QqU3m0Cq56E7i2xUeFYSv6O9V3CRjNdqPzqxK9z6gS9GQFj8D" crossorigin="anonymous"
|
87 |
+
onload="renderMathInElement(document.body);"></script>
|
88 |
|
89 |
+
<div>
|
90 |
+
<p>
|
91 |
+
We propose Chance Adjusted Probabilistic Agreement ($\operatorname{CAPA}$, or $\kappa_p$), a novel metric
|
92 |
+
for model similarity which adjusts for chance agreement due to accuracy. Using CAPA, we find:
|
93 |
+
</p>
|
94 |
+
<ol>
|
95 |
+
<li>LLM-as-a-judge scores are biased towards more similar models controlling for the model's capability.</li>
|
96 |
+
<li>Gain from training strong models on annotations of weak supervisors (weak-to-strong generalization) is higher when the two models are more different.</li>
|
97 |
+
<li>Concerningly, model errors are getting more correlated as capabilities increase.</li>
|
98 |
+
</ol>
|
99 |
+
</div>
|
100 |
+
"""
|
101 |
+
gr.HTML(value=metric_info_html)
|
102 |
with gr.Row():
|
103 |
gr.Image(value="data/table_capa.png", label="Comparison of different similarity metrics for multiple-choice questions", elem_classes="image_container", interactive=False)
|
104 |
gr.Markdown("""
|