Generate perplexity and kld scores
Browse files- scores/Dolphin3.0-R1-Mistral-24B-IQ3_M.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-IQ3_S.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-IQ4_NL.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q3_K_L.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q3_K_M.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q3_K_S.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q4_K_M.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q4_K_S.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q5_K_M.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q6_K.log +42 -0
- scores/Dolphin3.0-R1-Mistral-24B-Q8_0.log +42 -0
scores/Dolphin3.0-R1-Mistral-24B-IQ3_M.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 26.172864 ± 0.256006
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.01%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.114031 ± 0.001391
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.120786 ± 0.001559
|
7 |
+
Mean PPL(Q)-PPL(base) : 2.820631 ± 0.048526
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.078326 ± 0.000305
|
11 |
+
Maximum KLD: 7.825869
|
12 |
+
99.9% KLD: 1.127171
|
13 |
+
99.0% KLD: 0.520127
|
14 |
+
99.0% KLD: 0.520127
|
15 |
+
Median KLD: 0.038346
|
16 |
+
10.0% KLD: 0.000902
|
17 |
+
5.0% KLD: 0.000207
|
18 |
+
1.0% KLD: 0.000001
|
19 |
+
Minimum KLD: -0.000505
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.107 ± 0.018 %
|
23 |
+
Maximum Δp: 86.241%
|
24 |
+
99.9% Δp: 39.725%
|
25 |
+
99.0% Δp: 22.115%
|
26 |
+
95.0% Δp: 9.997%
|
27 |
+
90.0% Δp: 5.175%
|
28 |
+
75.0% Δp: 0.626%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.677%
|
31 |
+
10.0% Δp: -5.458%
|
32 |
+
5.0% Δp: -10.553%
|
33 |
+
1.0% Δp: -23.853%
|
34 |
+
0.1% Δp: -43.880%
|
35 |
+
Minimum Δp: -97.767%
|
36 |
+
RMS Δp : 6.900 ± 0.035 %
|
37 |
+
Same top p: 88.021 ± 0.083 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 82098.99 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1718829.57 ms / 304128 tokens ( 5.65 ms per token, 176.94 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 1775127.52 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-IQ3_S.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 26.143038 ± 0.254443
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 98.95%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.112890 ± 0.001420
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.119509 ± 0.001590
|
7 |
+
Mean PPL(Q)-PPL(base) : 2.790806 ± 0.048102
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.082486 ± 0.000324
|
11 |
+
Maximum KLD: 8.223216
|
12 |
+
99.9% KLD: 1.163796
|
13 |
+
99.0% KLD: 0.554553
|
14 |
+
99.0% KLD: 0.554553
|
15 |
+
Median KLD: 0.040744
|
16 |
+
10.0% KLD: 0.000979
|
17 |
+
5.0% KLD: 0.000241
|
18 |
+
1.0% KLD: 0.000008
|
19 |
+
Minimum KLD: -0.000550
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.290 ± 0.018 %
|
23 |
+
Maximum Δp: 83.668%
|
24 |
+
99.9% Δp: 39.492%
|
25 |
+
99.0% Δp: 22.349%
|
26 |
+
95.0% Δp: 10.000%
|
27 |
+
90.0% Δp: 4.974%
|
28 |
+
75.0% Δp: 0.538%
|
29 |
+
Median Δp: -0.001%
|
30 |
+
25.0% Δp: -0.805%
|
31 |
+
10.0% Δp: -5.962%
|
32 |
+
5.0% Δp: -11.458%
|
33 |
+
1.0% Δp: -25.491%
|
34 |
+
0.1% Δp: -47.606%
|
35 |
+
Minimum Δp: -94.438%
|
36 |
+
RMS Δp : 7.163 ± 0.036 %
|
37 |
+
Same top p: 87.635 ± 0.085 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 80439.51 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1705180.19 ms / 304128 tokens ( 5.61 ms per token, 178.36 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 1852832.45 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-IQ4_NL.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 23.491557 ± 0.221754
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.65%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.005948 ± 0.000786
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.005966 ± 0.000791
|
7 |
+
Mean PPL(Q)-PPL(base) : 0.139324 ± 0.018431
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.025519 ± 0.000102
|
11 |
+
Maximum KLD: 2.251977
|
12 |
+
99.9% KLD: 0.379603
|
13 |
+
99.0% KLD: 0.171840
|
14 |
+
99.0% KLD: 0.171840
|
15 |
+
Median KLD: 0.012414
|
16 |
+
10.0% KLD: 0.000339
|
17 |
+
5.0% KLD: 0.000079
|
18 |
+
1.0% KLD: -0.000015
|
19 |
+
Minimum KLD: -0.000684
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.210 ± 0.010 %
|
23 |
+
Maximum Δp: 56.116%
|
24 |
+
99.9% Δp: 23.081%
|
25 |
+
99.0% Δp: 12.242%
|
26 |
+
95.0% Δp: 5.439%
|
27 |
+
90.0% Δp: 2.734%
|
28 |
+
75.0% Δp: 0.282%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.493%
|
31 |
+
10.0% Δp: -3.448%
|
32 |
+
5.0% Δp: -6.500%
|
33 |
+
1.0% Δp: -14.293%
|
34 |
+
0.1% Δp: -27.100%
|
35 |
+
Minimum Δp: -67.311%
|
36 |
+
RMS Δp : 4.044 ± 0.022 %
|
37 |
+
Same top p: 92.678 ± 0.067 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 103365.78 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1896464.64 ms / 304128 tokens ( 6.24 ms per token, 160.37 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 1956813.06 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q3_K_L.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 24.550639 ± 0.235362
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.30%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.050045 ± 0.001135
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.051319 ± 0.001194
|
7 |
+
Mean PPL(Q)-PPL(base) : 1.198406 ± 0.030666
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.053089 ± 0.000213
|
11 |
+
Maximum KLD: 4.851488
|
12 |
+
99.9% KLD: 0.770215
|
13 |
+
99.0% KLD: 0.362664
|
14 |
+
99.0% KLD: 0.362664
|
15 |
+
Median KLD: 0.025806
|
16 |
+
10.0% KLD: 0.000577
|
17 |
+
5.0% KLD: 0.000120
|
18 |
+
1.0% KLD: -0.000018
|
19 |
+
Minimum KLD: -0.000444
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.115 ± 0.015 %
|
23 |
+
Maximum Δp: 82.459%
|
24 |
+
99.9% Δp: 33.908%
|
25 |
+
99.0% Δp: 18.275%
|
26 |
+
95.0% Δp: 8.313%
|
27 |
+
90.0% Δp: 4.302%
|
28 |
+
75.0% Δp: 0.536%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.557%
|
31 |
+
10.0% Δp: -4.600%
|
32 |
+
5.0% Δp: -8.973%
|
33 |
+
1.0% Δp: -20.298%
|
34 |
+
0.1% Δp: -38.592%
|
35 |
+
Minimum Δp: -79.558%
|
36 |
+
RMS Δp : 5.840 ± 0.030 %
|
37 |
+
Same top p: 89.673 ± 0.078 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 95100.93 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1924525.06 ms / 304128 tokens ( 6.33 ms per token, 158.03 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 2042841.27 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q3_K_M.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 24.806925 ± 0.237610
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.16%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.060430 ± 0.001239
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.062293 ± 0.001316
|
7 |
+
Mean PPL(Q)-PPL(base) : 1.454692 ± 0.034094
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.063828 ± 0.000254
|
11 |
+
Maximum KLD: 4.249581
|
12 |
+
99.9% KLD: 0.920882
|
13 |
+
99.0% KLD: 0.436885
|
14 |
+
99.0% KLD: 0.436885
|
15 |
+
Median KLD: 0.031026
|
16 |
+
10.0% KLD: 0.000750
|
17 |
+
5.0% KLD: 0.000177
|
18 |
+
1.0% KLD: 0.000000
|
19 |
+
Minimum KLD: -0.000455
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.257 ± 0.016 %
|
23 |
+
Maximum Δp: 73.489%
|
24 |
+
99.9% Δp: 36.078%
|
25 |
+
99.0% Δp: 19.828%
|
26 |
+
95.0% Δp: 8.789%
|
27 |
+
90.0% Δp: 4.486%
|
28 |
+
75.0% Δp: 0.496%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.687%
|
31 |
+
10.0% Δp: -5.257%
|
32 |
+
5.0% Δp: -10.230%
|
33 |
+
1.0% Δp: -22.714%
|
34 |
+
0.1% Δp: -43.074%
|
35 |
+
Minimum Δp: -90.990%
|
36 |
+
RMS Δp : 6.411 ± 0.033 %
|
37 |
+
Same top p: 88.802 ± 0.081 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 87933.48 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1845252.27 ms / 304128 tokens ( 6.07 ms per token, 164.82 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 1979075.22 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q3_K_S.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 39.396539 ± 0.409082
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 94.50%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.522985 ± 0.003415
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.687057 ± 0.005762
|
7 |
+
Mean PPL(Q)-PPL(base) : 16.044307 ± 0.213016
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.448856 ± 0.001660
|
11 |
+
Maximum KLD: 11.228906
|
12 |
+
99.9% KLD: 5.432167
|
13 |
+
99.0% KLD: 3.097294
|
14 |
+
99.0% KLD: 3.097294
|
15 |
+
Median KLD: 0.213731
|
16 |
+
10.0% KLD: 0.005922
|
17 |
+
5.0% KLD: 0.001612
|
18 |
+
1.0% KLD: 0.000166
|
19 |
+
Minimum KLD: -0.000425
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -2.648 ± 0.043 %
|
23 |
+
Maximum Δp: 96.523%
|
24 |
+
99.9% Δp: 66.438%
|
25 |
+
99.0% Δp: 40.723%
|
26 |
+
95.0% Δp: 18.234%
|
27 |
+
90.0% Δp: 8.706%
|
28 |
+
75.0% Δp: 0.595%
|
29 |
+
Median Δp: -0.023%
|
30 |
+
25.0% Δp: -2.797%
|
31 |
+
10.0% Δp: -17.572%
|
32 |
+
5.0% Δp: -33.515%
|
33 |
+
1.0% Δp: -71.845%
|
34 |
+
0.1% Δp: -94.533%
|
35 |
+
Minimum Δp: -99.575%
|
36 |
+
RMS Δp : 16.773 ± 0.071 %
|
37 |
+
Same top p: 74.363 ± 0.112 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 79800.46 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1798108.56 ms / 304128 tokens ( 5.91 ms per token, 169.14 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 1960563.16 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q4_K_M.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 24.940671 ± 0.241913
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.60%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.065807 ± 0.000886
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.068021 ± 0.000947
|
7 |
+
Mean PPL(Q)-PPL(base) : 1.588438 ± 0.029452
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.030244 ± 0.000124
|
11 |
+
Maximum KLD: 2.276901
|
12 |
+
99.9% KLD: 0.467041
|
13 |
+
99.0% KLD: 0.202787
|
14 |
+
99.0% KLD: 0.202787
|
15 |
+
Median KLD: 0.014692
|
16 |
+
10.0% KLD: 0.000262
|
17 |
+
5.0% KLD: 0.000030
|
18 |
+
1.0% KLD: -0.000070
|
19 |
+
Minimum KLD: -0.000565
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: 0.252 ± 0.011 %
|
23 |
+
Maximum Δp: 56.165%
|
24 |
+
99.9% Δp: 26.367%
|
25 |
+
99.0% Δp: 14.890%
|
26 |
+
95.0% Δp: 7.165%
|
27 |
+
90.0% Δp: 3.916%
|
28 |
+
75.0% Δp: 0.648%
|
29 |
+
Median Δp: 0.000%
|
30 |
+
25.0% Δp: -0.249%
|
31 |
+
10.0% Δp: -2.747%
|
32 |
+
5.0% Δp: -5.791%
|
33 |
+
1.0% Δp: -14.128%
|
34 |
+
0.1% Δp: -27.579%
|
35 |
+
Minimum Δp: -73.858%
|
36 |
+
RMS Δp : 4.403 ± 0.023 %
|
37 |
+
Same top p: 92.309 ± 0.068 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 288598.22 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1999513.57 ms / 304128 tokens ( 6.57 ms per token, 152.10 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 2559785.48 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q4_K_S.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 23.315818 ± 0.220552
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.50%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : -0.001561 ± 0.000949
|
6 |
+
Mean PPL(Q)/PPL(base) : 0.998441 ± 0.000948
|
7 |
+
Mean PPL(Q)-PPL(base) : -0.036415 ± 0.022157
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.037680 ± 0.000153
|
11 |
+
Maximum KLD: 5.026946
|
12 |
+
99.9% KLD: 0.544697
|
13 |
+
99.0% KLD: 0.251806
|
14 |
+
99.0% KLD: 0.251806
|
15 |
+
Median KLD: 0.018222
|
16 |
+
10.0% KLD: 0.000471
|
17 |
+
5.0% KLD: 0.000107
|
18 |
+
1.0% KLD: -0.000012
|
19 |
+
Minimum KLD: -0.000554
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: 0.151 ± 0.012 %
|
23 |
+
Maximum Δp: 80.430%
|
24 |
+
99.9% Δp: 29.435%
|
25 |
+
99.0% Δp: 16.228%
|
26 |
+
95.0% Δp: 7.580%
|
27 |
+
90.0% Δp: 4.031%
|
28 |
+
75.0% Δp: 0.584%
|
29 |
+
Median Δp: 0.000%
|
30 |
+
25.0% Δp: -0.350%
|
31 |
+
10.0% Δp: -3.338%
|
32 |
+
5.0% Δp: -6.778%
|
33 |
+
1.0% Δp: -15.802%
|
34 |
+
0.1% Δp: -30.165%
|
35 |
+
Minimum Δp: -63.405%
|
36 |
+
RMS Δp : 4.833 ± 0.025 %
|
37 |
+
Same top p: 91.410 ± 0.072 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 103826.14 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 1941845.24 ms / 304128 tokens ( 6.38 ms per token, 156.62 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 2633698.11 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q5_K_M.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 24.304562 ± 0.233045
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.74%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.039971 ± 0.000693
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.040781 ± 0.000721
|
7 |
+
Mean PPL(Q)-PPL(base) : 0.952329 ± 0.020290
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.019001 ± 0.000084
|
11 |
+
Maximum KLD: 2.667997
|
12 |
+
99.9% KLD: 0.307100
|
13 |
+
99.0% KLD: 0.131595
|
14 |
+
99.0% KLD: 0.131595
|
15 |
+
Median KLD: 0.009005
|
16 |
+
10.0% KLD: 0.000235
|
17 |
+
5.0% KLD: 0.000046
|
18 |
+
1.0% KLD: -0.000033
|
19 |
+
Minimum KLD: -0.000534
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.033 ± 0.009 %
|
23 |
+
Maximum Δp: 53.497%
|
24 |
+
99.9% Δp: 21.400%
|
25 |
+
99.0% Δp: 11.091%
|
26 |
+
95.0% Δp: 4.986%
|
27 |
+
90.0% Δp: 2.607%
|
28 |
+
75.0% Δp: 0.323%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.333%
|
31 |
+
10.0% Δp: -2.621%
|
32 |
+
5.0% Δp: -5.142%
|
33 |
+
1.0% Δp: -11.765%
|
34 |
+
0.1% Δp: -23.329%
|
35 |
+
Minimum Δp: -88.396%
|
36 |
+
RMS Δp : 3.498 ± 0.021 %
|
37 |
+
Same top p: 93.797 ± 0.062 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 128672.98 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 2206637.86 ms / 304128 tokens ( 7.26 ms per token, 137.82 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 2358244.53 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q6_K.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 24.415348 ± 0.234385
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.79%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.044519 ± 0.000634
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.045525 ± 0.000663
|
7 |
+
Mean PPL(Q)-PPL(base) : 1.063116 ± 0.020031
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.015503 ± 0.000075
|
11 |
+
Maximum KLD: 3.272221
|
12 |
+
99.9% KLD: 0.267841
|
13 |
+
99.0% KLD: 0.107995
|
14 |
+
99.0% KLD: 0.107995
|
15 |
+
Median KLD: 0.007291
|
16 |
+
10.0% KLD: 0.000184
|
17 |
+
5.0% KLD: 0.000033
|
18 |
+
1.0% KLD: -0.000041
|
19 |
+
Minimum KLD: -0.000471
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: -0.008 ± 0.008 %
|
23 |
+
Maximum Δp: 80.786%
|
24 |
+
99.9% Δp: 18.971%
|
25 |
+
99.0% Δp: 10.037%
|
26 |
+
95.0% Δp: 4.585%
|
27 |
+
90.0% Δp: 2.431%
|
28 |
+
75.0% Δp: 0.317%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.289%
|
31 |
+
10.0% Δp: -2.327%
|
32 |
+
5.0% Δp: -4.593%
|
33 |
+
1.0% Δp: -10.622%
|
34 |
+
0.1% Δp: -22.631%
|
35 |
+
Minimum Δp: -95.612%
|
36 |
+
RMS Δp : 3.201 ± 0.022 %
|
37 |
+
Same top p: 94.448 ± 0.059 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 134148.25 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 2502976.05 ms / 304128 tokens ( 8.23 ms per token, 121.51 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 3009280.06 ms / 304129 tokens
|
scores/Dolphin3.0-R1-Mistral-24B-Q8_0.log
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
====== Perplexity statistics ======
|
2 |
+
Mean PPL(Q) : 23.519878 ± 0.223215
|
3 |
+
Mean PPL(base) : 23.352232 ± 0.220841
|
4 |
+
Cor(ln(PPL(Q)), ln(PPL(base))): 99.96%
|
5 |
+
Mean ln(PPL(Q)/PPL(base)) : 0.007153 ± 0.000271
|
6 |
+
Mean PPL(Q)/PPL(base) : 1.007179 ± 0.000273
|
7 |
+
Mean PPL(Q)-PPL(base) : 0.167646 ± 0.006738
|
8 |
+
|
9 |
+
====== KL divergence statistics ======
|
10 |
+
Mean KLD: 0.002318 ± 0.000013
|
11 |
+
Maximum KLD: 0.848010
|
12 |
+
99.9% KLD: 0.043823
|
13 |
+
99.0% KLD: 0.015801
|
14 |
+
99.0% KLD: 0.015801
|
15 |
+
Median KLD: 0.001166
|
16 |
+
10.0% KLD: 0.000025
|
17 |
+
5.0% KLD: -0.000001
|
18 |
+
1.0% KLD: -0.000063
|
19 |
+
Minimum KLD: -0.000401
|
20 |
+
|
21 |
+
====== Token probability statistics ======
|
22 |
+
Mean Δp: 0.006 ± 0.003 %
|
23 |
+
Maximum Δp: 26.906%
|
24 |
+
99.9% Δp: 7.636%
|
25 |
+
99.0% Δp: 4.056%
|
26 |
+
95.0% Δp: 1.919%
|
27 |
+
90.0% Δp: 0.994%
|
28 |
+
75.0% Δp: 0.127%
|
29 |
+
Median Δp: -0.000%
|
30 |
+
25.0% Δp: -0.119%
|
31 |
+
10.0% Δp: -0.958%
|
32 |
+
5.0% Δp: -1.871%
|
33 |
+
1.0% Δp: -4.090%
|
34 |
+
0.1% Δp: -7.817%
|
35 |
+
Minimum Δp: -20.640%
|
36 |
+
RMS Δp : 1.266 ± 0.008 %
|
37 |
+
Same top p: 97.778 ± 0.038 %
|
38 |
+
|
39 |
+
llama_perf_context_print: load time = 3238.55 ms
|
40 |
+
llama_perf_context_print: prompt eval time = 2842839.16 ms / 304128 tokens ( 9.35 ms per token, 106.98 tokens per second)
|
41 |
+
llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
|
42 |
+
llama_perf_context_print: total time = 2975924.16 ms / 304129 tokens
|