diff --git "a/scores/Llama-Guard-3-8B-Q6_K.log" "b/scores/Llama-Guard-3-8B-Q6_K.log" --- "a/scores/Llama-Guard-3-8B-Q6_K.log" +++ "b/scores/Llama-Guard-3-8B-Q6_K.log" @@ -1,2142 +1,3 @@ -build: 4730 (fe163d5b) with cc (GCC) 11.4.1 20230605 (Red Hat 11.4.1-2) for x86_64-amazon-linux -llama_model_load_from_file_impl: using device CUDA0 (Tesla T4) - 14812 MiB free -llama_model_loader: loaded meta data with 36 key-value pairs and 292 tensors from ./Llama-Guard-3-8B-Q6_K.gguf (version GGUF V3 (latest)) -llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output. -llama_model_loader: - kv 0: general.architecture str = llama -llama_model_loader: - kv 1: general.type str = model -llama_model_loader: - kv 2: general.name str = Llama Guard 3 8B -llama_model_loader: - kv 3: general.basename str = Llama-Guard-3 -llama_model_loader: - kv 4: general.size_label str = 8B -llama_model_loader: - kv 5: general.license str = llama3.1 -llama_model_loader: - kv 6: general.base_model.count u32 = 1 -llama_model_loader: - kv 7: general.base_model.0.name str = Meta Llama 3.1 8B -llama_model_loader: - kv 8: general.base_model.0.organization str = Meta Llama -llama_model_loader: - kv 9: general.base_model.0.repo_url str = https://huggingface.co/meta-llama/Met... -llama_model_loader: - kv 10: general.tags arr[str,6] = ["facebook", "meta", "pytorch", "llam... -llama_model_loader: - kv 11: general.languages arr[str,1] = ["en"] -llama_model_loader: - kv 12: llama.block_count u32 = 32 -llama_model_loader: - kv 13: llama.context_length u32 = 131072 -llama_model_loader: - kv 14: llama.embedding_length u32 = 4096 -llama_model_loader: - kv 15: llama.feed_forward_length u32 = 14336 -llama_model_loader: - kv 16: llama.attention.head_count u32 = 32 -llama_model_loader: - kv 17: llama.attention.head_count_kv u32 = 8 -llama_model_loader: - kv 18: llama.rope.freq_base f32 = 500000.000000 -llama_model_loader: - kv 19: llama.attention.layer_norm_rms_epsilon f32 = 0.000010 -llama_model_loader: - kv 20: llama.vocab_size u32 = 128256 -llama_model_loader: - kv 21: llama.rope.dimension_count u32 = 128 -llama_model_loader: - kv 22: tokenizer.ggml.model str = gpt2 -llama_model_loader: - kv 23: tokenizer.ggml.pre str = smaug-bpe -llama_model_loader: - kv 24: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ... -llama_model_loader: - kv 25: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... -llama_model_loader: - kv 26: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "... -llama_model_loader: - kv 27: tokenizer.ggml.bos_token_id u32 = 128000 -llama_model_loader: - kv 28: tokenizer.ggml.eos_token_id u32 = 128009 -llama_model_loader: - kv 29: tokenizer.chat_template str = {% if messages|length % 2 == 0 %}{% s... -llama_model_loader: - kv 30: general.quantization_version u32 = 2 -llama_model_loader: - kv 31: general.file_type u32 = 18 -llama_model_loader: - kv 32: quantize.imatrix.file str = ./imatrix/imatrix-Llama-Guard-3-8B-sm... -llama_model_loader: - kv 33: quantize.imatrix.dataset str = ../datasets/calibration_eur_small.txt -llama_model_loader: - kv 34: quantize.imatrix.entries_count i32 = 224 -llama_model_loader: - kv 35: quantize.imatrix.chunks_count i32 = 722 -llama_model_loader: - type f32: 66 tensors -llama_model_loader: - type q6_K: 226 tensors -print_info: file format = GGUF V3 (latest) -print_info: file type = Q6_K -print_info: file size = 6.14 GiB (6.56 BPW) -init_tokenizer: initializing tokenizer for type 2 -load: control token: 128254 '<|reserved_special_token_246|>' is not marked as EOG -load: control token: 128249 '<|reserved_special_token_241|>' is not marked as EOG -load: control token: 128246 '<|reserved_special_token_238|>' is not marked as EOG -load: control token: 128243 '<|reserved_special_token_235|>' is not marked as EOG -load: control token: 128242 '<|reserved_special_token_234|>' is not marked as EOG -load: control token: 128241 '<|reserved_special_token_233|>' is not marked as EOG -load: control token: 128240 '<|reserved_special_token_232|>' is not marked as EOG -load: control token: 128235 '<|reserved_special_token_227|>' is not marked as EOG -load: control token: 128231 '<|reserved_special_token_223|>' is not marked as EOG -load: control token: 128230 '<|reserved_special_token_222|>' is not marked as EOG -load: control token: 128228 '<|reserved_special_token_220|>' is not marked as EOG -load: control token: 128225 '<|reserved_special_token_217|>' is not marked as EOG -load: control token: 128218 '<|reserved_special_token_210|>' is not marked as EOG -load: control token: 128214 '<|reserved_special_token_206|>' is not marked as EOG -load: control token: 128213 '<|reserved_special_token_205|>' is not marked as EOG -load: control token: 128207 '<|reserved_special_token_199|>' is not marked as EOG -load: control token: 128206 '<|reserved_special_token_198|>' is not marked as EOG -load: control token: 128204 '<|reserved_special_token_196|>' is not marked as EOG -load: control token: 128200 '<|reserved_special_token_192|>' is not marked as EOG -load: control token: 128199 '<|reserved_special_token_191|>' is not marked as EOG -load: control token: 128198 '<|reserved_special_token_190|>' is not marked as EOG -load: control token: 128196 '<|reserved_special_token_188|>' is not marked as EOG -load: control token: 128194 '<|reserved_special_token_186|>' is not marked as EOG -load: control token: 128193 '<|reserved_special_token_185|>' is not marked as EOG -load: control token: 128188 '<|reserved_special_token_180|>' is not marked as EOG -load: control token: 128187 '<|reserved_special_token_179|>' is not marked as EOG -load: control token: 128185 '<|reserved_special_token_177|>' is not marked as EOG -load: control token: 128184 '<|reserved_special_token_176|>' is not marked as EOG -load: control token: 128180 '<|reserved_special_token_172|>' is not marked as EOG -load: control token: 128179 '<|reserved_special_token_171|>' is not marked as EOG -load: control token: 128178 '<|reserved_special_token_170|>' is not marked as EOG -load: control token: 128177 '<|reserved_special_token_169|>' is not marked as EOG -load: control token: 128176 '<|reserved_special_token_168|>' is not marked as EOG -load: control token: 128175 '<|reserved_special_token_167|>' is not marked as EOG -load: control token: 128171 '<|reserved_special_token_163|>' is not marked as EOG -load: control token: 128170 '<|reserved_special_token_162|>' is not marked as EOG -load: control token: 128169 '<|reserved_special_token_161|>' is not marked as EOG -load: control token: 128168 '<|reserved_special_token_160|>' is not marked as EOG -load: control token: 128165 '<|reserved_special_token_157|>' is not marked as EOG -load: control token: 128162 '<|reserved_special_token_154|>' is not marked as EOG -load: control token: 128158 '<|reserved_special_token_150|>' is not marked as EOG -load: control token: 128156 '<|reserved_special_token_148|>' is not marked as EOG -load: control token: 128155 '<|reserved_special_token_147|>' is not marked as EOG -load: control token: 128154 '<|reserved_special_token_146|>' is not marked as EOG -load: control token: 128151 '<|reserved_special_token_143|>' is not marked as EOG -load: control token: 128149 '<|reserved_special_token_141|>' is not marked as EOG -load: control token: 128147 '<|reserved_special_token_139|>' is not marked as EOG -load: control token: 128146 '<|reserved_special_token_138|>' is not marked as EOG -load: control token: 128144 '<|reserved_special_token_136|>' is not marked as EOG -load: control token: 128142 '<|reserved_special_token_134|>' is not marked as EOG -load: control token: 128141 '<|reserved_special_token_133|>' is not marked as EOG -load: control token: 128138 '<|reserved_special_token_130|>' is not marked as EOG -load: control token: 128136 '<|reserved_special_token_128|>' is not marked as EOG -load: control token: 128135 '<|reserved_special_token_127|>' is not marked as EOG -load: control token: 128134 '<|reserved_special_token_126|>' is not marked as EOG -load: control token: 128133 '<|reserved_special_token_125|>' is not marked as EOG -load: control token: 128131 '<|reserved_special_token_123|>' is not marked as EOG -load: control token: 128128 '<|reserved_special_token_120|>' is not marked as EOG -load: control token: 128124 '<|reserved_special_token_116|>' is not marked as EOG -load: control token: 128123 '<|reserved_special_token_115|>' is not marked as EOG -load: control token: 128122 '<|reserved_special_token_114|>' is not marked as EOG -load: control token: 128119 '<|reserved_special_token_111|>' is not marked as EOG -load: control token: 128115 '<|reserved_special_token_107|>' is not marked as EOG -load: control token: 128112 '<|reserved_special_token_104|>' is not marked as EOG -load: control token: 128110 '<|reserved_special_token_102|>' is not marked as EOG -load: control token: 128109 '<|reserved_special_token_101|>' is not marked as EOG -load: control token: 128108 '<|reserved_special_token_100|>' is not marked as EOG -load: control token: 128106 '<|reserved_special_token_98|>' is not marked as EOG -load: control token: 128103 '<|reserved_special_token_95|>' is not marked as EOG -load: control token: 128102 '<|reserved_special_token_94|>' is not marked as EOG -load: control token: 128101 '<|reserved_special_token_93|>' is not marked as EOG -load: control token: 128097 '<|reserved_special_token_89|>' is not marked as EOG -load: control token: 128091 '<|reserved_special_token_83|>' is not marked as EOG -load: control token: 128090 '<|reserved_special_token_82|>' is not marked as EOG -load: control token: 128089 '<|reserved_special_token_81|>' is not marked as EOG -load: control token: 128087 '<|reserved_special_token_79|>' is not marked as EOG -load: control token: 128085 '<|reserved_special_token_77|>' is not marked as EOG -load: control token: 128081 '<|reserved_special_token_73|>' is not marked as EOG -load: control token: 128078 '<|reserved_special_token_70|>' is not marked as EOG -load: control token: 128076 '<|reserved_special_token_68|>' is not marked as EOG -load: control token: 128075 '<|reserved_special_token_67|>' is not marked as EOG -load: control token: 128073 '<|reserved_special_token_65|>' is not marked as EOG -load: control token: 128068 '<|reserved_special_token_60|>' is not marked as EOG -load: control token: 128067 '<|reserved_special_token_59|>' is not marked as EOG -load: control token: 128065 '<|reserved_special_token_57|>' is not marked as EOG -load: control token: 128063 '<|reserved_special_token_55|>' is not marked as EOG -load: control token: 128062 '<|reserved_special_token_54|>' is not marked as EOG -load: control token: 128060 '<|reserved_special_token_52|>' is not marked as EOG -load: control token: 128059 '<|reserved_special_token_51|>' is not marked as EOG -load: control token: 128057 '<|reserved_special_token_49|>' is not marked as EOG -load: control token: 128054 '<|reserved_special_token_46|>' is not marked as EOG -load: control token: 128046 '<|reserved_special_token_38|>' is not marked as EOG -load: control token: 128045 '<|reserved_special_token_37|>' is not marked as EOG -load: control token: 128044 '<|reserved_special_token_36|>' is not marked as EOG -load: control token: 128043 '<|reserved_special_token_35|>' is not marked as EOG -load: control token: 128038 '<|reserved_special_token_30|>' is not marked as EOG -load: control token: 128036 '<|reserved_special_token_28|>' is not marked as EOG -load: control token: 128035 '<|reserved_special_token_27|>' is not marked as EOG -load: control token: 128032 '<|reserved_special_token_24|>' is not marked as EOG -load: control token: 128028 '<|reserved_special_token_20|>' is not marked as EOG -load: control token: 128027 '<|reserved_special_token_19|>' is not marked as EOG -load: control token: 128024 '<|reserved_special_token_16|>' is not marked as EOG -load: control token: 128023 '<|reserved_special_token_15|>' is not marked as EOG -load: control token: 128022 '<|reserved_special_token_14|>' is not marked as EOG -load: control token: 128021 '<|reserved_special_token_13|>' is not marked as EOG -load: control token: 128018 '<|reserved_special_token_10|>' is not marked as EOG -load: control token: 128016 '<|reserved_special_token_8|>' is not marked as EOG -load: control token: 128015 '<|reserved_special_token_7|>' is not marked as EOG -load: control token: 128013 '<|reserved_special_token_5|>' is not marked as EOG -load: control token: 128011 '<|reserved_special_token_3|>' is not marked as EOG -load: control token: 128005 '<|reserved_special_token_2|>' is not marked as EOG -load: control token: 128004 '<|finetune_right_pad_id|>' is not marked as EOG -load: control token: 128002 '<|reserved_special_token_0|>' is not marked as EOG -load: control token: 128252 '<|reserved_special_token_244|>' is not marked as EOG -load: control token: 128190 '<|reserved_special_token_182|>' is not marked as EOG -load: control token: 128183 '<|reserved_special_token_175|>' is not marked as EOG -load: control token: 128137 '<|reserved_special_token_129|>' is not marked as EOG -load: control token: 128182 '<|reserved_special_token_174|>' is not marked as EOG -load: control token: 128040 '<|reserved_special_token_32|>' is not marked as EOG -load: control token: 128048 '<|reserved_special_token_40|>' is not marked as EOG -load: control token: 128092 '<|reserved_special_token_84|>' is not marked as EOG -load: control token: 128215 '<|reserved_special_token_207|>' is not marked as EOG -load: control token: 128107 '<|reserved_special_token_99|>' is not marked as EOG -load: control token: 128208 '<|reserved_special_token_200|>' is not marked as EOG -load: control token: 128145 '<|reserved_special_token_137|>' is not marked as EOG -load: control token: 128031 '<|reserved_special_token_23|>' is not marked as EOG -load: control token: 128129 '<|reserved_special_token_121|>' is not marked as EOG -load: control token: 128201 '<|reserved_special_token_193|>' is not marked as EOG -load: control token: 128074 '<|reserved_special_token_66|>' is not marked as EOG -load: control token: 128095 '<|reserved_special_token_87|>' is not marked as EOG -load: control token: 128186 '<|reserved_special_token_178|>' is not marked as EOG -load: control token: 128143 '<|reserved_special_token_135|>' is not marked as EOG -load: control token: 128229 '<|reserved_special_token_221|>' is not marked as EOG -load: control token: 128007 '<|end_header_id|>' is not marked as EOG -load: control token: 128055 '<|reserved_special_token_47|>' is not marked as EOG -load: control token: 128056 '<|reserved_special_token_48|>' is not marked as EOG -load: control token: 128061 '<|reserved_special_token_53|>' is not marked as EOG -load: control token: 128153 '<|reserved_special_token_145|>' is not marked as EOG -load: control token: 128152 '<|reserved_special_token_144|>' is not marked as EOG -load: control token: 128212 '<|reserved_special_token_204|>' is not marked as EOG -load: control token: 128172 '<|reserved_special_token_164|>' is not marked as EOG -load: control token: 128160 '<|reserved_special_token_152|>' is not marked as EOG -load: control token: 128041 '<|reserved_special_token_33|>' is not marked as EOG -load: control token: 128181 '<|reserved_special_token_173|>' is not marked as EOG -load: control token: 128094 '<|reserved_special_token_86|>' is not marked as EOG -load: control token: 128118 '<|reserved_special_token_110|>' is not marked as EOG -load: control token: 128236 '<|reserved_special_token_228|>' is not marked as EOG -load: control token: 128148 '<|reserved_special_token_140|>' is not marked as EOG -load: control token: 128042 '<|reserved_special_token_34|>' is not marked as EOG -load: control token: 128139 '<|reserved_special_token_131|>' is not marked as EOG -load: control token: 128173 '<|reserved_special_token_165|>' is not marked as EOG -load: control token: 128239 '<|reserved_special_token_231|>' is not marked as EOG -load: control token: 128157 '<|reserved_special_token_149|>' is not marked as EOG -load: control token: 128052 '<|reserved_special_token_44|>' is not marked as EOG -load: control token: 128026 '<|reserved_special_token_18|>' is not marked as EOG -load: control token: 128003 '<|reserved_special_token_1|>' is not marked as EOG -load: control token: 128019 '<|reserved_special_token_11|>' is not marked as EOG -load: control token: 128116 '<|reserved_special_token_108|>' is not marked as EOG -load: control token: 128161 '<|reserved_special_token_153|>' is not marked as EOG -load: control token: 128226 '<|reserved_special_token_218|>' is not marked as EOG -load: control token: 128159 '<|reserved_special_token_151|>' is not marked as EOG -load: control token: 128012 '<|reserved_special_token_4|>' is not marked as EOG -load: control token: 128088 '<|reserved_special_token_80|>' is not marked as EOG -load: control token: 128163 '<|reserved_special_token_155|>' is not marked as EOG -load: control token: 128001 '<|end_of_text|>' is not marked as EOG -load: control token: 128113 '<|reserved_special_token_105|>' is not marked as EOG -load: control token: 128250 '<|reserved_special_token_242|>' is not marked as EOG -load: control token: 128125 '<|reserved_special_token_117|>' is not marked as EOG -load: control token: 128053 '<|reserved_special_token_45|>' is not marked as EOG -load: control token: 128224 '<|reserved_special_token_216|>' is not marked as EOG -load: control token: 128247 '<|reserved_special_token_239|>' is not marked as EOG -load: control token: 128251 '<|reserved_special_token_243|>' is not marked as EOG -load: control token: 128216 '<|reserved_special_token_208|>' is not marked as EOG -load: control token: 128006 '<|start_header_id|>' is not marked as EOG -load: control token: 128211 '<|reserved_special_token_203|>' is not marked as EOG -load: control token: 128077 '<|reserved_special_token_69|>' is not marked as EOG -load: control token: 128237 '<|reserved_special_token_229|>' is not marked as EOG -load: control token: 128086 '<|reserved_special_token_78|>' is not marked as EOG -load: control token: 128227 '<|reserved_special_token_219|>' is not marked as EOG -load: control token: 128058 '<|reserved_special_token_50|>' is not marked as EOG -load: control token: 128100 '<|reserved_special_token_92|>' is not marked as EOG -load: control token: 128209 '<|reserved_special_token_201|>' is not marked as EOG -load: control token: 128084 '<|reserved_special_token_76|>' is not marked as EOG -load: control token: 128071 '<|reserved_special_token_63|>' is not marked as EOG -load: control token: 128070 '<|reserved_special_token_62|>' is not marked as EOG -load: control token: 128049 '<|reserved_special_token_41|>' is not marked as EOG -load: control token: 128197 '<|reserved_special_token_189|>' is not marked as EOG -load: control token: 128072 '<|reserved_special_token_64|>' is not marked as EOG -load: control token: 128000 '<|begin_of_text|>' is not marked as EOG -load: control token: 128223 '<|reserved_special_token_215|>' is not marked as EOG -load: control token: 128217 '<|reserved_special_token_209|>' is not marked as EOG -load: control token: 128111 '<|reserved_special_token_103|>' is not marked as EOG -load: control token: 128203 '<|reserved_special_token_195|>' is not marked as EOG -load: control token: 128051 '<|reserved_special_token_43|>' is not marked as EOG -load: control token: 128030 '<|reserved_special_token_22|>' is not marked as EOG -load: control token: 128117 '<|reserved_special_token_109|>' is not marked as EOG -load: control token: 128010 '<|python_tag|>' is not marked as EOG -load: control token: 128238 '<|reserved_special_token_230|>' is not marked as EOG -load: control token: 128255 '<|reserved_special_token_247|>' is not marked as EOG -load: control token: 128202 '<|reserved_special_token_194|>' is not marked as EOG -load: control token: 128132 '<|reserved_special_token_124|>' is not marked as EOG -load: control token: 128248 '<|reserved_special_token_240|>' is not marked as EOG -load: control token: 128167 '<|reserved_special_token_159|>' is not marked as EOG -load: control token: 128127 '<|reserved_special_token_119|>' is not marked as EOG -load: control token: 128105 '<|reserved_special_token_97|>' is not marked as EOG -load: control token: 128039 '<|reserved_special_token_31|>' is not marked as EOG -load: control token: 128232 '<|reserved_special_token_224|>' is not marked as EOG -load: control token: 128166 '<|reserved_special_token_158|>' is not marked as EOG -load: control token: 128130 '<|reserved_special_token_122|>' is not marked as EOG -load: control token: 128114 '<|reserved_special_token_106|>' is not marked as EOG -load: control token: 128234 '<|reserved_special_token_226|>' is not marked as EOG -load: control token: 128191 '<|reserved_special_token_183|>' is not marked as EOG -load: control token: 128064 '<|reserved_special_token_56|>' is not marked as EOG -load: control token: 128140 '<|reserved_special_token_132|>' is not marked as EOG -load: control token: 128096 '<|reserved_special_token_88|>' is not marked as EOG -load: control token: 128098 '<|reserved_special_token_90|>' is not marked as EOG -load: control token: 128192 '<|reserved_special_token_184|>' is not marked as EOG -load: control token: 128093 '<|reserved_special_token_85|>' is not marked as EOG -load: control token: 128150 '<|reserved_special_token_142|>' is not marked as EOG -load: control token: 128222 '<|reserved_special_token_214|>' is not marked as EOG -load: control token: 128233 '<|reserved_special_token_225|>' is not marked as EOG -load: control token: 128220 '<|reserved_special_token_212|>' is not marked as EOG -load: control token: 128034 '<|reserved_special_token_26|>' is not marked as EOG -load: control token: 128033 '<|reserved_special_token_25|>' is not marked as EOG -load: control token: 128253 '<|reserved_special_token_245|>' is not marked as EOG -load: control token: 128195 '<|reserved_special_token_187|>' is not marked as EOG -load: control token: 128099 '<|reserved_special_token_91|>' is not marked as EOG -load: control token: 128189 '<|reserved_special_token_181|>' is not marked as EOG -load: control token: 128210 '<|reserved_special_token_202|>' is not marked as EOG -load: control token: 128174 '<|reserved_special_token_166|>' is not marked as EOG -load: control token: 128083 '<|reserved_special_token_75|>' is not marked as EOG -load: control token: 128080 '<|reserved_special_token_72|>' is not marked as EOG -load: control token: 128104 '<|reserved_special_token_96|>' is not marked as EOG -load: control token: 128082 '<|reserved_special_token_74|>' is not marked as EOG -load: control token: 128219 '<|reserved_special_token_211|>' is not marked as EOG -load: control token: 128017 '<|reserved_special_token_9|>' is not marked as EOG -load: control token: 128050 '<|reserved_special_token_42|>' is not marked as EOG -load: control token: 128205 '<|reserved_special_token_197|>' is not marked as EOG -load: control token: 128047 '<|reserved_special_token_39|>' is not marked as EOG -load: control token: 128164 '<|reserved_special_token_156|>' is not marked as EOG -load: control token: 128020 '<|reserved_special_token_12|>' is not marked as EOG -load: control token: 128069 '<|reserved_special_token_61|>' is not marked as EOG -load: control token: 128245 '<|reserved_special_token_237|>' is not marked as EOG -load: control token: 128121 '<|reserved_special_token_113|>' is not marked as EOG -load: control token: 128079 '<|reserved_special_token_71|>' is not marked as EOG -load: control token: 128037 '<|reserved_special_token_29|>' is not marked as EOG -load: control token: 128244 '<|reserved_special_token_236|>' is not marked as EOG -load: control token: 128029 '<|reserved_special_token_21|>' is not marked as EOG -load: control token: 128221 '<|reserved_special_token_213|>' is not marked as EOG -load: control token: 128066 '<|reserved_special_token_58|>' is not marked as EOG -load: control token: 128120 '<|reserved_special_token_112|>' is not marked as EOG -load: control token: 128014 '<|reserved_special_token_6|>' is not marked as EOG -load: control token: 128025 '<|reserved_special_token_17|>' is not marked as EOG -load: control token: 128126 '<|reserved_special_token_118|>' is not marked as EOG -load: special tokens cache size = 256 -load: token to piece cache size = 0.7999 MB -print_info: arch = llama -print_info: vocab_only = 0 -print_info: n_ctx_train = 131072 -print_info: n_embd = 4096 -print_info: n_layer = 32 -print_info: n_head = 32 -print_info: n_head_kv = 8 -print_info: n_rot = 128 -print_info: n_swa = 0 -print_info: n_embd_head_k = 128 -print_info: n_embd_head_v = 128 -print_info: n_gqa = 4 -print_info: n_embd_k_gqa = 1024 -print_info: n_embd_v_gqa = 1024 -print_info: f_norm_eps = 0.0e+00 -print_info: f_norm_rms_eps = 1.0e-05 -print_info: f_clamp_kqv = 0.0e+00 -print_info: f_max_alibi_bias = 0.0e+00 -print_info: f_logit_scale = 0.0e+00 -print_info: n_ff = 14336 -print_info: n_expert = 0 -print_info: n_expert_used = 0 -print_info: causal attn = 1 -print_info: pooling type = 0 -print_info: rope type = 0 -print_info: rope scaling = linear -print_info: freq_base_train = 500000.0 -print_info: freq_scale_train = 1 -print_info: n_ctx_orig_yarn = 131072 -print_info: rope_finetuned = unknown -print_info: ssm_d_conv = 0 -print_info: ssm_d_inner = 0 -print_info: ssm_d_state = 0 -print_info: ssm_dt_rank = 0 -print_info: ssm_dt_b_c_rms = 0 -print_info: model type = 8B -print_info: model params = 8.03 B -print_info: general.name = Llama Guard 3 8B -print_info: vocab type = BPE -print_info: n_vocab = 128256 -print_info: n_merges = 280147 -print_info: BOS token = 128000 '<|begin_of_text|>' -print_info: EOS token = 128009 '<|eot_id|>' -print_info: EOT token = 128009 '<|eot_id|>' -print_info: EOM token = 128008 '<|eom_id|>' -print_info: LF token = 198 'Ċ' -print_info: EOG token = 128008 '<|eom_id|>' -print_info: EOG token = 128009 '<|eot_id|>' -print_info: max token length = 256 -load_tensors: loading model tensors, this can take a while... (mmap = true) -load_tensors: layer 0 assigned to device CPU -load_tensors: layer 1 assigned to device CPU -load_tensors: layer 2 assigned to device CPU -load_tensors: layer 3 assigned to device CPU -load_tensors: layer 4 assigned to device CPU -load_tensors: layer 5 assigned to device CPU -load_tensors: layer 6 assigned to device CPU -load_tensors: layer 7 assigned to device CPU -load_tensors: layer 8 assigned to device CPU -load_tensors: layer 9 assigned to device CPU -load_tensors: layer 10 assigned to device CPU -load_tensors: layer 11 assigned to device CPU -load_tensors: layer 12 assigned to device CPU -load_tensors: layer 13 assigned to device CPU -load_tensors: layer 14 assigned to device CPU -load_tensors: layer 15 assigned to device CPU -load_tensors: layer 16 assigned to device CPU -load_tensors: layer 17 assigned to device CPU -load_tensors: layer 18 assigned to device CPU -load_tensors: layer 19 assigned to device CPU -load_tensors: layer 20 assigned to device CUDA0 -load_tensors: layer 21 assigned to device CUDA0 -load_tensors: layer 22 assigned to device CUDA0 -load_tensors: layer 23 assigned to device CUDA0 -load_tensors: layer 24 assigned to device CUDA0 -load_tensors: layer 25 assigned to device CUDA0 -load_tensors: layer 26 assigned to device CUDA0 -load_tensors: layer 27 assigned to device CUDA0 -load_tensors: layer 28 assigned to device CUDA0 -load_tensors: layer 29 assigned to device CUDA0 -load_tensors: layer 30 assigned to device CUDA0 -load_tensors: layer 31 assigned to device CUDA0 -load_tensors: layer 32 assigned to device CPU -load_tensors: tensor 'token_embd.weight' (q6_K) (and 202 others) cannot be used with preferred buffer type CPU_AARCH64, using CPU instead -load_tensors: offloading 12 repeating layers to GPU -load_tensors: offloaded 12/33 layers to GPU -load_tensors: CPU_Mapped model buffer size = 4235.09 MiB -load_tensors: CUDA0 model buffer size = 2047.88 MiB -......................................................................................... -llama_init_from_model: n_seq_max = 1 -llama_init_from_model: n_ctx = 512 -llama_init_from_model: n_ctx_per_seq = 512 -llama_init_from_model: n_batch = 512 -llama_init_from_model: n_ubatch = 512 -llama_init_from_model: flash_attn = 1 -llama_init_from_model: freq_base = 500000.0 -llama_init_from_model: freq_scale = 1 -llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized -llama_kv_cache_init: kv_size = 512, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1 -llama_kv_cache_init: layer 0: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 1: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 2: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 3: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 4: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 5: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 6: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 7: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 8: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 9: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 10: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 11: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 12: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 13: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 14: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 15: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 16: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 17: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 18: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 19: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 20: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 21: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 22: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 23: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 24: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 25: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 26: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 27: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 28: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 29: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 30: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: layer 31: n_embd_k_gqa = 1024, n_embd_v_gqa = 1024 -llama_kv_cache_init: CPU KV buffer size = 40.00 MiB -llama_kv_cache_init: CUDA0 KV buffer size = 24.00 MiB -llama_init_from_model: KV self size = 64.00 MiB, K (f16): 32.00 MiB, V (f16): 32.00 MiB -llama_init_from_model: CPU output buffer size = 0.49 MiB -llama_init_from_model: CUDA0 compute buffer size = 669.48 MiB -llama_init_from_model: CUDA_Host compute buffer size = 9.01 MiB -llama_init_from_model: graph nodes = 903 -llama_init_from_model: graph splits = 225 (with bs=512), 3 (with bs=1) -common_init_from_params: setting dry_penalty_last_n to ctx_size = 512 -common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable) - -system_info: n_threads = 8 (n_threads_batch = 8) / 16 | CUDA : ARCHS = 750 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | AVX512 = 1 | AVX512_VNNI = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 | -kl_divergence: 1.67 seconds per pass - ETA 15.73 minutes - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 1 3.9818 ± 0.5154 0.00641 ± 0.00667 0.00387 ± 0.00086 1.664 ± 0.194 % 98.824 ± 0.677 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 2 4.9173 ± 0.4968 0.00420 ± 0.00399 0.00350 ± 0.00045 1.576 ± 0.117 % 98.431 ± 0.551 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 3 4.4354 ± 0.3601 0.01579 ± 0.00470 0.00879 ± 0.00089 3.498 ± 0.395 % 96.601 ± 0.656 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 4 4.9884 ± 0.3494 0.01391 ± 0.00379 0.00749 ± 0.00067 3.172 ± 0.329 % 96.471 ± 0.578 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 5 5.3579 ± 0.3272 0.01114 ± 0.00319 0.00662 ± 0.00054 2.927 ± 0.286 % 96.078 ± 0.544 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 6 5.7059 ± 0.3176 0.01009 ± 0.00285 0.00623 ± 0.00046 2.794 ± 0.251 % 96.209 ± 0.488 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 7 6.0860 ± 0.3193 0.01057 ± 0.00285 0.00588 ± 0.00039 2.674 ± 0.226 % 96.303 ± 0.447 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 8 6.6052 ± 0.3376 0.00899 ± 0.00258 0.00557 ± 0.00035 2.543 ± 0.208 % 96.275 ± 0.419 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 9 7.1490 ± 0.3491 0.00794 ± 0.00238 0.00533 ± 0.00031 2.437 ± 0.193 % 96.383 ± 0.390 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 10 7.3364 ± 0.3381 0.00739 ± 0.00219 0.00508 ± 0.00028 2.354 ± 0.180 % 96.549 ± 0.362 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 11 7.4902 ± 0.3311 0.00666 ± 0.00205 0.00499 ± 0.00026 2.301 ± 0.168 % 96.435 ± 0.350 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 12 7.6215 ± 0.3254 0.00684 ± 0.00195 0.00498 ± 0.00024 2.281 ± 0.156 % 96.471 ± 0.334 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 13 7.9883 ± 0.3332 0.00605 ± 0.00184 0.00493 ± 0.00023 2.257 ± 0.150 % 96.501 ± 0.319 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 14 7.7075 ± 0.3089 0.00551 ± 0.00175 0.00483 ± 0.00021 2.216 ± 0.142 % 96.555 ± 0.305 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 15 7.6430 ± 0.2952 0.00516 ± 0.00169 0.00475 ± 0.00020 2.194 ± 0.134 % 96.575 ± 0.294 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 16 7.4549 ± 0.2778 0.00465 ± 0.00162 0.00466 ± 0.00019 2.175 ± 0.127 % 96.544 ± 0.286 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 17 7.4188 ± 0.2696 0.00407 ± 0.00156 0.00456 ± 0.00018 2.137 ± 0.122 % 96.609 ± 0.275 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 18 7.5292 ± 0.2676 0.00384 ± 0.00150 0.00456 ± 0.00017 2.133 ± 0.117 % 96.645 ± 0.266 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 19 7.3102 ± 0.2529 0.00444 ± 0.00146 0.00459 ± 0.00017 2.152 ± 0.110 % 96.698 ± 0.257 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 20 7.1947 ± 0.2405 0.00430 ± 0.00142 0.00456 ± 0.00016 2.147 ± 0.106 % 96.569 ± 0.255 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 21 7.1835 ± 0.2344 0.00400 ± 0.00137 0.00451 ± 0.00015 2.122 ± 0.102 % 96.601 ± 0.248 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 22 6.9237 ± 0.2188 0.00394 ± 0.00133 0.00450 ± 0.00015 2.125 ± 0.098 % 96.613 ± 0.242 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 23 6.6744 ± 0.2054 0.00396 ± 0.00130 0.00446 ± 0.00014 2.107 ± 0.095 % 96.641 ± 0.235 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 24 6.5344 ± 0.1965 0.00379 ± 0.00127 0.00448 ± 0.00014 2.109 ± 0.091 % 96.667 ± 0.229 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 25 6.3534 ± 0.1857 0.00431 ± 0.00125 0.00455 ± 0.00014 2.135 ± 0.090 % 96.612 ± 0.227 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 26 6.2309 ± 0.1771 0.00434 ± 0.00123 0.00470 ± 0.00014 2.191 ± 0.089 % 96.637 ± 0.221 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 27 6.1687 ± 0.1717 0.00444 ± 0.00120 0.00462 ± 0.00014 2.166 ± 0.086 % 96.688 ± 0.216 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 28 6.1032 ± 0.1658 0.00429 ± 0.00119 0.00469 ± 0.00014 2.174 ± 0.085 % 96.653 ± 0.213 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 29 6.1665 ± 0.1647 0.00439 ± 0.00117 0.00468 ± 0.00013 2.160 ± 0.083 % 96.646 ± 0.209 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 30 6.1651 ± 0.1618 0.00416 ± 0.00115 0.00470 ± 0.00013 2.144 ± 0.081 % 96.627 ± 0.206 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 31 6.1694 ± 0.1594 0.00424 ± 0.00113 0.00468 ± 0.00013 2.132 ± 0.079 % 96.610 ± 0.204 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 32 6.1361 ± 0.1555 0.00434 ± 0.00111 0.00464 ± 0.00013 2.119 ± 0.077 % 96.630 ± 0.200 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 33 6.1669 ± 0.1539 0.00461 ± 0.00111 0.00463 ± 0.00012 2.107 ± 0.075 % 96.625 ± 0.197 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 34 6.2126 ± 0.1528 0.00457 ± 0.00110 0.00463 ± 0.00012 2.096 ± 0.074 % 96.574 ± 0.195 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 35 6.2491 ± 0.1516 0.00452 ± 0.00108 0.00461 ± 0.00012 2.087 ± 0.072 % 96.605 ± 0.192 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 36 6.3326 ± 0.1519 0.00462 ± 0.00106 0.00462 ± 0.00011 2.079 ± 0.070 % 96.580 ± 0.190 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 37 6.3112 ± 0.1492 0.00433 ± 0.00104 0.00459 ± 0.00011 2.073 ± 0.069 % 96.577 ± 0.187 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 38 6.3412 ± 0.1485 0.00445 ± 0.00102 0.00455 ± 0.00011 2.057 ± 0.067 % 96.563 ± 0.185 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 39 6.3433 ± 0.1464 0.00440 ± 0.00101 0.00453 ± 0.00011 2.054 ± 0.066 % 96.531 ± 0.184 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 40 6.3707 ± 0.1450 0.00458 ± 0.00099 0.00449 ± 0.00010 2.039 ± 0.065 % 96.539 ± 0.181 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 41 6.3722 ± 0.1432 0.00447 ± 0.00097 0.00447 ± 0.00010 2.035 ± 0.064 % 96.576 ± 0.178 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 42 6.3165 ± 0.1396 0.00434 ± 0.00096 0.00444 ± 0.00010 2.034 ± 0.062 % 96.573 ± 0.176 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 43 6.3518 ± 0.1389 0.00430 ± 0.00095 0.00443 ± 0.00010 2.025 ± 0.061 % 96.580 ± 0.174 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 44 6.3077 ± 0.1362 0.00445 ± 0.00094 0.00446 ± 0.00011 2.029 ± 0.060 % 96.622 ± 0.171 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 45 6.4014 ± 0.1372 0.00435 ± 0.00093 0.00443 ± 0.00010 2.018 ± 0.059 % 96.627 ± 0.169 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 46 6.4125 ± 0.1360 0.00412 ± 0.00092 0.00440 ± 0.00010 2.009 ± 0.058 % 96.650 ± 0.166 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 47 6.4061 ± 0.1343 0.00404 ± 0.00091 0.00438 ± 0.00010 2.002 ± 0.057 % 96.654 ± 0.164 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 48 6.3842 ± 0.1324 0.00389 ± 0.00090 0.00439 ± 0.00010 2.001 ± 0.056 % 96.642 ± 0.163 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 49 6.3695 ± 0.1306 0.00388 ± 0.00089 0.00438 ± 0.00010 2.001 ± 0.055 % 96.647 ± 0.161 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 50 6.4305 ± 0.1309 0.00375 ± 0.00088 0.00438 ± 0.00009 1.992 ± 0.054 % 96.612 ± 0.160 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 51 6.4950 ± 0.1312 0.00359 ± 0.00087 0.00436 ± 0.00009 1.982 ± 0.054 % 96.617 ± 0.159 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 52 6.4647 ± 0.1290 0.00357 ± 0.00086 0.00433 ± 0.00009 1.974 ± 0.053 % 96.644 ± 0.156 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 53 6.5613 ± 0.1299 0.00386 ± 0.00085 0.00433 ± 0.00009 1.964 ± 0.052 % 96.596 ± 0.156 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 54 6.5743 ± 0.1292 0.00377 ± 0.00084 0.00432 ± 0.00009 1.961 ± 0.051 % 96.630 ± 0.154 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 55 6.5900 ± 0.1283 0.00363 ± 0.00084 0.00431 ± 0.00009 1.956 ± 0.051 % 96.649 ± 0.152 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 56 6.6397 ± 0.1284 0.00342 ± 0.00083 0.00430 ± 0.00008 1.948 ± 0.050 % 96.646 ± 0.151 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 57 6.6677 ± 0.1281 0.00340 ± 0.00082 0.00430 ± 0.00008 1.945 ± 0.049 % 96.670 ± 0.149 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 58 6.6856 ± 0.1273 0.00348 ± 0.00081 0.00430 ± 0.00008 1.947 ± 0.049 % 96.687 ± 0.147 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 59 6.6409 ± 0.1251 0.00333 ± 0.00080 0.00428 ± 0.00008 1.943 ± 0.048 % 96.663 ± 0.146 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 60 6.6904 ± 0.1251 0.00344 ± 0.00080 0.00429 ± 0.00008 1.942 ± 0.047 % 96.621 ± 0.146 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 61 6.7488 ± 0.1253 0.00334 ± 0.00079 0.00428 ± 0.00008 1.934 ± 0.047 % 96.580 ± 0.146 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 62 6.7972 ± 0.1252 0.00324 ± 0.00078 0.00426 ± 0.00008 1.926 ± 0.046 % 96.565 ± 0.145 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 63 6.8582 ± 0.1258 0.00321 ± 0.00078 0.00426 ± 0.00008 1.923 ± 0.046 % 96.552 ± 0.144 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 64 6.9203 ± 0.1262 0.00328 ± 0.00077 0.00425 ± 0.00008 1.917 ± 0.045 % 96.556 ± 0.143 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 65 6.9055 ± 0.1248 0.00315 ± 0.00077 0.00423 ± 0.00007 1.914 ± 0.044 % 96.549 ± 0.142 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 66 6.9060 ± 0.1236 0.00313 ± 0.00076 0.00421 ± 0.00007 1.907 ± 0.044 % 96.536 ± 0.141 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 67 6.8983 ± 0.1225 0.00317 ± 0.00075 0.00420 ± 0.00007 1.903 ± 0.043 % 96.553 ± 0.140 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 68 6.9265 ± 0.1223 0.00313 ± 0.00074 0.00419 ± 0.00007 1.897 ± 0.043 % 96.540 ± 0.139 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 69 6.9692 ± 0.1222 0.00315 ± 0.00074 0.00418 ± 0.00007 1.892 ± 0.042 % 96.516 ± 0.138 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 70 6.9953 ± 0.1219 0.00324 ± 0.00074 0.00421 ± 0.00007 1.892 ± 0.042 % 96.476 ± 0.138 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 71 6.9839 ± 0.1209 0.00320 ± 0.00073 0.00419 ± 0.00007 1.889 ± 0.041 % 96.482 ± 0.137 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 72 6.9569 ± 0.1197 0.00323 ± 0.00072 0.00419 ± 0.00007 1.886 ± 0.041 % 96.503 ± 0.136 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 73 6.9336 ± 0.1184 0.00337 ± 0.00072 0.00421 ± 0.00007 1.894 ± 0.041 % 96.465 ± 0.135 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 74 6.9471 ± 0.1178 0.00341 ± 0.00072 0.00424 ± 0.00007 1.899 ± 0.041 % 96.428 ± 0.135 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 75 6.8836 ± 0.1159 0.00336 ± 0.00072 0.00430 ± 0.00007 1.911 ± 0.041 % 96.403 ± 0.135 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 76 6.8233 ± 0.1138 0.00379 ± 0.00072 0.00450 ± 0.00007 1.974 ± 0.042 % 96.373 ± 0.134 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 77 6.7820 ± 0.1122 0.00377 ± 0.00072 0.00447 ± 0.00007 1.968 ± 0.041 % 96.379 ± 0.133 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 78 6.7879 ± 0.1115 0.00354 ± 0.00071 0.00450 ± 0.00007 1.971 ± 0.041 % 96.340 ± 0.133 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 79 6.7992 ± 0.1111 0.00355 ± 0.00071 0.00449 ± 0.00007 1.973 ± 0.040 % 96.351 ± 0.132 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 80 6.8099 ± 0.1105 0.00349 ± 0.00070 0.00447 ± 0.00007 1.968 ± 0.040 % 96.363 ± 0.131 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 81 6.8059 ± 0.1096 0.00359 ± 0.00070 0.00448 ± 0.00007 1.965 ± 0.040 % 96.354 ± 0.130 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 82 6.8233 ± 0.1092 0.00361 ± 0.00069 0.00447 ± 0.00007 1.965 ± 0.039 % 96.337 ± 0.130 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 83 6.8392 ± 0.1089 0.00366 ± 0.00069 0.00447 ± 0.00007 1.966 ± 0.039 % 96.334 ± 0.129 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 84 6.8325 ± 0.1080 0.00366 ± 0.00068 0.00446 ± 0.00007 1.961 ± 0.039 % 96.349 ± 0.128 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 85 6.8276 ± 0.1072 0.00355 ± 0.00068 0.00444 ± 0.00007 1.956 ± 0.038 % 96.355 ± 0.127 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 86 6.8186 ± 0.1063 0.00345 ± 0.00067 0.00444 ± 0.00007 1.952 ± 0.038 % 96.375 ± 0.126 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 87 6.8945 ± 0.1072 0.00337 ± 0.00067 0.00443 ± 0.00007 1.947 ± 0.038 % 96.385 ± 0.125 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 88 6.8915 ± 0.1064 0.00343 ± 0.00066 0.00441 ± 0.00007 1.943 ± 0.037 % 96.404 ± 0.124 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 89 6.9123 ± 0.1061 0.00333 ± 0.00066 0.00443 ± 0.00007 1.947 ± 0.038 % 96.391 ± 0.124 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 90 6.9187 ± 0.1056 0.00325 ± 0.00065 0.00442 ± 0.00007 1.944 ± 0.037 % 96.370 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 91 6.9151 ± 0.1049 0.00331 ± 0.00065 0.00441 ± 0.00006 1.939 ± 0.037 % 96.367 ± 0.123 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 92 6.9125 ± 0.1043 0.00332 ± 0.00064 0.00440 ± 0.00006 1.938 ± 0.037 % 96.377 ± 0.122 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 93 6.9015 ± 0.1034 0.00323 ± 0.00064 0.00440 ± 0.00006 1.935 ± 0.036 % 96.378 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 94 6.9088 ± 0.1029 0.00322 ± 0.00063 0.00439 ± 0.00006 1.935 ± 0.036 % 96.379 ± 0.121 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 95 6.8995 ± 0.1021 0.00311 ± 0.00063 0.00439 ± 0.00006 1.935 ± 0.036 % 96.376 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 96 6.9241 ± 0.1021 0.00322 ± 0.00063 0.00439 ± 0.00006 1.932 ± 0.036 % 96.356 ± 0.120 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 97 6.9398 ± 0.1019 0.00318 ± 0.00062 0.00437 ± 0.00006 1.926 ± 0.035 % 96.382 ± 0.119 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 98 6.9453 ± 0.1016 0.00325 ± 0.00062 0.00436 ± 0.00006 1.922 ± 0.035 % 96.391 ± 0.118 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 99 6.9431 ± 0.1011 0.00316 ± 0.00062 0.00438 ± 0.00006 1.928 ± 0.035 % 96.395 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 100 6.9313 ± 0.1004 0.00317 ± 0.00062 0.00437 ± 0.00006 1.925 ± 0.034 % 96.396 ± 0.117 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 101 6.9302 ± 0.1000 0.00312 ± 0.00061 0.00439 ± 0.00006 1.926 ± 0.034 % 96.381 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 102 6.9555 ± 0.1000 0.00306 ± 0.00061 0.00438 ± 0.00006 1.923 ± 0.034 % 96.382 ± 0.116 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 103 6.9826 ± 0.0999 0.00309 ± 0.00061 0.00437 ± 0.00006 1.921 ± 0.034 % 96.394 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 104 7.0297 ± 0.1004 0.00303 ± 0.00060 0.00437 ± 0.00006 1.919 ± 0.033 % 96.384 ± 0.115 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 105 7.0196 ± 0.0997 0.00300 ± 0.00060 0.00437 ± 0.00006 1.916 ± 0.033 % 96.396 ± 0.114 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 106 7.0748 ± 0.1003 0.00303 ± 0.00060 0.00436 ± 0.00006 1.911 ± 0.033 % 96.404 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 107 7.0937 ± 0.1001 0.00298 ± 0.00059 0.00435 ± 0.00006 1.907 ± 0.033 % 96.394 ± 0.113 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 108 7.1030 ± 0.0998 0.00299 ± 0.00059 0.00434 ± 0.00006 1.903 ± 0.033 % 96.402 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 109 7.1393 ± 0.1001 0.00300 ± 0.00059 0.00433 ± 0.00006 1.900 ± 0.032 % 96.399 ± 0.112 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 110 7.1799 ± 0.1004 0.00283 ± 0.00058 0.00431 ± 0.00006 1.896 ± 0.032 % 96.399 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 111 7.1960 ± 0.1003 0.00284 ± 0.00058 0.00430 ± 0.00006 1.892 ± 0.032 % 96.411 ± 0.111 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 112 7.1651 ± 0.0994 0.00292 ± 0.00058 0.00429 ± 0.00006 1.891 ± 0.032 % 96.404 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 113 7.1515 ± 0.0987 0.00284 ± 0.00058 0.00429 ± 0.00005 1.893 ± 0.031 % 96.412 ± 0.110 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 114 7.1457 ± 0.0982 0.00282 ± 0.00057 0.00429 ± 0.00005 1.896 ± 0.031 % 96.412 ± 0.109 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 115 7.1233 ± 0.0975 0.00287 ± 0.00058 0.00429 ± 0.00005 1.894 ± 0.031 % 96.426 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 116 7.1204 ± 0.0970 0.00283 ± 0.00058 0.00430 ± 0.00005 1.900 ± 0.031 % 96.410 ± 0.108 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 117 7.1094 ± 0.0965 0.00280 ± 0.00058 0.00430 ± 0.00005 1.897 ± 0.031 % 96.427 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 118 7.0923 ± 0.0958 0.00275 ± 0.00057 0.00430 ± 0.00005 1.907 ± 0.031 % 96.414 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 119 7.0745 ± 0.0951 0.00268 ± 0.00057 0.00429 ± 0.00005 1.905 ± 0.031 % 96.411 ± 0.107 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 120 7.0522 ± 0.0943 0.00267 ± 0.00057 0.00430 ± 0.00005 1.904 ± 0.031 % 96.418 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 121 7.0319 ± 0.0936 0.00273 ± 0.00057 0.00431 ± 0.00005 1.913 ± 0.031 % 96.422 ± 0.106 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 122 7.0079 ± 0.0928 0.00267 ± 0.00056 0.00430 ± 0.00005 1.910 ± 0.031 % 96.426 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 123 6.9783 ± 0.0920 0.00276 ± 0.00056 0.00430 ± 0.00005 1.914 ± 0.031 % 96.426 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 124 6.9338 ± 0.0910 0.00273 ± 0.00056 0.00430 ± 0.00005 1.913 ± 0.031 % 96.417 ± 0.105 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 125 6.8902 ± 0.0899 0.00270 ± 0.00055 0.00429 ± 0.00005 1.911 ± 0.030 % 96.427 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 126 6.8558 ± 0.0890 0.00268 ± 0.00055 0.00431 ± 0.00005 1.917 ± 0.030 % 96.424 ± 0.104 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 127 6.8271 ± 0.0882 0.00272 ± 0.00055 0.00431 ± 0.00005 1.915 ± 0.030 % 96.409 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 128 6.8280 ± 0.0879 0.00277 ± 0.00055 0.00431 ± 0.00005 1.914 ± 0.030 % 96.394 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 129 6.8239 ± 0.0874 0.00273 ± 0.00054 0.00430 ± 0.00005 1.913 ± 0.030 % 96.398 ± 0.103 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 130 6.8279 ± 0.0871 0.00276 ± 0.00054 0.00430 ± 0.00005 1.912 ± 0.030 % 96.407 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 131 6.8286 ± 0.0868 0.00278 ± 0.00054 0.00430 ± 0.00005 1.909 ± 0.029 % 96.402 ± 0.102 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 132 6.8107 ± 0.0862 0.00272 ± 0.00054 0.00428 ± 0.00005 1.907 ± 0.029 % 96.408 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 133 6.7890 ± 0.0855 0.00267 ± 0.00053 0.00428 ± 0.00005 1.907 ± 0.029 % 96.415 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 134 6.7978 ± 0.0853 0.00266 ± 0.00053 0.00427 ± 0.00005 1.904 ± 0.029 % 96.403 ± 0.101 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 135 6.7927 ± 0.0850 0.00281 ± 0.00053 0.00429 ± 0.00005 1.905 ± 0.029 % 96.401 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 136 6.7998 ± 0.0848 0.00275 ± 0.00053 0.00429 ± 0.00005 1.903 ± 0.029 % 96.404 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 137 6.8098 ± 0.0846 0.00272 ± 0.00053 0.00429 ± 0.00005 1.902 ± 0.028 % 96.399 ± 0.100 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 138 6.8348 ± 0.0847 0.00263 ± 0.00053 0.00429 ± 0.00005 1.899 ± 0.028 % 96.402 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 139 6.8564 ± 0.0848 0.00258 ± 0.00053 0.00429 ± 0.00005 1.896 ± 0.028 % 96.403 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 140 6.8386 ± 0.0843 0.00260 ± 0.00053 0.00430 ± 0.00005 1.899 ± 0.028 % 96.381 ± 0.099 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 141 6.8046 ± 0.0835 0.00267 ± 0.00052 0.00432 ± 0.00005 1.907 ± 0.028 % 96.387 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 142 6.7771 ± 0.0828 0.00266 ± 0.00052 0.00431 ± 0.00005 1.904 ± 0.028 % 96.390 ± 0.098 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 143 6.7341 ± 0.0818 0.00263 ± 0.00052 0.00429 ± 0.00005 1.903 ± 0.028 % 96.405 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 144 6.7026 ± 0.0811 0.00262 ± 0.00052 0.00428 ± 0.00005 1.901 ± 0.028 % 96.416 ± 0.097 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 145 6.6584 ± 0.0801 0.00259 ± 0.00051 0.00427 ± 0.00005 1.900 ± 0.027 % 96.433 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 146 6.6297 ± 0.0794 0.00261 ± 0.00051 0.00427 ± 0.00005 1.900 ± 0.027 % 96.425 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 147 6.6055 ± 0.0788 0.00262 ± 0.00051 0.00428 ± 0.00005 1.918 ± 0.032 % 96.433 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 148 6.5866 ± 0.0783 0.00261 ± 0.00051 0.00427 ± 0.00005 1.920 ± 0.032 % 96.420 ± 0.096 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 149 6.5669 ± 0.0777 0.00259 ± 0.00051 0.00426 ± 0.00005 1.920 ± 0.032 % 96.431 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 150 6.5493 ± 0.0771 0.00257 ± 0.00050 0.00426 ± 0.00005 1.919 ± 0.031 % 96.418 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 151 6.5194 ± 0.0764 0.00261 ± 0.00050 0.00426 ± 0.00005 1.923 ± 0.031 % 96.429 ± 0.095 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 152 6.4943 ± 0.0758 0.00262 ± 0.00050 0.00425 ± 0.00005 1.920 ± 0.031 % 96.424 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 153 6.4692 ± 0.0751 0.00259 ± 0.00050 0.00424 ± 0.00005 1.920 ± 0.031 % 96.427 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 154 6.4378 ± 0.0744 0.00254 ± 0.00050 0.00424 ± 0.00005 1.920 ± 0.031 % 96.417 ± 0.094 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 155 6.4177 ± 0.0738 0.00254 ± 0.00049 0.00423 ± 0.00005 1.919 ± 0.031 % 96.420 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 156 6.4027 ± 0.0733 0.00252 ± 0.00049 0.00423 ± 0.00005 1.919 ± 0.031 % 96.403 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 157 6.3781 ± 0.0727 0.00248 ± 0.00049 0.00422 ± 0.00005 1.918 ± 0.030 % 96.391 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 158 6.3736 ± 0.0724 0.00255 ± 0.00049 0.00422 ± 0.00005 1.917 ± 0.030 % 96.394 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 159 6.3530 ± 0.0719 0.00258 ± 0.00048 0.00421 ± 0.00005 1.915 ± 0.030 % 96.392 ± 0.093 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 160 6.3419 ± 0.0715 0.00261 ± 0.00048 0.00420 ± 0.00005 1.915 ± 0.030 % 96.402 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 161 6.3560 ± 0.0716 0.00268 ± 0.00049 0.00421 ± 0.00005 1.915 ± 0.030 % 96.395 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 162 6.3563 ± 0.0713 0.00265 ± 0.00049 0.00420 ± 0.00005 1.912 ± 0.030 % 96.386 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 163 6.3747 ± 0.0714 0.00264 ± 0.00048 0.00420 ± 0.00005 1.910 ± 0.029 % 96.389 ± 0.092 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 164 6.3826 ± 0.0712 0.00266 ± 0.00048 0.00419 ± 0.00005 1.908 ± 0.029 % 96.387 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 165 6.4094 ± 0.0714 0.00264 ± 0.00048 0.00419 ± 0.00005 1.907 ± 0.029 % 96.395 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 166 6.4402 ± 0.0716 0.00264 ± 0.00048 0.00418 ± 0.00005 1.904 ± 0.029 % 96.386 ± 0.091 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 167 6.4587 ± 0.0716 0.00258 ± 0.00048 0.00418 ± 0.00004 1.902 ± 0.029 % 96.402 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 168 6.4970 ± 0.0720 0.00262 ± 0.00048 0.00418 ± 0.00004 1.900 ± 0.029 % 96.394 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 169 6.5145 ± 0.0720 0.00262 ± 0.00048 0.00418 ± 0.00004 1.900 ± 0.029 % 96.392 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 170 6.5445 ± 0.0722 0.00262 ± 0.00047 0.00417 ± 0.00004 1.898 ± 0.029 % 96.385 ± 0.090 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 171 6.5772 ± 0.0726 0.00261 ± 0.00047 0.00417 ± 0.00004 1.895 ± 0.028 % 96.379 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 172 6.5870 ± 0.0725 0.00255 ± 0.00047 0.00417 ± 0.00004 1.894 ± 0.028 % 96.377 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 173 6.5932 ± 0.0724 0.00251 ± 0.00047 0.00417 ± 0.00004 1.894 ± 0.028 % 96.364 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 174 6.5913 ± 0.0721 0.00251 ± 0.00047 0.00416 ± 0.00004 1.893 ± 0.028 % 96.358 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 175 6.6125 ± 0.0722 0.00249 ± 0.00047 0.00417 ± 0.00004 1.891 ± 0.028 % 96.356 ± 0.089 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 176 6.6204 ± 0.0721 0.00243 ± 0.00047 0.00418 ± 0.00004 1.893 ± 0.028 % 96.357 ± 0.088 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 177 6.6257 ± 0.0721 0.00244 ± 0.00046 0.00418 ± 0.00004 1.893 ± 0.028 % 96.360 ± 0.088 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 178 6.6322 ± 0.0720 0.00246 ± 0.00046 0.00418 ± 0.00004 1.894 ± 0.028 % 96.358 ± 0.088 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 179 6.6310 ± 0.0718 0.00245 ± 0.00046 0.00418 ± 0.00004 1.896 ± 0.027 % 96.354 ± 0.088 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 180 6.6437 ± 0.0717 0.00241 ± 0.00046 0.00418 ± 0.00004 1.895 ± 0.027 % 96.344 ± 0.088 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 181 6.6472 ± 0.0716 0.00243 ± 0.00046 0.00417 ± 0.00004 1.894 ± 0.027 % 96.351 ± 0.087 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 182 6.6564 ± 0.0715 0.00247 ± 0.00046 0.00417 ± 0.00004 1.894 ± 0.027 % 96.352 ± 0.087 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 183 6.6800 ± 0.0716 0.00239 ± 0.00046 0.00417 ± 0.00004 1.893 ± 0.027 % 96.353 ± 0.087 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 184 6.6932 ± 0.0715 0.00244 ± 0.00046 0.00417 ± 0.00004 1.892 ± 0.027 % 96.358 ± 0.086 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 185 6.7020 ± 0.0714 0.00242 ± 0.00045 0.00417 ± 0.00004 1.890 ± 0.027 % 96.365 ± 0.086 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 186 6.7065 ± 0.0713 0.00241 ± 0.00045 0.00416 ± 0.00004 1.889 ± 0.027 % 96.369 ± 0.086 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 187 6.7251 ± 0.0714 0.00240 ± 0.00045 0.00416 ± 0.00004 1.887 ± 0.027 % 96.353 ± 0.086 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 188 6.7400 ± 0.0713 0.00240 ± 0.00045 0.00416 ± 0.00004 1.885 ± 0.026 % 96.348 ± 0.086 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 189 6.7508 ± 0.0713 0.00242 ± 0.00045 0.00415 ± 0.00004 1.883 ± 0.026 % 96.348 ± 0.085 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 190 6.7617 ± 0.0713 0.00240 ± 0.00045 0.00415 ± 0.00004 1.881 ± 0.026 % 96.347 ± 0.085 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 191 6.7568 ± 0.0710 0.00248 ± 0.00044 0.00415 ± 0.00004 1.880 ± 0.026 % 96.352 ± 0.085 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 192 6.7501 ± 0.0707 0.00246 ± 0.00044 0.00414 ± 0.00004 1.878 ± 0.026 % 96.360 ± 0.085 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 193 6.7385 ± 0.0704 0.00243 ± 0.00045 0.00414 ± 0.00004 1.878 ± 0.026 % 96.369 ± 0.084 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 194 6.7322 ± 0.0702 0.00243 ± 0.00044 0.00414 ± 0.00004 1.878 ± 0.026 % 96.370 ± 0.084 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 195 6.7626 ± 0.0704 0.00241 ± 0.00044 0.00414 ± 0.00004 1.875 ± 0.026 % 96.366 ± 0.084 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 196 6.7584 ± 0.0702 0.00240 ± 0.00044 0.00415 ± 0.00004 1.880 ± 0.026 % 96.365 ± 0.084 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 197 6.7593 ± 0.0700 0.00241 ± 0.00044 0.00415 ± 0.00004 1.881 ± 0.026 % 96.359 ± 0.084 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 198 6.7510 ± 0.0697 0.00241 ± 0.00044 0.00415 ± 0.00004 1.880 ± 0.025 % 96.356 ± 0.083 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 199 6.7428 ± 0.0694 0.00242 ± 0.00044 0.00414 ± 0.00004 1.881 ± 0.025 % 96.364 ± 0.083 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 200 6.7259 ± 0.0690 0.00239 ± 0.00044 0.00415 ± 0.00004 1.881 ± 0.025 % 96.359 ± 0.083 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 201 6.7014 ± 0.0685 0.00245 ± 0.00044 0.00414 ± 0.00004 1.880 ± 0.025 % 96.363 ± 0.083 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 202 6.6944 ± 0.0683 0.00246 ± 0.00044 0.00414 ± 0.00004 1.879 ± 0.025 % 96.358 ± 0.083 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 203 6.6577 ± 0.0677 0.00247 ± 0.00044 0.00415 ± 0.00004 1.891 ± 0.025 % 96.357 ± 0.082 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 204 6.6515 ± 0.0675 0.00245 ± 0.00044 0.00415 ± 0.00004 1.890 ± 0.025 % 96.359 ± 0.082 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 205 6.6442 ± 0.0672 0.00243 ± 0.00043 0.00415 ± 0.00004 1.889 ± 0.025 % 96.362 ± 0.082 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 206 6.6332 ± 0.0669 0.00239 ± 0.00043 0.00415 ± 0.00004 1.889 ± 0.025 % 96.372 ± 0.082 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 207 6.6232 ± 0.0666 0.00238 ± 0.00043 0.00414 ± 0.00004 1.890 ± 0.025 % 96.368 ± 0.081 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 208 6.6289 ± 0.0666 0.00237 ± 0.00043 0.00414 ± 0.00004 1.889 ± 0.025 % 96.374 ± 0.081 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 209 6.6338 ± 0.0665 0.00234 ± 0.00043 0.00414 ± 0.00004 1.888 ± 0.025 % 96.377 ± 0.081 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 210 6.6322 ± 0.0663 0.00238 ± 0.00043 0.00414 ± 0.00004 1.886 ± 0.024 % 96.387 ± 0.081 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 211 6.6166 ± 0.0660 0.00239 ± 0.00043 0.00414 ± 0.00004 1.886 ± 0.024 % 96.380 ± 0.081 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 212 6.5923 ± 0.0655 0.00242 ± 0.00043 0.00413 ± 0.00004 1.886 ± 0.024 % 96.391 ± 0.080 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 213 6.5821 ± 0.0652 0.00244 ± 0.00043 0.00413 ± 0.00004 1.885 ± 0.024 % 96.397 ± 0.080 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 214 6.5848 ± 0.0651 0.00245 ± 0.00042 0.00412 ± 0.00004 1.884 ± 0.024 % 96.403 ± 0.080 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 215 6.5735 ± 0.0648 0.00242 ± 0.00042 0.00412 ± 0.00004 1.884 ± 0.024 % 96.407 ± 0.079 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 216 6.5751 ± 0.0647 0.00249 ± 0.00042 0.00412 ± 0.00004 1.883 ± 0.024 % 96.405 ± 0.079 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 217 6.5565 ± 0.0643 0.00253 ± 0.00042 0.00412 ± 0.00004 1.883 ± 0.024 % 96.407 ± 0.079 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 218 6.5437 ± 0.0640 0.00254 ± 0.00042 0.00412 ± 0.00004 1.883 ± 0.024 % 96.402 ± 0.079 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 219 6.5378 ± 0.0638 0.00254 ± 0.00042 0.00412 ± 0.00004 1.884 ± 0.024 % 96.399 ± 0.079 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 220 6.5317 ± 0.0636 0.00253 ± 0.00042 0.00412 ± 0.00004 1.884 ± 0.024 % 96.399 ± 0.079 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 221 6.5150 ± 0.0632 0.00250 ± 0.00042 0.00412 ± 0.00004 1.883 ± 0.023 % 96.400 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 222 6.5015 ± 0.0629 0.00250 ± 0.00042 0.00411 ± 0.00004 1.883 ± 0.023 % 96.405 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 223 6.4890 ± 0.0626 0.00254 ± 0.00041 0.00412 ± 0.00004 1.883 ± 0.023 % 96.409 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 224 6.4818 ± 0.0624 0.00257 ± 0.00041 0.00411 ± 0.00004 1.881 ± 0.023 % 96.401 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 225 6.4843 ± 0.0623 0.00256 ± 0.00041 0.00411 ± 0.00004 1.880 ± 0.023 % 96.392 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 226 6.4768 ± 0.0620 0.00253 ± 0.00041 0.00411 ± 0.00004 1.880 ± 0.023 % 96.382 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 227 6.4373 ± 0.0615 0.00269 ± 0.00042 0.00421 ± 0.00005 1.946 ± 0.034 % 96.381 ± 0.078 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 228 6.4327 ± 0.0613 0.00270 ± 0.00041 0.00421 ± 0.00005 1.946 ± 0.034 % 96.383 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 229 6.4236 ± 0.0610 0.00269 ± 0.00041 0.00421 ± 0.00005 1.943 ± 0.034 % 96.385 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 230 6.4106 ± 0.0608 0.00268 ± 0.00041 0.00420 ± 0.00005 1.942 ± 0.033 % 96.390 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 231 6.4114 ± 0.0607 0.00263 ± 0.00041 0.00420 ± 0.00005 1.943 ± 0.033 % 96.391 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 232 6.4157 ± 0.0606 0.00255 ± 0.00041 0.00420 ± 0.00005 1.941 ± 0.033 % 96.383 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 233 6.4244 ± 0.0606 0.00254 ± 0.00041 0.00419 ± 0.00005 1.939 ± 0.033 % 96.376 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 234 6.4225 ± 0.0604 0.00254 ± 0.00041 0.00419 ± 0.00005 1.940 ± 0.033 % 96.370 ± 0.077 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 235 6.4376 ± 0.0605 0.00251 ± 0.00041 0.00420 ± 0.00005 1.941 ± 0.033 % 96.374 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 236 6.4404 ± 0.0604 0.00244 ± 0.00041 0.00420 ± 0.00005 1.941 ± 0.033 % 96.374 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 237 6.4543 ± 0.0605 0.00240 ± 0.00041 0.00420 ± 0.00005 1.940 ± 0.033 % 96.365 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 238 6.4662 ± 0.0605 0.00242 ± 0.00041 0.00420 ± 0.00005 1.938 ± 0.032 % 96.365 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 239 6.4737 ± 0.0605 0.00241 ± 0.00041 0.00420 ± 0.00005 1.938 ± 0.032 % 96.356 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 240 6.4775 ± 0.0604 0.00239 ± 0.00040 0.00420 ± 0.00005 1.936 ± 0.032 % 96.359 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 241 6.4834 ± 0.0603 0.00241 ± 0.00040 0.00420 ± 0.00005 1.935 ± 0.032 % 96.352 ± 0.076 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 242 6.4971 ± 0.0603 0.00238 ± 0.00040 0.00419 ± 0.00005 1.934 ± 0.032 % 96.356 ± 0.075 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 243 6.5001 ± 0.0602 0.00234 ± 0.00040 0.00419 ± 0.00005 1.932 ± 0.032 % 96.354 ± 0.075 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 244 6.5172 ± 0.0603 0.00236 ± 0.00040 0.00419 ± 0.00005 1.931 ± 0.032 % 96.361 ± 0.075 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 245 6.5363 ± 0.0604 0.00231 ± 0.00040 0.00418 ± 0.00005 1.929 ± 0.032 % 96.368 ± 0.075 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 246 6.5400 ± 0.0603 0.00234 ± 0.00040 0.00418 ± 0.00005 1.928 ± 0.032 % 96.372 ± 0.075 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 247 6.5399 ± 0.0602 0.00233 ± 0.00040 0.00419 ± 0.00005 1.928 ± 0.031 % 96.375 ± 0.074 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 248 6.5492 ± 0.0602 0.00233 ± 0.00040 0.00419 ± 0.00005 1.927 ± 0.031 % 96.376 ± 0.074 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 249 6.5399 ± 0.0600 0.00231 ± 0.00040 0.00419 ± 0.00005 1.927 ± 0.031 % 96.378 ± 0.074 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 250 6.5163 ± 0.0596 0.00234 ± 0.00040 0.00419 ± 0.00005 1.928 ± 0.031 % 96.387 ± 0.074 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 251 6.5045 ± 0.0594 0.00225 ± 0.00040 0.00420 ± 0.00005 1.934 ± 0.031 % 96.382 ± 0.074 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 252 6.4874 ± 0.0591 0.00224 ± 0.00039 0.00420 ± 0.00005 1.934 ± 0.031 % 96.387 ± 0.074 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 253 6.4786 ± 0.0589 0.00223 ± 0.00039 0.00419 ± 0.00005 1.933 ± 0.031 % 96.392 ± 0.073 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 254 6.4784 ± 0.0587 0.00222 ± 0.00039 0.00419 ± 0.00005 1.933 ± 0.031 % 96.393 ± 0.073 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 255 6.4849 ± 0.0587 0.00219 ± 0.00039 0.00419 ± 0.00005 1.932 ± 0.031 % 96.398 ± 0.073 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 256 6.4843 ± 0.0586 0.00216 ± 0.00039 0.00419 ± 0.00005 1.931 ± 0.031 % 96.396 ± 0.073 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 257 6.4861 ± 0.0585 0.00216 ± 0.00039 0.00419 ± 0.00005 1.930 ± 0.030 % 96.403 ± 0.073 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 258 6.4831 ± 0.0584 0.00216 ± 0.00039 0.00419 ± 0.00005 1.929 ± 0.030 % 96.413 ± 0.073 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 259 6.4804 ± 0.0582 0.00216 ± 0.00039 0.00419 ± 0.00005 1.928 ± 0.030 % 96.413 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 260 6.4733 ± 0.0580 0.00216 ± 0.00039 0.00419 ± 0.00005 1.929 ± 0.030 % 96.415 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 261 6.4592 ± 0.0577 0.00216 ± 0.00039 0.00419 ± 0.00005 1.930 ± 0.030 % 96.415 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 262 6.4495 ± 0.0575 0.00214 ± 0.00039 0.00418 ± 0.00005 1.929 ± 0.030 % 96.412 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 263 6.4453 ± 0.0574 0.00214 ± 0.00038 0.00418 ± 0.00005 1.928 ± 0.030 % 96.408 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 264 6.4326 ± 0.0571 0.00214 ± 0.00038 0.00418 ± 0.00005 1.927 ± 0.030 % 96.411 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 265 6.4322 ± 0.0570 0.00211 ± 0.00038 0.00417 ± 0.00005 1.926 ± 0.030 % 96.414 ± 0.072 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 266 6.4180 ± 0.0567 0.00209 ± 0.00038 0.00417 ± 0.00005 1.925 ± 0.030 % 96.419 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 267 6.4142 ± 0.0566 0.00209 ± 0.00038 0.00417 ± 0.00005 1.924 ± 0.029 % 96.421 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 268 6.4075 ± 0.0564 0.00209 ± 0.00038 0.00416 ± 0.00005 1.923 ± 0.029 % 96.415 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 269 6.4032 ± 0.0562 0.00208 ± 0.00038 0.00416 ± 0.00005 1.923 ± 0.029 % 96.411 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 270 6.3946 ± 0.0560 0.00208 ± 0.00038 0.00417 ± 0.00005 1.922 ± 0.029 % 96.420 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 271 6.3907 ± 0.0559 0.00205 ± 0.00038 0.00416 ± 0.00005 1.922 ± 0.029 % 96.424 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 272 6.3882 ± 0.0557 0.00206 ± 0.00038 0.00417 ± 0.00005 1.923 ± 0.029 % 96.420 ± 0.071 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 273 6.3667 ± 0.0554 0.00211 ± 0.00038 0.00417 ± 0.00005 1.926 ± 0.029 % 96.422 ± 0.070 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 274 6.3535 ± 0.0552 0.00214 ± 0.00038 0.00418 ± 0.00005 1.928 ± 0.029 % 96.431 ± 0.070 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 275 6.3581 ± 0.0551 0.00216 ± 0.00038 0.00418 ± 0.00005 1.926 ± 0.029 % 96.431 ± 0.070 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 276 6.3570 ± 0.0550 0.00217 ± 0.00038 0.00419 ± 0.00005 1.940 ± 0.029 % 96.428 ± 0.070 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 277 6.3447 ± 0.0548 0.00217 ± 0.00037 0.00419 ± 0.00005 1.940 ± 0.029 % 96.434 ± 0.070 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 278 6.3391 ± 0.0547 0.00222 ± 0.00037 0.00420 ± 0.00005 1.946 ± 0.029 % 96.435 ± 0.070 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 279 6.3412 ± 0.0546 0.00223 ± 0.00037 0.00420 ± 0.00005 1.947 ± 0.029 % 96.440 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 280 6.3516 ± 0.0546 0.00223 ± 0.00037 0.00420 ± 0.00005 1.945 ± 0.029 % 96.443 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 281 6.3603 ± 0.0547 0.00221 ± 0.00037 0.00420 ± 0.00005 1.944 ± 0.029 % 96.440 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 282 6.3714 ± 0.0547 0.00215 ± 0.00037 0.00420 ± 0.00005 1.945 ± 0.029 % 96.432 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 283 6.3773 ± 0.0547 0.00214 ± 0.00037 0.00420 ± 0.00005 1.945 ± 0.029 % 96.437 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 284 6.3810 ± 0.0546 0.00212 ± 0.00037 0.00421 ± 0.00005 1.946 ± 0.029 % 96.432 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 285 6.3963 ± 0.0547 0.00213 ± 0.00037 0.00421 ± 0.00005 1.945 ± 0.029 % 96.438 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 286 6.3979 ± 0.0546 0.00213 ± 0.00037 0.00420 ± 0.00005 1.944 ± 0.029 % 96.432 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 287 6.4058 ± 0.0546 0.00211 ± 0.00037 0.00420 ± 0.00004 1.943 ± 0.029 % 96.425 ± 0.069 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 288 6.3982 ± 0.0544 0.00212 ± 0.00037 0.00420 ± 0.00004 1.942 ± 0.028 % 96.427 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 289 6.3930 ± 0.0542 0.00211 ± 0.00037 0.00420 ± 0.00004 1.941 ± 0.028 % 96.423 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 290 6.3939 ± 0.0541 0.00212 ± 0.00037 0.00419 ± 0.00004 1.941 ± 0.028 % 96.414 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 291 6.3952 ± 0.0541 0.00208 ± 0.00037 0.00420 ± 0.00004 1.944 ± 0.028 % 96.417 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 292 6.3885 ± 0.0539 0.00207 ± 0.00037 0.00420 ± 0.00004 1.945 ± 0.028 % 96.420 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 293 6.3902 ± 0.0538 0.00208 ± 0.00036 0.00419 ± 0.00004 1.944 ± 0.028 % 96.422 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 294 6.3959 ± 0.0538 0.00209 ± 0.00036 0.00419 ± 0.00004 1.943 ± 0.028 % 96.419 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 295 6.3960 ± 0.0536 0.00203 ± 0.00036 0.00419 ± 0.00004 1.941 ± 0.028 % 96.415 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 296 6.3988 ± 0.0536 0.00202 ± 0.00036 0.00419 ± 0.00004 1.941 ± 0.028 % 96.423 ± 0.068 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 297 6.3976 ± 0.0534 0.00204 ± 0.00036 0.00419 ± 0.00004 1.939 ± 0.028 % 96.423 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 298 6.3952 ± 0.0533 0.00206 ± 0.00036 0.00418 ± 0.00004 1.938 ± 0.028 % 96.418 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 299 6.3990 ± 0.0532 0.00207 ± 0.00036 0.00418 ± 0.00004 1.937 ± 0.028 % 96.414 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 300 6.4017 ± 0.0532 0.00208 ± 0.00036 0.00418 ± 0.00004 1.936 ± 0.028 % 96.416 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 301 6.3980 ± 0.0530 0.00209 ± 0.00036 0.00417 ± 0.00004 1.934 ± 0.028 % 96.417 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 302 6.3923 ± 0.0529 0.00213 ± 0.00036 0.00417 ± 0.00004 1.936 ± 0.028 % 96.423 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 303 6.3952 ± 0.0528 0.00212 ± 0.00036 0.00419 ± 0.00005 1.944 ± 0.029 % 96.415 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 304 6.3862 ± 0.0526 0.00213 ± 0.00036 0.00419 ± 0.00004 1.943 ± 0.028 % 96.419 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 305 6.3845 ± 0.0525 0.00212 ± 0.00036 0.00419 ± 0.00004 1.943 ± 0.028 % 96.418 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 306 6.3955 ± 0.0525 0.00210 ± 0.00036 0.00419 ± 0.00004 1.942 ± 0.028 % 96.410 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 307 6.4019 ± 0.0525 0.00211 ± 0.00035 0.00418 ± 0.00004 1.941 ± 0.028 % 96.405 ± 0.067 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 308 6.4020 ± 0.0524 0.00206 ± 0.00035 0.00418 ± 0.00004 1.940 ± 0.028 % 96.409 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 309 6.4088 ± 0.0524 0.00206 ± 0.00035 0.00418 ± 0.00004 1.939 ± 0.028 % 96.411 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 310 6.4023 ± 0.0523 0.00203 ± 0.00035 0.00419 ± 0.00004 1.940 ± 0.028 % 96.414 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 311 6.4027 ± 0.0522 0.00202 ± 0.00035 0.00419 ± 0.00004 1.940 ± 0.028 % 96.415 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 312 6.4110 ± 0.0522 0.00204 ± 0.00035 0.00419 ± 0.00004 1.941 ± 0.028 % 96.418 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 313 6.4239 ± 0.0522 0.00203 ± 0.00035 0.00419 ± 0.00004 1.939 ± 0.028 % 96.414 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 314 6.4406 ± 0.0524 0.00201 ± 0.00035 0.00419 ± 0.00004 1.937 ± 0.028 % 96.414 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 315 6.4462 ± 0.0523 0.00201 ± 0.00035 0.00419 ± 0.00004 1.937 ± 0.028 % 96.413 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 316 6.4550 ± 0.0523 0.00202 ± 0.00035 0.00418 ± 0.00004 1.936 ± 0.028 % 96.410 ± 0.066 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 317 6.4522 ± 0.0522 0.00204 ± 0.00035 0.00418 ± 0.00004 1.935 ± 0.027 % 96.414 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 318 6.4550 ± 0.0522 0.00206 ± 0.00035 0.00418 ± 0.00004 1.934 ± 0.027 % 96.410 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 319 6.4485 ± 0.0520 0.00204 ± 0.00035 0.00417 ± 0.00004 1.933 ± 0.027 % 96.416 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 320 6.4419 ± 0.0519 0.00204 ± 0.00035 0.00417 ± 0.00004 1.932 ± 0.027 % 96.420 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 321 6.4423 ± 0.0518 0.00204 ± 0.00035 0.00417 ± 0.00004 1.932 ± 0.027 % 96.420 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 322 6.4410 ± 0.0517 0.00205 ± 0.00035 0.00417 ± 0.00004 1.931 ± 0.027 % 96.422 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 323 6.4354 ± 0.0516 0.00204 ± 0.00034 0.00416 ± 0.00004 1.931 ± 0.027 % 96.428 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 324 6.4394 ± 0.0515 0.00204 ± 0.00034 0.00416 ± 0.00004 1.930 ± 0.027 % 96.427 ± 0.065 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 325 6.4312 ± 0.0514 0.00206 ± 0.00034 0.00416 ± 0.00004 1.929 ± 0.027 % 96.433 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 326 6.4341 ± 0.0513 0.00205 ± 0.00034 0.00415 ± 0.00004 1.929 ± 0.027 % 96.437 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 327 6.4357 ± 0.0513 0.00205 ± 0.00034 0.00415 ± 0.00004 1.927 ± 0.027 % 96.435 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 328 6.4314 ± 0.0512 0.00207 ± 0.00034 0.00415 ± 0.00004 1.927 ± 0.027 % 96.430 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 329 6.4259 ± 0.0510 0.00209 ± 0.00034 0.00415 ± 0.00004 1.927 ± 0.027 % 96.432 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 330 6.4132 ± 0.0508 0.00210 ± 0.00034 0.00414 ± 0.00004 1.927 ± 0.027 % 96.436 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 331 6.4184 ± 0.0508 0.00213 ± 0.00034 0.00414 ± 0.00004 1.926 ± 0.027 % 96.439 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 332 6.4170 ± 0.0507 0.00210 ± 0.00034 0.00414 ± 0.00004 1.925 ± 0.026 % 96.429 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 333 6.4138 ± 0.0506 0.00211 ± 0.00034 0.00414 ± 0.00004 1.924 ± 0.026 % 96.429 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 334 6.4110 ± 0.0504 0.00211 ± 0.00034 0.00414 ± 0.00004 1.926 ± 0.026 % 96.433 ± 0.064 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 335 6.3990 ± 0.0503 0.00208 ± 0.00034 0.00414 ± 0.00004 1.925 ± 0.026 % 96.434 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 336 6.3974 ± 0.0502 0.00209 ± 0.00034 0.00414 ± 0.00004 1.925 ± 0.026 % 96.436 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 337 6.3908 ± 0.0500 0.00212 ± 0.00034 0.00414 ± 0.00004 1.925 ± 0.026 % 96.433 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 338 6.3870 ± 0.0499 0.00213 ± 0.00034 0.00413 ± 0.00004 1.924 ± 0.026 % 96.426 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 339 6.3829 ± 0.0498 0.00214 ± 0.00034 0.00413 ± 0.00004 1.923 ± 0.026 % 96.434 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 340 6.3862 ± 0.0497 0.00213 ± 0.00033 0.00413 ± 0.00004 1.922 ± 0.026 % 96.427 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 341 6.3859 ± 0.0496 0.00210 ± 0.00033 0.00413 ± 0.00004 1.921 ± 0.026 % 96.425 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 342 6.3889 ± 0.0496 0.00210 ± 0.00033 0.00412 ± 0.00004 1.920 ± 0.026 % 96.420 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 343 6.3962 ± 0.0496 0.00209 ± 0.00033 0.00412 ± 0.00004 1.919 ± 0.026 % 96.418 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 344 6.4061 ± 0.0496 0.00207 ± 0.00033 0.00412 ± 0.00004 1.918 ± 0.026 % 96.422 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 345 6.4084 ± 0.0495 0.00205 ± 0.00033 0.00412 ± 0.00004 1.917 ± 0.026 % 96.419 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 346 6.4095 ± 0.0495 0.00207 ± 0.00033 0.00412 ± 0.00004 1.917 ± 0.026 % 96.420 ± 0.063 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 347 6.4110 ± 0.0494 0.00207 ± 0.00033 0.00412 ± 0.00004 1.917 ± 0.025 % 96.419 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 348 6.4175 ± 0.0494 0.00208 ± 0.00033 0.00411 ± 0.00004 1.916 ± 0.025 % 96.423 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 349 6.4214 ± 0.0493 0.00206 ± 0.00033 0.00411 ± 0.00004 1.915 ± 0.025 % 96.427 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 350 6.4233 ± 0.0493 0.00203 ± 0.00033 0.00411 ± 0.00004 1.915 ± 0.025 % 96.429 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 351 6.4246 ± 0.0493 0.00204 ± 0.00033 0.00412 ± 0.00004 1.916 ± 0.025 % 96.425 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 352 6.4286 ± 0.0492 0.00202 ± 0.00033 0.00412 ± 0.00004 1.916 ± 0.025 % 96.416 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 353 6.4451 ± 0.0493 0.00201 ± 0.00033 0.00412 ± 0.00004 1.915 ± 0.025 % 96.419 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 354 6.4612 ± 0.0494 0.00200 ± 0.00033 0.00412 ± 0.00004 1.913 ± 0.025 % 96.411 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 355 6.4766 ± 0.0495 0.00198 ± 0.00033 0.00411 ± 0.00004 1.912 ± 0.025 % 96.412 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 356 6.4861 ± 0.0495 0.00200 ± 0.00033 0.00411 ± 0.00004 1.911 ± 0.025 % 96.419 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 357 6.5020 ± 0.0496 0.00201 ± 0.00033 0.00411 ± 0.00004 1.910 ± 0.025 % 96.417 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 358 6.5146 ± 0.0496 0.00200 ± 0.00033 0.00411 ± 0.00004 1.908 ± 0.025 % 96.410 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 359 6.5302 ± 0.0497 0.00200 ± 0.00033 0.00411 ± 0.00004 1.908 ± 0.025 % 96.407 ± 0.062 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 360 6.5397 ± 0.0498 0.00198 ± 0.00032 0.00411 ± 0.00004 1.906 ± 0.025 % 96.402 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 361 6.5440 ± 0.0497 0.00203 ± 0.00032 0.00411 ± 0.00004 1.907 ± 0.025 % 96.403 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 362 6.5548 ± 0.0497 0.00204 ± 0.00032 0.00411 ± 0.00004 1.906 ± 0.025 % 96.395 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 363 6.5605 ± 0.0497 0.00201 ± 0.00032 0.00411 ± 0.00004 1.905 ± 0.025 % 96.397 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 364 6.5590 ± 0.0496 0.00202 ± 0.00032 0.00411 ± 0.00004 1.906 ± 0.025 % 96.399 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 365 6.5676 ± 0.0497 0.00202 ± 0.00032 0.00411 ± 0.00004 1.905 ± 0.024 % 96.390 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 366 6.5808 ± 0.0497 0.00201 ± 0.00032 0.00411 ± 0.00004 1.904 ± 0.024 % 96.390 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 367 6.5888 ± 0.0498 0.00201 ± 0.00032 0.00411 ± 0.00004 1.903 ± 0.024 % 96.396 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 368 6.5956 ± 0.0497 0.00200 ± 0.00032 0.00411 ± 0.00004 1.902 ± 0.024 % 96.391 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 369 6.6007 ± 0.0497 0.00201 ± 0.00032 0.00411 ± 0.00004 1.901 ± 0.024 % 96.396 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 370 6.6108 ± 0.0498 0.00198 ± 0.00032 0.00410 ± 0.00004 1.899 ± 0.024 % 96.396 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 371 6.6241 ± 0.0498 0.00198 ± 0.00032 0.00410 ± 0.00004 1.899 ± 0.024 % 96.396 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 372 6.6367 ± 0.0498 0.00198 ± 0.00032 0.00410 ± 0.00004 1.898 ± 0.024 % 96.392 ± 0.061 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 373 6.6375 ± 0.0498 0.00197 ± 0.00032 0.00410 ± 0.00004 1.897 ± 0.024 % 96.393 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 374 6.6349 ± 0.0497 0.00198 ± 0.00032 0.00410 ± 0.00004 1.897 ± 0.024 % 96.391 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 375 6.6303 ± 0.0496 0.00196 ± 0.00032 0.00409 ± 0.00004 1.895 ± 0.024 % 96.383 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 376 6.6344 ± 0.0496 0.00196 ± 0.00032 0.00409 ± 0.00004 1.894 ± 0.024 % 96.380 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 377 6.6486 ± 0.0497 0.00193 ± 0.00032 0.00409 ± 0.00004 1.893 ± 0.024 % 96.379 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 378 6.6617 ± 0.0497 0.00190 ± 0.00032 0.00409 ± 0.00004 1.892 ± 0.024 % 96.381 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 379 6.6608 ± 0.0497 0.00190 ± 0.00032 0.00409 ± 0.00004 1.892 ± 0.024 % 96.376 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 380 6.6587 ± 0.0496 0.00194 ± 0.00032 0.00409 ± 0.00004 1.893 ± 0.024 % 96.375 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 381 6.6522 ± 0.0494 0.00195 ± 0.00032 0.00409 ± 0.00004 1.892 ± 0.024 % 96.376 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 382 6.6546 ± 0.0494 0.00193 ± 0.00032 0.00409 ± 0.00004 1.891 ± 0.024 % 96.376 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 383 6.6610 ± 0.0494 0.00192 ± 0.00031 0.00409 ± 0.00004 1.890 ± 0.024 % 96.380 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 384 6.6639 ± 0.0493 0.00193 ± 0.00031 0.00408 ± 0.00004 1.889 ± 0.024 % 96.383 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 385 6.6668 ± 0.0493 0.00191 ± 0.00031 0.00408 ± 0.00004 1.889 ± 0.023 % 96.380 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 386 6.6692 ± 0.0492 0.00190 ± 0.00031 0.00408 ± 0.00004 1.888 ± 0.023 % 96.380 ± 0.060 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 387 6.6739 ± 0.0492 0.00189 ± 0.00031 0.00408 ± 0.00004 1.887 ± 0.023 % 96.380 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 388 6.6785 ± 0.0492 0.00189 ± 0.00031 0.00408 ± 0.00004 1.886 ± 0.023 % 96.380 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 389 6.6810 ± 0.0492 0.00190 ± 0.00031 0.00408 ± 0.00004 1.886 ± 0.023 % 96.379 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 390 6.6623 ± 0.0490 0.00191 ± 0.00031 0.00410 ± 0.00004 1.903 ± 0.023 % 96.378 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 391 6.6467 ± 0.0488 0.00198 ± 0.00031 0.00414 ± 0.00004 1.920 ± 0.024 % 96.378 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 392 6.6390 ± 0.0486 0.00199 ± 0.00031 0.00414 ± 0.00004 1.921 ± 0.024 % 96.376 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 393 6.6428 ± 0.0486 0.00200 ± 0.00031 0.00415 ± 0.00004 1.921 ± 0.023 % 96.378 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 394 6.6421 ± 0.0485 0.00201 ± 0.00031 0.00415 ± 0.00004 1.921 ± 0.023 % 96.375 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 395 6.6386 ± 0.0485 0.00202 ± 0.00031 0.00415 ± 0.00004 1.920 ± 0.023 % 96.374 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 396 6.6381 ± 0.0484 0.00201 ± 0.00031 0.00416 ± 0.00004 1.921 ± 0.023 % 96.373 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 397 6.6492 ± 0.0484 0.00200 ± 0.00031 0.00416 ± 0.00004 1.920 ± 0.023 % 96.372 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 398 6.6448 ± 0.0483 0.00199 ± 0.00031 0.00416 ± 0.00004 1.920 ± 0.023 % 96.370 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 399 6.6398 ± 0.0482 0.00199 ± 0.00031 0.00416 ± 0.00004 1.921 ± 0.023 % 96.369 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 400 6.6483 ± 0.0482 0.00196 ± 0.00031 0.00416 ± 0.00004 1.921 ± 0.023 % 96.369 ± 0.059 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 401 6.6467 ± 0.0481 0.00196 ± 0.00031 0.00416 ± 0.00004 1.920 ± 0.023 % 96.371 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 402 6.6375 ± 0.0480 0.00198 ± 0.00031 0.00416 ± 0.00004 1.922 ± 0.023 % 96.373 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 403 6.6353 ± 0.0479 0.00199 ± 0.00031 0.00417 ± 0.00004 1.924 ± 0.023 % 96.372 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 404 6.6338 ± 0.0478 0.00201 ± 0.00031 0.00417 ± 0.00004 1.924 ± 0.023 % 96.374 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 405 6.6344 ± 0.0478 0.00202 ± 0.00031 0.00418 ± 0.00004 1.924 ± 0.023 % 96.374 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 406 6.6326 ± 0.0477 0.00203 ± 0.00031 0.00419 ± 0.00004 1.933 ± 0.023 % 96.369 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 407 6.6322 ± 0.0476 0.00204 ± 0.00031 0.00419 ± 0.00004 1.934 ± 0.023 % 96.367 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 408 6.6268 ± 0.0475 0.00205 ± 0.00031 0.00419 ± 0.00004 1.934 ± 0.023 % 96.370 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 409 6.6305 ± 0.0475 0.00208 ± 0.00031 0.00419 ± 0.00004 1.934 ± 0.023 % 96.362 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 410 6.6207 ± 0.0474 0.00207 ± 0.00031 0.00420 ± 0.00004 1.939 ± 0.023 % 96.364 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 411 6.6178 ± 0.0473 0.00209 ± 0.00030 0.00421 ± 0.00004 1.942 ± 0.023 % 96.362 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 412 6.6062 ± 0.0471 0.00213 ± 0.00031 0.00422 ± 0.00004 1.946 ± 0.023 % 96.360 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 413 6.6040 ± 0.0471 0.00214 ± 0.00030 0.00423 ± 0.00004 1.946 ± 0.023 % 96.359 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 414 6.6103 ± 0.0471 0.00213 ± 0.00030 0.00423 ± 0.00004 1.946 ± 0.023 % 96.362 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 415 6.6153 ± 0.0470 0.00213 ± 0.00030 0.00423 ± 0.00004 1.946 ± 0.023 % 96.364 ± 0.058 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 416 6.6158 ± 0.0470 0.00215 ± 0.00030 0.00423 ± 0.00004 1.945 ± 0.023 % 96.365 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 417 6.6095 ± 0.0469 0.00214 ± 0.00030 0.00424 ± 0.00004 1.949 ± 0.023 % 96.367 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 418 6.6116 ± 0.0468 0.00214 ± 0.00030 0.00425 ± 0.00004 1.952 ± 0.023 % 96.369 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 419 6.6037 ± 0.0467 0.00217 ± 0.00030 0.00428 ± 0.00004 1.968 ± 0.024 % 96.370 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 420 6.6029 ± 0.0467 0.00220 ± 0.00030 0.00429 ± 0.00004 1.969 ± 0.024 % 96.365 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 421 6.6058 ± 0.0466 0.00220 ± 0.00030 0.00429 ± 0.00004 1.969 ± 0.024 % 96.366 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 422 6.6022 ± 0.0465 0.00220 ± 0.00030 0.00429 ± 0.00004 1.969 ± 0.024 % 96.364 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 423 6.6052 ± 0.0465 0.00219 ± 0.00030 0.00429 ± 0.00004 1.969 ± 0.024 % 96.363 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 424 6.5993 ± 0.0464 0.00218 ± 0.00030 0.00430 ± 0.00004 1.971 ± 0.023 % 96.363 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 425 6.5864 ± 0.0462 0.00217 ± 0.00030 0.00430 ± 0.00004 1.971 ± 0.023 % 96.365 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 426 6.5875 ± 0.0462 0.00217 ± 0.00030 0.00430 ± 0.00004 1.972 ± 0.023 % 96.367 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 427 6.5858 ± 0.0461 0.00220 ± 0.00030 0.00430 ± 0.00004 1.973 ± 0.023 % 96.369 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 428 6.5819 ± 0.0461 0.00224 ± 0.00030 0.00430 ± 0.00004 1.976 ± 0.023 % 96.370 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 429 6.5737 ± 0.0459 0.00225 ± 0.00030 0.00432 ± 0.00004 1.983 ± 0.023 % 96.372 ± 0.057 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 430 6.5728 ± 0.0459 0.00228 ± 0.00030 0.00433 ± 0.00004 1.985 ± 0.023 % 96.371 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 431 6.5651 ± 0.0458 0.00228 ± 0.00030 0.00433 ± 0.00004 1.984 ± 0.023 % 96.371 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 432 6.5606 ± 0.0457 0.00222 ± 0.00030 0.00434 ± 0.00004 1.988 ± 0.023 % 96.370 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 433 6.5594 ± 0.0456 0.00225 ± 0.00030 0.00434 ± 0.00004 1.988 ± 0.023 % 96.373 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 434 6.5597 ± 0.0456 0.00225 ± 0.00030 0.00434 ± 0.00004 1.988 ± 0.023 % 96.375 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 435 6.5474 ± 0.0454 0.00228 ± 0.00030 0.00435 ± 0.00004 1.991 ± 0.023 % 96.374 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 436 6.5492 ± 0.0454 0.00228 ± 0.00030 0.00435 ± 0.00004 1.991 ± 0.023 % 96.376 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 437 6.5455 ± 0.0453 0.00228 ± 0.00030 0.00435 ± 0.00004 1.990 ± 0.023 % 96.381 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 438 6.5426 ± 0.0452 0.00227 ± 0.00030 0.00435 ± 0.00004 1.990 ± 0.023 % 96.383 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 439 6.5418 ± 0.0451 0.00228 ± 0.00030 0.00435 ± 0.00004 1.992 ± 0.023 % 96.382 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 440 6.5411 ± 0.0451 0.00229 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.023 % 96.379 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 441 6.5429 ± 0.0451 0.00230 ± 0.00030 0.00435 ± 0.00004 1.992 ± 0.023 % 96.382 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 442 6.5442 ± 0.0450 0.00231 ± 0.00030 0.00435 ± 0.00004 1.991 ± 0.023 % 96.379 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 443 6.5582 ± 0.0451 0.00232 ± 0.00030 0.00436 ± 0.00004 1.991 ± 0.023 % 96.380 ± 0.056 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 444 6.5596 ± 0.0451 0.00233 ± 0.00030 0.00436 ± 0.00004 1.990 ± 0.023 % 96.382 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 445 6.5574 ± 0.0450 0.00231 ± 0.00030 0.00436 ± 0.00004 1.990 ± 0.023 % 96.385 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 446 6.5555 ± 0.0449 0.00234 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.023 % 96.385 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 447 6.5539 ± 0.0449 0.00234 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.023 % 96.386 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 448 6.5587 ± 0.0449 0.00232 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.022 % 96.387 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 449 6.5581 ± 0.0448 0.00231 ± 0.00030 0.00436 ± 0.00004 1.993 ± 0.022 % 96.388 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 450 6.5576 ± 0.0448 0.00231 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.022 % 96.390 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 451 6.5585 ± 0.0447 0.00228 ± 0.00030 0.00436 ± 0.00004 1.993 ± 0.022 % 96.387 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 452 6.5650 ± 0.0447 0.00227 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.022 % 96.386 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 453 6.5690 ± 0.0447 0.00227 ± 0.00030 0.00436 ± 0.00004 1.992 ± 0.022 % 96.387 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 454 6.5693 ± 0.0447 0.00225 ± 0.00030 0.00436 ± 0.00004 1.991 ± 0.022 % 96.388 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 455 6.5719 ± 0.0446 0.00221 ± 0.00030 0.00436 ± 0.00004 1.990 ± 0.022 % 96.384 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 456 6.5667 ± 0.0445 0.00221 ± 0.00030 0.00436 ± 0.00004 1.991 ± 0.022 % 96.384 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 457 6.5688 ± 0.0445 0.00223 ± 0.00030 0.00436 ± 0.00004 1.993 ± 0.022 % 96.375 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 458 6.5596 ± 0.0444 0.00225 ± 0.00030 0.00436 ± 0.00004 1.995 ± 0.022 % 96.374 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 459 6.5644 ± 0.0444 0.00227 ± 0.00030 0.00436 ± 0.00004 1.995 ± 0.022 % 96.373 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 460 6.5719 ± 0.0444 0.00226 ± 0.00029 0.00436 ± 0.00004 1.994 ± 0.022 % 96.372 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 461 6.5688 ± 0.0443 0.00226 ± 0.00029 0.00436 ± 0.00004 1.994 ± 0.022 % 96.373 ± 0.055 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 462 6.5677 ± 0.0443 0.00226 ± 0.00029 0.00437 ± 0.00004 1.997 ± 0.022 % 96.375 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 463 6.5583 ± 0.0442 0.00226 ± 0.00029 0.00437 ± 0.00004 1.997 ± 0.022 % 96.377 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 464 6.5618 ± 0.0441 0.00228 ± 0.00029 0.00437 ± 0.00004 1.996 ± 0.022 % 96.373 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 465 6.5755 ± 0.0442 0.00228 ± 0.00029 0.00436 ± 0.00004 1.995 ± 0.022 % 96.375 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 466 6.5814 ± 0.0442 0.00231 ± 0.00029 0.00437 ± 0.00004 1.995 ± 0.022 % 96.376 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 467 6.5804 ± 0.0442 0.00231 ± 0.00029 0.00437 ± 0.00004 1.997 ± 0.022 % 96.376 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 468 6.5825 ± 0.0442 0.00232 ± 0.00029 0.00437 ± 0.00004 1.997 ± 0.022 % 96.375 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 469 6.5809 ± 0.0441 0.00232 ± 0.00029 0.00437 ± 0.00004 1.997 ± 0.022 % 96.373 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 470 6.5804 ± 0.0440 0.00230 ± 0.00029 0.00436 ± 0.00004 1.997 ± 0.022 % 96.368 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 471 6.5780 ± 0.0440 0.00229 ± 0.00029 0.00437 ± 0.00004 1.997 ± 0.022 % 96.367 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 472 6.5730 ± 0.0439 0.00230 ± 0.00029 0.00436 ± 0.00003 1.996 ± 0.022 % 96.367 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 473 6.5682 ± 0.0438 0.00231 ± 0.00029 0.00436 ± 0.00003 1.995 ± 0.022 % 96.364 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 474 6.5656 ± 0.0437 0.00232 ± 0.00029 0.00436 ± 0.00003 1.995 ± 0.022 % 96.366 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 475 6.5648 ± 0.0437 0.00231 ± 0.00029 0.00436 ± 0.00003 1.996 ± 0.022 % 96.364 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 476 6.5641 ± 0.0436 0.00230 ± 0.00029 0.00436 ± 0.00003 1.995 ± 0.022 % 96.367 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 477 6.5578 ± 0.0435 0.00230 ± 0.00029 0.00435 ± 0.00003 1.994 ± 0.021 % 96.366 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 478 6.5552 ± 0.0435 0.00229 ± 0.00029 0.00435 ± 0.00003 1.994 ± 0.021 % 96.368 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 479 6.5514 ± 0.0434 0.00230 ± 0.00029 0.00435 ± 0.00003 1.993 ± 0.021 % 96.364 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 480 6.5523 ± 0.0434 0.00229 ± 0.00029 0.00435 ± 0.00003 1.993 ± 0.021 % 96.361 ± 0.054 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 481 6.5539 ± 0.0433 0.00229 ± 0.00029 0.00435 ± 0.00003 1.993 ± 0.021 % 96.361 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 482 6.5573 ± 0.0433 0.00230 ± 0.00029 0.00435 ± 0.00003 1.993 ± 0.021 % 96.365 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 483 6.5512 ± 0.0432 0.00229 ± 0.00029 0.00436 ± 0.00003 1.994 ± 0.021 % 96.365 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 484 6.5526 ± 0.0432 0.00230 ± 0.00029 0.00435 ± 0.00003 1.993 ± 0.021 % 96.364 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 485 6.5494 ± 0.0431 0.00228 ± 0.00029 0.00435 ± 0.00003 1.994 ± 0.021 % 96.367 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 486 6.5530 ± 0.0431 0.00227 ± 0.00029 0.00436 ± 0.00003 1.993 ± 0.021 % 96.370 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 487 6.5585 ± 0.0431 0.00226 ± 0.00029 0.00436 ± 0.00003 1.993 ± 0.021 % 96.372 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 488 6.5596 ± 0.0431 0.00226 ± 0.00029 0.00436 ± 0.00003 1.993 ± 0.021 % 96.370 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 489 6.5600 ± 0.0431 0.00226 ± 0.00028 0.00436 ± 0.00003 1.993 ± 0.021 % 96.373 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 490 6.5637 ± 0.0430 0.00226 ± 0.00028 0.00436 ± 0.00003 1.992 ± 0.021 % 96.366 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 491 6.5622 ± 0.0430 0.00226 ± 0.00028 0.00435 ± 0.00003 1.991 ± 0.021 % 96.368 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 492 6.5637 ± 0.0430 0.00228 ± 0.00028 0.00435 ± 0.00003 1.991 ± 0.021 % 96.367 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 493 6.5697 ± 0.0430 0.00228 ± 0.00028 0.00435 ± 0.00003 1.990 ± 0.021 % 96.361 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 494 6.5715 ± 0.0429 0.00230 ± 0.00028 0.00435 ± 0.00003 1.990 ± 0.021 % 96.361 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 495 6.5687 ± 0.0429 0.00230 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.021 % 96.357 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 496 6.5661 ± 0.0428 0.00233 ± 0.00028 0.00435 ± 0.00003 1.988 ± 0.021 % 96.355 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 497 6.5660 ± 0.0428 0.00231 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.021 % 96.351 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 498 6.5646 ± 0.0427 0.00231 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.021 % 96.351 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 499 6.5650 ± 0.0427 0.00232 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.021 % 96.351 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 500 6.5616 ± 0.0426 0.00231 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.021 % 96.352 ± 0.053 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 501 6.5571 ± 0.0425 0.00230 ± 0.00028 0.00435 ± 0.00003 1.988 ± 0.021 % 96.355 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 502 6.5591 ± 0.0425 0.00228 ± 0.00028 0.00435 ± 0.00003 1.988 ± 0.021 % 96.357 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 503 6.5615 ± 0.0425 0.00227 ± 0.00028 0.00435 ± 0.00003 1.987 ± 0.021 % 96.361 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 504 6.5596 ± 0.0424 0.00235 ± 0.00028 0.00435 ± 0.00003 1.987 ± 0.021 % 96.364 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 505 6.5545 ± 0.0424 0.00236 ± 0.00028 0.00434 ± 0.00003 1.986 ± 0.021 % 96.363 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 506 6.5573 ± 0.0423 0.00237 ± 0.00028 0.00435 ± 0.00003 1.987 ± 0.021 % 96.366 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 507 6.5605 ± 0.0423 0.00239 ± 0.00028 0.00435 ± 0.00003 1.986 ± 0.021 % 96.368 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 508 6.5658 ± 0.0423 0.00239 ± 0.00028 0.00435 ± 0.00003 1.987 ± 0.021 % 96.369 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 509 6.5625 ± 0.0423 0.00239 ± 0.00028 0.00434 ± 0.00003 1.986 ± 0.020 % 96.373 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 510 6.5627 ± 0.0422 0.00238 ± 0.00028 0.00434 ± 0.00003 1.986 ± 0.020 % 96.376 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 511 6.5567 ± 0.0421 0.00233 ± 0.00028 0.00435 ± 0.00003 1.990 ± 0.020 % 96.375 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 512 6.5590 ± 0.0421 0.00234 ± 0.00028 0.00435 ± 0.00003 1.990 ± 0.020 % 96.376 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 513 6.5581 ± 0.0421 0.00232 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.020 % 96.376 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 514 6.5590 ± 0.0420 0.00234 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.020 % 96.374 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 515 6.5586 ± 0.0420 0.00233 ± 0.00028 0.00435 ± 0.00003 1.990 ± 0.020 % 96.374 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 516 6.5620 ± 0.0420 0.00235 ± 0.00028 0.00435 ± 0.00003 1.989 ± 0.020 % 96.373 ± 0.052 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 517 6.5640 ± 0.0420 0.00233 ± 0.00028 0.00435 ± 0.00003 1.988 ± 0.020 % 96.373 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 518 6.5595 ± 0.0419 0.00234 ± 0.00028 0.00435 ± 0.00003 1.988 ± 0.020 % 96.373 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 519 6.5610 ± 0.0419 0.00235 ± 0.00028 0.00436 ± 0.00003 1.991 ± 0.020 % 96.370 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 520 6.5658 ± 0.0419 0.00236 ± 0.00028 0.00436 ± 0.00003 1.991 ± 0.020 % 96.369 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 521 6.5673 ± 0.0418 0.00235 ± 0.00028 0.00436 ± 0.00003 1.991 ± 0.020 % 96.366 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 522 6.5759 ± 0.0419 0.00236 ± 0.00028 0.00436 ± 0.00003 1.991 ± 0.020 % 96.368 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 523 6.5743 ± 0.0418 0.00237 ± 0.00028 0.00436 ± 0.00003 1.991 ± 0.020 % 96.369 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 524 6.5699 ± 0.0418 0.00238 ± 0.00028 0.00436 ± 0.00003 1.991 ± 0.020 % 96.366 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 525 6.5718 ± 0.0417 0.00237 ± 0.00028 0.00436 �� 0.00003 1.990 ± 0.020 % 96.365 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 526 6.5740 ± 0.0417 0.00236 ± 0.00028 0.00436 ± 0.00003 1.990 ± 0.020 % 96.360 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 527 6.5776 ± 0.0417 0.00236 ± 0.00028 0.00436 ± 0.00003 1.989 ± 0.020 % 96.361 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 528 6.5748 ± 0.0416 0.00236 ± 0.00028 0.00436 ± 0.00003 1.989 ± 0.020 % 96.361 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 529 6.5665 ± 0.0415 0.00236 ± 0.00028 0.00436 ± 0.00003 1.989 ± 0.020 % 96.359 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 530 6.5590 ± 0.0414 0.00236 ± 0.00028 0.00435 ± 0.00003 1.988 ± 0.020 % 96.360 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 531 6.5658 ± 0.0414 0.00234 ± 0.00028 0.00435 ± 0.00003 1.987 ± 0.020 % 96.361 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 532 6.5599 ± 0.0414 0.00233 ± 0.00028 0.00435 ± 0.00003 1.986 ± 0.020 % 96.363 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 533 6.5551 ± 0.0413 0.00233 ± 0.00028 0.00434 ± 0.00003 1.985 ± 0.020 % 96.365 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 534 6.5410 ± 0.0412 0.00234 ± 0.00028 0.00434 ± 0.00003 1.984 ± 0.020 % 96.370 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 535 6.5343 ± 0.0411 0.00232 ± 0.00028 0.00434 ± 0.00003 1.983 ± 0.020 % 96.373 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 536 6.5339 ± 0.0410 0.00229 ± 0.00028 0.00434 ± 0.00003 1.983 ± 0.020 % 96.373 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 537 6.5369 ± 0.0410 0.00229 ± 0.00028 0.00434 ± 0.00003 1.983 ± 0.020 % 96.375 ± 0.051 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 538 6.5405 ± 0.0410 0.00229 ± 0.00028 0.00433 ± 0.00003 1.982 ± 0.020 % 96.377 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 539 6.5405 ± 0.0410 0.00231 ± 0.00027 0.00433 ± 0.00003 1.982 ± 0.020 % 96.377 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 540 6.5463 ± 0.0410 0.00230 ± 0.00027 0.00433 ± 0.00003 1.981 ± 0.020 % 96.376 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 541 6.5485 ± 0.0409 0.00231 ± 0.00027 0.00433 ± 0.00003 1.980 ± 0.020 % 96.374 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 542 6.5542 ± 0.0409 0.00230 ± 0.00027 0.00433 ± 0.00003 1.979 ± 0.020 % 96.374 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 543 6.5609 ± 0.0410 0.00228 ± 0.00027 0.00434 ± 0.00003 1.981 ± 0.020 % 96.372 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 544 6.5684 ± 0.0410 0.00230 ± 0.00027 0.00433 ± 0.00003 1.981 ± 0.020 % 96.374 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 545 6.5679 ± 0.0409 0.00232 ± 0.00027 0.00433 ± 0.00003 1.980 ± 0.020 % 96.374 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 546 6.5758 ± 0.0410 0.00234 ± 0.00027 0.00433 ± 0.00003 1.980 ± 0.020 % 96.373 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 547 6.5795 ± 0.0410 0.00233 ± 0.00027 0.00433 ± 0.00003 1.978 ± 0.019 % 96.372 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 548 6.5621 ± 0.0408 0.00236 ± 0.00027 0.00438 ± 0.00003 2.002 ± 0.020 % 96.370 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 549 6.5583 ± 0.0407 0.00234 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.020 % 96.371 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 550 6.5509 ± 0.0406 0.00233 ± 0.00027 0.00438 ± 0.00003 2.003 ± 0.020 % 96.375 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 551 6.5512 ± 0.0406 0.00232 ± 0.00027 0.00438 ± 0.00003 2.002 ± 0.020 % 96.373 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 552 6.5541 ± 0.0406 0.00231 ± 0.00027 0.00438 ± 0.00003 2.002 ± 0.020 % 96.375 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 553 6.5590 ± 0.0406 0.00230 ± 0.00027 0.00438 ± 0.00003 2.002 ± 0.020 % 96.374 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 554 6.5603 ± 0.0406 0.00230 ± 0.00027 0.00438 ± 0.00003 2.002 ± 0.020 % 96.376 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 555 6.5588 ± 0.0405 0.00230 ± 0.00027 0.00438 ± 0.00003 2.002 ± 0.020 % 96.374 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 556 6.5575 ± 0.0405 0.00229 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.020 % 96.377 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 557 6.5507 ± 0.0404 0.00230 ± 0.00027 0.00438 ± 0.00003 2.005 ± 0.020 % 96.384 ± 0.050 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 558 6.5525 ± 0.0404 0.00230 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.020 % 96.387 ± 0.049 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 559 6.5547 ± 0.0404 0.00229 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.020 % 96.386 ± 0.049 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 560 6.5583 ± 0.0403 0.00231 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.020 % 96.384 ± 0.049 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 561 6.5634 ± 0.0403 0.00233 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.020 % 96.387 ± 0.049 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 562 6.5741 ± 0.0404 0.00234 ± 0.00027 0.00438 ± 0.00003 2.003 ± 0.019 % 96.385 ± 0.049 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 563 6.5691 ± 0.0403 0.00231 ± 0.00027 0.00438 ± 0.00003 2.004 ± 0.019 % 96.388 ± 0.049 % - -chunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p - 564 6.5701 ± 0.0403 0.00231 ± 0.00027 0.00438 ± 0.00003 2.003 ± 0.019 % 96.391 ± 0.049 % - ====== Perplexity statistics ====== Mean PPL(Q) : 6.570115 ± 0.040324 Mean PPL(base) : 6.554978 ± 0.040159 @@ -2174,8 +35,3 @@ Median Δp: 0.000% Minimum Δp: -72.891% RMS Δp : 2.003 ± 0.019 % Same top p: 96.391 ± 0.049 % - -llama_perf_context_print: load time = 50262.02 ms -llama_perf_context_print: prompt eval time = 794113.95 ms / 288768 tokens ( 2.75 ms per token, 363.64 tokens per second) -llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second) -llama_perf_context_print: total time = 907443.74 ms / 288769 tokens