{ "best_metric": 0.6124338624338624, "best_model_checkpoint": "vivit-b-16x2-kinetics400-ft-76388\\checkpoint-4773", "epoch": 49.01109090909091, "eval_steps": 500, "global_step": 5500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018181818181818182, "grad_norm": 5.520795822143555, "learning_rate": 9.09090909090909e-07, "loss": 1.0986, "step": 10 }, { "epoch": 0.0036363636363636364, "grad_norm": 10.853554725646973, "learning_rate": 1.818181818181818e-06, "loss": 1.1002, "step": 20 }, { "epoch": 0.005454545454545455, "grad_norm": 6.47132682800293, "learning_rate": 2.7272727272727272e-06, "loss": 1.1019, "step": 30 }, { "epoch": 0.007272727272727273, "grad_norm": 6.264883041381836, "learning_rate": 3.636363636363636e-06, "loss": 1.0819, "step": 40 }, { "epoch": 0.00909090909090909, "grad_norm": 7.561264991760254, "learning_rate": 4.5454545454545455e-06, "loss": 1.0847, "step": 50 }, { "epoch": 0.01090909090909091, "grad_norm": 4.925744533538818, "learning_rate": 5.4545454545454545e-06, "loss": 1.097, "step": 60 }, { "epoch": 0.012727272727272728, "grad_norm": 13.454752922058105, "learning_rate": 6.363636363636363e-06, "loss": 1.12, "step": 70 }, { "epoch": 0.014545454545454545, "grad_norm": 2.357506036758423, "learning_rate": 7.272727272727272e-06, "loss": 1.0849, "step": 80 }, { "epoch": 0.016363636363636365, "grad_norm": 6.968355655670166, "learning_rate": 8.181818181818183e-06, "loss": 1.0887, "step": 90 }, { "epoch": 0.01818181818181818, "grad_norm": 5.487667083740234, "learning_rate": 9.090909090909091e-06, "loss": 1.1272, "step": 100 }, { "epoch": 0.02, "grad_norm": 6.389057159423828, "learning_rate": 1e-05, "loss": 1.1083, "step": 110 }, { "epoch": 0.020181818181818183, "eval_accuracy": 0.33465608465608465, "eval_loss": 1.1111931800842285, "eval_runtime": 148.2258, "eval_samples_per_second": 5.1, "eval_steps_per_second": 0.641, "step": 111 }, { "epoch": 1.0016363636363637, "grad_norm": 5.377053260803223, "learning_rate": 1.0909090909090909e-05, "loss": 1.0977, "step": 120 }, { "epoch": 1.0034545454545454, "grad_norm": 6.0129170417785645, "learning_rate": 1.1818181818181819e-05, "loss": 1.1504, "step": 130 }, { "epoch": 1.0052727272727273, "grad_norm": 7.416619777679443, "learning_rate": 1.2727272727272727e-05, "loss": 1.072, "step": 140 }, { "epoch": 1.007090909090909, "grad_norm": 4.585965156555176, "learning_rate": 1.3636363636363637e-05, "loss": 1.0834, "step": 150 }, { "epoch": 1.008909090909091, "grad_norm": 9.934368133544922, "learning_rate": 1.4545454545454545e-05, "loss": 1.1117, "step": 160 }, { "epoch": 1.0107272727272727, "grad_norm": 7.8346381187438965, "learning_rate": 1.5454545454545454e-05, "loss": 1.1222, "step": 170 }, { "epoch": 1.0125454545454546, "grad_norm": 4.306520462036133, "learning_rate": 1.6363636363636366e-05, "loss": 1.0952, "step": 180 }, { "epoch": 1.0143636363636364, "grad_norm": 7.647275924682617, "learning_rate": 1.7272727272727274e-05, "loss": 1.1341, "step": 190 }, { "epoch": 1.016181818181818, "grad_norm": 5.643113613128662, "learning_rate": 1.8181818181818182e-05, "loss": 1.0344, "step": 200 }, { "epoch": 1.018, "grad_norm": 6.641160011291504, "learning_rate": 1.9090909090909094e-05, "loss": 1.1053, "step": 210 }, { "epoch": 1.0198181818181817, "grad_norm": 7.255860805511475, "learning_rate": 2e-05, "loss": 1.0789, "step": 220 }, { "epoch": 1.020181818181818, "eval_accuracy": 0.42592592592592593, "eval_loss": 1.0576354265213013, "eval_runtime": 149.5382, "eval_samples_per_second": 5.056, "eval_steps_per_second": 0.635, "step": 222 }, { "epoch": 2.0014545454545454, "grad_norm": 6.65429162979126, "learning_rate": 2.090909090909091e-05, "loss": 1.091, "step": 230 }, { "epoch": 2.0032727272727273, "grad_norm": 3.869105815887451, "learning_rate": 2.1818181818181818e-05, "loss": 1.0371, "step": 240 }, { "epoch": 2.0050909090909093, "grad_norm": 8.261011123657227, "learning_rate": 2.272727272727273e-05, "loss": 1.0778, "step": 250 }, { "epoch": 2.0069090909090908, "grad_norm": 8.152657508850098, "learning_rate": 2.3636363636363637e-05, "loss": 1.0327, "step": 260 }, { "epoch": 2.0087272727272727, "grad_norm": 6.811939239501953, "learning_rate": 2.4545454545454545e-05, "loss": 1.094, "step": 270 }, { "epoch": 2.0105454545454546, "grad_norm": 4.7158942222595215, "learning_rate": 2.5454545454545454e-05, "loss": 1.1247, "step": 280 }, { "epoch": 2.0123636363636366, "grad_norm": 4.587483882904053, "learning_rate": 2.636363636363636e-05, "loss": 1.1225, "step": 290 }, { "epoch": 2.014181818181818, "grad_norm": 3.0075366497039795, "learning_rate": 2.7272727272727273e-05, "loss": 1.1186, "step": 300 }, { "epoch": 2.016, "grad_norm": 6.268433094024658, "learning_rate": 2.818181818181818e-05, "loss": 1.1272, "step": 310 }, { "epoch": 2.017818181818182, "grad_norm": 8.382287979125977, "learning_rate": 2.909090909090909e-05, "loss": 1.087, "step": 320 }, { "epoch": 2.0196363636363635, "grad_norm": 4.22382926940918, "learning_rate": 3e-05, "loss": 1.0767, "step": 330 }, { "epoch": 2.0201818181818183, "eval_accuracy": 0.4246031746031746, "eval_loss": 1.0863010883331299, "eval_runtime": 153.7648, "eval_samples_per_second": 4.917, "eval_steps_per_second": 0.618, "step": 333 }, { "epoch": 3.001272727272727, "grad_norm": 5.407969951629639, "learning_rate": 3.090909090909091e-05, "loss": 1.0193, "step": 340 }, { "epoch": 3.003090909090909, "grad_norm": 9.856935501098633, "learning_rate": 3.181818181818182e-05, "loss": 1.0889, "step": 350 }, { "epoch": 3.004909090909091, "grad_norm": 9.323800086975098, "learning_rate": 3.272727272727273e-05, "loss": 1.0824, "step": 360 }, { "epoch": 3.006727272727273, "grad_norm": 13.862144470214844, "learning_rate": 3.3636363636363636e-05, "loss": 1.1013, "step": 370 }, { "epoch": 3.0085454545454544, "grad_norm": 7.345916748046875, "learning_rate": 3.454545454545455e-05, "loss": 1.0071, "step": 380 }, { "epoch": 3.0103636363636364, "grad_norm": 3.808772087097168, "learning_rate": 3.545454545454546e-05, "loss": 1.0615, "step": 390 }, { "epoch": 3.0121818181818183, "grad_norm": 5.705800533294678, "learning_rate": 3.6363636363636364e-05, "loss": 1.0663, "step": 400 }, { "epoch": 3.014, "grad_norm": 9.873875617980957, "learning_rate": 3.7272727272727276e-05, "loss": 1.0465, "step": 410 }, { "epoch": 3.0158181818181817, "grad_norm": 2.8121228218078613, "learning_rate": 3.818181818181819e-05, "loss": 1.1368, "step": 420 }, { "epoch": 3.0176363636363637, "grad_norm": 10.855435371398926, "learning_rate": 3.909090909090909e-05, "loss": 1.1093, "step": 430 }, { "epoch": 3.0194545454545456, "grad_norm": 4.415251731872559, "learning_rate": 4e-05, "loss": 1.1114, "step": 440 }, { "epoch": 3.0201818181818183, "eval_accuracy": 0.37037037037037035, "eval_loss": 1.1061029434204102, "eval_runtime": 149.7962, "eval_samples_per_second": 5.047, "eval_steps_per_second": 0.634, "step": 444 }, { "epoch": 4.001090909090909, "grad_norm": 2.8874096870422363, "learning_rate": 4.0909090909090915e-05, "loss": 1.159, "step": 450 }, { "epoch": 4.002909090909091, "grad_norm": 15.52538776397705, "learning_rate": 4.181818181818182e-05, "loss": 1.0661, "step": 460 }, { "epoch": 4.004727272727273, "grad_norm": 8.193879127502441, "learning_rate": 4.2727272727272724e-05, "loss": 1.0345, "step": 470 }, { "epoch": 4.006545454545455, "grad_norm": 8.885357856750488, "learning_rate": 4.3636363636363636e-05, "loss": 1.01, "step": 480 }, { "epoch": 4.008363636363637, "grad_norm": 9.789897918701172, "learning_rate": 4.454545454545455e-05, "loss": 1.1213, "step": 490 }, { "epoch": 4.0101818181818185, "grad_norm": 11.676102638244629, "learning_rate": 4.545454545454546e-05, "loss": 1.111, "step": 500 }, { "epoch": 4.012, "grad_norm": 6.093684196472168, "learning_rate": 4.636363636363636e-05, "loss": 1.2359, "step": 510 }, { "epoch": 4.0138181818181815, "grad_norm": 8.584346771240234, "learning_rate": 4.7272727272727275e-05, "loss": 1.0954, "step": 520 }, { "epoch": 4.0156363636363634, "grad_norm": 5.42387580871582, "learning_rate": 4.8181818181818186e-05, "loss": 1.0584, "step": 530 }, { "epoch": 4.017454545454545, "grad_norm": 9.476812362670898, "learning_rate": 4.909090909090909e-05, "loss": 1.069, "step": 540 }, { "epoch": 4.019272727272727, "grad_norm": 8.928860664367676, "learning_rate": 5e-05, "loss": 1.0832, "step": 550 }, { "epoch": 4.020181818181818, "eval_accuracy": 0.4193121693121693, "eval_loss": 1.053645372390747, "eval_runtime": 148.4018, "eval_samples_per_second": 5.094, "eval_steps_per_second": 0.64, "step": 555 }, { "epoch": 5.000909090909091, "grad_norm": 3.7041380405426025, "learning_rate": 4.98989898989899e-05, "loss": 1.0869, "step": 560 }, { "epoch": 5.002727272727273, "grad_norm": 9.805282592773438, "learning_rate": 4.97979797979798e-05, "loss": 1.0385, "step": 570 }, { "epoch": 5.004545454545455, "grad_norm": 7.139551162719727, "learning_rate": 4.9696969696969694e-05, "loss": 1.1415, "step": 580 }, { "epoch": 5.006363636363637, "grad_norm": 5.1117682456970215, "learning_rate": 4.9595959595959594e-05, "loss": 1.0708, "step": 590 }, { "epoch": 5.008181818181818, "grad_norm": 5.95142126083374, "learning_rate": 4.94949494949495e-05, "loss": 1.1131, "step": 600 }, { "epoch": 5.01, "grad_norm": 4.644528388977051, "learning_rate": 4.93939393939394e-05, "loss": 1.0531, "step": 610 }, { "epoch": 5.011818181818182, "grad_norm": 5.309731483459473, "learning_rate": 4.92929292929293e-05, "loss": 1.0557, "step": 620 }, { "epoch": 5.013636363636364, "grad_norm": 2.6945815086364746, "learning_rate": 4.919191919191919e-05, "loss": 1.0923, "step": 630 }, { "epoch": 5.015454545454546, "grad_norm": 6.136772155761719, "learning_rate": 4.909090909090909e-05, "loss": 1.0675, "step": 640 }, { "epoch": 5.0172727272727276, "grad_norm": 3.571549415588379, "learning_rate": 4.898989898989899e-05, "loss": 1.0437, "step": 650 }, { "epoch": 5.0190909090909095, "grad_norm": 3.366074323654175, "learning_rate": 4.888888888888889e-05, "loss": 1.0622, "step": 660 }, { "epoch": 5.020181818181818, "eval_accuracy": 0.4576719576719577, "eval_loss": 1.0719990730285645, "eval_runtime": 148.7515, "eval_samples_per_second": 5.082, "eval_steps_per_second": 0.639, "step": 666 }, { "epoch": 6.000727272727273, "grad_norm": 8.250535011291504, "learning_rate": 4.878787878787879e-05, "loss": 0.9758, "step": 670 }, { "epoch": 6.002545454545454, "grad_norm": 3.9611146450042725, "learning_rate": 4.868686868686869e-05, "loss": 1.1303, "step": 680 }, { "epoch": 6.004363636363636, "grad_norm": 6.4248504638671875, "learning_rate": 4.858585858585859e-05, "loss": 0.9938, "step": 690 }, { "epoch": 6.006181818181818, "grad_norm": 6.866003513336182, "learning_rate": 4.848484848484849e-05, "loss": 0.9641, "step": 700 }, { "epoch": 6.008, "grad_norm": 11.4672269821167, "learning_rate": 4.838383838383839e-05, "loss": 1.0336, "step": 710 }, { "epoch": 6.009818181818182, "grad_norm": 16.061107635498047, "learning_rate": 4.828282828282829e-05, "loss": 1.0764, "step": 720 }, { "epoch": 6.011636363636364, "grad_norm": 4.682388782501221, "learning_rate": 4.8181818181818186e-05, "loss": 1.0844, "step": 730 }, { "epoch": 6.013454545454546, "grad_norm": 7.241218566894531, "learning_rate": 4.808080808080808e-05, "loss": 1.0297, "step": 740 }, { "epoch": 6.015272727272727, "grad_norm": 7.628449440002441, "learning_rate": 4.797979797979798e-05, "loss": 1.1384, "step": 750 }, { "epoch": 6.017090909090909, "grad_norm": 6.4355878829956055, "learning_rate": 4.787878787878788e-05, "loss": 1.0197, "step": 760 }, { "epoch": 6.018909090909091, "grad_norm": 4.381698131561279, "learning_rate": 4.7777777777777784e-05, "loss": 1.0874, "step": 770 }, { "epoch": 6.020181818181818, "eval_accuracy": 0.4708994708994709, "eval_loss": 1.0304068326950073, "eval_runtime": 148.4186, "eval_samples_per_second": 5.094, "eval_steps_per_second": 0.64, "step": 777 }, { "epoch": 7.000545454545454, "grad_norm": 8.861906051635742, "learning_rate": 4.7676767676767684e-05, "loss": 1.0022, "step": 780 }, { "epoch": 7.002363636363636, "grad_norm": 5.51254940032959, "learning_rate": 4.7575757575757576e-05, "loss": 1.0364, "step": 790 }, { "epoch": 7.004181818181818, "grad_norm": 7.339147090911865, "learning_rate": 4.7474747474747476e-05, "loss": 1.0236, "step": 800 }, { "epoch": 7.006, "grad_norm": 5.729611396789551, "learning_rate": 4.7373737373737375e-05, "loss": 1.0856, "step": 810 }, { "epoch": 7.007818181818182, "grad_norm": 11.61246109008789, "learning_rate": 4.7272727272727275e-05, "loss": 1.0058, "step": 820 }, { "epoch": 7.009636363636363, "grad_norm": 7.409841060638428, "learning_rate": 4.7171717171717174e-05, "loss": 0.9701, "step": 830 }, { "epoch": 7.011454545454545, "grad_norm": 9.707467079162598, "learning_rate": 4.7070707070707074e-05, "loss": 0.9544, "step": 840 }, { "epoch": 7.013272727272727, "grad_norm": 4.250189781188965, "learning_rate": 4.696969696969697e-05, "loss": 1.0981, "step": 850 }, { "epoch": 7.015090909090909, "grad_norm": 5.50283670425415, "learning_rate": 4.686868686868687e-05, "loss": 1.0752, "step": 860 }, { "epoch": 7.016909090909091, "grad_norm": 10.252998352050781, "learning_rate": 4.676767676767677e-05, "loss": 1.0213, "step": 870 }, { "epoch": 7.018727272727273, "grad_norm": 9.177423477172852, "learning_rate": 4.666666666666667e-05, "loss": 0.9742, "step": 880 }, { "epoch": 7.020181818181818, "eval_accuracy": 0.45105820105820105, "eval_loss": 1.0340412855148315, "eval_runtime": 148.8072, "eval_samples_per_second": 5.08, "eval_steps_per_second": 0.638, "step": 888 }, { "epoch": 8.000363636363636, "grad_norm": 7.860219478607178, "learning_rate": 4.656565656565657e-05, "loss": 0.9891, "step": 890 }, { "epoch": 8.002181818181818, "grad_norm": 11.450358390808105, "learning_rate": 4.6464646464646464e-05, "loss": 0.966, "step": 900 }, { "epoch": 8.004, "grad_norm": 7.019264221191406, "learning_rate": 4.636363636363636e-05, "loss": 1.0707, "step": 910 }, { "epoch": 8.005818181818182, "grad_norm": 6.124785900115967, "learning_rate": 4.626262626262626e-05, "loss": 0.8674, "step": 920 }, { "epoch": 8.007636363636363, "grad_norm": 5.532207012176514, "learning_rate": 4.616161616161616e-05, "loss": 1.1394, "step": 930 }, { "epoch": 8.009454545454545, "grad_norm": 2.409095287322998, "learning_rate": 4.606060606060607e-05, "loss": 1.0774, "step": 940 }, { "epoch": 8.011272727272727, "grad_norm": 5.863190174102783, "learning_rate": 4.595959595959596e-05, "loss": 1.0996, "step": 950 }, { "epoch": 8.01309090909091, "grad_norm": 5.189179420471191, "learning_rate": 4.585858585858586e-05, "loss": 1.0476, "step": 960 }, { "epoch": 8.014909090909091, "grad_norm": 6.774593353271484, "learning_rate": 4.575757575757576e-05, "loss": 1.0299, "step": 970 }, { "epoch": 8.016727272727273, "grad_norm": 9.079130172729492, "learning_rate": 4.565656565656566e-05, "loss": 1.0262, "step": 980 }, { "epoch": 8.018545454545455, "grad_norm": 6.1006760597229, "learning_rate": 4.555555555555556e-05, "loss": 0.9848, "step": 990 }, { "epoch": 8.020181818181818, "eval_accuracy": 0.4669312169312169, "eval_loss": 1.036673903465271, "eval_runtime": 146.7468, "eval_samples_per_second": 5.152, "eval_steps_per_second": 0.647, "step": 999 }, { "epoch": 9.000181818181819, "grad_norm": 6.422980308532715, "learning_rate": 4.545454545454546e-05, "loss": 1.1866, "step": 1000 }, { "epoch": 9.002, "grad_norm": 5.095742702484131, "learning_rate": 4.535353535353535e-05, "loss": 1.0583, "step": 1010 }, { "epoch": 9.003818181818183, "grad_norm": 6.119103908538818, "learning_rate": 4.525252525252526e-05, "loss": 1.0929, "step": 1020 }, { "epoch": 9.005636363636363, "grad_norm": 4.790987968444824, "learning_rate": 4.515151515151516e-05, "loss": 1.0684, "step": 1030 }, { "epoch": 9.007454545454545, "grad_norm": 5.97383975982666, "learning_rate": 4.5050505050505056e-05, "loss": 1.0431, "step": 1040 }, { "epoch": 9.009272727272727, "grad_norm": 3.85739803314209, "learning_rate": 4.494949494949495e-05, "loss": 0.934, "step": 1050 }, { "epoch": 9.011090909090909, "grad_norm": 3.361600875854492, "learning_rate": 4.484848484848485e-05, "loss": 0.9329, "step": 1060 }, { "epoch": 9.01290909090909, "grad_norm": 7.449879169464111, "learning_rate": 4.474747474747475e-05, "loss": 1.0629, "step": 1070 }, { "epoch": 9.014727272727272, "grad_norm": 7.561132431030273, "learning_rate": 4.464646464646465e-05, "loss": 1.058, "step": 1080 }, { "epoch": 9.016545454545454, "grad_norm": 12.090225219726562, "learning_rate": 4.454545454545455e-05, "loss": 1.1276, "step": 1090 }, { "epoch": 9.018363636363636, "grad_norm": 5.2706708908081055, "learning_rate": 4.4444444444444447e-05, "loss": 0.9764, "step": 1100 }, { "epoch": 9.020181818181818, "grad_norm": 14.38776969909668, "learning_rate": 4.4343434343434346e-05, "loss": 1.12, "step": 1110 }, { "epoch": 9.020181818181818, "eval_accuracy": 0.4193121693121693, "eval_loss": 1.026868224143982, "eval_runtime": 149.067, "eval_samples_per_second": 5.072, "eval_steps_per_second": 0.637, "step": 1110 }, { "epoch": 10.001818181818182, "grad_norm": 4.231562614440918, "learning_rate": 4.4242424242424246e-05, "loss": 1.0368, "step": 1120 }, { "epoch": 10.003636363636364, "grad_norm": 6.167897701263428, "learning_rate": 4.4141414141414145e-05, "loss": 1.0565, "step": 1130 }, { "epoch": 10.005454545454546, "grad_norm": 5.133641719818115, "learning_rate": 4.4040404040404044e-05, "loss": 1.0345, "step": 1140 }, { "epoch": 10.007272727272728, "grad_norm": 4.132250785827637, "learning_rate": 4.3939393939393944e-05, "loss": 0.9971, "step": 1150 }, { "epoch": 10.00909090909091, "grad_norm": 4.209298610687256, "learning_rate": 4.383838383838384e-05, "loss": 1.0099, "step": 1160 }, { "epoch": 10.010909090909092, "grad_norm": 6.258595943450928, "learning_rate": 4.3737373737373736e-05, "loss": 1.0037, "step": 1170 }, { "epoch": 10.012727272727274, "grad_norm": 9.644258499145508, "learning_rate": 4.3636363636363636e-05, "loss": 0.9807, "step": 1180 }, { "epoch": 10.014545454545454, "grad_norm": 6.8741912841796875, "learning_rate": 4.3535353535353535e-05, "loss": 0.9534, "step": 1190 }, { "epoch": 10.016363636363636, "grad_norm": 4.356405735015869, "learning_rate": 4.343434343434344e-05, "loss": 1.0285, "step": 1200 }, { "epoch": 10.018181818181818, "grad_norm": 4.656445503234863, "learning_rate": 4.3333333333333334e-05, "loss": 1.0168, "step": 1210 }, { "epoch": 10.02, "grad_norm": 6.54704475402832, "learning_rate": 4.3232323232323234e-05, "loss": 1.0484, "step": 1220 }, { "epoch": 10.020181818181818, "eval_accuracy": 0.45105820105820105, "eval_loss": 1.0105139017105103, "eval_runtime": 146.4072, "eval_samples_per_second": 5.164, "eval_steps_per_second": 0.649, "step": 1221 }, { "epoch": 11.001636363636363, "grad_norm": 3.6094846725463867, "learning_rate": 4.313131313131313e-05, "loss": 0.9776, "step": 1230 }, { "epoch": 11.003454545454545, "grad_norm": 4.734577178955078, "learning_rate": 4.303030303030303e-05, "loss": 0.971, "step": 1240 }, { "epoch": 11.005272727272727, "grad_norm": 4.031460285186768, "learning_rate": 4.292929292929293e-05, "loss": 0.9972, "step": 1250 }, { "epoch": 11.007090909090909, "grad_norm": 11.494680404663086, "learning_rate": 4.282828282828283e-05, "loss": 0.9916, "step": 1260 }, { "epoch": 11.008909090909091, "grad_norm": 6.08104944229126, "learning_rate": 4.2727272727272724e-05, "loss": 1.0135, "step": 1270 }, { "epoch": 11.010727272727273, "grad_norm": 4.234505653381348, "learning_rate": 4.262626262626263e-05, "loss": 0.9287, "step": 1280 }, { "epoch": 11.012545454545455, "grad_norm": 6.952130317687988, "learning_rate": 4.252525252525253e-05, "loss": 0.9476, "step": 1290 }, { "epoch": 11.014363636363637, "grad_norm": 7.385903358459473, "learning_rate": 4.242424242424243e-05, "loss": 0.9915, "step": 1300 }, { "epoch": 11.016181818181819, "grad_norm": 6.446252822875977, "learning_rate": 4.232323232323233e-05, "loss": 1.0603, "step": 1310 }, { "epoch": 11.018, "grad_norm": 5.797521591186523, "learning_rate": 4.222222222222222e-05, "loss": 1.1218, "step": 1320 }, { "epoch": 11.019818181818183, "grad_norm": 6.516293048858643, "learning_rate": 4.212121212121212e-05, "loss": 0.9445, "step": 1330 }, { "epoch": 11.020181818181818, "eval_accuracy": 0.4880952380952381, "eval_loss": 1.0051753520965576, "eval_runtime": 152.1617, "eval_samples_per_second": 4.968, "eval_steps_per_second": 0.624, "step": 1332 }, { "epoch": 12.001454545454546, "grad_norm": 7.457939624786377, "learning_rate": 4.202020202020202e-05, "loss": 1.0431, "step": 1340 }, { "epoch": 12.003272727272726, "grad_norm": 6.6593217849731445, "learning_rate": 4.191919191919192e-05, "loss": 1.0327, "step": 1350 }, { "epoch": 12.005090909090908, "grad_norm": 4.458651542663574, "learning_rate": 4.181818181818182e-05, "loss": 0.9756, "step": 1360 }, { "epoch": 12.00690909090909, "grad_norm": 5.859511852264404, "learning_rate": 4.171717171717172e-05, "loss": 1.039, "step": 1370 }, { "epoch": 12.008727272727272, "grad_norm": 9.763144493103027, "learning_rate": 4.161616161616162e-05, "loss": 1.0068, "step": 1380 }, { "epoch": 12.010545454545454, "grad_norm": 4.887930870056152, "learning_rate": 4.151515151515152e-05, "loss": 0.9022, "step": 1390 }, { "epoch": 12.012363636363636, "grad_norm": 6.166642665863037, "learning_rate": 4.141414141414142e-05, "loss": 0.9438, "step": 1400 }, { "epoch": 12.014181818181818, "grad_norm": 13.519564628601074, "learning_rate": 4.131313131313132e-05, "loss": 1.05, "step": 1410 }, { "epoch": 12.016, "grad_norm": 8.109920501708984, "learning_rate": 4.1212121212121216e-05, "loss": 0.9257, "step": 1420 }, { "epoch": 12.017818181818182, "grad_norm": 6.31077241897583, "learning_rate": 4.111111111111111e-05, "loss": 0.8967, "step": 1430 }, { "epoch": 12.019636363636364, "grad_norm": 8.095528602600098, "learning_rate": 4.101010101010101e-05, "loss": 1.032, "step": 1440 }, { "epoch": 12.020181818181818, "eval_accuracy": 0.4523809523809524, "eval_loss": 1.0365078449249268, "eval_runtime": 148.6389, "eval_samples_per_second": 5.086, "eval_steps_per_second": 0.639, "step": 1443 }, { "epoch": 13.001272727272728, "grad_norm": 4.897077560424805, "learning_rate": 4.0909090909090915e-05, "loss": 0.954, "step": 1450 }, { "epoch": 13.00309090909091, "grad_norm": 5.455181121826172, "learning_rate": 4.0808080808080814e-05, "loss": 0.9739, "step": 1460 }, { "epoch": 13.004909090909091, "grad_norm": 6.275787830352783, "learning_rate": 4.070707070707071e-05, "loss": 1.0453, "step": 1470 }, { "epoch": 13.006727272727273, "grad_norm": 4.212862491607666, "learning_rate": 4.0606060606060606e-05, "loss": 1.1058, "step": 1480 }, { "epoch": 13.008545454545455, "grad_norm": 3.4009814262390137, "learning_rate": 4.0505050505050506e-05, "loss": 1.0064, "step": 1490 }, { "epoch": 13.010363636363637, "grad_norm": 5.143698692321777, "learning_rate": 4.0404040404040405e-05, "loss": 0.9478, "step": 1500 }, { "epoch": 13.012181818181817, "grad_norm": 5.192939758300781, "learning_rate": 4.0303030303030305e-05, "loss": 0.9567, "step": 1510 }, { "epoch": 13.014, "grad_norm": 4.058863162994385, "learning_rate": 4.0202020202020204e-05, "loss": 0.9702, "step": 1520 }, { "epoch": 13.015818181818181, "grad_norm": 4.788520336151123, "learning_rate": 4.01010101010101e-05, "loss": 0.8961, "step": 1530 }, { "epoch": 13.017636363636363, "grad_norm": 8.213604927062988, "learning_rate": 4e-05, "loss": 1.0142, "step": 1540 }, { "epoch": 13.019454545454545, "grad_norm": 4.203430652618408, "learning_rate": 3.98989898989899e-05, "loss": 0.987, "step": 1550 }, { "epoch": 13.020181818181818, "eval_accuracy": 0.5105820105820106, "eval_loss": 1.001942753791809, "eval_runtime": 146.3151, "eval_samples_per_second": 5.167, "eval_steps_per_second": 0.649, "step": 1554 }, { "epoch": 14.001090909090909, "grad_norm": 5.082915782928467, "learning_rate": 3.97979797979798e-05, "loss": 0.9439, "step": 1560 }, { "epoch": 14.00290909090909, "grad_norm": 5.0494256019592285, "learning_rate": 3.96969696969697e-05, "loss": 0.918, "step": 1570 }, { "epoch": 14.004727272727273, "grad_norm": 5.370521068572998, "learning_rate": 3.9595959595959594e-05, "loss": 1.0225, "step": 1580 }, { "epoch": 14.006545454545455, "grad_norm": 8.859224319458008, "learning_rate": 3.9494949494949494e-05, "loss": 0.9746, "step": 1590 }, { "epoch": 14.008363636363637, "grad_norm": 6.678452491760254, "learning_rate": 3.939393939393939e-05, "loss": 1.0394, "step": 1600 }, { "epoch": 14.010181818181819, "grad_norm": 9.348724365234375, "learning_rate": 3.929292929292929e-05, "loss": 1.0252, "step": 1610 }, { "epoch": 14.012, "grad_norm": 9.378219604492188, "learning_rate": 3.91919191919192e-05, "loss": 1.0869, "step": 1620 }, { "epoch": 14.013818181818182, "grad_norm": 7.069434642791748, "learning_rate": 3.909090909090909e-05, "loss": 0.8797, "step": 1630 }, { "epoch": 14.015636363636364, "grad_norm": 4.11262845993042, "learning_rate": 3.898989898989899e-05, "loss": 0.9745, "step": 1640 }, { "epoch": 14.017454545454546, "grad_norm": 4.320169448852539, "learning_rate": 3.888888888888889e-05, "loss": 1.0043, "step": 1650 }, { "epoch": 14.019272727272726, "grad_norm": 6.942636489868164, "learning_rate": 3.878787878787879e-05, "loss": 1.0797, "step": 1660 }, { "epoch": 14.020181818181818, "eval_accuracy": 0.4656084656084656, "eval_loss": 1.0128320455551147, "eval_runtime": 150.7976, "eval_samples_per_second": 5.013, "eval_steps_per_second": 0.63, "step": 1665 }, { "epoch": 15.00090909090909, "grad_norm": 5.204150199890137, "learning_rate": 3.868686868686869e-05, "loss": 0.9386, "step": 1670 }, { "epoch": 15.002727272727272, "grad_norm": 5.085378646850586, "learning_rate": 3.858585858585859e-05, "loss": 1.0595, "step": 1680 }, { "epoch": 15.004545454545454, "grad_norm": 9.999809265136719, "learning_rate": 3.848484848484848e-05, "loss": 1.0415, "step": 1690 }, { "epoch": 15.006363636363636, "grad_norm": 2.8342270851135254, "learning_rate": 3.838383838383838e-05, "loss": 0.9376, "step": 1700 }, { "epoch": 15.008181818181818, "grad_norm": 6.779669761657715, "learning_rate": 3.828282828282829e-05, "loss": 0.971, "step": 1710 }, { "epoch": 15.01, "grad_norm": 8.835999488830566, "learning_rate": 3.818181818181819e-05, "loss": 0.9513, "step": 1720 }, { "epoch": 15.011818181818182, "grad_norm": 7.420837879180908, "learning_rate": 3.8080808080808087e-05, "loss": 0.9304, "step": 1730 }, { "epoch": 15.013636363636364, "grad_norm": 4.993125915527344, "learning_rate": 3.797979797979798e-05, "loss": 0.9871, "step": 1740 }, { "epoch": 15.015454545454546, "grad_norm": 8.37247371673584, "learning_rate": 3.787878787878788e-05, "loss": 0.9466, "step": 1750 }, { "epoch": 15.017272727272728, "grad_norm": 7.0074663162231445, "learning_rate": 3.777777777777778e-05, "loss": 0.8956, "step": 1760 }, { "epoch": 15.01909090909091, "grad_norm": 9.646268844604492, "learning_rate": 3.767676767676768e-05, "loss": 0.9196, "step": 1770 }, { "epoch": 15.020181818181818, "eval_accuracy": 0.5013227513227513, "eval_loss": 1.0431362390518188, "eval_runtime": 147.3058, "eval_samples_per_second": 5.132, "eval_steps_per_second": 0.645, "step": 1776 }, { "epoch": 16.00072727272727, "grad_norm": 7.494339942932129, "learning_rate": 3.757575757575758e-05, "loss": 0.9682, "step": 1780 }, { "epoch": 16.002545454545455, "grad_norm": 10.007247924804688, "learning_rate": 3.747474747474748e-05, "loss": 1.0289, "step": 1790 }, { "epoch": 16.004363636363635, "grad_norm": 5.851564884185791, "learning_rate": 3.7373737373737376e-05, "loss": 0.9664, "step": 1800 }, { "epoch": 16.00618181818182, "grad_norm": 7.351463794708252, "learning_rate": 3.7272727272727276e-05, "loss": 0.9844, "step": 1810 }, { "epoch": 16.008, "grad_norm": 5.768230438232422, "learning_rate": 3.7171717171717175e-05, "loss": 1.0384, "step": 1820 }, { "epoch": 16.009818181818183, "grad_norm": 6.959085941314697, "learning_rate": 3.7070707070707075e-05, "loss": 0.9584, "step": 1830 }, { "epoch": 16.011636363636363, "grad_norm": 8.661413192749023, "learning_rate": 3.6969696969696974e-05, "loss": 1.001, "step": 1840 }, { "epoch": 16.013454545454547, "grad_norm": 4.314298152923584, "learning_rate": 3.686868686868687e-05, "loss": 0.9738, "step": 1850 }, { "epoch": 16.015272727272727, "grad_norm": 6.815008163452148, "learning_rate": 3.6767676767676766e-05, "loss": 0.9342, "step": 1860 }, { "epoch": 16.01709090909091, "grad_norm": 10.533506393432617, "learning_rate": 3.6666666666666666e-05, "loss": 0.9174, "step": 1870 }, { "epoch": 16.01890909090909, "grad_norm": 13.072206497192383, "learning_rate": 3.656565656565657e-05, "loss": 1.0727, "step": 1880 }, { "epoch": 16.02018181818182, "eval_accuracy": 0.5343915343915344, "eval_loss": 1.001629114151001, "eval_runtime": 149.4378, "eval_samples_per_second": 5.059, "eval_steps_per_second": 0.636, "step": 1887 }, { "epoch": 17.000545454545456, "grad_norm": 9.216334342956543, "learning_rate": 3.6464646464646465e-05, "loss": 0.9356, "step": 1890 }, { "epoch": 17.002363636363636, "grad_norm": 12.995749473571777, "learning_rate": 3.6363636363636364e-05, "loss": 1.0007, "step": 1900 }, { "epoch": 17.004181818181817, "grad_norm": 10.831470489501953, "learning_rate": 3.6262626262626264e-05, "loss": 1.0075, "step": 1910 }, { "epoch": 17.006, "grad_norm": 6.675658226013184, "learning_rate": 3.616161616161616e-05, "loss": 1.0298, "step": 1920 }, { "epoch": 17.00781818181818, "grad_norm": 8.875894546508789, "learning_rate": 3.606060606060606e-05, "loss": 0.8819, "step": 1930 }, { "epoch": 17.009636363636364, "grad_norm": 3.7380897998809814, "learning_rate": 3.595959595959596e-05, "loss": 1.0307, "step": 1940 }, { "epoch": 17.011454545454544, "grad_norm": 5.976683139801025, "learning_rate": 3.5858585858585855e-05, "loss": 0.9766, "step": 1950 }, { "epoch": 17.013272727272728, "grad_norm": 4.93250846862793, "learning_rate": 3.575757575757576e-05, "loss": 0.9317, "step": 1960 }, { "epoch": 17.015090909090908, "grad_norm": 4.194036960601807, "learning_rate": 3.565656565656566e-05, "loss": 0.9038, "step": 1970 }, { "epoch": 17.016909090909092, "grad_norm": 7.000123977661133, "learning_rate": 3.555555555555556e-05, "loss": 0.9665, "step": 1980 }, { "epoch": 17.018727272727272, "grad_norm": 9.221128463745117, "learning_rate": 3.545454545454546e-05, "loss": 0.9481, "step": 1990 }, { "epoch": 17.02018181818182, "eval_accuracy": 0.5264550264550265, "eval_loss": 0.9982506632804871, "eval_runtime": 149.0509, "eval_samples_per_second": 5.072, "eval_steps_per_second": 0.637, "step": 1998 }, { "epoch": 18.000363636363637, "grad_norm": 5.551551818847656, "learning_rate": 3.535353535353535e-05, "loss": 0.9912, "step": 2000 }, { "epoch": 18.002181818181818, "grad_norm": 8.715899467468262, "learning_rate": 3.525252525252525e-05, "loss": 1.0457, "step": 2010 }, { "epoch": 18.004, "grad_norm": 11.81405258178711, "learning_rate": 3.515151515151515e-05, "loss": 0.9465, "step": 2020 }, { "epoch": 18.00581818181818, "grad_norm": 5.644428730010986, "learning_rate": 3.505050505050505e-05, "loss": 1.0424, "step": 2030 }, { "epoch": 18.007636363636365, "grad_norm": 5.840436935424805, "learning_rate": 3.494949494949495e-05, "loss": 0.9281, "step": 2040 }, { "epoch": 18.009454545454545, "grad_norm": 6.476779460906982, "learning_rate": 3.484848484848485e-05, "loss": 0.9568, "step": 2050 }, { "epoch": 18.011272727272726, "grad_norm": 7.143520355224609, "learning_rate": 3.474747474747475e-05, "loss": 0.947, "step": 2060 }, { "epoch": 18.01309090909091, "grad_norm": 6.404818058013916, "learning_rate": 3.464646464646465e-05, "loss": 0.8925, "step": 2070 }, { "epoch": 18.01490909090909, "grad_norm": 8.545957565307617, "learning_rate": 3.454545454545455e-05, "loss": 0.9917, "step": 2080 }, { "epoch": 18.016727272727273, "grad_norm": 6.831506729125977, "learning_rate": 3.444444444444445e-05, "loss": 0.8973, "step": 2090 }, { "epoch": 18.018545454545453, "grad_norm": 7.68071985244751, "learning_rate": 3.434343434343435e-05, "loss": 0.9034, "step": 2100 }, { "epoch": 18.02018181818182, "eval_accuracy": 0.5013227513227513, "eval_loss": 1.0221006870269775, "eval_runtime": 144.419, "eval_samples_per_second": 5.235, "eval_steps_per_second": 0.658, "step": 2109 }, { "epoch": 19.00018181818182, "grad_norm": 7.987392902374268, "learning_rate": 3.424242424242424e-05, "loss": 1.057, "step": 2110 }, { "epoch": 19.002, "grad_norm": 5.622376918792725, "learning_rate": 3.414141414141414e-05, "loss": 0.9968, "step": 2120 }, { "epoch": 19.003818181818183, "grad_norm": 4.796281814575195, "learning_rate": 3.4040404040404045e-05, "loss": 0.9697, "step": 2130 }, { "epoch": 19.005636363636363, "grad_norm": 6.3387908935546875, "learning_rate": 3.3939393939393945e-05, "loss": 0.9431, "step": 2140 }, { "epoch": 19.007454545454546, "grad_norm": 8.202577590942383, "learning_rate": 3.3838383838383844e-05, "loss": 1.0198, "step": 2150 }, { "epoch": 19.009272727272727, "grad_norm": 12.490442276000977, "learning_rate": 3.373737373737374e-05, "loss": 0.9727, "step": 2160 }, { "epoch": 19.01109090909091, "grad_norm": 9.358773231506348, "learning_rate": 3.3636363636363636e-05, "loss": 0.9366, "step": 2170 }, { "epoch": 19.01290909090909, "grad_norm": 6.616375923156738, "learning_rate": 3.3535353535353536e-05, "loss": 1.0193, "step": 2180 }, { "epoch": 19.014727272727274, "grad_norm": 5.817307949066162, "learning_rate": 3.3434343434343435e-05, "loss": 1.0252, "step": 2190 }, { "epoch": 19.016545454545454, "grad_norm": 6.090243339538574, "learning_rate": 3.3333333333333335e-05, "loss": 0.9156, "step": 2200 }, { "epoch": 19.018363636363638, "grad_norm": 7.476328372955322, "learning_rate": 3.3232323232323234e-05, "loss": 0.917, "step": 2210 }, { "epoch": 19.02018181818182, "grad_norm": 10.545258522033691, "learning_rate": 3.3131313131313134e-05, "loss": 0.8569, "step": 2220 }, { "epoch": 19.02018181818182, "eval_accuracy": 0.5264550264550265, "eval_loss": 0.9825001955032349, "eval_runtime": 151.5161, "eval_samples_per_second": 4.99, "eval_steps_per_second": 0.627, "step": 2220 }, { "epoch": 20.00181818181818, "grad_norm": 4.553400039672852, "learning_rate": 3.303030303030303e-05, "loss": 1.0637, "step": 2230 }, { "epoch": 20.003636363636364, "grad_norm": 4.245593070983887, "learning_rate": 3.292929292929293e-05, "loss": 0.954, "step": 2240 }, { "epoch": 20.005454545454544, "grad_norm": 6.2897210121154785, "learning_rate": 3.282828282828283e-05, "loss": 1.0215, "step": 2250 }, { "epoch": 20.007272727272728, "grad_norm": 8.956347465515137, "learning_rate": 3.272727272727273e-05, "loss": 0.8645, "step": 2260 }, { "epoch": 20.009090909090908, "grad_norm": 8.199272155761719, "learning_rate": 3.2626262626262624e-05, "loss": 0.9678, "step": 2270 }, { "epoch": 20.01090909090909, "grad_norm": 4.4115681648254395, "learning_rate": 3.2525252525252524e-05, "loss": 0.9508, "step": 2280 }, { "epoch": 20.012727272727272, "grad_norm": 5.1302714347839355, "learning_rate": 3.2424242424242423e-05, "loss": 0.8871, "step": 2290 }, { "epoch": 20.014545454545456, "grad_norm": 5.004090785980225, "learning_rate": 3.232323232323233e-05, "loss": 0.887, "step": 2300 }, { "epoch": 20.016363636363636, "grad_norm": 13.2914400100708, "learning_rate": 3.222222222222223e-05, "loss": 0.9051, "step": 2310 }, { "epoch": 20.01818181818182, "grad_norm": 9.987497329711914, "learning_rate": 3.212121212121212e-05, "loss": 0.9421, "step": 2320 }, { "epoch": 20.02, "grad_norm": 12.255473136901855, "learning_rate": 3.202020202020202e-05, "loss": 0.9256, "step": 2330 }, { "epoch": 20.02018181818182, "eval_accuracy": 0.5396825396825397, "eval_loss": 0.9677516222000122, "eval_runtime": 148.6642, "eval_samples_per_second": 5.085, "eval_steps_per_second": 0.639, "step": 2331 }, { "epoch": 21.001636363636365, "grad_norm": 5.143698692321777, "learning_rate": 3.191919191919192e-05, "loss": 0.9137, "step": 2340 }, { "epoch": 21.003454545454545, "grad_norm": 7.78878927230835, "learning_rate": 3.181818181818182e-05, "loss": 0.9939, "step": 2350 }, { "epoch": 21.00527272727273, "grad_norm": 6.338476657867432, "learning_rate": 3.171717171717172e-05, "loss": 0.8976, "step": 2360 }, { "epoch": 21.00709090909091, "grad_norm": 6.025676250457764, "learning_rate": 3.161616161616161e-05, "loss": 0.9612, "step": 2370 }, { "epoch": 21.00890909090909, "grad_norm": 6.251794338226318, "learning_rate": 3.151515151515151e-05, "loss": 0.9476, "step": 2380 }, { "epoch": 21.010727272727273, "grad_norm": 7.739345550537109, "learning_rate": 3.141414141414142e-05, "loss": 0.9162, "step": 2390 }, { "epoch": 21.012545454545453, "grad_norm": 6.820745944976807, "learning_rate": 3.131313131313132e-05, "loss": 0.8919, "step": 2400 }, { "epoch": 21.014363636363637, "grad_norm": 7.606575012207031, "learning_rate": 3.121212121212122e-05, "loss": 0.9543, "step": 2410 }, { "epoch": 21.016181818181817, "grad_norm": 5.678408145904541, "learning_rate": 3.111111111111111e-05, "loss": 0.9386, "step": 2420 }, { "epoch": 21.018, "grad_norm": 6.3955206871032715, "learning_rate": 3.101010101010101e-05, "loss": 0.9258, "step": 2430 }, { "epoch": 21.01981818181818, "grad_norm": 6.42524528503418, "learning_rate": 3.090909090909091e-05, "loss": 1.0311, "step": 2440 }, { "epoch": 21.02018181818182, "eval_accuracy": 0.5105820105820106, "eval_loss": 0.9574453234672546, "eval_runtime": 145.6583, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.652, "step": 2442 }, { "epoch": 22.001454545454546, "grad_norm": 5.330824375152588, "learning_rate": 3.080808080808081e-05, "loss": 0.9399, "step": 2450 }, { "epoch": 22.003272727272726, "grad_norm": 4.4678192138671875, "learning_rate": 3.070707070707071e-05, "loss": 0.7764, "step": 2460 }, { "epoch": 22.00509090909091, "grad_norm": 6.995047092437744, "learning_rate": 3.060606060606061e-05, "loss": 0.9542, "step": 2470 }, { "epoch": 22.00690909090909, "grad_norm": 9.586326599121094, "learning_rate": 3.050505050505051e-05, "loss": 1.0099, "step": 2480 }, { "epoch": 22.008727272727274, "grad_norm": 6.698061466217041, "learning_rate": 3.0404040404040406e-05, "loss": 0.9805, "step": 2490 }, { "epoch": 22.010545454545454, "grad_norm": 9.36473274230957, "learning_rate": 3.0303030303030306e-05, "loss": 0.8718, "step": 2500 }, { "epoch": 22.012363636363638, "grad_norm": 3.693310260772705, "learning_rate": 3.0202020202020205e-05, "loss": 0.9963, "step": 2510 }, { "epoch": 22.014181818181818, "grad_norm": 10.124568939208984, "learning_rate": 3.01010101010101e-05, "loss": 0.8962, "step": 2520 }, { "epoch": 22.016, "grad_norm": 5.137106895446777, "learning_rate": 3e-05, "loss": 0.9473, "step": 2530 }, { "epoch": 22.017818181818182, "grad_norm": 5.98214054107666, "learning_rate": 2.98989898989899e-05, "loss": 0.8896, "step": 2540 }, { "epoch": 22.019636363636362, "grad_norm": 10.731734275817871, "learning_rate": 2.9797979797979796e-05, "loss": 0.8651, "step": 2550 }, { "epoch": 22.02018181818182, "eval_accuracy": 0.49867724867724866, "eval_loss": 1.0047566890716553, "eval_runtime": 158.3545, "eval_samples_per_second": 4.774, "eval_steps_per_second": 0.6, "step": 2553 }, { "epoch": 23.001272727272728, "grad_norm": 5.062934398651123, "learning_rate": 2.96969696969697e-05, "loss": 0.9075, "step": 2560 }, { "epoch": 23.003090909090908, "grad_norm": 9.365435600280762, "learning_rate": 2.95959595959596e-05, "loss": 0.9693, "step": 2570 }, { "epoch": 23.00490909090909, "grad_norm": 8.738523483276367, "learning_rate": 2.9494949494949498e-05, "loss": 0.891, "step": 2580 }, { "epoch": 23.00672727272727, "grad_norm": 9.805744171142578, "learning_rate": 2.9393939393939394e-05, "loss": 0.8262, "step": 2590 }, { "epoch": 23.008545454545455, "grad_norm": 9.829437255859375, "learning_rate": 2.9292929292929294e-05, "loss": 0.9529, "step": 2600 }, { "epoch": 23.010363636363635, "grad_norm": 8.564093589782715, "learning_rate": 2.9191919191919193e-05, "loss": 0.9536, "step": 2610 }, { "epoch": 23.01218181818182, "grad_norm": 7.534347057342529, "learning_rate": 2.909090909090909e-05, "loss": 0.9625, "step": 2620 }, { "epoch": 23.014, "grad_norm": 5.0384039878845215, "learning_rate": 2.898989898989899e-05, "loss": 0.9672, "step": 2630 }, { "epoch": 23.015818181818183, "grad_norm": 3.9534647464752197, "learning_rate": 2.8888888888888888e-05, "loss": 1.0424, "step": 2640 }, { "epoch": 23.017636363636363, "grad_norm": 4.473760604858398, "learning_rate": 2.878787878787879e-05, "loss": 0.8712, "step": 2650 }, { "epoch": 23.019454545454547, "grad_norm": 7.2100348472595215, "learning_rate": 2.868686868686869e-05, "loss": 0.9384, "step": 2660 }, { "epoch": 23.02018181818182, "eval_accuracy": 0.5224867724867724, "eval_loss": 0.9716671109199524, "eval_runtime": 154.7299, "eval_samples_per_second": 4.886, "eval_steps_per_second": 0.614, "step": 2664 }, { "epoch": 24.00109090909091, "grad_norm": 13.06188678741455, "learning_rate": 2.8585858585858587e-05, "loss": 0.9131, "step": 2670 }, { "epoch": 24.002909090909093, "grad_norm": 13.319742202758789, "learning_rate": 2.8484848484848486e-05, "loss": 1.0963, "step": 2680 }, { "epoch": 24.004727272727273, "grad_norm": 8.775079727172852, "learning_rate": 2.8383838383838386e-05, "loss": 0.8914, "step": 2690 }, { "epoch": 24.006545454545453, "grad_norm": 7.000812530517578, "learning_rate": 2.8282828282828282e-05, "loss": 0.8983, "step": 2700 }, { "epoch": 24.008363636363637, "grad_norm": 5.142882823944092, "learning_rate": 2.818181818181818e-05, "loss": 0.9501, "step": 2710 }, { "epoch": 24.010181818181817, "grad_norm": 7.493001937866211, "learning_rate": 2.808080808080808e-05, "loss": 0.802, "step": 2720 }, { "epoch": 24.012, "grad_norm": 5.429752349853516, "learning_rate": 2.7979797979797984e-05, "loss": 0.8444, "step": 2730 }, { "epoch": 24.01381818181818, "grad_norm": 7.175288200378418, "learning_rate": 2.7878787878787883e-05, "loss": 0.9787, "step": 2740 }, { "epoch": 24.015636363636364, "grad_norm": 12.047754287719727, "learning_rate": 2.777777777777778e-05, "loss": 0.9811, "step": 2750 }, { "epoch": 24.017454545454545, "grad_norm": 10.042250633239746, "learning_rate": 2.767676767676768e-05, "loss": 0.9755, "step": 2760 }, { "epoch": 24.019272727272728, "grad_norm": 7.948033332824707, "learning_rate": 2.7575757575757578e-05, "loss": 0.9545, "step": 2770 }, { "epoch": 24.02018181818182, "eval_accuracy": 0.5171957671957672, "eval_loss": 0.9763391017913818, "eval_runtime": 147.5827, "eval_samples_per_second": 5.123, "eval_steps_per_second": 0.644, "step": 2775 }, { "epoch": 25.00090909090909, "grad_norm": 6.587329387664795, "learning_rate": 2.7474747474747474e-05, "loss": 1.0554, "step": 2780 }, { "epoch": 25.002727272727274, "grad_norm": 10.31185531616211, "learning_rate": 2.7373737373737374e-05, "loss": 0.8739, "step": 2790 }, { "epoch": 25.004545454545454, "grad_norm": 3.5984859466552734, "learning_rate": 2.7272727272727273e-05, "loss": 0.8191, "step": 2800 }, { "epoch": 25.006363636363638, "grad_norm": 5.685831546783447, "learning_rate": 2.717171717171717e-05, "loss": 0.9804, "step": 2810 }, { "epoch": 25.008181818181818, "grad_norm": 8.268026351928711, "learning_rate": 2.7070707070707075e-05, "loss": 0.8067, "step": 2820 }, { "epoch": 25.01, "grad_norm": 5.9249773025512695, "learning_rate": 2.696969696969697e-05, "loss": 0.8634, "step": 2830 }, { "epoch": 25.01181818181818, "grad_norm": 5.94825553894043, "learning_rate": 2.686868686868687e-05, "loss": 0.8773, "step": 2840 }, { "epoch": 25.013636363636362, "grad_norm": 8.133403778076172, "learning_rate": 2.676767676767677e-05, "loss": 0.9, "step": 2850 }, { "epoch": 25.015454545454546, "grad_norm": 9.554986953735352, "learning_rate": 2.6666666666666667e-05, "loss": 0.9591, "step": 2860 }, { "epoch": 25.017272727272726, "grad_norm": 10.977254867553711, "learning_rate": 2.6565656565656566e-05, "loss": 0.8604, "step": 2870 }, { "epoch": 25.01909090909091, "grad_norm": 7.258336067199707, "learning_rate": 2.6464646464646466e-05, "loss": 0.9187, "step": 2880 }, { "epoch": 25.02018181818182, "eval_accuracy": 0.5211640211640212, "eval_loss": 0.962836503982544, "eval_runtime": 146.3506, "eval_samples_per_second": 5.166, "eval_steps_per_second": 0.649, "step": 2886 }, { "epoch": 26.00072727272727, "grad_norm": 8.66103458404541, "learning_rate": 2.636363636363636e-05, "loss": 0.8217, "step": 2890 }, { "epoch": 26.002545454545455, "grad_norm": 8.29440689086914, "learning_rate": 2.6262626262626268e-05, "loss": 0.8624, "step": 2900 }, { "epoch": 26.004363636363635, "grad_norm": 8.019746780395508, "learning_rate": 2.6161616161616164e-05, "loss": 0.7612, "step": 2910 }, { "epoch": 26.00618181818182, "grad_norm": 13.38365364074707, "learning_rate": 2.6060606060606063e-05, "loss": 0.9354, "step": 2920 }, { "epoch": 26.008, "grad_norm": 9.972203254699707, "learning_rate": 2.5959595959595963e-05, "loss": 0.7413, "step": 2930 }, { "epoch": 26.009818181818183, "grad_norm": 5.007082462310791, "learning_rate": 2.585858585858586e-05, "loss": 0.9473, "step": 2940 }, { "epoch": 26.011636363636363, "grad_norm": 7.641317367553711, "learning_rate": 2.575757575757576e-05, "loss": 0.9514, "step": 2950 }, { "epoch": 26.013454545454547, "grad_norm": 6.921894073486328, "learning_rate": 2.5656565656565658e-05, "loss": 0.8694, "step": 2960 }, { "epoch": 26.015272727272727, "grad_norm": 14.923357963562012, "learning_rate": 2.5555555555555554e-05, "loss": 0.8904, "step": 2970 }, { "epoch": 26.01709090909091, "grad_norm": 9.429475784301758, "learning_rate": 2.5454545454545454e-05, "loss": 0.8831, "step": 2980 }, { "epoch": 26.01890909090909, "grad_norm": 6.4121527671813965, "learning_rate": 2.5353535353535356e-05, "loss": 0.7953, "step": 2990 }, { "epoch": 26.02018181818182, "eval_accuracy": 0.5264550264550265, "eval_loss": 0.9523358941078186, "eval_runtime": 150.2976, "eval_samples_per_second": 5.03, "eval_steps_per_second": 0.632, "step": 2997 }, { "epoch": 27.000545454545456, "grad_norm": 7.7547101974487305, "learning_rate": 2.5252525252525256e-05, "loss": 0.9809, "step": 3000 }, { "epoch": 27.002363636363636, "grad_norm": 9.476554870605469, "learning_rate": 2.5151515151515155e-05, "loss": 0.767, "step": 3010 }, { "epoch": 27.004181818181817, "grad_norm": 9.019051551818848, "learning_rate": 2.505050505050505e-05, "loss": 0.9555, "step": 3020 }, { "epoch": 27.006, "grad_norm": 7.9360833168029785, "learning_rate": 2.494949494949495e-05, "loss": 0.8657, "step": 3030 }, { "epoch": 27.00781818181818, "grad_norm": 7.8032612800598145, "learning_rate": 2.4848484848484847e-05, "loss": 0.7659, "step": 3040 }, { "epoch": 27.009636363636364, "grad_norm": 8.02381706237793, "learning_rate": 2.474747474747475e-05, "loss": 0.9237, "step": 3050 }, { "epoch": 27.011454545454544, "grad_norm": 6.776451587677002, "learning_rate": 2.464646464646465e-05, "loss": 1.0109, "step": 3060 }, { "epoch": 27.013272727272728, "grad_norm": 9.373581886291504, "learning_rate": 2.4545454545454545e-05, "loss": 0.8007, "step": 3070 }, { "epoch": 27.015090909090908, "grad_norm": 13.388670921325684, "learning_rate": 2.4444444444444445e-05, "loss": 0.7814, "step": 3080 }, { "epoch": 27.016909090909092, "grad_norm": 4.5681352615356445, "learning_rate": 2.4343434343434344e-05, "loss": 0.8697, "step": 3090 }, { "epoch": 27.018727272727272, "grad_norm": 3.7407443523406982, "learning_rate": 2.4242424242424244e-05, "loss": 0.8793, "step": 3100 }, { "epoch": 27.02018181818182, "eval_accuracy": 0.5370370370370371, "eval_loss": 0.9977483153343201, "eval_runtime": 147.1287, "eval_samples_per_second": 5.138, "eval_steps_per_second": 0.646, "step": 3108 }, { "epoch": 28.000363636363637, "grad_norm": 8.104622840881348, "learning_rate": 2.4141414141414143e-05, "loss": 0.8612, "step": 3110 }, { "epoch": 28.002181818181818, "grad_norm": 7.6057562828063965, "learning_rate": 2.404040404040404e-05, "loss": 0.9138, "step": 3120 }, { "epoch": 28.004, "grad_norm": 9.901479721069336, "learning_rate": 2.393939393939394e-05, "loss": 0.8845, "step": 3130 }, { "epoch": 28.00581818181818, "grad_norm": 10.897790908813477, "learning_rate": 2.3838383838383842e-05, "loss": 0.7653, "step": 3140 }, { "epoch": 28.007636363636365, "grad_norm": 6.1693267822265625, "learning_rate": 2.3737373737373738e-05, "loss": 0.8565, "step": 3150 }, { "epoch": 28.009454545454545, "grad_norm": 8.267416954040527, "learning_rate": 2.3636363636363637e-05, "loss": 0.8691, "step": 3160 }, { "epoch": 28.011272727272726, "grad_norm": 6.09727144241333, "learning_rate": 2.3535353535353537e-05, "loss": 0.8473, "step": 3170 }, { "epoch": 28.01309090909091, "grad_norm": 6.62100887298584, "learning_rate": 2.3434343434343436e-05, "loss": 0.7564, "step": 3180 }, { "epoch": 28.01490909090909, "grad_norm": 9.10604476928711, "learning_rate": 2.3333333333333336e-05, "loss": 0.7724, "step": 3190 }, { "epoch": 28.016727272727273, "grad_norm": 7.155385494232178, "learning_rate": 2.3232323232323232e-05, "loss": 0.823, "step": 3200 }, { "epoch": 28.018545454545453, "grad_norm": 9.220155715942383, "learning_rate": 2.313131313131313e-05, "loss": 0.7897, "step": 3210 }, { "epoch": 28.02018181818182, "eval_accuracy": 0.5317460317460317, "eval_loss": 0.9964798092842102, "eval_runtime": 147.9044, "eval_samples_per_second": 5.111, "eval_steps_per_second": 0.642, "step": 3219 }, { "epoch": 29.00018181818182, "grad_norm": 10.702564239501953, "learning_rate": 2.3030303030303034e-05, "loss": 0.893, "step": 3220 }, { "epoch": 29.002, "grad_norm": 11.652132034301758, "learning_rate": 2.292929292929293e-05, "loss": 0.9462, "step": 3230 }, { "epoch": 29.003818181818183, "grad_norm": 5.475378036499023, "learning_rate": 2.282828282828283e-05, "loss": 0.745, "step": 3240 }, { "epoch": 29.005636363636363, "grad_norm": 6.293100833892822, "learning_rate": 2.272727272727273e-05, "loss": 0.7957, "step": 3250 }, { "epoch": 29.007454545454546, "grad_norm": 9.700437545776367, "learning_rate": 2.262626262626263e-05, "loss": 0.9725, "step": 3260 }, { "epoch": 29.009272727272727, "grad_norm": 8.564444541931152, "learning_rate": 2.2525252525252528e-05, "loss": 0.9211, "step": 3270 }, { "epoch": 29.01109090909091, "grad_norm": 7.833463191986084, "learning_rate": 2.2424242424242424e-05, "loss": 0.926, "step": 3280 }, { "epoch": 29.01290909090909, "grad_norm": 9.064501762390137, "learning_rate": 2.2323232323232324e-05, "loss": 1.0597, "step": 3290 }, { "epoch": 29.014727272727274, "grad_norm": 9.065460205078125, "learning_rate": 2.2222222222222223e-05, "loss": 0.8668, "step": 3300 }, { "epoch": 29.016545454545454, "grad_norm": 8.223165512084961, "learning_rate": 2.2121212121212123e-05, "loss": 0.878, "step": 3310 }, { "epoch": 29.018363636363638, "grad_norm": 7.36085319519043, "learning_rate": 2.2020202020202022e-05, "loss": 0.8666, "step": 3320 }, { "epoch": 29.02018181818182, "grad_norm": 13.20446491241455, "learning_rate": 2.191919191919192e-05, "loss": 0.8034, "step": 3330 }, { "epoch": 29.02018181818182, "eval_accuracy": 0.5462962962962963, "eval_loss": 0.9271609783172607, "eval_runtime": 150.9049, "eval_samples_per_second": 5.01, "eval_steps_per_second": 0.63, "step": 3330 }, { "epoch": 30.00181818181818, "grad_norm": 8.620079040527344, "learning_rate": 2.1818181818181818e-05, "loss": 0.8551, "step": 3340 }, { "epoch": 30.003636363636364, "grad_norm": 10.10771656036377, "learning_rate": 2.171717171717172e-05, "loss": 0.8917, "step": 3350 }, { "epoch": 30.005454545454544, "grad_norm": 8.590020179748535, "learning_rate": 2.1616161616161617e-05, "loss": 0.7187, "step": 3360 }, { "epoch": 30.007272727272728, "grad_norm": 10.648637771606445, "learning_rate": 2.1515151515151516e-05, "loss": 0.8967, "step": 3370 }, { "epoch": 30.009090909090908, "grad_norm": 6.126356601715088, "learning_rate": 2.1414141414141416e-05, "loss": 0.7979, "step": 3380 }, { "epoch": 30.01090909090909, "grad_norm": 6.431183815002441, "learning_rate": 2.1313131313131315e-05, "loss": 0.7723, "step": 3390 }, { "epoch": 30.012727272727272, "grad_norm": 7.046969413757324, "learning_rate": 2.1212121212121215e-05, "loss": 0.8287, "step": 3400 }, { "epoch": 30.014545454545456, "grad_norm": 10.070467948913574, "learning_rate": 2.111111111111111e-05, "loss": 0.9068, "step": 3410 }, { "epoch": 30.016363636363636, "grad_norm": 8.365399360656738, "learning_rate": 2.101010101010101e-05, "loss": 0.8802, "step": 3420 }, { "epoch": 30.01818181818182, "grad_norm": 6.84988260269165, "learning_rate": 2.090909090909091e-05, "loss": 0.9197, "step": 3430 }, { "epoch": 30.02, "grad_norm": 8.448094367980957, "learning_rate": 2.080808080808081e-05, "loss": 0.8469, "step": 3440 }, { "epoch": 30.02018181818182, "eval_accuracy": 0.5383597883597884, "eval_loss": 0.9230695962905884, "eval_runtime": 148.3769, "eval_samples_per_second": 5.095, "eval_steps_per_second": 0.64, "step": 3441 }, { "epoch": 31.001636363636365, "grad_norm": 9.007742881774902, "learning_rate": 2.070707070707071e-05, "loss": 0.9572, "step": 3450 }, { "epoch": 31.003454545454545, "grad_norm": 11.113166809082031, "learning_rate": 2.0606060606060608e-05, "loss": 0.8089, "step": 3460 }, { "epoch": 31.00527272727273, "grad_norm": 6.533670425415039, "learning_rate": 2.0505050505050504e-05, "loss": 0.9231, "step": 3470 }, { "epoch": 31.00709090909091, "grad_norm": 9.098650932312012, "learning_rate": 2.0404040404040407e-05, "loss": 0.868, "step": 3480 }, { "epoch": 31.00890909090909, "grad_norm": 8.53820514678955, "learning_rate": 2.0303030303030303e-05, "loss": 0.8957, "step": 3490 }, { "epoch": 31.010727272727273, "grad_norm": 6.1428422927856445, "learning_rate": 2.0202020202020203e-05, "loss": 0.7182, "step": 3500 }, { "epoch": 31.012545454545453, "grad_norm": 5.717990875244141, "learning_rate": 2.0101010101010102e-05, "loss": 0.829, "step": 3510 }, { "epoch": 31.014363636363637, "grad_norm": 7.748228073120117, "learning_rate": 2e-05, "loss": 0.8067, "step": 3520 }, { "epoch": 31.016181818181817, "grad_norm": 11.068036079406738, "learning_rate": 1.98989898989899e-05, "loss": 0.8186, "step": 3530 }, { "epoch": 31.018, "grad_norm": 8.458252906799316, "learning_rate": 1.9797979797979797e-05, "loss": 0.8751, "step": 3540 }, { "epoch": 31.01981818181818, "grad_norm": 6.225341796875, "learning_rate": 1.9696969696969697e-05, "loss": 0.79, "step": 3550 }, { "epoch": 31.02018181818182, "eval_accuracy": 0.5727513227513228, "eval_loss": 0.9281449913978577, "eval_runtime": 150.2805, "eval_samples_per_second": 5.031, "eval_steps_per_second": 0.632, "step": 3552 }, { "epoch": 32.00145454545454, "grad_norm": 10.177803993225098, "learning_rate": 1.95959595959596e-05, "loss": 0.9101, "step": 3560 }, { "epoch": 32.00327272727273, "grad_norm": 10.928609848022461, "learning_rate": 1.9494949494949496e-05, "loss": 0.8019, "step": 3570 }, { "epoch": 32.00509090909091, "grad_norm": 8.741167068481445, "learning_rate": 1.9393939393939395e-05, "loss": 0.9915, "step": 3580 }, { "epoch": 32.00690909090909, "grad_norm": 4.667022228240967, "learning_rate": 1.9292929292929295e-05, "loss": 0.7823, "step": 3590 }, { "epoch": 32.00872727272727, "grad_norm": 8.407429695129395, "learning_rate": 1.919191919191919e-05, "loss": 0.9603, "step": 3600 }, { "epoch": 32.01054545454546, "grad_norm": 8.828394889831543, "learning_rate": 1.9090909090909094e-05, "loss": 0.8425, "step": 3610 }, { "epoch": 32.01236363636364, "grad_norm": 7.08673620223999, "learning_rate": 1.898989898989899e-05, "loss": 0.9535, "step": 3620 }, { "epoch": 32.01418181818182, "grad_norm": 5.232740879058838, "learning_rate": 1.888888888888889e-05, "loss": 0.6998, "step": 3630 }, { "epoch": 32.016, "grad_norm": 8.102818489074707, "learning_rate": 1.878787878787879e-05, "loss": 0.8431, "step": 3640 }, { "epoch": 32.01781818181818, "grad_norm": 9.122153282165527, "learning_rate": 1.8686868686868688e-05, "loss": 0.755, "step": 3650 }, { "epoch": 32.019636363636366, "grad_norm": 14.438915252685547, "learning_rate": 1.8585858585858588e-05, "loss": 0.8516, "step": 3660 }, { "epoch": 32.02018181818182, "eval_accuracy": 0.5568783068783069, "eval_loss": 0.9310464262962341, "eval_runtime": 149.3465, "eval_samples_per_second": 5.062, "eval_steps_per_second": 0.636, "step": 3663 }, { "epoch": 33.00127272727273, "grad_norm": 8.778589248657227, "learning_rate": 1.8484848484848487e-05, "loss": 0.7724, "step": 3670 }, { "epoch": 33.00309090909091, "grad_norm": 26.57788848876953, "learning_rate": 1.8383838383838383e-05, "loss": 0.9285, "step": 3680 }, { "epoch": 33.00490909090909, "grad_norm": 7.8436689376831055, "learning_rate": 1.8282828282828286e-05, "loss": 0.7533, "step": 3690 }, { "epoch": 33.006727272727275, "grad_norm": 8.450164794921875, "learning_rate": 1.8181818181818182e-05, "loss": 0.7167, "step": 3700 }, { "epoch": 33.008545454545455, "grad_norm": 6.676935195922852, "learning_rate": 1.808080808080808e-05, "loss": 0.8161, "step": 3710 }, { "epoch": 33.010363636363635, "grad_norm": 9.457823753356934, "learning_rate": 1.797979797979798e-05, "loss": 0.7213, "step": 3720 }, { "epoch": 33.012181818181816, "grad_norm": 6.458945274353027, "learning_rate": 1.787878787878788e-05, "loss": 0.791, "step": 3730 }, { "epoch": 33.014, "grad_norm": 10.989433288574219, "learning_rate": 1.777777777777778e-05, "loss": 0.9043, "step": 3740 }, { "epoch": 33.01581818181818, "grad_norm": 9.787355422973633, "learning_rate": 1.7676767676767676e-05, "loss": 0.8733, "step": 3750 }, { "epoch": 33.01763636363636, "grad_norm": 13.30040168762207, "learning_rate": 1.7575757575757576e-05, "loss": 0.7123, "step": 3760 }, { "epoch": 33.01945454545454, "grad_norm": 11.642248153686523, "learning_rate": 1.7474747474747475e-05, "loss": 0.8138, "step": 3770 }, { "epoch": 33.02018181818182, "eval_accuracy": 0.5674603174603174, "eval_loss": 0.9582036733627319, "eval_runtime": 147.1713, "eval_samples_per_second": 5.137, "eval_steps_per_second": 0.646, "step": 3774 }, { "epoch": 34.00109090909091, "grad_norm": 9.875617027282715, "learning_rate": 1.7373737373737375e-05, "loss": 0.7208, "step": 3780 }, { "epoch": 34.00290909090909, "grad_norm": 19.283206939697266, "learning_rate": 1.7272727272727274e-05, "loss": 0.7816, "step": 3790 }, { "epoch": 34.00472727272727, "grad_norm": 14.294538497924805, "learning_rate": 1.7171717171717173e-05, "loss": 0.9556, "step": 3800 }, { "epoch": 34.00654545454545, "grad_norm": 13.240488052368164, "learning_rate": 1.707070707070707e-05, "loss": 0.9312, "step": 3810 }, { "epoch": 34.00836363636363, "grad_norm": 11.781719207763672, "learning_rate": 1.6969696969696972e-05, "loss": 0.8563, "step": 3820 }, { "epoch": 34.01018181818182, "grad_norm": 6.2492218017578125, "learning_rate": 1.686868686868687e-05, "loss": 0.6618, "step": 3830 }, { "epoch": 34.012, "grad_norm": 11.331971168518066, "learning_rate": 1.6767676767676768e-05, "loss": 0.7962, "step": 3840 }, { "epoch": 34.01381818181818, "grad_norm": 8.892802238464355, "learning_rate": 1.6666666666666667e-05, "loss": 0.858, "step": 3850 }, { "epoch": 34.01563636363636, "grad_norm": 9.716336250305176, "learning_rate": 1.6565656565656567e-05, "loss": 0.7353, "step": 3860 }, { "epoch": 34.01745454545455, "grad_norm": 14.95606517791748, "learning_rate": 1.6464646464646466e-05, "loss": 0.6582, "step": 3870 }, { "epoch": 34.01927272727273, "grad_norm": 12.510953903198242, "learning_rate": 1.6363636363636366e-05, "loss": 0.8322, "step": 3880 }, { "epoch": 34.02018181818182, "eval_accuracy": 0.5621693121693122, "eval_loss": 0.9740582704544067, "eval_runtime": 146.7236, "eval_samples_per_second": 5.153, "eval_steps_per_second": 0.647, "step": 3885 }, { "epoch": 35.00090909090909, "grad_norm": 10.233559608459473, "learning_rate": 1.6262626262626262e-05, "loss": 0.7694, "step": 3890 }, { "epoch": 35.00272727272727, "grad_norm": 12.976431846618652, "learning_rate": 1.6161616161616165e-05, "loss": 0.7612, "step": 3900 }, { "epoch": 35.00454545454546, "grad_norm": 11.562703132629395, "learning_rate": 1.606060606060606e-05, "loss": 0.8862, "step": 3910 }, { "epoch": 35.00636363636364, "grad_norm": 9.613603591918945, "learning_rate": 1.595959595959596e-05, "loss": 0.6993, "step": 3920 }, { "epoch": 35.00818181818182, "grad_norm": 12.087981224060059, "learning_rate": 1.585858585858586e-05, "loss": 0.9855, "step": 3930 }, { "epoch": 35.01, "grad_norm": 7.05510139465332, "learning_rate": 1.5757575757575756e-05, "loss": 0.8415, "step": 3940 }, { "epoch": 35.011818181818185, "grad_norm": 8.785040855407715, "learning_rate": 1.565656565656566e-05, "loss": 0.773, "step": 3950 }, { "epoch": 35.013636363636365, "grad_norm": 10.331352233886719, "learning_rate": 1.5555555555555555e-05, "loss": 0.7655, "step": 3960 }, { "epoch": 35.015454545454546, "grad_norm": 5.654623031616211, "learning_rate": 1.5454545454545454e-05, "loss": 0.7274, "step": 3970 }, { "epoch": 35.017272727272726, "grad_norm": 12.63278579711914, "learning_rate": 1.5353535353535354e-05, "loss": 0.9074, "step": 3980 }, { "epoch": 35.019090909090906, "grad_norm": 7.924473285675049, "learning_rate": 1.5252525252525255e-05, "loss": 0.8064, "step": 3990 }, { "epoch": 35.02018181818182, "eval_accuracy": 0.5753968253968254, "eval_loss": 0.9572974443435669, "eval_runtime": 150.5167, "eval_samples_per_second": 5.023, "eval_steps_per_second": 0.631, "step": 3996 }, { "epoch": 36.000727272727275, "grad_norm": 11.757354736328125, "learning_rate": 1.5151515151515153e-05, "loss": 0.8435, "step": 4000 }, { "epoch": 36.002545454545455, "grad_norm": 13.549497604370117, "learning_rate": 1.505050505050505e-05, "loss": 0.7881, "step": 4010 }, { "epoch": 36.004363636363635, "grad_norm": 11.874682426452637, "learning_rate": 1.494949494949495e-05, "loss": 0.8561, "step": 4020 }, { "epoch": 36.006181818181815, "grad_norm": 11.986287117004395, "learning_rate": 1.484848484848485e-05, "loss": 0.8678, "step": 4030 }, { "epoch": 36.008, "grad_norm": 9.599827766418457, "learning_rate": 1.4747474747474749e-05, "loss": 0.6111, "step": 4040 }, { "epoch": 36.00981818181818, "grad_norm": 12.461723327636719, "learning_rate": 1.4646464646464647e-05, "loss": 0.8555, "step": 4050 }, { "epoch": 36.01163636363636, "grad_norm": 10.647710800170898, "learning_rate": 1.4545454545454545e-05, "loss": 0.8921, "step": 4060 }, { "epoch": 36.01345454545454, "grad_norm": 7.4742350578308105, "learning_rate": 1.4444444444444444e-05, "loss": 0.7072, "step": 4070 }, { "epoch": 36.01527272727273, "grad_norm": 10.952604293823242, "learning_rate": 1.4343434343434345e-05, "loss": 0.8367, "step": 4080 }, { "epoch": 36.01709090909091, "grad_norm": 13.392800331115723, "learning_rate": 1.4242424242424243e-05, "loss": 0.762, "step": 4090 }, { "epoch": 36.01890909090909, "grad_norm": 8.420698165893555, "learning_rate": 1.4141414141414141e-05, "loss": 0.8767, "step": 4100 }, { "epoch": 36.02018181818182, "eval_accuracy": 0.5714285714285714, "eval_loss": 0.9290463924407959, "eval_runtime": 148.5027, "eval_samples_per_second": 5.091, "eval_steps_per_second": 0.64, "step": 4107 }, { "epoch": 37.00054545454545, "grad_norm": 14.011470794677734, "learning_rate": 1.404040404040404e-05, "loss": 0.8077, "step": 4110 }, { "epoch": 37.00236363636364, "grad_norm": 5.852056980133057, "learning_rate": 1.3939393939393942e-05, "loss": 0.8372, "step": 4120 }, { "epoch": 37.00418181818182, "grad_norm": 14.729412078857422, "learning_rate": 1.383838383838384e-05, "loss": 0.7565, "step": 4130 }, { "epoch": 37.006, "grad_norm": 15.823266983032227, "learning_rate": 1.3737373737373737e-05, "loss": 0.7657, "step": 4140 }, { "epoch": 37.00781818181818, "grad_norm": 8.077898025512695, "learning_rate": 1.3636363636363637e-05, "loss": 0.8377, "step": 4150 }, { "epoch": 37.00963636363636, "grad_norm": 19.177709579467773, "learning_rate": 1.3535353535353538e-05, "loss": 0.9433, "step": 4160 }, { "epoch": 37.01145454545455, "grad_norm": 10.09786605834961, "learning_rate": 1.3434343434343436e-05, "loss": 0.7564, "step": 4170 }, { "epoch": 37.01327272727273, "grad_norm": 7.137022972106934, "learning_rate": 1.3333333333333333e-05, "loss": 0.8067, "step": 4180 }, { "epoch": 37.01509090909091, "grad_norm": 6.618945121765137, "learning_rate": 1.3232323232323233e-05, "loss": 0.803, "step": 4190 }, { "epoch": 37.01690909090909, "grad_norm": 15.50954532623291, "learning_rate": 1.3131313131313134e-05, "loss": 0.8677, "step": 4200 }, { "epoch": 37.018727272727276, "grad_norm": 7.7649030685424805, "learning_rate": 1.3030303030303032e-05, "loss": 0.7978, "step": 4210 }, { "epoch": 37.02018181818182, "eval_accuracy": 0.5727513227513228, "eval_loss": 0.9448675513267517, "eval_runtime": 147.0248, "eval_samples_per_second": 5.142, "eval_steps_per_second": 0.646, "step": 4218 }, { "epoch": 38.00036363636364, "grad_norm": 9.288775444030762, "learning_rate": 1.292929292929293e-05, "loss": 0.6643, "step": 4220 }, { "epoch": 38.00218181818182, "grad_norm": 14.049256324768066, "learning_rate": 1.2828282828282829e-05, "loss": 0.8048, "step": 4230 }, { "epoch": 38.004, "grad_norm": 6.196603775024414, "learning_rate": 1.2727272727272727e-05, "loss": 0.6936, "step": 4240 }, { "epoch": 38.005818181818185, "grad_norm": 16.842363357543945, "learning_rate": 1.2626262626262628e-05, "loss": 0.9143, "step": 4250 }, { "epoch": 38.007636363636365, "grad_norm": 10.34643840789795, "learning_rate": 1.2525252525252526e-05, "loss": 0.7569, "step": 4260 }, { "epoch": 38.009454545454545, "grad_norm": 5.936524868011475, "learning_rate": 1.2424242424242424e-05, "loss": 0.7721, "step": 4270 }, { "epoch": 38.011272727272726, "grad_norm": 8.305501937866211, "learning_rate": 1.2323232323232325e-05, "loss": 0.7388, "step": 4280 }, { "epoch": 38.013090909090906, "grad_norm": 10.99225902557373, "learning_rate": 1.2222222222222222e-05, "loss": 0.6871, "step": 4290 }, { "epoch": 38.01490909090909, "grad_norm": 11.384906768798828, "learning_rate": 1.2121212121212122e-05, "loss": 0.8106, "step": 4300 }, { "epoch": 38.01672727272727, "grad_norm": 10.89456844329834, "learning_rate": 1.202020202020202e-05, "loss": 0.7482, "step": 4310 }, { "epoch": 38.01854545454545, "grad_norm": 7.169947147369385, "learning_rate": 1.1919191919191921e-05, "loss": 0.8113, "step": 4320 }, { "epoch": 38.02018181818182, "eval_accuracy": 0.578042328042328, "eval_loss": 0.949256956577301, "eval_runtime": 148.4198, "eval_samples_per_second": 5.094, "eval_steps_per_second": 0.64, "step": 4329 }, { "epoch": 39.000181818181815, "grad_norm": 6.558440208435059, "learning_rate": 1.1818181818181819e-05, "loss": 0.6821, "step": 4330 }, { "epoch": 39.002, "grad_norm": 9.286921501159668, "learning_rate": 1.1717171717171718e-05, "loss": 0.868, "step": 4340 }, { "epoch": 39.00381818181818, "grad_norm": 11.597467422485352, "learning_rate": 1.1616161616161616e-05, "loss": 0.6401, "step": 4350 }, { "epoch": 39.00563636363636, "grad_norm": 16.700910568237305, "learning_rate": 1.1515151515151517e-05, "loss": 0.7055, "step": 4360 }, { "epoch": 39.00745454545454, "grad_norm": 11.440078735351562, "learning_rate": 1.1414141414141415e-05, "loss": 0.7684, "step": 4370 }, { "epoch": 39.00927272727273, "grad_norm": 9.321911811828613, "learning_rate": 1.1313131313131314e-05, "loss": 0.7973, "step": 4380 }, { "epoch": 39.01109090909091, "grad_norm": 8.111784934997559, "learning_rate": 1.1212121212121212e-05, "loss": 0.7334, "step": 4390 }, { "epoch": 39.01290909090909, "grad_norm": 7.027799606323242, "learning_rate": 1.1111111111111112e-05, "loss": 0.6968, "step": 4400 }, { "epoch": 39.01472727272727, "grad_norm": 11.222505569458008, "learning_rate": 1.1010101010101011e-05, "loss": 0.806, "step": 4410 }, { "epoch": 39.01654545454546, "grad_norm": 13.890769958496094, "learning_rate": 1.0909090909090909e-05, "loss": 0.8326, "step": 4420 }, { "epoch": 39.01836363636364, "grad_norm": 14.440450668334961, "learning_rate": 1.0808080808080808e-05, "loss": 0.7795, "step": 4430 }, { "epoch": 39.02018181818182, "grad_norm": 7.134138584136963, "learning_rate": 1.0707070707070708e-05, "loss": 0.8065, "step": 4440 }, { "epoch": 39.02018181818182, "eval_accuracy": 0.5925925925925926, "eval_loss": 0.9015074372291565, "eval_runtime": 145.6584, "eval_samples_per_second": 5.19, "eval_steps_per_second": 0.652, "step": 4440 }, { "epoch": 40.00181818181818, "grad_norm": 6.643571376800537, "learning_rate": 1.0606060606060607e-05, "loss": 0.7311, "step": 4450 }, { "epoch": 40.00363636363636, "grad_norm": 8.529888153076172, "learning_rate": 1.0505050505050505e-05, "loss": 0.7338, "step": 4460 }, { "epoch": 40.00545454545455, "grad_norm": 6.874202728271484, "learning_rate": 1.0404040404040405e-05, "loss": 0.7362, "step": 4470 }, { "epoch": 40.00727272727273, "grad_norm": 12.398289680480957, "learning_rate": 1.0303030303030304e-05, "loss": 0.9457, "step": 4480 }, { "epoch": 40.00909090909091, "grad_norm": 9.524188041687012, "learning_rate": 1.0202020202020204e-05, "loss": 0.6851, "step": 4490 }, { "epoch": 40.01090909090909, "grad_norm": 8.724706649780273, "learning_rate": 1.0101010101010101e-05, "loss": 0.8305, "step": 4500 }, { "epoch": 40.012727272727275, "grad_norm": 7.507053852081299, "learning_rate": 1e-05, "loss": 0.8247, "step": 4510 }, { "epoch": 40.014545454545456, "grad_norm": 17.32568359375, "learning_rate": 9.898989898989899e-06, "loss": 0.7536, "step": 4520 }, { "epoch": 40.016363636363636, "grad_norm": 9.178523063659668, "learning_rate": 9.7979797979798e-06, "loss": 0.8062, "step": 4530 }, { "epoch": 40.018181818181816, "grad_norm": 9.23918342590332, "learning_rate": 9.696969696969698e-06, "loss": 0.7517, "step": 4540 }, { "epoch": 40.02, "grad_norm": 5.558004379272461, "learning_rate": 9.595959595959595e-06, "loss": 0.7989, "step": 4550 }, { "epoch": 40.02018181818182, "eval_accuracy": 0.5886243386243386, "eval_loss": 0.9139331579208374, "eval_runtime": 149.2134, "eval_samples_per_second": 5.067, "eval_steps_per_second": 0.637, "step": 4551 }, { "epoch": 41.001636363636365, "grad_norm": 10.740436553955078, "learning_rate": 9.494949494949495e-06, "loss": 0.8113, "step": 4560 }, { "epoch": 41.003454545454545, "grad_norm": 12.195236206054688, "learning_rate": 9.393939393939394e-06, "loss": 0.8884, "step": 4570 }, { "epoch": 41.005272727272725, "grad_norm": 9.886882781982422, "learning_rate": 9.292929292929294e-06, "loss": 0.6887, "step": 4580 }, { "epoch": 41.00709090909091, "grad_norm": 14.517189025878906, "learning_rate": 9.191919191919192e-06, "loss": 0.7739, "step": 4590 }, { "epoch": 41.00890909090909, "grad_norm": 11.440573692321777, "learning_rate": 9.090909090909091e-06, "loss": 0.6939, "step": 4600 }, { "epoch": 41.01072727272727, "grad_norm": 8.60914421081543, "learning_rate": 8.98989898989899e-06, "loss": 0.6769, "step": 4610 }, { "epoch": 41.01254545454545, "grad_norm": 5.601386070251465, "learning_rate": 8.88888888888889e-06, "loss": 0.7519, "step": 4620 }, { "epoch": 41.01436363636363, "grad_norm": 7.747749328613281, "learning_rate": 8.787878787878788e-06, "loss": 0.7842, "step": 4630 }, { "epoch": 41.01618181818182, "grad_norm": 11.570269584655762, "learning_rate": 8.686868686868687e-06, "loss": 0.7867, "step": 4640 }, { "epoch": 41.018, "grad_norm": 15.08242416381836, "learning_rate": 8.585858585858587e-06, "loss": 0.7528, "step": 4650 }, { "epoch": 41.01981818181818, "grad_norm": 7.267955780029297, "learning_rate": 8.484848484848486e-06, "loss": 0.6323, "step": 4660 }, { "epoch": 41.02018181818182, "eval_accuracy": 0.5992063492063492, "eval_loss": 0.9004061818122864, "eval_runtime": 148.3224, "eval_samples_per_second": 5.097, "eval_steps_per_second": 0.64, "step": 4662 }, { "epoch": 42.00145454545454, "grad_norm": 9.609107971191406, "learning_rate": 8.383838383838384e-06, "loss": 0.7849, "step": 4670 }, { "epoch": 42.00327272727273, "grad_norm": 12.100410461425781, "learning_rate": 8.282828282828283e-06, "loss": 0.806, "step": 4680 }, { "epoch": 42.00509090909091, "grad_norm": 10.153985977172852, "learning_rate": 8.181818181818183e-06, "loss": 0.669, "step": 4690 }, { "epoch": 42.00690909090909, "grad_norm": 14.402464866638184, "learning_rate": 8.080808080808082e-06, "loss": 0.6636, "step": 4700 }, { "epoch": 42.00872727272727, "grad_norm": 14.053479194641113, "learning_rate": 7.97979797979798e-06, "loss": 0.769, "step": 4710 }, { "epoch": 42.01054545454546, "grad_norm": 7.945107460021973, "learning_rate": 7.878787878787878e-06, "loss": 0.6945, "step": 4720 }, { "epoch": 42.01236363636364, "grad_norm": 8.050100326538086, "learning_rate": 7.777777777777777e-06, "loss": 0.6112, "step": 4730 }, { "epoch": 42.01418181818182, "grad_norm": 13.17475414276123, "learning_rate": 7.676767676767677e-06, "loss": 0.8035, "step": 4740 }, { "epoch": 42.016, "grad_norm": 16.584096908569336, "learning_rate": 7.5757575757575764e-06, "loss": 0.7637, "step": 4750 }, { "epoch": 42.01781818181818, "grad_norm": 11.471580505371094, "learning_rate": 7.474747474747475e-06, "loss": 0.7247, "step": 4760 }, { "epoch": 42.019636363636366, "grad_norm": 11.29985237121582, "learning_rate": 7.3737373737373745e-06, "loss": 0.6847, "step": 4770 }, { "epoch": 42.02018181818182, "eval_accuracy": 0.6124338624338624, "eval_loss": 0.9082908034324646, "eval_runtime": 147.6625, "eval_samples_per_second": 5.12, "eval_steps_per_second": 0.643, "step": 4773 }, { "epoch": 43.00127272727273, "grad_norm": 10.324508666992188, "learning_rate": 7.272727272727272e-06, "loss": 0.5586, "step": 4780 }, { "epoch": 43.00309090909091, "grad_norm": 10.838313102722168, "learning_rate": 7.171717171717173e-06, "loss": 0.7385, "step": 4790 }, { "epoch": 43.00490909090909, "grad_norm": 6.829562187194824, "learning_rate": 7.0707070707070704e-06, "loss": 0.6887, "step": 4800 }, { "epoch": 43.006727272727275, "grad_norm": 19.654333114624023, "learning_rate": 6.969696969696971e-06, "loss": 0.8211, "step": 4810 }, { "epoch": 43.008545454545455, "grad_norm": 11.248620986938477, "learning_rate": 6.8686868686868685e-06, "loss": 0.7529, "step": 4820 }, { "epoch": 43.010363636363635, "grad_norm": 14.996559143066406, "learning_rate": 6.767676767676769e-06, "loss": 0.764, "step": 4830 }, { "epoch": 43.012181818181816, "grad_norm": 13.435885429382324, "learning_rate": 6.666666666666667e-06, "loss": 0.7792, "step": 4840 }, { "epoch": 43.014, "grad_norm": 16.040420532226562, "learning_rate": 6.565656565656567e-06, "loss": 0.9152, "step": 4850 }, { "epoch": 43.01581818181818, "grad_norm": 9.068537712097168, "learning_rate": 6.464646464646465e-06, "loss": 0.7527, "step": 4860 }, { "epoch": 43.01763636363636, "grad_norm": 9.895467758178711, "learning_rate": 6.363636363636363e-06, "loss": 0.7526, "step": 4870 }, { "epoch": 43.01945454545454, "grad_norm": 12.693032264709473, "learning_rate": 6.262626262626263e-06, "loss": 0.7711, "step": 4880 }, { "epoch": 43.02018181818182, "eval_accuracy": 0.5978835978835979, "eval_loss": 0.9023100137710571, "eval_runtime": 146.9949, "eval_samples_per_second": 5.143, "eval_steps_per_second": 0.646, "step": 4884 }, { "epoch": 44.00109090909091, "grad_norm": 9.122892379760742, "learning_rate": 6.161616161616162e-06, "loss": 0.8924, "step": 4890 }, { "epoch": 44.00290909090909, "grad_norm": 16.091691970825195, "learning_rate": 6.060606060606061e-06, "loss": 0.8003, "step": 4900 }, { "epoch": 44.00472727272727, "grad_norm": 11.019819259643555, "learning_rate": 5.9595959595959605e-06, "loss": 0.7017, "step": 4910 }, { "epoch": 44.00654545454545, "grad_norm": 5.1934332847595215, "learning_rate": 5.858585858585859e-06, "loss": 0.5691, "step": 4920 }, { "epoch": 44.00836363636363, "grad_norm": 15.653286933898926, "learning_rate": 5.7575757575757586e-06, "loss": 0.7065, "step": 4930 }, { "epoch": 44.01018181818182, "grad_norm": 11.122913360595703, "learning_rate": 5.656565656565657e-06, "loss": 0.706, "step": 4940 }, { "epoch": 44.012, "grad_norm": 24.985450744628906, "learning_rate": 5.555555555555556e-06, "loss": 0.8322, "step": 4950 }, { "epoch": 44.01381818181818, "grad_norm": 19.167444229125977, "learning_rate": 5.4545454545454545e-06, "loss": 0.6324, "step": 4960 }, { "epoch": 44.01563636363636, "grad_norm": 10.249974250793457, "learning_rate": 5.353535353535354e-06, "loss": 0.7215, "step": 4970 }, { "epoch": 44.01745454545455, "grad_norm": 5.51936149597168, "learning_rate": 5.2525252525252526e-06, "loss": 0.6117, "step": 4980 }, { "epoch": 44.01927272727273, "grad_norm": 6.55401611328125, "learning_rate": 5.151515151515152e-06, "loss": 0.5815, "step": 4990 }, { "epoch": 44.02018181818182, "eval_accuracy": 0.6058201058201058, "eval_loss": 0.9247404336929321, "eval_runtime": 148.7396, "eval_samples_per_second": 5.083, "eval_steps_per_second": 0.639, "step": 4995 }, { "epoch": 45.00090909090909, "grad_norm": 10.085493087768555, "learning_rate": 5.050505050505051e-06, "loss": 0.7191, "step": 5000 }, { "epoch": 45.00272727272727, "grad_norm": 9.831320762634277, "learning_rate": 4.949494949494949e-06, "loss": 0.7192, "step": 5010 }, { "epoch": 45.00454545454546, "grad_norm": 12.230847358703613, "learning_rate": 4.848484848484849e-06, "loss": 0.5632, "step": 5020 }, { "epoch": 45.00636363636364, "grad_norm": 12.282224655151367, "learning_rate": 4.747474747474747e-06, "loss": 0.9129, "step": 5030 }, { "epoch": 45.00818181818182, "grad_norm": 13.632772445678711, "learning_rate": 4.646464646464647e-06, "loss": 0.6591, "step": 5040 }, { "epoch": 45.01, "grad_norm": 9.214385986328125, "learning_rate": 4.5454545454545455e-06, "loss": 0.7707, "step": 5050 }, { "epoch": 45.011818181818185, "grad_norm": 10.159058570861816, "learning_rate": 4.444444444444445e-06, "loss": 0.7463, "step": 5060 }, { "epoch": 45.013636363636365, "grad_norm": 12.011441230773926, "learning_rate": 4.343434343434344e-06, "loss": 0.6294, "step": 5070 }, { "epoch": 45.015454545454546, "grad_norm": 5.8238115310668945, "learning_rate": 4.242424242424243e-06, "loss": 0.7006, "step": 5080 }, { "epoch": 45.017272727272726, "grad_norm": 12.484506607055664, "learning_rate": 4.141414141414142e-06, "loss": 0.7661, "step": 5090 }, { "epoch": 45.019090909090906, "grad_norm": 13.438883781433105, "learning_rate": 4.040404040404041e-06, "loss": 0.8821, "step": 5100 }, { "epoch": 45.02018181818182, "eval_accuracy": 0.6058201058201058, "eval_loss": 0.9070749282836914, "eval_runtime": 148.301, "eval_samples_per_second": 5.098, "eval_steps_per_second": 0.641, "step": 5106 }, { "epoch": 46.000727272727275, "grad_norm": 8.703226089477539, "learning_rate": 3.939393939393939e-06, "loss": 0.7494, "step": 5110 }, { "epoch": 46.002545454545455, "grad_norm": 14.767084121704102, "learning_rate": 3.8383838383838385e-06, "loss": 0.7142, "step": 5120 }, { "epoch": 46.004363636363635, "grad_norm": 12.91554069519043, "learning_rate": 3.7373737373737375e-06, "loss": 0.6795, "step": 5130 }, { "epoch": 46.006181818181815, "grad_norm": 10.742305755615234, "learning_rate": 3.636363636363636e-06, "loss": 0.6425, "step": 5140 }, { "epoch": 46.008, "grad_norm": 14.223603248596191, "learning_rate": 3.5353535353535352e-06, "loss": 0.8141, "step": 5150 }, { "epoch": 46.00981818181818, "grad_norm": 8.773728370666504, "learning_rate": 3.4343434343434343e-06, "loss": 0.6197, "step": 5160 }, { "epoch": 46.01163636363636, "grad_norm": 5.544559478759766, "learning_rate": 3.3333333333333333e-06, "loss": 0.6588, "step": 5170 }, { "epoch": 46.01345454545454, "grad_norm": 7.724081039428711, "learning_rate": 3.2323232323232324e-06, "loss": 0.7757, "step": 5180 }, { "epoch": 46.01527272727273, "grad_norm": 15.705133438110352, "learning_rate": 3.1313131313131314e-06, "loss": 0.8568, "step": 5190 }, { "epoch": 46.01709090909091, "grad_norm": 10.857434272766113, "learning_rate": 3.0303030303030305e-06, "loss": 0.6365, "step": 5200 }, { "epoch": 46.01890909090909, "grad_norm": 11.509218215942383, "learning_rate": 2.9292929292929295e-06, "loss": 0.7436, "step": 5210 }, { "epoch": 46.02018181818182, "eval_accuracy": 0.6084656084656085, "eval_loss": 0.8924344182014465, "eval_runtime": 144.2666, "eval_samples_per_second": 5.24, "eval_steps_per_second": 0.659, "step": 5217 }, { "epoch": 47.00054545454545, "grad_norm": 11.28231143951416, "learning_rate": 2.8282828282828286e-06, "loss": 0.7133, "step": 5220 }, { "epoch": 47.00236363636364, "grad_norm": 25.72589683532715, "learning_rate": 2.7272727272727272e-06, "loss": 0.7184, "step": 5230 }, { "epoch": 47.00418181818182, "grad_norm": 13.874494552612305, "learning_rate": 2.6262626262626263e-06, "loss": 0.8662, "step": 5240 }, { "epoch": 47.006, "grad_norm": 10.938508987426758, "learning_rate": 2.5252525252525253e-06, "loss": 0.5704, "step": 5250 }, { "epoch": 47.00781818181818, "grad_norm": 7.734257698059082, "learning_rate": 2.4242424242424244e-06, "loss": 0.6648, "step": 5260 }, { "epoch": 47.00963636363636, "grad_norm": 8.946194648742676, "learning_rate": 2.3232323232323234e-06, "loss": 0.8833, "step": 5270 }, { "epoch": 47.01145454545455, "grad_norm": 20.528419494628906, "learning_rate": 2.2222222222222225e-06, "loss": 0.7525, "step": 5280 }, { "epoch": 47.01327272727273, "grad_norm": 15.024425506591797, "learning_rate": 2.1212121212121216e-06, "loss": 0.6796, "step": 5290 }, { "epoch": 47.01509090909091, "grad_norm": 7.91912317276001, "learning_rate": 2.0202020202020206e-06, "loss": 0.7555, "step": 5300 }, { "epoch": 47.01690909090909, "grad_norm": 16.979326248168945, "learning_rate": 1.9191919191919192e-06, "loss": 0.5724, "step": 5310 }, { "epoch": 47.018727272727276, "grad_norm": 7.550643444061279, "learning_rate": 1.818181818181818e-06, "loss": 0.6863, "step": 5320 }, { "epoch": 47.02018181818182, "eval_accuracy": 0.6111111111111112, "eval_loss": 0.8965290784835815, "eval_runtime": 146.2721, "eval_samples_per_second": 5.168, "eval_steps_per_second": 0.649, "step": 5328 }, { "epoch": 48.00036363636364, "grad_norm": 13.027327537536621, "learning_rate": 1.7171717171717171e-06, "loss": 0.5784, "step": 5330 }, { "epoch": 48.00218181818182, "grad_norm": 9.1978759765625, "learning_rate": 1.6161616161616162e-06, "loss": 0.6893, "step": 5340 }, { "epoch": 48.004, "grad_norm": 8.523101806640625, "learning_rate": 1.5151515151515152e-06, "loss": 0.6099, "step": 5350 }, { "epoch": 48.005818181818185, "grad_norm": 15.44604778289795, "learning_rate": 1.4141414141414143e-06, "loss": 0.7395, "step": 5360 }, { "epoch": 48.007636363636365, "grad_norm": 12.837162971496582, "learning_rate": 1.3131313131313131e-06, "loss": 0.7887, "step": 5370 }, { "epoch": 48.009454545454545, "grad_norm": 16.95429039001465, "learning_rate": 1.2121212121212122e-06, "loss": 0.8216, "step": 5380 }, { "epoch": 48.011272727272726, "grad_norm": 11.841358184814453, "learning_rate": 1.1111111111111112e-06, "loss": 0.7102, "step": 5390 }, { "epoch": 48.013090909090906, "grad_norm": 12.675179481506348, "learning_rate": 1.0101010101010103e-06, "loss": 0.8177, "step": 5400 }, { "epoch": 48.01490909090909, "grad_norm": 13.496634483337402, "learning_rate": 9.09090909090909e-07, "loss": 0.7247, "step": 5410 }, { "epoch": 48.01672727272727, "grad_norm": 11.762472152709961, "learning_rate": 8.080808080808081e-07, "loss": 0.6307, "step": 5420 }, { "epoch": 48.01854545454545, "grad_norm": 25.6706485748291, "learning_rate": 7.070707070707071e-07, "loss": 0.7035, "step": 5430 }, { "epoch": 48.02018181818182, "eval_accuracy": 0.6044973544973545, "eval_loss": 0.8940863013267517, "eval_runtime": 149.5287, "eval_samples_per_second": 5.056, "eval_steps_per_second": 0.635, "step": 5439 }, { "epoch": 49.000181818181815, "grad_norm": 12.681800842285156, "learning_rate": 6.060606060606061e-07, "loss": 0.5424, "step": 5440 }, { "epoch": 49.002, "grad_norm": 10.071161270141602, "learning_rate": 5.050505050505052e-07, "loss": 0.6097, "step": 5450 }, { "epoch": 49.00381818181818, "grad_norm": 11.494267463684082, "learning_rate": 4.0404040404040405e-07, "loss": 0.5644, "step": 5460 }, { "epoch": 49.00563636363636, "grad_norm": 15.089944839477539, "learning_rate": 3.0303030303030305e-07, "loss": 0.714, "step": 5470 }, { "epoch": 49.00745454545454, "grad_norm": 8.669146537780762, "learning_rate": 2.0202020202020202e-07, "loss": 0.7086, "step": 5480 }, { "epoch": 49.00927272727273, "grad_norm": 10.859286308288574, "learning_rate": 1.0101010101010101e-07, "loss": 0.6569, "step": 5490 }, { "epoch": 49.01109090909091, "grad_norm": 10.49977970123291, "learning_rate": 0.0, "loss": 0.6348, "step": 5500 }, { "epoch": 49.01109090909091, "eval_accuracy": 0.6124338624338624, "eval_loss": 0.8949828743934631, "eval_runtime": 148.7108, "eval_samples_per_second": 5.084, "eval_steps_per_second": 0.639, "step": 5500 }, { "epoch": 49.01109090909091, "step": 5500, "total_flos": 2.3965589411203056e+19, "train_loss": 0.8993404138738459, "train_runtime": 16738.8602, "train_samples_per_second": 2.629, "train_steps_per_second": 0.329 }, { "epoch": 49.01109090909091, "eval_accuracy": 0.5595238095238095, "eval_loss": 0.9923801422119141, "eval_runtime": 148.7567, "eval_samples_per_second": 5.082, "eval_steps_per_second": 0.639, "step": 5500 } ], "logging_steps": 10, "max_steps": 5500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.3965589411203056e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }