{ "best_metric": 2.546696424484253, "best_model_checkpoint": "miner_id_24/checkpoint-1350", "epoch": 2.5906735751295336, "eval_steps": 150, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017271157167530224, "eval_loss": 4.038626194000244, "eval_runtime": 3.4855, "eval_samples_per_second": 140.01, "eval_steps_per_second": 35.003, "step": 1 }, { "epoch": 0.017271157167530225, "grad_norm": 75.89480590820312, "learning_rate": 0.0001, "loss": 14.3934, "step": 10 }, { "epoch": 0.03454231433506045, "grad_norm": 111.20323181152344, "learning_rate": 0.0001, "loss": 14.1053, "step": 20 }, { "epoch": 0.05181347150259067, "grad_norm": 135.59878540039062, "learning_rate": 0.0001, "loss": 13.9889, "step": 30 }, { "epoch": 0.0690846286701209, "grad_norm": 170.99771118164062, "learning_rate": 0.0001, "loss": 13.7206, "step": 40 }, { "epoch": 0.08635578583765112, "grad_norm": 135.28236389160156, "learning_rate": 0.0001, "loss": 13.0562, "step": 50 }, { "epoch": 0.10362694300518134, "grad_norm": 156.28713989257812, "learning_rate": 0.0001, "loss": 11.7938, "step": 60 }, { "epoch": 0.12089810017271158, "grad_norm": 269.24566650390625, "learning_rate": 0.0001, "loss": 12.3869, "step": 70 }, { "epoch": 0.1381692573402418, "grad_norm": 150.54861450195312, "learning_rate": 0.0001, "loss": 12.0717, "step": 80 }, { "epoch": 0.15544041450777202, "grad_norm": 133.45846557617188, "learning_rate": 0.0001, "loss": 11.5019, "step": 90 }, { "epoch": 0.17271157167530224, "grad_norm": 93.638916015625, "learning_rate": 0.0001, "loss": 11.7609, "step": 100 }, { "epoch": 0.18998272884283246, "grad_norm": 79.70787048339844, "learning_rate": 0.0001, "loss": 11.2305, "step": 110 }, { "epoch": 0.20725388601036268, "grad_norm": 110.97862243652344, "learning_rate": 0.0001, "loss": 11.5846, "step": 120 }, { "epoch": 0.22452504317789293, "grad_norm": 97.05016326904297, "learning_rate": 0.0001, "loss": 11.0253, "step": 130 }, { "epoch": 0.24179620034542315, "grad_norm": 123.05204010009766, "learning_rate": 0.0001, "loss": 11.4952, "step": 140 }, { "epoch": 0.25906735751295334, "grad_norm": 122.82206726074219, "learning_rate": 0.0001, "loss": 11.2932, "step": 150 }, { "epoch": 0.25906735751295334, "eval_loss": 2.9152116775512695, "eval_runtime": 3.3951, "eval_samples_per_second": 143.736, "eval_steps_per_second": 35.934, "step": 150 }, { "epoch": 0.2763385146804836, "grad_norm": 83.58716583251953, "learning_rate": 0.0001, "loss": 11.2958, "step": 160 }, { "epoch": 0.29360967184801384, "grad_norm": 100.84962463378906, "learning_rate": 0.0001, "loss": 10.9797, "step": 170 }, { "epoch": 0.31088082901554404, "grad_norm": 69.06895446777344, "learning_rate": 0.0001, "loss": 11.1063, "step": 180 }, { "epoch": 0.3281519861830743, "grad_norm": 129.27708435058594, "learning_rate": 0.0001, "loss": 11.2391, "step": 190 }, { "epoch": 0.3454231433506045, "grad_norm": 94.75996398925781, "learning_rate": 0.0001, "loss": 11.2505, "step": 200 }, { "epoch": 0.3626943005181347, "grad_norm": 108.81437683105469, "learning_rate": 0.0001, "loss": 10.9523, "step": 210 }, { "epoch": 0.3799654576856649, "grad_norm": 146.96475219726562, "learning_rate": 0.0001, "loss": 10.9382, "step": 220 }, { "epoch": 0.39723661485319517, "grad_norm": 65.99649047851562, "learning_rate": 0.0001, "loss": 10.9976, "step": 230 }, { "epoch": 0.41450777202072536, "grad_norm": 90.05879211425781, "learning_rate": 0.0001, "loss": 10.4706, "step": 240 }, { "epoch": 0.4317789291882556, "grad_norm": 172.34547424316406, "learning_rate": 0.0001, "loss": 11.2257, "step": 250 }, { "epoch": 0.44905008635578586, "grad_norm": 96.2286605834961, "learning_rate": 0.0001, "loss": 10.9538, "step": 260 }, { "epoch": 0.46632124352331605, "grad_norm": 69.3274154663086, "learning_rate": 0.0001, "loss": 10.185, "step": 270 }, { "epoch": 0.4835924006908463, "grad_norm": 75.67520904541016, "learning_rate": 0.0001, "loss": 10.663, "step": 280 }, { "epoch": 0.5008635578583766, "grad_norm": 129.46115112304688, "learning_rate": 0.0001, "loss": 10.758, "step": 290 }, { "epoch": 0.5181347150259067, "grad_norm": 121.0255126953125, "learning_rate": 0.0001, "loss": 10.8523, "step": 300 }, { "epoch": 0.5181347150259067, "eval_loss": 2.736581325531006, "eval_runtime": 3.3484, "eval_samples_per_second": 145.74, "eval_steps_per_second": 36.435, "step": 300 }, { "epoch": 0.5354058721934369, "grad_norm": 84.29546356201172, "learning_rate": 0.0001, "loss": 10.9785, "step": 310 }, { "epoch": 0.5526770293609672, "grad_norm": 72.99498748779297, "learning_rate": 0.0001, "loss": 10.669, "step": 320 }, { "epoch": 0.5699481865284974, "grad_norm": 62.89350891113281, "learning_rate": 0.0001, "loss": 10.3593, "step": 330 }, { "epoch": 0.5872193436960277, "grad_norm": 160.30242919921875, "learning_rate": 0.0001, "loss": 10.7546, "step": 340 }, { "epoch": 0.6044905008635578, "grad_norm": 75.04741668701172, "learning_rate": 0.0001, "loss": 10.9507, "step": 350 }, { "epoch": 0.6217616580310881, "grad_norm": 76.93465423583984, "learning_rate": 0.0001, "loss": 10.5212, "step": 360 }, { "epoch": 0.6390328151986183, "grad_norm": 116.82339477539062, "learning_rate": 0.0001, "loss": 10.7363, "step": 370 }, { "epoch": 0.6563039723661486, "grad_norm": 606.068603515625, "learning_rate": 0.0001, "loss": 10.8678, "step": 380 }, { "epoch": 0.6735751295336787, "grad_norm": 351.8760681152344, "learning_rate": 0.0001, "loss": 12.1544, "step": 390 }, { "epoch": 0.690846286701209, "grad_norm": 151.93214416503906, "learning_rate": 0.0001, "loss": 12.3248, "step": 400 }, { "epoch": 0.7081174438687392, "grad_norm": 395.1159362792969, "learning_rate": 0.0001, "loss": 11.5917, "step": 410 }, { "epoch": 0.7253886010362695, "grad_norm": 118.90939331054688, "learning_rate": 0.0001, "loss": 10.4903, "step": 420 }, { "epoch": 0.7426597582037997, "grad_norm": 274.8316955566406, "learning_rate": 0.0001, "loss": 10.5821, "step": 430 }, { "epoch": 0.7599309153713298, "grad_norm": 194.53819274902344, "learning_rate": 0.0001, "loss": 11.1816, "step": 440 }, { "epoch": 0.7772020725388601, "grad_norm": 136.4066619873047, "learning_rate": 0.0001, "loss": 11.514, "step": 450 }, { "epoch": 0.7772020725388601, "eval_loss": 2.930330276489258, "eval_runtime": 3.3819, "eval_samples_per_second": 144.297, "eval_steps_per_second": 36.074, "step": 450 }, { "epoch": 0.7944732297063903, "grad_norm": 368.5176086425781, "learning_rate": 0.0001, "loss": 12.3824, "step": 460 }, { "epoch": 0.8117443868739206, "grad_norm": 425.0867919921875, "learning_rate": 0.0001, "loss": 12.3026, "step": 470 }, { "epoch": 0.8290155440414507, "grad_norm": 218.0021209716797, "learning_rate": 0.0001, "loss": 11.8672, "step": 480 }, { "epoch": 0.846286701208981, "grad_norm": 316.37115478515625, "learning_rate": 0.0001, "loss": 11.5297, "step": 490 }, { "epoch": 0.8635578583765112, "grad_norm": 78.69461822509766, "learning_rate": 0.0001, "loss": 11.9958, "step": 500 }, { "epoch": 0.8808290155440415, "grad_norm": 267.04888916015625, "learning_rate": 0.0001, "loss": 11.0739, "step": 510 }, { "epoch": 0.8981001727115717, "grad_norm": 115.52953338623047, "learning_rate": 0.0001, "loss": 10.3282, "step": 520 }, { "epoch": 0.9153713298791019, "grad_norm": 104.83427429199219, "learning_rate": 0.0001, "loss": 10.7548, "step": 530 }, { "epoch": 0.9326424870466321, "grad_norm": 156.23126220703125, "learning_rate": 0.0001, "loss": 10.634, "step": 540 }, { "epoch": 0.9499136442141624, "grad_norm": 76.87721252441406, "learning_rate": 0.0001, "loss": 11.2733, "step": 550 }, { "epoch": 0.9671848013816926, "grad_norm": 133.16722106933594, "learning_rate": 0.0001, "loss": 10.8275, "step": 560 }, { "epoch": 0.9844559585492227, "grad_norm": 229.51499938964844, "learning_rate": 0.0001, "loss": 11.3043, "step": 570 }, { "epoch": 1.001727115716753, "grad_norm": 63.176795959472656, "learning_rate": 0.0001, "loss": 10.7135, "step": 580 }, { "epoch": 1.0189982728842832, "grad_norm": 104.19017791748047, "learning_rate": 0.0001, "loss": 10.3569, "step": 590 }, { "epoch": 1.0362694300518134, "grad_norm": 63.779815673828125, "learning_rate": 0.0001, "loss": 10.2036, "step": 600 }, { "epoch": 1.0362694300518134, "eval_loss": 2.6248672008514404, "eval_runtime": 3.3501, "eval_samples_per_second": 145.667, "eval_steps_per_second": 36.417, "step": 600 }, { "epoch": 1.0535405872193437, "grad_norm": 83.07618713378906, "learning_rate": 0.0001, "loss": 10.3781, "step": 610 }, { "epoch": 1.0708117443868739, "grad_norm": 90.20931243896484, "learning_rate": 0.0001, "loss": 10.3507, "step": 620 }, { "epoch": 1.0880829015544042, "grad_norm": 92.54613494873047, "learning_rate": 0.0001, "loss": 10.8331, "step": 630 }, { "epoch": 1.1053540587219344, "grad_norm": 105.11112213134766, "learning_rate": 0.0001, "loss": 10.6137, "step": 640 }, { "epoch": 1.1226252158894645, "grad_norm": 98.22175598144531, "learning_rate": 0.0001, "loss": 10.4826, "step": 650 }, { "epoch": 1.1398963730569949, "grad_norm": 75.88497161865234, "learning_rate": 0.0001, "loss": 10.122, "step": 660 }, { "epoch": 1.157167530224525, "grad_norm": 108.49153137207031, "learning_rate": 0.0001, "loss": 10.5046, "step": 670 }, { "epoch": 1.1744386873920551, "grad_norm": 88.94380187988281, "learning_rate": 0.0001, "loss": 10.6064, "step": 680 }, { "epoch": 1.1917098445595855, "grad_norm": 67.67328643798828, "learning_rate": 0.0001, "loss": 10.2257, "step": 690 }, { "epoch": 1.2089810017271156, "grad_norm": 64.91503143310547, "learning_rate": 0.0001, "loss": 10.6785, "step": 700 }, { "epoch": 1.226252158894646, "grad_norm": 137.12078857421875, "learning_rate": 0.0001, "loss": 10.69, "step": 710 }, { "epoch": 1.2435233160621761, "grad_norm": 131.67543029785156, "learning_rate": 0.0001, "loss": 11.0192, "step": 720 }, { "epoch": 1.2607944732297063, "grad_norm": 111.2732162475586, "learning_rate": 0.0001, "loss": 10.4661, "step": 730 }, { "epoch": 1.2780656303972366, "grad_norm": 104.6572265625, "learning_rate": 0.0001, "loss": 10.5554, "step": 740 }, { "epoch": 1.2953367875647668, "grad_norm": 54.92211151123047, "learning_rate": 0.0001, "loss": 9.9844, "step": 750 }, { "epoch": 1.2953367875647668, "eval_loss": 2.6087870597839355, "eval_runtime": 3.4493, "eval_samples_per_second": 141.476, "eval_steps_per_second": 35.369, "step": 750 }, { "epoch": 1.3126079447322971, "grad_norm": 66.25089263916016, "learning_rate": 0.0001, "loss": 10.4141, "step": 760 }, { "epoch": 1.3298791018998273, "grad_norm": 149.7973175048828, "learning_rate": 0.0001, "loss": 10.9567, "step": 770 }, { "epoch": 1.3471502590673574, "grad_norm": 88.66106414794922, "learning_rate": 0.0001, "loss": 10.173, "step": 780 }, { "epoch": 1.3644214162348878, "grad_norm": 124.81246185302734, "learning_rate": 0.0001, "loss": 10.4141, "step": 790 }, { "epoch": 1.381692573402418, "grad_norm": 138.77783203125, "learning_rate": 0.0001, "loss": 10.3441, "step": 800 }, { "epoch": 1.3989637305699483, "grad_norm": 93.12714385986328, "learning_rate": 0.0001, "loss": 9.9939, "step": 810 }, { "epoch": 1.4162348877374784, "grad_norm": 163.06826782226562, "learning_rate": 0.0001, "loss": 10.404, "step": 820 }, { "epoch": 1.4335060449050085, "grad_norm": 105.33958435058594, "learning_rate": 0.0001, "loss": 10.9251, "step": 830 }, { "epoch": 1.450777202072539, "grad_norm": 158.2169189453125, "learning_rate": 0.0001, "loss": 10.7292, "step": 840 }, { "epoch": 1.468048359240069, "grad_norm": 115.02069854736328, "learning_rate": 0.0001, "loss": 10.3911, "step": 850 }, { "epoch": 1.4853195164075994, "grad_norm": 120.8702163696289, "learning_rate": 0.0001, "loss": 10.3324, "step": 860 }, { "epoch": 1.5025906735751295, "grad_norm": 83.17007446289062, "learning_rate": 0.0001, "loss": 10.5342, "step": 870 }, { "epoch": 1.5198618307426597, "grad_norm": 92.21144104003906, "learning_rate": 0.0001, "loss": 11.1533, "step": 880 }, { "epoch": 1.5371329879101898, "grad_norm": 56.907371520996094, "learning_rate": 0.0001, "loss": 10.692, "step": 890 }, { "epoch": 1.5544041450777202, "grad_norm": 67.13449096679688, "learning_rate": 0.0001, "loss": 10.5367, "step": 900 }, { "epoch": 1.5544041450777202, "eval_loss": 2.6833252906799316, "eval_runtime": 3.3528, "eval_samples_per_second": 145.549, "eval_steps_per_second": 36.387, "step": 900 }, { "epoch": 1.5716753022452505, "grad_norm": 87.14239501953125, "learning_rate": 0.0001, "loss": 10.772, "step": 910 }, { "epoch": 1.5889464594127807, "grad_norm": 79.53548431396484, "learning_rate": 0.0001, "loss": 9.8992, "step": 920 }, { "epoch": 1.6062176165803108, "grad_norm": 130.53680419921875, "learning_rate": 0.0001, "loss": 10.54, "step": 930 }, { "epoch": 1.623488773747841, "grad_norm": 76.92542266845703, "learning_rate": 0.0001, "loss": 10.7549, "step": 940 }, { "epoch": 1.6407599309153713, "grad_norm": 86.40167236328125, "learning_rate": 0.0001, "loss": 10.7878, "step": 950 }, { "epoch": 1.6580310880829017, "grad_norm": 82.14942169189453, "learning_rate": 0.0001, "loss": 10.5113, "step": 960 }, { "epoch": 1.6753022452504318, "grad_norm": 91.56623077392578, "learning_rate": 0.0001, "loss": 10.1856, "step": 970 }, { "epoch": 1.692573402417962, "grad_norm": 94.72366333007812, "learning_rate": 0.0001, "loss": 10.6023, "step": 980 }, { "epoch": 1.709844559585492, "grad_norm": 78.64158630371094, "learning_rate": 0.0001, "loss": 10.2007, "step": 990 }, { "epoch": 1.7271157167530224, "grad_norm": 118.63618469238281, "learning_rate": 0.0001, "loss": 10.1417, "step": 1000 }, { "epoch": 1.7443868739205528, "grad_norm": 82.75341033935547, "learning_rate": 0.0001, "loss": 10.265, "step": 1010 }, { "epoch": 1.761658031088083, "grad_norm": 161.32315063476562, "learning_rate": 0.0001, "loss": 10.3127, "step": 1020 }, { "epoch": 1.778929188255613, "grad_norm": 82.68830108642578, "learning_rate": 0.0001, "loss": 11.0381, "step": 1030 }, { "epoch": 1.7962003454231432, "grad_norm": 81.87013244628906, "learning_rate": 0.0001, "loss": 10.7413, "step": 1040 }, { "epoch": 1.8134715025906736, "grad_norm": 113.54767608642578, "learning_rate": 0.0001, "loss": 9.9794, "step": 1050 }, { "epoch": 1.8134715025906736, "eval_loss": 2.610283374786377, "eval_runtime": 3.4188, "eval_samples_per_second": 142.741, "eval_steps_per_second": 35.685, "step": 1050 }, { "epoch": 1.830742659758204, "grad_norm": 99.90003967285156, "learning_rate": 0.0001, "loss": 10.5983, "step": 1060 }, { "epoch": 1.848013816925734, "grad_norm": 154.58657836914062, "learning_rate": 0.0001, "loss": 10.9577, "step": 1070 }, { "epoch": 1.8652849740932642, "grad_norm": 128.96095275878906, "learning_rate": 0.0001, "loss": 10.8509, "step": 1080 }, { "epoch": 1.8825561312607944, "grad_norm": 47.479736328125, "learning_rate": 0.0001, "loss": 10.682, "step": 1090 }, { "epoch": 1.8998272884283247, "grad_norm": 93.58959197998047, "learning_rate": 0.0001, "loss": 10.4838, "step": 1100 }, { "epoch": 1.917098445595855, "grad_norm": 65.81804656982422, "learning_rate": 0.0001, "loss": 10.0903, "step": 1110 }, { "epoch": 1.9343696027633852, "grad_norm": 100.26107788085938, "learning_rate": 0.0001, "loss": 10.5065, "step": 1120 }, { "epoch": 1.9516407599309153, "grad_norm": 80.01396942138672, "learning_rate": 0.0001, "loss": 10.5274, "step": 1130 }, { "epoch": 1.9689119170984455, "grad_norm": 121.59380340576172, "learning_rate": 0.0001, "loss": 10.174, "step": 1140 }, { "epoch": 1.9861830742659758, "grad_norm": 93.56069946289062, "learning_rate": 0.0001, "loss": 10.1463, "step": 1150 }, { "epoch": 2.003454231433506, "grad_norm": 72.67021942138672, "learning_rate": 0.0001, "loss": 10.1582, "step": 1160 }, { "epoch": 2.0207253886010363, "grad_norm": 77.44140625, "learning_rate": 0.0001, "loss": 10.1227, "step": 1170 }, { "epoch": 2.0379965457685665, "grad_norm": 66.51920318603516, "learning_rate": 0.0001, "loss": 10.0345, "step": 1180 }, { "epoch": 2.0552677029360966, "grad_norm": 68.12701416015625, "learning_rate": 0.0001, "loss": 9.9762, "step": 1190 }, { "epoch": 2.0725388601036268, "grad_norm": 87.24915313720703, "learning_rate": 0.0001, "loss": 9.4328, "step": 1200 }, { "epoch": 2.0725388601036268, "eval_loss": 2.5481302738189697, "eval_runtime": 3.4381, "eval_samples_per_second": 141.94, "eval_steps_per_second": 35.485, "step": 1200 }, { "epoch": 2.0898100172711573, "grad_norm": 157.5850067138672, "learning_rate": 0.0001, "loss": 9.9501, "step": 1210 }, { "epoch": 2.1070811744386875, "grad_norm": 69.93534088134766, "learning_rate": 0.0001, "loss": 10.2953, "step": 1220 }, { "epoch": 2.1243523316062176, "grad_norm": 84.55315399169922, "learning_rate": 0.0001, "loss": 10.0178, "step": 1230 }, { "epoch": 2.1416234887737478, "grad_norm": 66.14724731445312, "learning_rate": 0.0001, "loss": 10.1136, "step": 1240 }, { "epoch": 2.158894645941278, "grad_norm": 108.00819396972656, "learning_rate": 0.0001, "loss": 9.7876, "step": 1250 }, { "epoch": 2.1761658031088085, "grad_norm": 68.95934295654297, "learning_rate": 0.0001, "loss": 10.1318, "step": 1260 }, { "epoch": 2.1934369602763386, "grad_norm": 126.5031967163086, "learning_rate": 0.0001, "loss": 9.4767, "step": 1270 }, { "epoch": 2.2107081174438687, "grad_norm": 149.7384796142578, "learning_rate": 0.0001, "loss": 9.763, "step": 1280 }, { "epoch": 2.227979274611399, "grad_norm": 51.13591384887695, "learning_rate": 0.0001, "loss": 10.1029, "step": 1290 }, { "epoch": 2.245250431778929, "grad_norm": 87.11122131347656, "learning_rate": 0.0001, "loss": 9.8839, "step": 1300 }, { "epoch": 2.2625215889464596, "grad_norm": 47.78391647338867, "learning_rate": 0.0001, "loss": 10.0033, "step": 1310 }, { "epoch": 2.2797927461139897, "grad_norm": 133.7624969482422, "learning_rate": 0.0001, "loss": 9.8414, "step": 1320 }, { "epoch": 2.29706390328152, "grad_norm": 74.67383575439453, "learning_rate": 0.0001, "loss": 9.6827, "step": 1330 }, { "epoch": 2.31433506044905, "grad_norm": 78.1319808959961, "learning_rate": 0.0001, "loss": 10.096, "step": 1340 }, { "epoch": 2.33160621761658, "grad_norm": 179.70462036132812, "learning_rate": 0.0001, "loss": 10.1758, "step": 1350 }, { "epoch": 2.33160621761658, "eval_loss": 2.546696424484253, "eval_runtime": 3.4992, "eval_samples_per_second": 139.461, "eval_steps_per_second": 34.865, "step": 1350 }, { "epoch": 2.3488773747841103, "grad_norm": 146.88360595703125, "learning_rate": 0.0001, "loss": 10.1538, "step": 1360 }, { "epoch": 2.366148531951641, "grad_norm": 186.4013671875, "learning_rate": 0.0001, "loss": 10.0984, "step": 1370 }, { "epoch": 2.383419689119171, "grad_norm": 78.0069351196289, "learning_rate": 0.0001, "loss": 10.1345, "step": 1380 }, { "epoch": 2.400690846286701, "grad_norm": 61.604061126708984, "learning_rate": 0.0001, "loss": 10.3753, "step": 1390 }, { "epoch": 2.4179620034542313, "grad_norm": 76.18042755126953, "learning_rate": 0.0001, "loss": 10.3891, "step": 1400 }, { "epoch": 2.4352331606217614, "grad_norm": 108.20265197753906, "learning_rate": 0.0001, "loss": 10.0772, "step": 1410 }, { "epoch": 2.452504317789292, "grad_norm": 76.92615509033203, "learning_rate": 0.0001, "loss": 9.8017, "step": 1420 }, { "epoch": 2.469775474956822, "grad_norm": 138.38558959960938, "learning_rate": 0.0001, "loss": 9.6187, "step": 1430 }, { "epoch": 2.4870466321243523, "grad_norm": 171.43023681640625, "learning_rate": 0.0001, "loss": 9.9722, "step": 1440 }, { "epoch": 2.5043177892918824, "grad_norm": 136.78587341308594, "learning_rate": 0.0001, "loss": 11.1977, "step": 1450 }, { "epoch": 2.5215889464594126, "grad_norm": 144.79205322265625, "learning_rate": 0.0001, "loss": 10.4862, "step": 1460 }, { "epoch": 2.538860103626943, "grad_norm": 127.37538146972656, "learning_rate": 0.0001, "loss": 10.8806, "step": 1470 }, { "epoch": 2.5561312607944733, "grad_norm": 122.02765655517578, "learning_rate": 0.0001, "loss": 10.4921, "step": 1480 }, { "epoch": 2.5734024179620034, "grad_norm": 653.7969970703125, "learning_rate": 0.0001, "loss": 10.2225, "step": 1490 }, { "epoch": 2.5906735751295336, "grad_norm": 125.18404388427734, "learning_rate": 0.0001, "loss": 10.6133, "step": 1500 }, { "epoch": 2.5906735751295336, "eval_loss": 2.5841307640075684, "eval_runtime": 3.5077, "eval_samples_per_second": 139.122, "eval_steps_per_second": 34.78, "step": 1500 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.654607288513331e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }