{ "best_metric": 2.7592430114746094, "best_model_checkpoint": "miner_id_24/checkpoint-1350", "epoch": 0.028963959180126862, "eval_steps": 150, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.9309306120084574e-05, "eval_loss": 3.049531936645508, "eval_runtime": 325.6693, "eval_samples_per_second": 133.915, "eval_steps_per_second": 33.479, "step": 1 }, { "epoch": 0.00019309306120084574, "grad_norm": 17.593538284301758, "learning_rate": 0.0001, "loss": 10.3405, "step": 10 }, { "epoch": 0.0003861861224016915, "grad_norm": 25.346281051635742, "learning_rate": 0.0001, "loss": 10.4921, "step": 20 }, { "epoch": 0.0005792791836025373, "grad_norm": 29.28500747680664, "learning_rate": 0.0001, "loss": 11.2535, "step": 30 }, { "epoch": 0.000772372244803383, "grad_norm": 25.79044532775879, "learning_rate": 0.0001, "loss": 12.771, "step": 40 }, { "epoch": 0.0009654653060042288, "grad_norm": 29.85230827331543, "learning_rate": 0.0001, "loss": 15.3004, "step": 50 }, { "epoch": 0.0011585583672050746, "grad_norm": 18.505237579345703, "learning_rate": 0.0001, "loss": 10.1685, "step": 60 }, { "epoch": 0.0013516514284059203, "grad_norm": 20.607799530029297, "learning_rate": 0.0001, "loss": 10.0184, "step": 70 }, { "epoch": 0.001544744489606766, "grad_norm": 28.849403381347656, "learning_rate": 0.0001, "loss": 10.9882, "step": 80 }, { "epoch": 0.0017378375508076116, "grad_norm": 22.832693099975586, "learning_rate": 0.0001, "loss": 12.2328, "step": 90 }, { "epoch": 0.0019309306120084576, "grad_norm": 36.24290084838867, "learning_rate": 0.0001, "loss": 15.052, "step": 100 }, { "epoch": 0.002124023673209303, "grad_norm": 26.353515625, "learning_rate": 0.0001, "loss": 9.8579, "step": 110 }, { "epoch": 0.002317116734410149, "grad_norm": 36.15186309814453, "learning_rate": 0.0001, "loss": 10.1559, "step": 120 }, { "epoch": 0.002510209795610995, "grad_norm": 25.59376335144043, "learning_rate": 0.0001, "loss": 10.5048, "step": 130 }, { "epoch": 0.0027033028568118405, "grad_norm": 45.6728630065918, "learning_rate": 0.0001, "loss": 11.7695, "step": 140 }, { "epoch": 0.0028963959180126862, "grad_norm": 67.35182189941406, "learning_rate": 0.0001, "loss": 14.9163, "step": 150 }, { "epoch": 0.0028963959180126862, "eval_loss": 2.915919542312622, "eval_runtime": 325.4265, "eval_samples_per_second": 134.015, "eval_steps_per_second": 33.504, "step": 150 }, { "epoch": 0.003089488979213532, "grad_norm": 23.811389923095703, "learning_rate": 0.0001, "loss": 10.1832, "step": 160 }, { "epoch": 0.0032825820404143776, "grad_norm": 33.760498046875, "learning_rate": 0.0001, "loss": 10.6903, "step": 170 }, { "epoch": 0.0034756751016152233, "grad_norm": 36.039459228515625, "learning_rate": 0.0001, "loss": 10.9393, "step": 180 }, { "epoch": 0.003668768162816069, "grad_norm": 47.975990295410156, "learning_rate": 0.0001, "loss": 12.2215, "step": 190 }, { "epoch": 0.003861861224016915, "grad_norm": 44.83605194091797, "learning_rate": 0.0001, "loss": 14.0036, "step": 200 }, { "epoch": 0.00405495428521776, "grad_norm": 27.559823989868164, "learning_rate": 0.0001, "loss": 10.8778, "step": 210 }, { "epoch": 0.004248047346418606, "grad_norm": 29.685842514038086, "learning_rate": 0.0001, "loss": 10.3398, "step": 220 }, { "epoch": 0.004441140407619453, "grad_norm": 32.39301681518555, "learning_rate": 0.0001, "loss": 10.8218, "step": 230 }, { "epoch": 0.004634233468820298, "grad_norm": 26.63850212097168, "learning_rate": 0.0001, "loss": 11.7435, "step": 240 }, { "epoch": 0.004827326530021144, "grad_norm": 42.46340560913086, "learning_rate": 0.0001, "loss": 13.7549, "step": 250 }, { "epoch": 0.00502041959122199, "grad_norm": 41.16633605957031, "learning_rate": 0.0001, "loss": 10.0357, "step": 260 }, { "epoch": 0.005213512652422835, "grad_norm": 29.798715591430664, "learning_rate": 0.0001, "loss": 9.9566, "step": 270 }, { "epoch": 0.005406605713623681, "grad_norm": 26.712194442749023, "learning_rate": 0.0001, "loss": 10.8858, "step": 280 }, { "epoch": 0.005599698774824527, "grad_norm": 24.423017501831055, "learning_rate": 0.0001, "loss": 12.0604, "step": 290 }, { "epoch": 0.0057927918360253725, "grad_norm": 32.55687713623047, "learning_rate": 0.0001, "loss": 13.7669, "step": 300 }, { "epoch": 0.0057927918360253725, "eval_loss": 2.8850886821746826, "eval_runtime": 326.0923, "eval_samples_per_second": 133.741, "eval_steps_per_second": 33.435, "step": 300 }, { "epoch": 0.005985884897226218, "grad_norm": 17.198266983032227, "learning_rate": 0.0001, "loss": 10.4193, "step": 310 }, { "epoch": 0.006178977958427064, "grad_norm": 18.960155487060547, "learning_rate": 0.0001, "loss": 10.4948, "step": 320 }, { "epoch": 0.0063720710196279095, "grad_norm": 15.217443466186523, "learning_rate": 0.0001, "loss": 10.8575, "step": 330 }, { "epoch": 0.006565164080828755, "grad_norm": 17.301410675048828, "learning_rate": 0.0001, "loss": 11.7949, "step": 340 }, { "epoch": 0.006758257142029601, "grad_norm": 44.01465606689453, "learning_rate": 0.0001, "loss": 14.4235, "step": 350 }, { "epoch": 0.006951350203230447, "grad_norm": 18.623449325561523, "learning_rate": 0.0001, "loss": 10.6294, "step": 360 }, { "epoch": 0.007144443264431292, "grad_norm": 15.859786987304688, "learning_rate": 0.0001, "loss": 10.3981, "step": 370 }, { "epoch": 0.007337536325632138, "grad_norm": 15.925846099853516, "learning_rate": 0.0001, "loss": 10.6106, "step": 380 }, { "epoch": 0.0075306293868329845, "grad_norm": 23.462480545043945, "learning_rate": 0.0001, "loss": 11.4959, "step": 390 }, { "epoch": 0.00772372244803383, "grad_norm": 34.727569580078125, "learning_rate": 0.0001, "loss": 13.6495, "step": 400 }, { "epoch": 0.007916815509234676, "grad_norm": 13.482560157775879, "learning_rate": 0.0001, "loss": 9.9286, "step": 410 }, { "epoch": 0.00810990857043552, "grad_norm": 19.61520004272461, "learning_rate": 0.0001, "loss": 10.4189, "step": 420 }, { "epoch": 0.008303001631636367, "grad_norm": 20.92690658569336, "learning_rate": 0.0001, "loss": 10.7233, "step": 430 }, { "epoch": 0.008496094692837212, "grad_norm": 40.37583541870117, "learning_rate": 0.0001, "loss": 11.906, "step": 440 }, { "epoch": 0.008689187754038059, "grad_norm": 54.6865234375, "learning_rate": 0.0001, "loss": 14.2049, "step": 450 }, { "epoch": 0.008689187754038059, "eval_loss": 2.856499433517456, "eval_runtime": 326.449, "eval_samples_per_second": 133.595, "eval_steps_per_second": 33.399, "step": 450 }, { "epoch": 0.008882280815238905, "grad_norm": 31.493087768554688, "learning_rate": 0.0001, "loss": 10.422, "step": 460 }, { "epoch": 0.00907537387643975, "grad_norm": 25.92237091064453, "learning_rate": 0.0001, "loss": 10.5121, "step": 470 }, { "epoch": 0.009268466937640597, "grad_norm": 23.767501831054688, "learning_rate": 0.0001, "loss": 10.6667, "step": 480 }, { "epoch": 0.009461559998841441, "grad_norm": 29.84052848815918, "learning_rate": 0.0001, "loss": 11.8893, "step": 490 }, { "epoch": 0.009654653060042288, "grad_norm": 42.1278190612793, "learning_rate": 0.0001, "loss": 13.8487, "step": 500 }, { "epoch": 0.009847746121243133, "grad_norm": 18.233877182006836, "learning_rate": 0.0001, "loss": 10.4124, "step": 510 }, { "epoch": 0.01004083918244398, "grad_norm": 24.33700180053711, "learning_rate": 0.0001, "loss": 10.3656, "step": 520 }, { "epoch": 0.010233932243644824, "grad_norm": 21.17763900756836, "learning_rate": 0.0001, "loss": 10.4647, "step": 530 }, { "epoch": 0.01042702530484567, "grad_norm": 47.72334289550781, "learning_rate": 0.0001, "loss": 11.6587, "step": 540 }, { "epoch": 0.010620118366046516, "grad_norm": 32.690364837646484, "learning_rate": 0.0001, "loss": 13.7886, "step": 550 }, { "epoch": 0.010813211427247362, "grad_norm": 14.042930603027344, "learning_rate": 0.0001, "loss": 10.1801, "step": 560 }, { "epoch": 0.011006304488448207, "grad_norm": 13.466276168823242, "learning_rate": 0.0001, "loss": 9.7874, "step": 570 }, { "epoch": 0.011199397549649054, "grad_norm": 13.901522636413574, "learning_rate": 0.0001, "loss": 10.7971, "step": 580 }, { "epoch": 0.011392490610849898, "grad_norm": 15.882288932800293, "learning_rate": 0.0001, "loss": 11.6839, "step": 590 }, { "epoch": 0.011585583672050745, "grad_norm": 31.395444869995117, "learning_rate": 0.0001, "loss": 13.7779, "step": 600 }, { "epoch": 0.011585583672050745, "eval_loss": 2.848609447479248, "eval_runtime": 328.4742, "eval_samples_per_second": 132.771, "eval_steps_per_second": 33.193, "step": 600 }, { "epoch": 0.011778676733251591, "grad_norm": 16.243173599243164, "learning_rate": 0.0001, "loss": 10.3243, "step": 610 }, { "epoch": 0.011971769794452436, "grad_norm": 10.895001411437988, "learning_rate": 0.0001, "loss": 10.1884, "step": 620 }, { "epoch": 0.012164862855653283, "grad_norm": 12.364214897155762, "learning_rate": 0.0001, "loss": 10.5667, "step": 630 }, { "epoch": 0.012357955916854128, "grad_norm": 14.893481254577637, "learning_rate": 0.0001, "loss": 11.7055, "step": 640 }, { "epoch": 0.012551048978054974, "grad_norm": 31.42432403564453, "learning_rate": 0.0001, "loss": 13.6669, "step": 650 }, { "epoch": 0.012744142039255819, "grad_norm": 11.641709327697754, "learning_rate": 0.0001, "loss": 10.4052, "step": 660 }, { "epoch": 0.012937235100456666, "grad_norm": 12.934191703796387, "learning_rate": 0.0001, "loss": 10.1611, "step": 670 }, { "epoch": 0.01313032816165751, "grad_norm": 28.218975067138672, "learning_rate": 0.0001, "loss": 10.7762, "step": 680 }, { "epoch": 0.013323421222858357, "grad_norm": 16.593141555786133, "learning_rate": 0.0001, "loss": 11.8962, "step": 690 }, { "epoch": 0.013516514284059202, "grad_norm": 23.61168098449707, "learning_rate": 0.0001, "loss": 13.6789, "step": 700 }, { "epoch": 0.013709607345260048, "grad_norm": 11.673768043518066, "learning_rate": 0.0001, "loss": 10.1343, "step": 710 }, { "epoch": 0.013902700406460893, "grad_norm": 13.041866302490234, "learning_rate": 0.0001, "loss": 10.4441, "step": 720 }, { "epoch": 0.01409579346766174, "grad_norm": 15.724223136901855, "learning_rate": 0.0001, "loss": 10.759, "step": 730 }, { "epoch": 0.014288886528862585, "grad_norm": 15.264642715454102, "learning_rate": 0.0001, "loss": 11.8888, "step": 740 }, { "epoch": 0.014481979590063431, "grad_norm": 37.832698822021484, "learning_rate": 0.0001, "loss": 13.6694, "step": 750 }, { "epoch": 0.014481979590063431, "eval_loss": 2.8133044242858887, "eval_runtime": 330.5652, "eval_samples_per_second": 131.932, "eval_steps_per_second": 32.983, "step": 750 }, { "epoch": 0.014675072651264276, "grad_norm": 13.753046035766602, "learning_rate": 0.0001, "loss": 10.2511, "step": 760 }, { "epoch": 0.014868165712465123, "grad_norm": 12.730433464050293, "learning_rate": 0.0001, "loss": 10.2344, "step": 770 }, { "epoch": 0.015061258773665969, "grad_norm": 18.035993576049805, "learning_rate": 0.0001, "loss": 10.7321, "step": 780 }, { "epoch": 0.015254351834866814, "grad_norm": 20.784934997558594, "learning_rate": 0.0001, "loss": 11.6791, "step": 790 }, { "epoch": 0.01544744489606766, "grad_norm": 29.870254516601562, "learning_rate": 0.0001, "loss": 13.6885, "step": 800 }, { "epoch": 0.015640537957268507, "grad_norm": 16.11637306213379, "learning_rate": 0.0001, "loss": 10.3119, "step": 810 }, { "epoch": 0.015833631018469352, "grad_norm": 12.562838554382324, "learning_rate": 0.0001, "loss": 10.0703, "step": 820 }, { "epoch": 0.016026724079670197, "grad_norm": 11.796956062316895, "learning_rate": 0.0001, "loss": 10.7967, "step": 830 }, { "epoch": 0.01621981714087104, "grad_norm": 18.159549713134766, "learning_rate": 0.0001, "loss": 11.2847, "step": 840 }, { "epoch": 0.01641291020207189, "grad_norm": 22.098344802856445, "learning_rate": 0.0001, "loss": 13.9348, "step": 850 }, { "epoch": 0.016606003263272735, "grad_norm": 10.987359046936035, "learning_rate": 0.0001, "loss": 10.1845, "step": 860 }, { "epoch": 0.01679909632447358, "grad_norm": 22.796899795532227, "learning_rate": 0.0001, "loss": 10.2937, "step": 870 }, { "epoch": 0.016992189385674424, "grad_norm": 14.24060344696045, "learning_rate": 0.0001, "loss": 10.6819, "step": 880 }, { "epoch": 0.017185282446875273, "grad_norm": 14.327189445495605, "learning_rate": 0.0001, "loss": 11.411, "step": 890 }, { "epoch": 0.017378375508076117, "grad_norm": 25.815399169921875, "learning_rate": 0.0001, "loss": 13.5363, "step": 900 }, { "epoch": 0.017378375508076117, "eval_loss": 2.7856554985046387, "eval_runtime": 328.0056, "eval_samples_per_second": 132.961, "eval_steps_per_second": 33.24, "step": 900 }, { "epoch": 0.017571468569276962, "grad_norm": 14.01270580291748, "learning_rate": 0.0001, "loss": 10.0269, "step": 910 }, { "epoch": 0.01776456163047781, "grad_norm": 12.662365913391113, "learning_rate": 0.0001, "loss": 10.0288, "step": 920 }, { "epoch": 0.017957654691678655, "grad_norm": 11.21884822845459, "learning_rate": 0.0001, "loss": 10.5588, "step": 930 }, { "epoch": 0.0181507477528795, "grad_norm": 14.193058013916016, "learning_rate": 0.0001, "loss": 11.3954, "step": 940 }, { "epoch": 0.018343840814080345, "grad_norm": 33.61137008666992, "learning_rate": 0.0001, "loss": 13.412, "step": 950 }, { "epoch": 0.018536933875281193, "grad_norm": 8.29185962677002, "learning_rate": 0.0001, "loss": 10.1405, "step": 960 }, { "epoch": 0.018730026936482038, "grad_norm": 9.83952522277832, "learning_rate": 0.0001, "loss": 9.9334, "step": 970 }, { "epoch": 0.018923119997682883, "grad_norm": 13.321242332458496, "learning_rate": 0.0001, "loss": 10.2569, "step": 980 }, { "epoch": 0.019116213058883728, "grad_norm": 11.480195045471191, "learning_rate": 0.0001, "loss": 11.5664, "step": 990 }, { "epoch": 0.019309306120084576, "grad_norm": 31.19932746887207, "learning_rate": 0.0001, "loss": 13.4732, "step": 1000 }, { "epoch": 0.01950239918128542, "grad_norm": 8.18012809753418, "learning_rate": 0.0001, "loss": 9.6682, "step": 1010 }, { "epoch": 0.019695492242486266, "grad_norm": 13.29211139678955, "learning_rate": 0.0001, "loss": 9.7695, "step": 1020 }, { "epoch": 0.01988858530368711, "grad_norm": 10.953620910644531, "learning_rate": 0.0001, "loss": 10.6678, "step": 1030 }, { "epoch": 0.02008167836488796, "grad_norm": 13.800012588500977, "learning_rate": 0.0001, "loss": 11.9463, "step": 1040 }, { "epoch": 0.020274771426088804, "grad_norm": 38.13134765625, "learning_rate": 0.0001, "loss": 13.1925, "step": 1050 }, { "epoch": 0.020274771426088804, "eval_loss": 2.779891014099121, "eval_runtime": 331.3018, "eval_samples_per_second": 131.638, "eval_steps_per_second": 32.91, "step": 1050 }, { "epoch": 0.02046786448728965, "grad_norm": 9.869668006896973, "learning_rate": 0.0001, "loss": 10.3068, "step": 1060 }, { "epoch": 0.020660957548490497, "grad_norm": 16.215045928955078, "learning_rate": 0.0001, "loss": 10.0212, "step": 1070 }, { "epoch": 0.02085405060969134, "grad_norm": 13.320286750793457, "learning_rate": 0.0001, "loss": 10.9438, "step": 1080 }, { "epoch": 0.021047143670892186, "grad_norm": 17.76883888244629, "learning_rate": 0.0001, "loss": 11.2812, "step": 1090 }, { "epoch": 0.02124023673209303, "grad_norm": 34.5752067565918, "learning_rate": 0.0001, "loss": 13.2006, "step": 1100 }, { "epoch": 0.02143332979329388, "grad_norm": 13.33773422241211, "learning_rate": 0.0001, "loss": 10.145, "step": 1110 }, { "epoch": 0.021626422854494724, "grad_norm": 13.584674835205078, "learning_rate": 0.0001, "loss": 9.9768, "step": 1120 }, { "epoch": 0.02181951591569557, "grad_norm": 16.620107650756836, "learning_rate": 0.0001, "loss": 10.4809, "step": 1130 }, { "epoch": 0.022012608976896414, "grad_norm": 20.761350631713867, "learning_rate": 0.0001, "loss": 11.4802, "step": 1140 }, { "epoch": 0.022205702038097262, "grad_norm": 42.86295700073242, "learning_rate": 0.0001, "loss": 13.3743, "step": 1150 }, { "epoch": 0.022398795099298107, "grad_norm": 14.914039611816406, "learning_rate": 0.0001, "loss": 9.9647, "step": 1160 }, { "epoch": 0.022591888160498952, "grad_norm": 18.681671142578125, "learning_rate": 0.0001, "loss": 9.6733, "step": 1170 }, { "epoch": 0.022784981221699797, "grad_norm": 16.26973533630371, "learning_rate": 0.0001, "loss": 10.5368, "step": 1180 }, { "epoch": 0.022978074282900645, "grad_norm": 18.916370391845703, "learning_rate": 0.0001, "loss": 11.4679, "step": 1190 }, { "epoch": 0.02317116734410149, "grad_norm": 43.991939544677734, "learning_rate": 0.0001, "loss": 13.6126, "step": 1200 }, { "epoch": 0.02317116734410149, "eval_loss": 2.76770281791687, "eval_runtime": 324.3376, "eval_samples_per_second": 134.465, "eval_steps_per_second": 33.616, "step": 1200 }, { "epoch": 0.023364260405302335, "grad_norm": 19.019067764282227, "learning_rate": 0.0001, "loss": 9.9333, "step": 1210 }, { "epoch": 0.023557353466503183, "grad_norm": 14.600743293762207, "learning_rate": 0.0001, "loss": 10.1374, "step": 1220 }, { "epoch": 0.023750446527704028, "grad_norm": 16.908267974853516, "learning_rate": 0.0001, "loss": 10.7748, "step": 1230 }, { "epoch": 0.023943539588904873, "grad_norm": 25.043933868408203, "learning_rate": 0.0001, "loss": 11.5941, "step": 1240 }, { "epoch": 0.024136632650105717, "grad_norm": 31.190759658813477, "learning_rate": 0.0001, "loss": 13.5434, "step": 1250 }, { "epoch": 0.024329725711306566, "grad_norm": 11.28165054321289, "learning_rate": 0.0001, "loss": 9.8626, "step": 1260 }, { "epoch": 0.02452281877250741, "grad_norm": 12.803401947021484, "learning_rate": 0.0001, "loss": 9.9016, "step": 1270 }, { "epoch": 0.024715911833708255, "grad_norm": 10.985638618469238, "learning_rate": 0.0001, "loss": 10.5182, "step": 1280 }, { "epoch": 0.0249090048949091, "grad_norm": 14.558553695678711, "learning_rate": 0.0001, "loss": 11.4006, "step": 1290 }, { "epoch": 0.02510209795610995, "grad_norm": 28.10079002380371, "learning_rate": 0.0001, "loss": 13.2948, "step": 1300 }, { "epoch": 0.025295191017310793, "grad_norm": 10.139495849609375, "learning_rate": 0.0001, "loss": 9.7414, "step": 1310 }, { "epoch": 0.025488284078511638, "grad_norm": 12.202920913696289, "learning_rate": 0.0001, "loss": 9.915, "step": 1320 }, { "epoch": 0.025681377139712483, "grad_norm": 14.947649002075195, "learning_rate": 0.0001, "loss": 10.6419, "step": 1330 }, { "epoch": 0.02587447020091333, "grad_norm": 16.732439041137695, "learning_rate": 0.0001, "loss": 11.9169, "step": 1340 }, { "epoch": 0.026067563262114176, "grad_norm": 32.5897216796875, "learning_rate": 0.0001, "loss": 13.0356, "step": 1350 }, { "epoch": 0.026067563262114176, "eval_loss": 2.7592430114746094, "eval_runtime": 326.8015, "eval_samples_per_second": 133.451, "eval_steps_per_second": 33.363, "step": 1350 }, { "epoch": 0.02626065632331502, "grad_norm": 10.175602912902832, "learning_rate": 0.0001, "loss": 10.0645, "step": 1360 }, { "epoch": 0.026453749384515866, "grad_norm": 13.354422569274902, "learning_rate": 0.0001, "loss": 9.9626, "step": 1370 }, { "epoch": 0.026646842445716714, "grad_norm": 14.017874717712402, "learning_rate": 0.0001, "loss": 10.6526, "step": 1380 }, { "epoch": 0.02683993550691756, "grad_norm": 15.533336639404297, "learning_rate": 0.0001, "loss": 11.3396, "step": 1390 }, { "epoch": 0.027033028568118404, "grad_norm": 21.672401428222656, "learning_rate": 0.0001, "loss": 13.0943, "step": 1400 }, { "epoch": 0.027226121629319252, "grad_norm": 11.563956260681152, "learning_rate": 0.0001, "loss": 10.1492, "step": 1410 }, { "epoch": 0.027419214690520097, "grad_norm": 9.616212844848633, "learning_rate": 0.0001, "loss": 10.1169, "step": 1420 }, { "epoch": 0.02761230775172094, "grad_norm": 14.188048362731934, "learning_rate": 0.0001, "loss": 10.3995, "step": 1430 }, { "epoch": 0.027805400812921786, "grad_norm": 13.804783821105957, "learning_rate": 0.0001, "loss": 11.1481, "step": 1440 }, { "epoch": 0.027998493874122635, "grad_norm": 23.23021125793457, "learning_rate": 0.0001, "loss": 13.1522, "step": 1450 }, { "epoch": 0.02819158693532348, "grad_norm": 7.562139511108398, "learning_rate": 0.0001, "loss": 10.1138, "step": 1460 }, { "epoch": 0.028384679996524324, "grad_norm": 26.057680130004883, "learning_rate": 0.0001, "loss": 9.7856, "step": 1470 }, { "epoch": 0.02857777305772517, "grad_norm": 12.248312950134277, "learning_rate": 0.0001, "loss": 10.4052, "step": 1480 }, { "epoch": 0.028770866118926017, "grad_norm": 14.60325813293457, "learning_rate": 0.0001, "loss": 11.6516, "step": 1490 }, { "epoch": 0.028963959180126862, "grad_norm": 52.02046585083008, "learning_rate": 0.0001, "loss": 13.2193, "step": 1500 }, { "epoch": 0.028963959180126862, "eval_loss": 2.770695209503174, "eval_runtime": 330.0972, "eval_samples_per_second": 132.119, "eval_steps_per_second": 33.03, "step": 1500 } ], "logging_steps": 10, "max_steps": 1500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0944605381853184e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }