{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 268, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0037313432835820895, "grad_norm": 2.2083193788646778, "learning_rate": 3.5714285714285714e-06, "loss": 0.8468, "step": 1 }, { "epoch": 0.007462686567164179, "grad_norm": 2.2022661529563963, "learning_rate": 7.142857142857143e-06, "loss": 0.8672, "step": 2 }, { "epoch": 0.011194029850746268, "grad_norm": 2.096047179025918, "learning_rate": 1.0714285714285714e-05, "loss": 0.8534, "step": 3 }, { "epoch": 0.014925373134328358, "grad_norm": 1.4544030602725047, "learning_rate": 1.4285714285714285e-05, "loss": 0.8275, "step": 4 }, { "epoch": 0.018656716417910446, "grad_norm": 1.8353135999863304, "learning_rate": 1.785714285714286e-05, "loss": 0.8331, "step": 5 }, { "epoch": 0.022388059701492536, "grad_norm": 2.3376111584302106, "learning_rate": 2.1428571428571428e-05, "loss": 0.8146, "step": 6 }, { "epoch": 0.026119402985074626, "grad_norm": 2.1200568170888707, "learning_rate": 2.5e-05, "loss": 0.7896, "step": 7 }, { "epoch": 0.029850746268656716, "grad_norm": 1.4877729903052965, "learning_rate": 2.857142857142857e-05, "loss": 0.7767, "step": 8 }, { "epoch": 0.033582089552238806, "grad_norm": 0.9865560967134063, "learning_rate": 3.2142857142857144e-05, "loss": 0.7353, "step": 9 }, { "epoch": 0.03731343283582089, "grad_norm": 0.7812631557310801, "learning_rate": 3.571428571428572e-05, "loss": 0.7249, "step": 10 }, { "epoch": 0.041044776119402986, "grad_norm": 0.8222515329785233, "learning_rate": 3.928571428571429e-05, "loss": 0.7132, "step": 11 }, { "epoch": 0.04477611940298507, "grad_norm": 0.6903222794506202, "learning_rate": 4.2857142857142856e-05, "loss": 0.7025, "step": 12 }, { "epoch": 0.048507462686567165, "grad_norm": 0.5938656411295028, "learning_rate": 4.642857142857143e-05, "loss": 0.7015, "step": 13 }, { "epoch": 0.05223880597014925, "grad_norm": 0.617938425702766, "learning_rate": 5e-05, "loss": 0.7014, "step": 14 }, { "epoch": 0.055970149253731345, "grad_norm": 0.5020622563852112, "learning_rate": 4.999827900623038e-05, "loss": 0.6946, "step": 15 }, { "epoch": 0.05970149253731343, "grad_norm": 0.4967335870944291, "learning_rate": 4.999311628819437e-05, "loss": 0.6741, "step": 16 }, { "epoch": 0.06343283582089553, "grad_norm": 0.5223517838430579, "learning_rate": 4.998451263567024e-05, "loss": 0.697, "step": 17 }, { "epoch": 0.06716417910447761, "grad_norm": 0.3811681582666189, "learning_rate": 4.9972469364820877e-05, "loss": 0.6724, "step": 18 }, { "epoch": 0.0708955223880597, "grad_norm": 0.3899755933584027, "learning_rate": 4.995698831799242e-05, "loss": 0.6746, "step": 19 }, { "epoch": 0.07462686567164178, "grad_norm": 0.38167889193863935, "learning_rate": 4.993807186343243e-05, "loss": 0.6547, "step": 20 }, { "epoch": 0.07835820895522388, "grad_norm": 0.34376355784438994, "learning_rate": 4.9915722894927626e-05, "loss": 0.6555, "step": 21 }, { "epoch": 0.08208955223880597, "grad_norm": 0.3757869253891852, "learning_rate": 4.988994483136115e-05, "loss": 0.6325, "step": 22 }, { "epoch": 0.08582089552238806, "grad_norm": 0.3167321816874193, "learning_rate": 4.986074161618962e-05, "loss": 0.6428, "step": 23 }, { "epoch": 0.08955223880597014, "grad_norm": 0.3318375375489798, "learning_rate": 4.982811771683982e-05, "loss": 0.6392, "step": 24 }, { "epoch": 0.09328358208955224, "grad_norm": 0.35533751495096894, "learning_rate": 4.979207812402531e-05, "loss": 0.648, "step": 25 }, { "epoch": 0.09701492537313433, "grad_norm": 0.28839425158997484, "learning_rate": 4.975262835098295e-05, "loss": 0.6506, "step": 26 }, { "epoch": 0.10074626865671642, "grad_norm": 0.39706709966248505, "learning_rate": 4.970977443262949e-05, "loss": 0.6429, "step": 27 }, { "epoch": 0.1044776119402985, "grad_norm": 0.307300403151087, "learning_rate": 4.966352292463842e-05, "loss": 0.6289, "step": 28 }, { "epoch": 0.10820895522388059, "grad_norm": 0.32063642705798723, "learning_rate": 4.9613880902437035e-05, "loss": 0.6263, "step": 29 }, { "epoch": 0.11194029850746269, "grad_norm": 0.27096377706084435, "learning_rate": 4.956085596012407e-05, "loss": 0.6326, "step": 30 }, { "epoch": 0.11567164179104478, "grad_norm": 0.2923423890856896, "learning_rate": 4.950445620930801e-05, "loss": 0.6275, "step": 31 }, { "epoch": 0.11940298507462686, "grad_norm": 0.25099362351139276, "learning_rate": 4.944469027786616e-05, "loss": 0.6175, "step": 32 }, { "epoch": 0.12313432835820895, "grad_norm": 0.2574228502672499, "learning_rate": 4.938156730862481e-05, "loss": 0.621, "step": 33 }, { "epoch": 0.12686567164179105, "grad_norm": 0.28064221046194, "learning_rate": 4.931509695796055e-05, "loss": 0.6124, "step": 34 }, { "epoch": 0.13059701492537312, "grad_norm": 0.24520058089913865, "learning_rate": 4.924528939432311e-05, "loss": 0.6212, "step": 35 }, { "epoch": 0.13432835820895522, "grad_norm": 0.26510125360357145, "learning_rate": 4.917215529667979e-05, "loss": 0.6271, "step": 36 }, { "epoch": 0.13805970149253732, "grad_norm": 0.23216542656777295, "learning_rate": 4.909570585288181e-05, "loss": 0.5953, "step": 37 }, { "epoch": 0.1417910447761194, "grad_norm": 0.23246158232683492, "learning_rate": 4.901595275795287e-05, "loss": 0.6036, "step": 38 }, { "epoch": 0.1455223880597015, "grad_norm": 0.25772206222760663, "learning_rate": 4.8932908212300035e-05, "loss": 0.6137, "step": 39 }, { "epoch": 0.14925373134328357, "grad_norm": 0.2335484338372156, "learning_rate": 4.884658491984735e-05, "loss": 0.6132, "step": 40 }, { "epoch": 0.15298507462686567, "grad_norm": 0.23644464362754578, "learning_rate": 4.875699608609248e-05, "loss": 0.6045, "step": 41 }, { "epoch": 0.15671641791044777, "grad_norm": 0.25297544238791025, "learning_rate": 4.8664155416086495e-05, "loss": 0.6113, "step": 42 }, { "epoch": 0.16044776119402984, "grad_norm": 0.22864069373384371, "learning_rate": 4.8568077112337355e-05, "loss": 0.6264, "step": 43 }, { "epoch": 0.16417910447761194, "grad_norm": 0.2846806052878089, "learning_rate": 4.846877587263728e-05, "loss": 0.6084, "step": 44 }, { "epoch": 0.16791044776119404, "grad_norm": 0.2039160781081297, "learning_rate": 4.8366266887814235e-05, "loss": 0.6052, "step": 45 }, { "epoch": 0.17164179104477612, "grad_norm": 0.3428553102775528, "learning_rate": 4.826056583940815e-05, "loss": 0.5889, "step": 46 }, { "epoch": 0.17537313432835822, "grad_norm": 0.27728143115436193, "learning_rate": 4.815168889727199e-05, "loss": 0.6043, "step": 47 }, { "epoch": 0.1791044776119403, "grad_norm": 0.2779749061147266, "learning_rate": 4.803965271709811e-05, "loss": 0.5862, "step": 48 }, { "epoch": 0.1828358208955224, "grad_norm": 0.2513301701642567, "learning_rate": 4.792447443787034e-05, "loss": 0.5918, "step": 49 }, { "epoch": 0.1865671641791045, "grad_norm": 0.21866653759965723, "learning_rate": 4.780617167924209e-05, "loss": 0.5977, "step": 50 }, { "epoch": 0.19029850746268656, "grad_norm": 0.2645286012627394, "learning_rate": 4.7684762538840976e-05, "loss": 0.6037, "step": 51 }, { "epoch": 0.19402985074626866, "grad_norm": 0.2544272948372345, "learning_rate": 4.756026558950025e-05, "loss": 0.5968, "step": 52 }, { "epoch": 0.19776119402985073, "grad_norm": 0.24989633057850902, "learning_rate": 4.743269987641761e-05, "loss": 0.5925, "step": 53 }, { "epoch": 0.20149253731343283, "grad_norm": 0.23175963767366708, "learning_rate": 4.730208491424174e-05, "loss": 0.607, "step": 54 }, { "epoch": 0.20522388059701493, "grad_norm": 0.22451432592453308, "learning_rate": 4.716844068408693e-05, "loss": 0.5822, "step": 55 }, { "epoch": 0.208955223880597, "grad_norm": 0.2052475140816284, "learning_rate": 4.7031787630476544e-05, "loss": 0.5741, "step": 56 }, { "epoch": 0.2126865671641791, "grad_norm": 0.21978018664973234, "learning_rate": 4.6892146658215334e-05, "loss": 0.5931, "step": 57 }, { "epoch": 0.21641791044776118, "grad_norm": 0.2123273846078205, "learning_rate": 4.674953912919161e-05, "loss": 0.5996, "step": 58 }, { "epoch": 0.22014925373134328, "grad_norm": 0.24986687098106464, "learning_rate": 4.660398685910928e-05, "loss": 0.594, "step": 59 }, { "epoch": 0.22388059701492538, "grad_norm": 0.24580836120637442, "learning_rate": 4.6455512114150546e-05, "loss": 0.598, "step": 60 }, { "epoch": 0.22761194029850745, "grad_norm": 0.2724905364167634, "learning_rate": 4.630413760756973e-05, "loss": 0.5872, "step": 61 }, { "epoch": 0.23134328358208955, "grad_norm": 0.28606080361850245, "learning_rate": 4.614988649621864e-05, "loss": 0.5777, "step": 62 }, { "epoch": 0.23507462686567165, "grad_norm": 0.34193454503580145, "learning_rate": 4.599278237700407e-05, "loss": 0.5719, "step": 63 }, { "epoch": 0.23880597014925373, "grad_norm": 0.38444879379990926, "learning_rate": 4.583284928327805e-05, "loss": 0.5831, "step": 64 }, { "epoch": 0.24253731343283583, "grad_norm": 0.4734083914361801, "learning_rate": 4.5670111681161296e-05, "loss": 0.5939, "step": 65 }, { "epoch": 0.2462686567164179, "grad_norm": 0.5102574028338418, "learning_rate": 4.550459446580039e-05, "loss": 0.5762, "step": 66 }, { "epoch": 0.25, "grad_norm": 0.5117139342435484, "learning_rate": 4.5336322957559466e-05, "loss": 0.5833, "step": 67 }, { "epoch": 0.2537313432835821, "grad_norm": 0.46014525616930846, "learning_rate": 4.516532289814674e-05, "loss": 0.5785, "step": 68 }, { "epoch": 0.2574626865671642, "grad_norm": 0.3249014117473111, "learning_rate": 4.499162044667658e-05, "loss": 0.5657, "step": 69 }, { "epoch": 0.26119402985074625, "grad_norm": 0.3622697864552333, "learning_rate": 4.481524217566783e-05, "loss": 0.5949, "step": 70 }, { "epoch": 0.26492537313432835, "grad_norm": 0.5969547201230023, "learning_rate": 4.463621506697873e-05, "loss": 0.5925, "step": 71 }, { "epoch": 0.26865671641791045, "grad_norm": 0.48644887972799394, "learning_rate": 4.4454566507679395e-05, "loss": 0.5719, "step": 72 }, { "epoch": 0.27238805970149255, "grad_norm": 0.3624896588775397, "learning_rate": 4.4270324285862176e-05, "loss": 0.5786, "step": 73 }, { "epoch": 0.27611940298507465, "grad_norm": 0.47385159428007284, "learning_rate": 4.4083516586390694e-05, "loss": 0.5657, "step": 74 }, { "epoch": 0.2798507462686567, "grad_norm": 0.44563905818070665, "learning_rate": 4.3894171986588217e-05, "loss": 0.5842, "step": 75 }, { "epoch": 0.2835820895522388, "grad_norm": 0.2593586224740128, "learning_rate": 4.370231945186601e-05, "loss": 0.56, "step": 76 }, { "epoch": 0.2873134328358209, "grad_norm": 0.4522021891247066, "learning_rate": 4.350798833129225e-05, "loss": 0.5866, "step": 77 }, { "epoch": 0.291044776119403, "grad_norm": 0.39845211355965504, "learning_rate": 4.331120835310228e-05, "loss": 0.567, "step": 78 }, { "epoch": 0.2947761194029851, "grad_norm": 0.27334227409442136, "learning_rate": 4.3112009620150904e-05, "loss": 0.5803, "step": 79 }, { "epoch": 0.29850746268656714, "grad_norm": 0.3868529615890604, "learning_rate": 4.29104226053073e-05, "loss": 0.5927, "step": 80 }, { "epoch": 0.30223880597014924, "grad_norm": 0.34084096101485734, "learning_rate": 4.27064781467934e-05, "loss": 0.5686, "step": 81 }, { "epoch": 0.30597014925373134, "grad_norm": 0.30871015922653666, "learning_rate": 4.250020744346629e-05, "loss": 0.5832, "step": 82 }, { "epoch": 0.30970149253731344, "grad_norm": 0.3394963709668159, "learning_rate": 4.229164205004556e-05, "loss": 0.5713, "step": 83 }, { "epoch": 0.31343283582089554, "grad_norm": 0.29351269277041925, "learning_rate": 4.208081387228612e-05, "loss": 0.5712, "step": 84 }, { "epoch": 0.31716417910447764, "grad_norm": 0.3053584503146514, "learning_rate": 4.186775516209732e-05, "loss": 0.5692, "step": 85 }, { "epoch": 0.3208955223880597, "grad_norm": 0.29587612520776385, "learning_rate": 4.165249851260921e-05, "loss": 0.5798, "step": 86 }, { "epoch": 0.3246268656716418, "grad_norm": 0.32552232556821503, "learning_rate": 4.143507685318645e-05, "loss": 0.5767, "step": 87 }, { "epoch": 0.3283582089552239, "grad_norm": 0.29291818526208413, "learning_rate": 4.121552344439093e-05, "loss": 0.5682, "step": 88 }, { "epoch": 0.332089552238806, "grad_norm": 0.2925717688387159, "learning_rate": 4.099387187289365e-05, "loss": 0.5709, "step": 89 }, { "epoch": 0.3358208955223881, "grad_norm": 0.2997729199324307, "learning_rate": 4.077015604633669e-05, "loss": 0.5803, "step": 90 }, { "epoch": 0.33955223880597013, "grad_norm": 0.2477501647432846, "learning_rate": 4.054441018814617e-05, "loss": 0.5749, "step": 91 }, { "epoch": 0.34328358208955223, "grad_norm": 0.2860907976058691, "learning_rate": 4.031666883229678e-05, "loss": 0.5795, "step": 92 }, { "epoch": 0.34701492537313433, "grad_norm": 0.2617839660889754, "learning_rate": 4.008696681802895e-05, "loss": 0.5775, "step": 93 }, { "epoch": 0.35074626865671643, "grad_norm": 0.2843433667963072, "learning_rate": 3.985533928451914e-05, "loss": 0.5706, "step": 94 }, { "epoch": 0.35447761194029853, "grad_norm": 0.28519715770092186, "learning_rate": 3.962182166550441e-05, "loss": 0.5655, "step": 95 }, { "epoch": 0.3582089552238806, "grad_norm": 0.24670408239759686, "learning_rate": 3.938644968386188e-05, "loss": 0.5689, "step": 96 }, { "epoch": 0.3619402985074627, "grad_norm": 0.29258871269480563, "learning_rate": 3.9149259346143906e-05, "loss": 0.5628, "step": 97 }, { "epoch": 0.3656716417910448, "grad_norm": 0.23665483935616044, "learning_rate": 3.8910286937069894e-05, "loss": 0.5671, "step": 98 }, { "epoch": 0.3694029850746269, "grad_norm": 0.26718445384439204, "learning_rate": 3.866956901397559e-05, "loss": 0.5743, "step": 99 }, { "epoch": 0.373134328358209, "grad_norm": 0.260170899341738, "learning_rate": 3.8427142401220634e-05, "loss": 0.5745, "step": 100 }, { "epoch": 0.376865671641791, "grad_norm": 0.19182691360944223, "learning_rate": 3.818304418455526e-05, "loss": 0.5716, "step": 101 }, { "epoch": 0.3805970149253731, "grad_norm": 0.24944411136952396, "learning_rate": 3.7937311705447016e-05, "loss": 0.5568, "step": 102 }, { "epoch": 0.3843283582089552, "grad_norm": 0.2580117795680708, "learning_rate": 3.7689982555368405e-05, "loss": 0.5572, "step": 103 }, { "epoch": 0.3880597014925373, "grad_norm": 0.22020286184402807, "learning_rate": 3.74410945700462e-05, "loss": 0.5699, "step": 104 }, { "epoch": 0.3917910447761194, "grad_norm": 0.24252691253631364, "learning_rate": 3.71906858236735e-05, "loss": 0.5739, "step": 105 }, { "epoch": 0.39552238805970147, "grad_norm": 0.2710125706293663, "learning_rate": 3.693879462308516e-05, "loss": 0.569, "step": 106 }, { "epoch": 0.39925373134328357, "grad_norm": 0.2312782320329894, "learning_rate": 3.6685459501897786e-05, "loss": 0.5856, "step": 107 }, { "epoch": 0.40298507462686567, "grad_norm": 0.21651770531820405, "learning_rate": 3.643071921461497e-05, "loss": 0.5694, "step": 108 }, { "epoch": 0.40671641791044777, "grad_norm": 0.2216291879017097, "learning_rate": 3.617461273069867e-05, "loss": 0.5766, "step": 109 }, { "epoch": 0.41044776119402987, "grad_norm": 0.23475493172298043, "learning_rate": 3.591717922860785e-05, "loss": 0.5651, "step": 110 }, { "epoch": 0.4141791044776119, "grad_norm": 0.22295703407533332, "learning_rate": 3.565845808980501e-05, "loss": 0.567, "step": 111 }, { "epoch": 0.417910447761194, "grad_norm": 0.22757699129422, "learning_rate": 3.539848889273175e-05, "loss": 0.5671, "step": 112 }, { "epoch": 0.4216417910447761, "grad_norm": 0.24971066493741892, "learning_rate": 3.5137311406754156e-05, "loss": 0.5583, "step": 113 }, { "epoch": 0.4253731343283582, "grad_norm": 0.22619687569258193, "learning_rate": 3.487496558607898e-05, "loss": 0.5696, "step": 114 }, { "epoch": 0.4291044776119403, "grad_norm": 0.2058883240626454, "learning_rate": 3.46114915636416e-05, "loss": 0.5578, "step": 115 }, { "epoch": 0.43283582089552236, "grad_norm": 0.20275069322754283, "learning_rate": 3.4346929644966564e-05, "loss": 0.5547, "step": 116 }, { "epoch": 0.43656716417910446, "grad_norm": 0.24953635839842572, "learning_rate": 3.4081320302001754e-05, "loss": 0.5622, "step": 117 }, { "epoch": 0.44029850746268656, "grad_norm": 0.21495775389364913, "learning_rate": 3.38147041669271e-05, "loss": 0.5796, "step": 118 }, { "epoch": 0.44402985074626866, "grad_norm": 0.21438281980899815, "learning_rate": 3.354712202593882e-05, "loss": 0.5673, "step": 119 }, { "epoch": 0.44776119402985076, "grad_norm": 0.2130910034654793, "learning_rate": 3.3278614813010034e-05, "loss": 0.5609, "step": 120 }, { "epoch": 0.45149253731343286, "grad_norm": 0.20480609196454375, "learning_rate": 3.3009223603628786e-05, "loss": 0.564, "step": 121 }, { "epoch": 0.4552238805970149, "grad_norm": 0.23086267369600774, "learning_rate": 3.273898960851443e-05, "loss": 0.567, "step": 122 }, { "epoch": 0.458955223880597, "grad_norm": 0.18984512919428015, "learning_rate": 3.2467954167313384e-05, "loss": 0.5622, "step": 123 }, { "epoch": 0.4626865671641791, "grad_norm": 0.22925876538840964, "learning_rate": 3.219615874227504e-05, "loss": 0.5627, "step": 124 }, { "epoch": 0.4664179104477612, "grad_norm": 0.21444042494495835, "learning_rate": 3.1923644911909e-05, "loss": 0.5529, "step": 125 }, { "epoch": 0.4701492537313433, "grad_norm": 0.1985364469637859, "learning_rate": 3.165045436462451e-05, "loss": 0.5656, "step": 126 }, { "epoch": 0.47388059701492535, "grad_norm": 0.22073396554037217, "learning_rate": 3.137662889235313e-05, "loss": 0.5718, "step": 127 }, { "epoch": 0.47761194029850745, "grad_norm": 0.20687594272641843, "learning_rate": 3.110221038415545e-05, "loss": 0.5564, "step": 128 }, { "epoch": 0.48134328358208955, "grad_norm": 0.18996036356061075, "learning_rate": 3.082724081981306e-05, "loss": 0.5599, "step": 129 }, { "epoch": 0.48507462686567165, "grad_norm": 0.2076433111639105, "learning_rate": 3.0551762263406576e-05, "loss": 0.5487, "step": 130 }, { "epoch": 0.48880597014925375, "grad_norm": 0.19967128784484245, "learning_rate": 3.027581685688084e-05, "loss": 0.5478, "step": 131 }, { "epoch": 0.4925373134328358, "grad_norm": 0.19863395822305602, "learning_rate": 2.999944681359811e-05, "loss": 0.549, "step": 132 }, { "epoch": 0.4962686567164179, "grad_norm": 0.23896614524468549, "learning_rate": 2.9722694411880375e-05, "loss": 0.5616, "step": 133 }, { "epoch": 0.5, "grad_norm": 0.2160200658976899, "learning_rate": 2.9445601988541782e-05, "loss": 0.563, "step": 134 }, { "epoch": 0.503731343283582, "grad_norm": 0.20801723160927582, "learning_rate": 2.9168211932412042e-05, "loss": 0.5672, "step": 135 }, { "epoch": 0.5074626865671642, "grad_norm": 0.21097398636851195, "learning_rate": 2.88905666778519e-05, "loss": 0.5599, "step": 136 }, { "epoch": 0.5111940298507462, "grad_norm": 0.1991636199874378, "learning_rate": 2.8612708698261632e-05, "loss": 0.5655, "step": 137 }, { "epoch": 0.5149253731343284, "grad_norm": 0.2067349458340914, "learning_rate": 2.8334680499583617e-05, "loss": 0.5648, "step": 138 }, { "epoch": 0.5186567164179104, "grad_norm": 0.19931701118114634, "learning_rate": 2.8056524613799888e-05, "loss": 0.5572, "step": 139 }, { "epoch": 0.5223880597014925, "grad_norm": 0.19188565947709005, "learning_rate": 2.777828359242567e-05, "loss": 0.5681, "step": 140 }, { "epoch": 0.5261194029850746, "grad_norm": 0.20945169754166926, "learning_rate": 2.7500000000000004e-05, "loss": 0.5619, "step": 141 }, { "epoch": 0.5298507462686567, "grad_norm": 0.18764650588001947, "learning_rate": 2.722171640757434e-05, "loss": 0.5398, "step": 142 }, { "epoch": 0.5335820895522388, "grad_norm": 0.23149436363329826, "learning_rate": 2.6943475386200117e-05, "loss": 0.5555, "step": 143 }, { "epoch": 0.5373134328358209, "grad_norm": 0.18948167279403233, "learning_rate": 2.6665319500416385e-05, "loss": 0.5607, "step": 144 }, { "epoch": 0.5410447761194029, "grad_norm": 0.22746724049007466, "learning_rate": 2.6387291301738377e-05, "loss": 0.5608, "step": 145 }, { "epoch": 0.5447761194029851, "grad_norm": 0.1862272101430662, "learning_rate": 2.6109433322148112e-05, "loss": 0.5463, "step": 146 }, { "epoch": 0.5485074626865671, "grad_norm": 0.22207423336221527, "learning_rate": 2.5831788067587963e-05, "loss": 0.5546, "step": 147 }, { "epoch": 0.5522388059701493, "grad_norm": 0.2048557174287191, "learning_rate": 2.555439801145823e-05, "loss": 0.5644, "step": 148 }, { "epoch": 0.5559701492537313, "grad_norm": 0.22794890458188047, "learning_rate": 2.527730558811963e-05, "loss": 0.5522, "step": 149 }, { "epoch": 0.5597014925373134, "grad_norm": 0.19785496178287987, "learning_rate": 2.50005531864019e-05, "loss": 0.5507, "step": 150 }, { "epoch": 0.5634328358208955, "grad_norm": 0.210864842713951, "learning_rate": 2.4724183143119155e-05, "loss": 0.5468, "step": 151 }, { "epoch": 0.5671641791044776, "grad_norm": 0.20378922596042334, "learning_rate": 2.4448237736593422e-05, "loss": 0.548, "step": 152 }, { "epoch": 0.5708955223880597, "grad_norm": 0.18703940583216236, "learning_rate": 2.417275918018695e-05, "loss": 0.5673, "step": 153 }, { "epoch": 0.5746268656716418, "grad_norm": 0.23569564146888447, "learning_rate": 2.3897789615844557e-05, "loss": 0.5657, "step": 154 }, { "epoch": 0.5783582089552238, "grad_norm": 0.18231721639802242, "learning_rate": 2.362337110764688e-05, "loss": 0.542, "step": 155 }, { "epoch": 0.582089552238806, "grad_norm": 0.22176053800453152, "learning_rate": 2.3349545635375498e-05, "loss": 0.5626, "step": 156 }, { "epoch": 0.585820895522388, "grad_norm": 0.21615884674882344, "learning_rate": 2.307635508809101e-05, "loss": 0.5496, "step": 157 }, { "epoch": 0.5895522388059702, "grad_norm": 0.1941490885580029, "learning_rate": 2.2803841257724962e-05, "loss": 0.5574, "step": 158 }, { "epoch": 0.5932835820895522, "grad_norm": 0.20955613151155947, "learning_rate": 2.2532045832686615e-05, "loss": 0.5543, "step": 159 }, { "epoch": 0.5970149253731343, "grad_norm": 0.19828074785545058, "learning_rate": 2.226101039148557e-05, "loss": 0.5368, "step": 160 }, { "epoch": 0.6007462686567164, "grad_norm": 0.16777779459261297, "learning_rate": 2.1990776396371227e-05, "loss": 0.5505, "step": 161 }, { "epoch": 0.6044776119402985, "grad_norm": 0.21966969125322472, "learning_rate": 2.1721385186989978e-05, "loss": 0.5504, "step": 162 }, { "epoch": 0.6082089552238806, "grad_norm": 0.18436872555731795, "learning_rate": 2.145287797406119e-05, "loss": 0.5442, "step": 163 }, { "epoch": 0.6119402985074627, "grad_norm": 0.19625085958906968, "learning_rate": 2.1185295833072914e-05, "loss": 0.5527, "step": 164 }, { "epoch": 0.6156716417910447, "grad_norm": 0.2003859081658998, "learning_rate": 2.0918679697998252e-05, "loss": 0.5571, "step": 165 }, { "epoch": 0.6194029850746269, "grad_norm": 0.18504719754332433, "learning_rate": 2.0653070355033438e-05, "loss": 0.5542, "step": 166 }, { "epoch": 0.6231343283582089, "grad_norm": 0.17608209837916533, "learning_rate": 2.03885084363584e-05, "loss": 0.5482, "step": 167 }, { "epoch": 0.6268656716417911, "grad_norm": 0.1876116502662929, "learning_rate": 2.0125034413921024e-05, "loss": 0.5484, "step": 168 }, { "epoch": 0.6305970149253731, "grad_norm": 0.18302181742881515, "learning_rate": 1.9862688593245853e-05, "loss": 0.5562, "step": 169 }, { "epoch": 0.6343283582089553, "grad_norm": 0.19018739710680807, "learning_rate": 1.9601511107268255e-05, "loss": 0.5501, "step": 170 }, { "epoch": 0.6380597014925373, "grad_norm": 0.17999497596422973, "learning_rate": 1.9341541910194995e-05, "loss": 0.5425, "step": 171 }, { "epoch": 0.6417910447761194, "grad_norm": 0.18663424523807143, "learning_rate": 1.9082820771392157e-05, "loss": 0.5546, "step": 172 }, { "epoch": 0.6455223880597015, "grad_norm": 0.1698529977539088, "learning_rate": 1.8825387269301338e-05, "loss": 0.5318, "step": 173 }, { "epoch": 0.6492537313432836, "grad_norm": 0.19929073100641015, "learning_rate": 1.8569280785385046e-05, "loss": 0.5655, "step": 174 }, { "epoch": 0.6529850746268657, "grad_norm": 0.18686275940168323, "learning_rate": 1.8314540498102216e-05, "loss": 0.5608, "step": 175 }, { "epoch": 0.6567164179104478, "grad_norm": 0.20398465264796287, "learning_rate": 1.806120537691485e-05, "loss": 0.5487, "step": 176 }, { "epoch": 0.6604477611940298, "grad_norm": 0.17781647257248, "learning_rate": 1.7809314176326514e-05, "loss": 0.5363, "step": 177 }, { "epoch": 0.664179104477612, "grad_norm": 0.18882399302247135, "learning_rate": 1.7558905429953805e-05, "loss": 0.546, "step": 178 }, { "epoch": 0.667910447761194, "grad_norm": 0.17991868413443657, "learning_rate": 1.731001744463161e-05, "loss": 0.5448, "step": 179 }, { "epoch": 0.6716417910447762, "grad_norm": 0.16491240832836918, "learning_rate": 1.7062688294552992e-05, "loss": 0.5397, "step": 180 }, { "epoch": 0.6753731343283582, "grad_norm": 0.18822234506905258, "learning_rate": 1.6816955815444746e-05, "loss": 0.5509, "step": 181 }, { "epoch": 0.6791044776119403, "grad_norm": 0.18270213198792692, "learning_rate": 1.657285759877937e-05, "loss": 0.5488, "step": 182 }, { "epoch": 0.6828358208955224, "grad_norm": 0.19794173715824212, "learning_rate": 1.6330430986024414e-05, "loss": 0.5373, "step": 183 }, { "epoch": 0.6865671641791045, "grad_norm": 0.1635886625673262, "learning_rate": 1.6089713062930108e-05, "loss": 0.55, "step": 184 }, { "epoch": 0.6902985074626866, "grad_norm": 0.18251503726209914, "learning_rate": 1.5850740653856096e-05, "loss": 0.5456, "step": 185 }, { "epoch": 0.6940298507462687, "grad_norm": 0.16851290996101398, "learning_rate": 1.5613550316138116e-05, "loss": 0.5405, "step": 186 }, { "epoch": 0.6977611940298507, "grad_norm": 0.1685709239092689, "learning_rate": 1.5378178334495596e-05, "loss": 0.5564, "step": 187 }, { "epoch": 0.7014925373134329, "grad_norm": 0.15776177320181445, "learning_rate": 1.5144660715480877e-05, "loss": 0.5467, "step": 188 }, { "epoch": 0.7052238805970149, "grad_norm": 0.1864079536909339, "learning_rate": 1.4913033181971056e-05, "loss": 0.5634, "step": 189 }, { "epoch": 0.7089552238805971, "grad_norm": 0.16320475438696894, "learning_rate": 1.4683331167703218e-05, "loss": 0.5437, "step": 190 }, { "epoch": 0.7126865671641791, "grad_norm": 0.16993506201791542, "learning_rate": 1.4455589811853833e-05, "loss": 0.5518, "step": 191 }, { "epoch": 0.7164179104477612, "grad_norm": 0.15987266956348012, "learning_rate": 1.4229843953663313e-05, "loss": 0.5578, "step": 192 }, { "epoch": 0.7201492537313433, "grad_norm": 0.15746667871485898, "learning_rate": 1.4006128127106363e-05, "loss": 0.5552, "step": 193 }, { "epoch": 0.7238805970149254, "grad_norm": 0.16064149261880517, "learning_rate": 1.3784476555609077e-05, "loss": 0.5581, "step": 194 }, { "epoch": 0.7276119402985075, "grad_norm": 0.16499282150547817, "learning_rate": 1.356492314681356e-05, "loss": 0.5415, "step": 195 }, { "epoch": 0.7313432835820896, "grad_norm": 0.16737033944308755, "learning_rate": 1.3347501487390801e-05, "loss": 0.5493, "step": 196 }, { "epoch": 0.7350746268656716, "grad_norm": 0.1694946522170064, "learning_rate": 1.3132244837902674e-05, "loss": 0.5464, "step": 197 }, { "epoch": 0.7388059701492538, "grad_norm": 0.1500674830196698, "learning_rate": 1.2919186127713885e-05, "loss": 0.547, "step": 198 }, { "epoch": 0.7425373134328358, "grad_norm": 0.1621730888453231, "learning_rate": 1.2708357949954441e-05, "loss": 0.5411, "step": 199 }, { "epoch": 0.746268656716418, "grad_norm": 0.1522204131523946, "learning_rate": 1.2499792556533716e-05, "loss": 0.5525, "step": 200 }, { "epoch": 0.75, "grad_norm": 0.17328074458036194, "learning_rate": 1.229352185320661e-05, "loss": 0.5632, "step": 201 }, { "epoch": 0.753731343283582, "grad_norm": 0.15782749346311695, "learning_rate": 1.20895773946927e-05, "loss": 0.5451, "step": 202 }, { "epoch": 0.7574626865671642, "grad_norm": 0.16114594831252302, "learning_rate": 1.1887990379849101e-05, "loss": 0.5469, "step": 203 }, { "epoch": 0.7611940298507462, "grad_norm": 0.15115605162775136, "learning_rate": 1.1688791646897726e-05, "loss": 0.5391, "step": 204 }, { "epoch": 0.7649253731343284, "grad_norm": 0.1668591931410943, "learning_rate": 1.1492011668707753e-05, "loss": 0.5295, "step": 205 }, { "epoch": 0.7686567164179104, "grad_norm": 0.14747494287735852, "learning_rate": 1.1297680548133993e-05, "loss": 0.5519, "step": 206 }, { "epoch": 0.7723880597014925, "grad_norm": 0.1566369908547312, "learning_rate": 1.110582801341179e-05, "loss": 0.5529, "step": 207 }, { "epoch": 0.7761194029850746, "grad_norm": 0.16579316433849137, "learning_rate": 1.0916483413609315e-05, "loss": 0.5569, "step": 208 }, { "epoch": 0.7798507462686567, "grad_norm": 0.16236476949170714, "learning_rate": 1.0729675714137831e-05, "loss": 0.5484, "step": 209 }, { "epoch": 0.7835820895522388, "grad_norm": 0.1613244237155475, "learning_rate": 1.0545433492320603e-05, "loss": 0.5394, "step": 210 }, { "epoch": 0.7873134328358209, "grad_norm": 0.1572184147364716, "learning_rate": 1.0363784933021276e-05, "loss": 0.5423, "step": 211 }, { "epoch": 0.7910447761194029, "grad_norm": 0.1524912628586707, "learning_rate": 1.0184757824332187e-05, "loss": 0.54, "step": 212 }, { "epoch": 0.7947761194029851, "grad_norm": 0.15963097447119412, "learning_rate": 1.0008379553323415e-05, "loss": 0.55, "step": 213 }, { "epoch": 0.7985074626865671, "grad_norm": 0.15017948107917775, "learning_rate": 9.834677101853265e-06, "loss": 0.5362, "step": 214 }, { "epoch": 0.8022388059701493, "grad_norm": 0.14940423000768321, "learning_rate": 9.663677042440537e-06, "loss": 0.5529, "step": 215 }, { "epoch": 0.8059701492537313, "grad_norm": 0.15611126841305084, "learning_rate": 9.495405534199617e-06, "loss": 0.5599, "step": 216 }, { "epoch": 0.8097014925373134, "grad_norm": 0.15270616034620718, "learning_rate": 9.329888318838716e-06, "loss": 0.5398, "step": 217 }, { "epoch": 0.8134328358208955, "grad_norm": 0.14627271440930706, "learning_rate": 9.167150716721954e-06, "loss": 0.542, "step": 218 }, { "epoch": 0.8171641791044776, "grad_norm": 0.15008282595880507, "learning_rate": 9.007217622995933e-06, "loss": 0.5393, "step": 219 }, { "epoch": 0.8208955223880597, "grad_norm": 0.1551326390476609, "learning_rate": 8.850113503781367e-06, "loss": 0.5397, "step": 220 }, { "epoch": 0.8246268656716418, "grad_norm": 0.14881610648093943, "learning_rate": 8.69586239243027e-06, "loss": 0.5472, "step": 221 }, { "epoch": 0.8283582089552238, "grad_norm": 0.1470716434089977, "learning_rate": 8.54448788584946e-06, "loss": 0.5565, "step": 222 }, { "epoch": 0.832089552238806, "grad_norm": 0.1511297735047479, "learning_rate": 8.396013140890732e-06, "loss": 0.5512, "step": 223 }, { "epoch": 0.835820895522388, "grad_norm": 0.16141843554448523, "learning_rate": 8.250460870808394e-06, "loss": 0.5458, "step": 224 }, { "epoch": 0.8395522388059702, "grad_norm": 0.13832420859327427, "learning_rate": 8.107853341784671e-06, "loss": 0.5517, "step": 225 }, { "epoch": 0.8432835820895522, "grad_norm": 0.13996716173382923, "learning_rate": 7.968212369523462e-06, "loss": 0.5316, "step": 226 }, { "epoch": 0.8470149253731343, "grad_norm": 0.16247360822292523, "learning_rate": 7.831559315913068e-06, "loss": 0.5462, "step": 227 }, { "epoch": 0.8507462686567164, "grad_norm": 0.15709786181066085, "learning_rate": 7.697915085758266e-06, "loss": 0.5518, "step": 228 }, { "epoch": 0.8544776119402985, "grad_norm": 0.14250963305179876, "learning_rate": 7.567300123582388e-06, "loss": 0.535, "step": 229 }, { "epoch": 0.8582089552238806, "grad_norm": 0.1513836071462614, "learning_rate": 7.439734410499752e-06, "loss": 0.5698, "step": 230 }, { "epoch": 0.8619402985074627, "grad_norm": 0.14863808070276965, "learning_rate": 7.315237461159027e-06, "loss": 0.5605, "step": 231 }, { "epoch": 0.8656716417910447, "grad_norm": 0.15760982127034756, "learning_rate": 7.193828320757909e-06, "loss": 0.5562, "step": 232 }, { "epoch": 0.8694029850746269, "grad_norm": 0.1456055426246458, "learning_rate": 7.075525562129664e-06, "loss": 0.5492, "step": 233 }, { "epoch": 0.8731343283582089, "grad_norm": 0.14817070452531245, "learning_rate": 6.960347282901894e-06, "loss": 0.5322, "step": 234 }, { "epoch": 0.8768656716417911, "grad_norm": 0.1692185341827961, "learning_rate": 6.848311102728011e-06, "loss": 0.537, "step": 235 }, { "epoch": 0.8805970149253731, "grad_norm": 0.14103968254965088, "learning_rate": 6.739434160591852e-06, "loss": 0.5489, "step": 236 }, { "epoch": 0.8843283582089553, "grad_norm": 0.13624128681075703, "learning_rate": 6.633733112185768e-06, "loss": 0.5396, "step": 237 }, { "epoch": 0.8880597014925373, "grad_norm": 0.14992040022443223, "learning_rate": 6.531224127362726e-06, "loss": 0.532, "step": 238 }, { "epoch": 0.8917910447761194, "grad_norm": 0.1465832133718501, "learning_rate": 6.431922887662643e-06, "loss": 0.5498, "step": 239 }, { "epoch": 0.8955223880597015, "grad_norm": 0.16714418405362014, "learning_rate": 6.335844583913515e-06, "loss": 0.5457, "step": 240 }, { "epoch": 0.8992537313432836, "grad_norm": 0.1476740856625115, "learning_rate": 6.243003913907528e-06, "loss": 0.5577, "step": 241 }, { "epoch": 0.9029850746268657, "grad_norm": 0.13933748041232719, "learning_rate": 6.153415080152655e-06, "loss": 0.5458, "step": 242 }, { "epoch": 0.9067164179104478, "grad_norm": 0.16674390208669543, "learning_rate": 6.067091787699972e-06, "loss": 0.5531, "step": 243 }, { "epoch": 0.9104477611940298, "grad_norm": 0.15190991358688, "learning_rate": 5.984047242047134e-06, "loss": 0.5394, "step": 244 }, { "epoch": 0.914179104477612, "grad_norm": 0.135868472825757, "learning_rate": 5.904294147118193e-06, "loss": 0.5388, "step": 245 }, { "epoch": 0.917910447761194, "grad_norm": 0.14221884760432008, "learning_rate": 5.827844703320216e-06, "loss": 0.5489, "step": 246 }, { "epoch": 0.9216417910447762, "grad_norm": 0.13121689332045716, "learning_rate": 5.754710605676892e-06, "loss": 0.5463, "step": 247 }, { "epoch": 0.9253731343283582, "grad_norm": 0.14449918487529032, "learning_rate": 5.684903042039452e-06, "loss": 0.5284, "step": 248 }, { "epoch": 0.9291044776119403, "grad_norm": 0.13928914797758526, "learning_rate": 5.6184326913751945e-06, "loss": 0.5461, "step": 249 }, { "epoch": 0.9328358208955224, "grad_norm": 0.1417660900521393, "learning_rate": 5.555309722133842e-06, "loss": 0.5473, "step": 250 }, { "epoch": 0.9365671641791045, "grad_norm": 0.13614441656849596, "learning_rate": 5.495543790691992e-06, "loss": 0.5447, "step": 251 }, { "epoch": 0.9402985074626866, "grad_norm": 0.13631385772434718, "learning_rate": 5.439144039875931e-06, "loss": 0.5382, "step": 252 }, { "epoch": 0.9440298507462687, "grad_norm": 0.13669185965950084, "learning_rate": 5.386119097562968e-06, "loss": 0.5339, "step": 253 }, { "epoch": 0.9477611940298507, "grad_norm": 0.13822893660633595, "learning_rate": 5.336477075361577e-06, "loss": 0.5385, "step": 254 }, { "epoch": 0.9514925373134329, "grad_norm": 0.13624801019787705, "learning_rate": 5.290225567370509e-06, "loss": 0.542, "step": 255 }, { "epoch": 0.9552238805970149, "grad_norm": 0.13973740270508966, "learning_rate": 5.247371649017059e-06, "loss": 0.5448, "step": 256 }, { "epoch": 0.9589552238805971, "grad_norm": 0.1365138111666047, "learning_rate": 5.207921875974695e-06, "loss": 0.5431, "step": 257 }, { "epoch": 0.9626865671641791, "grad_norm": 0.1389889394286217, "learning_rate": 5.171882283160185e-06, "loss": 0.5314, "step": 258 }, { "epoch": 0.9664179104477612, "grad_norm": 0.1390994755040206, "learning_rate": 5.139258383810381e-06, "loss": 0.5465, "step": 259 }, { "epoch": 0.9701492537313433, "grad_norm": 0.13964028117544125, "learning_rate": 5.110055168638854e-06, "loss": 0.5498, "step": 260 }, { "epoch": 0.9738805970149254, "grad_norm": 0.13747670670156698, "learning_rate": 5.0842771050723815e-06, "loss": 0.5477, "step": 261 }, { "epoch": 0.9776119402985075, "grad_norm": 0.13556021540576652, "learning_rate": 5.06192813656757e-06, "loss": 0.5497, "step": 262 }, { "epoch": 0.9813432835820896, "grad_norm": 0.13349257592048414, "learning_rate": 5.0430116820075814e-06, "loss": 0.5361, "step": 263 }, { "epoch": 0.9850746268656716, "grad_norm": 0.13864201116785443, "learning_rate": 5.027530635179121e-06, "loss": 0.5401, "step": 264 }, { "epoch": 0.9888059701492538, "grad_norm": 0.13826663628336222, "learning_rate": 5.0154873643297575e-06, "loss": 0.5607, "step": 265 }, { "epoch": 0.9925373134328358, "grad_norm": 0.13977666438190228, "learning_rate": 5.00688371180563e-06, "loss": 0.5441, "step": 266 }, { "epoch": 0.996268656716418, "grad_norm": 0.13615065428921053, "learning_rate": 5.001720993769619e-06, "loss": 0.5472, "step": 267 }, { "epoch": 1.0, "grad_norm": 0.13622571621787738, "learning_rate": 5e-06, "loss": 0.5426, "step": 268 }, { "epoch": 1.0, "step": 268, "total_flos": 488621249396736.0, "train_loss": 0.579121366365632, "train_runtime": 3078.0231, "train_samples_per_second": 11.144, "train_steps_per_second": 0.087 } ], "logging_steps": 1, "max_steps": 268, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 488621249396736.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }