{ "best_metric": 1.7213791608810425, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.43360433604336046, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002168021680216802, "grad_norm": 29.56697654724121, "learning_rate": 1e-05, "loss": 6.7917, "step": 1 }, { "epoch": 0.002168021680216802, "eval_loss": 2.9564599990844727, "eval_runtime": 9.3786, "eval_samples_per_second": 82.848, "eval_steps_per_second": 20.792, "step": 1 }, { "epoch": 0.004336043360433604, "grad_norm": 22.844648361206055, "learning_rate": 2e-05, "loss": 8.8215, "step": 2 }, { "epoch": 0.0065040650406504065, "grad_norm": 27.447298049926758, "learning_rate": 3e-05, "loss": 7.7656, "step": 3 }, { "epoch": 0.008672086720867209, "grad_norm": 25.369524002075195, "learning_rate": 4e-05, "loss": 9.5796, "step": 4 }, { "epoch": 0.01084010840108401, "grad_norm": 24.90519905090332, "learning_rate": 5e-05, "loss": 8.8202, "step": 5 }, { "epoch": 0.013008130081300813, "grad_norm": 23.38373565673828, "learning_rate": 6e-05, "loss": 8.6864, "step": 6 }, { "epoch": 0.015176151761517615, "grad_norm": 23.274526596069336, "learning_rate": 7e-05, "loss": 8.4283, "step": 7 }, { "epoch": 0.017344173441734417, "grad_norm": 25.13153076171875, "learning_rate": 8e-05, "loss": 9.4565, "step": 8 }, { "epoch": 0.01951219512195122, "grad_norm": 30.048437118530273, "learning_rate": 9e-05, "loss": 9.5829, "step": 9 }, { "epoch": 0.02168021680216802, "grad_norm": 27.801332473754883, "learning_rate": 0.0001, "loss": 9.0878, "step": 10 }, { "epoch": 0.023848238482384824, "grad_norm": 32.128700256347656, "learning_rate": 9.999316524962345e-05, "loss": 9.333, "step": 11 }, { "epoch": 0.026016260162601626, "grad_norm": 36.04731369018555, "learning_rate": 9.997266286704631e-05, "loss": 9.1451, "step": 12 }, { "epoch": 0.028184281842818428, "grad_norm": 36.079429626464844, "learning_rate": 9.993849845741524e-05, "loss": 9.3416, "step": 13 }, { "epoch": 0.03035230352303523, "grad_norm": 35.72296905517578, "learning_rate": 9.989068136093873e-05, "loss": 9.3537, "step": 14 }, { "epoch": 0.032520325203252036, "grad_norm": 39.040367126464844, "learning_rate": 9.98292246503335e-05, "loss": 9.0205, "step": 15 }, { "epoch": 0.034688346883468835, "grad_norm": 58.99087905883789, "learning_rate": 9.975414512725057e-05, "loss": 9.3102, "step": 16 }, { "epoch": 0.03685636856368564, "grad_norm": 45.97587585449219, "learning_rate": 9.966546331768191e-05, "loss": 9.2068, "step": 17 }, { "epoch": 0.03902439024390244, "grad_norm": 40.5340461730957, "learning_rate": 9.956320346634876e-05, "loss": 9.2059, "step": 18 }, { "epoch": 0.041192411924119245, "grad_norm": 44.311119079589844, "learning_rate": 9.944739353007344e-05, "loss": 8.7438, "step": 19 }, { "epoch": 0.04336043360433604, "grad_norm": 48.79315948486328, "learning_rate": 9.931806517013612e-05, "loss": 10.1446, "step": 20 }, { "epoch": 0.04552845528455285, "grad_norm": 44.125003814697266, "learning_rate": 9.917525374361912e-05, "loss": 9.969, "step": 21 }, { "epoch": 0.04769647696476965, "grad_norm": 41.132083892822266, "learning_rate": 9.901899829374047e-05, "loss": 9.857, "step": 22 }, { "epoch": 0.04986449864498645, "grad_norm": 41.87261962890625, "learning_rate": 9.884934153917997e-05, "loss": 10.0139, "step": 23 }, { "epoch": 0.05203252032520325, "grad_norm": 39.67081832885742, "learning_rate": 9.86663298624003e-05, "loss": 8.8703, "step": 24 }, { "epoch": 0.05420054200542006, "grad_norm": 44.639129638671875, "learning_rate": 9.847001329696653e-05, "loss": 9.3676, "step": 25 }, { "epoch": 0.056368563685636856, "grad_norm": 58.59317398071289, "learning_rate": 9.826044551386744e-05, "loss": 9.9945, "step": 26 }, { "epoch": 0.05853658536585366, "grad_norm": 58.96311569213867, "learning_rate": 9.803768380684242e-05, "loss": 9.7769, "step": 27 }, { "epoch": 0.06070460704607046, "grad_norm": 101.34839630126953, "learning_rate": 9.780178907671789e-05, "loss": 9.6022, "step": 28 }, { "epoch": 0.06287262872628727, "grad_norm": 55.3743782043457, "learning_rate": 9.755282581475769e-05, "loss": 9.5984, "step": 29 }, { "epoch": 0.06504065040650407, "grad_norm": 52.611061096191406, "learning_rate": 9.729086208503174e-05, "loss": 9.5768, "step": 30 }, { "epoch": 0.06720867208672086, "grad_norm": 63.68136215209961, "learning_rate": 9.701596950580806e-05, "loss": 10.0241, "step": 31 }, { "epoch": 0.06937669376693767, "grad_norm": 131.1375732421875, "learning_rate": 9.672822322997305e-05, "loss": 10.19, "step": 32 }, { "epoch": 0.07154471544715447, "grad_norm": 82.2738037109375, "learning_rate": 9.642770192448536e-05, "loss": 10.1478, "step": 33 }, { "epoch": 0.07371273712737128, "grad_norm": 61.93830490112305, "learning_rate": 9.611448774886924e-05, "loss": 10.0923, "step": 34 }, { "epoch": 0.07588075880758807, "grad_norm": 60.45297622680664, "learning_rate": 9.578866633275288e-05, "loss": 12.7574, "step": 35 }, { "epoch": 0.07804878048780488, "grad_norm": 105.29837036132812, "learning_rate": 9.545032675245813e-05, "loss": 10.0525, "step": 36 }, { "epoch": 0.08021680216802168, "grad_norm": 86.53067016601562, "learning_rate": 9.509956150664796e-05, "loss": 9.5383, "step": 37 }, { "epoch": 0.08238482384823849, "grad_norm": 118.15338134765625, "learning_rate": 9.473646649103818e-05, "loss": 10.3553, "step": 38 }, { "epoch": 0.08455284552845528, "grad_norm": 79.79243469238281, "learning_rate": 9.43611409721806e-05, "loss": 11.9819, "step": 39 }, { "epoch": 0.08672086720867209, "grad_norm": 101.70763397216797, "learning_rate": 9.397368756032445e-05, "loss": 12.6661, "step": 40 }, { "epoch": 0.08888888888888889, "grad_norm": 108.63827514648438, "learning_rate": 9.357421218136386e-05, "loss": 10.515, "step": 41 }, { "epoch": 0.0910569105691057, "grad_norm": 87.01688385009766, "learning_rate": 9.316282404787871e-05, "loss": 9.7758, "step": 42 }, { "epoch": 0.09322493224932249, "grad_norm": 131.51397705078125, "learning_rate": 9.273963562927695e-05, "loss": 7.3259, "step": 43 }, { "epoch": 0.0953929539295393, "grad_norm": 88.70328521728516, "learning_rate": 9.230476262104677e-05, "loss": 6.2769, "step": 44 }, { "epoch": 0.0975609756097561, "grad_norm": 123.07538604736328, "learning_rate": 9.185832391312644e-05, "loss": 5.2226, "step": 45 }, { "epoch": 0.0997289972899729, "grad_norm": 98.83578491210938, "learning_rate": 9.140044155740101e-05, "loss": 4.1133, "step": 46 }, { "epoch": 0.1018970189701897, "grad_norm": 183.61734008789062, "learning_rate": 9.093124073433463e-05, "loss": 3.8164, "step": 47 }, { "epoch": 0.1040650406504065, "grad_norm": 211.9551239013672, "learning_rate": 9.045084971874738e-05, "loss": 3.5105, "step": 48 }, { "epoch": 0.10623306233062331, "grad_norm": 154.25930786132812, "learning_rate": 8.995939984474624e-05, "loss": 3.2991, "step": 49 }, { "epoch": 0.10840108401084012, "grad_norm": 167.2089385986328, "learning_rate": 8.945702546981969e-05, "loss": 4.4288, "step": 50 }, { "epoch": 0.10840108401084012, "eval_loss": 2.758779287338257, "eval_runtime": 9.348, "eval_samples_per_second": 83.119, "eval_steps_per_second": 20.86, "step": 50 }, { "epoch": 0.11056910569105691, "grad_norm": 157.94134521484375, "learning_rate": 8.894386393810563e-05, "loss": 7.8429, "step": 51 }, { "epoch": 0.11273712737127371, "grad_norm": 81.84595489501953, "learning_rate": 8.842005554284296e-05, "loss": 12.011, "step": 52 }, { "epoch": 0.11490514905149052, "grad_norm": 71.08673095703125, "learning_rate": 8.788574348801675e-05, "loss": 14.1127, "step": 53 }, { "epoch": 0.11707317073170732, "grad_norm": 59.624229431152344, "learning_rate": 8.73410738492077e-05, "loss": 11.356, "step": 54 }, { "epoch": 0.11924119241192412, "grad_norm": 67.42726135253906, "learning_rate": 8.678619553365659e-05, "loss": 10.9877, "step": 55 }, { "epoch": 0.12140921409214092, "grad_norm": 40.2747688293457, "learning_rate": 8.622126023955446e-05, "loss": 10.6976, "step": 56 }, { "epoch": 0.12357723577235773, "grad_norm": 60.5155029296875, "learning_rate": 8.564642241456986e-05, "loss": 9.8698, "step": 57 }, { "epoch": 0.12574525745257453, "grad_norm": 35.934452056884766, "learning_rate": 8.506183921362443e-05, "loss": 9.3362, "step": 58 }, { "epoch": 0.12791327913279132, "grad_norm": 52.79769515991211, "learning_rate": 8.44676704559283e-05, "loss": 9.4859, "step": 59 }, { "epoch": 0.13008130081300814, "grad_norm": 69.04476165771484, "learning_rate": 8.386407858128706e-05, "loss": 9.3703, "step": 60 }, { "epoch": 0.13224932249322494, "grad_norm": 51.741085052490234, "learning_rate": 8.32512286056924e-05, "loss": 9.3188, "step": 61 }, { "epoch": 0.13441734417344173, "grad_norm": 41.74775314331055, "learning_rate": 8.262928807620843e-05, "loss": 8.6789, "step": 62 }, { "epoch": 0.13658536585365855, "grad_norm": 34.91529846191406, "learning_rate": 8.199842702516583e-05, "loss": 9.4133, "step": 63 }, { "epoch": 0.13875338753387534, "grad_norm": 33.73413848876953, "learning_rate": 8.135881792367686e-05, "loss": 8.4024, "step": 64 }, { "epoch": 0.14092140921409213, "grad_norm": 52.59294891357422, "learning_rate": 8.07106356344834e-05, "loss": 9.213, "step": 65 }, { "epoch": 0.14308943089430895, "grad_norm": 44.27796936035156, "learning_rate": 8.005405736415126e-05, "loss": 9.4206, "step": 66 }, { "epoch": 0.14525745257452574, "grad_norm": 32.0435905456543, "learning_rate": 7.938926261462366e-05, "loss": 9.1629, "step": 67 }, { "epoch": 0.14742547425474256, "grad_norm": 56.0610237121582, "learning_rate": 7.871643313414718e-05, "loss": 8.8188, "step": 68 }, { "epoch": 0.14959349593495935, "grad_norm": 31.955718994140625, "learning_rate": 7.803575286758364e-05, "loss": 10.0232, "step": 69 }, { "epoch": 0.15176151761517614, "grad_norm": 39.53376770019531, "learning_rate": 7.734740790612136e-05, "loss": 9.4435, "step": 70 }, { "epoch": 0.15392953929539296, "grad_norm": 28.377092361450195, "learning_rate": 7.66515864363997e-05, "loss": 9.36, "step": 71 }, { "epoch": 0.15609756097560976, "grad_norm": 64.54959106445312, "learning_rate": 7.594847868906076e-05, "loss": 9.6367, "step": 72 }, { "epoch": 0.15826558265582655, "grad_norm": 38.5235710144043, "learning_rate": 7.52382768867422e-05, "loss": 9.7493, "step": 73 }, { "epoch": 0.16043360433604337, "grad_norm": 45.679569244384766, "learning_rate": 7.452117519152542e-05, "loss": 9.6859, "step": 74 }, { "epoch": 0.16260162601626016, "grad_norm": 40.02254104614258, "learning_rate": 7.379736965185368e-05, "loss": 10.0519, "step": 75 }, { "epoch": 0.16476964769647698, "grad_norm": 38.8911018371582, "learning_rate": 7.30670581489344e-05, "loss": 9.3061, "step": 76 }, { "epoch": 0.16693766937669377, "grad_norm": 32.40317916870117, "learning_rate": 7.233044034264034e-05, "loss": 9.2322, "step": 77 }, { "epoch": 0.16910569105691056, "grad_norm": 37.332603454589844, "learning_rate": 7.158771761692464e-05, "loss": 9.3109, "step": 78 }, { "epoch": 0.17127371273712738, "grad_norm": 38.216148376464844, "learning_rate": 7.083909302476453e-05, "loss": 9.8138, "step": 79 }, { "epoch": 0.17344173441734417, "grad_norm": 39.74815368652344, "learning_rate": 7.008477123264848e-05, "loss": 8.7014, "step": 80 }, { "epoch": 0.17560975609756097, "grad_norm": 44.80895233154297, "learning_rate": 6.932495846462261e-05, "loss": 8.8614, "step": 81 }, { "epoch": 0.17777777777777778, "grad_norm": 40.72815704345703, "learning_rate": 6.855986244591104e-05, "loss": 9.1287, "step": 82 }, { "epoch": 0.17994579945799458, "grad_norm": 63.297245025634766, "learning_rate": 6.778969234612584e-05, "loss": 9.0044, "step": 83 }, { "epoch": 0.1821138211382114, "grad_norm": 49.13942337036133, "learning_rate": 6.701465872208216e-05, "loss": 10.1192, "step": 84 }, { "epoch": 0.1842818428184282, "grad_norm": 52.749820709228516, "learning_rate": 6.623497346023418e-05, "loss": 9.4146, "step": 85 }, { "epoch": 0.18644986449864498, "grad_norm": 53.51035690307617, "learning_rate": 6.545084971874738e-05, "loss": 8.7282, "step": 86 }, { "epoch": 0.1886178861788618, "grad_norm": 65.0464096069336, "learning_rate": 6.466250186922325e-05, "loss": 8.2676, "step": 87 }, { "epoch": 0.1907859078590786, "grad_norm": 99.9498291015625, "learning_rate": 6.387014543809223e-05, "loss": 9.0324, "step": 88 }, { "epoch": 0.19295392953929538, "grad_norm": 79.33631896972656, "learning_rate": 6.307399704769099e-05, "loss": 5.7613, "step": 89 }, { "epoch": 0.1951219512195122, "grad_norm": 96.63216400146484, "learning_rate": 6.227427435703997e-05, "loss": 6.6183, "step": 90 }, { "epoch": 0.197289972899729, "grad_norm": 89.3034439086914, "learning_rate": 6.147119600233758e-05, "loss": 4.3904, "step": 91 }, { "epoch": 0.1994579945799458, "grad_norm": 82.56645965576172, "learning_rate": 6.066498153718735e-05, "loss": 5.1966, "step": 92 }, { "epoch": 0.2016260162601626, "grad_norm": 72.83575439453125, "learning_rate": 5.985585137257401e-05, "loss": 2.9211, "step": 93 }, { "epoch": 0.2037940379403794, "grad_norm": 124.21688842773438, "learning_rate": 5.90440267166055e-05, "loss": 2.023, "step": 94 }, { "epoch": 0.20596205962059622, "grad_norm": 103.04071044921875, "learning_rate": 5.8229729514036705e-05, "loss": 2.1432, "step": 95 }, { "epoch": 0.208130081300813, "grad_norm": 49.1472282409668, "learning_rate": 5.74131823855921e-05, "loss": 1.2012, "step": 96 }, { "epoch": 0.2102981029810298, "grad_norm": 93.72968292236328, "learning_rate": 5.6594608567103456e-05, "loss": 1.6871, "step": 97 }, { "epoch": 0.21246612466124662, "grad_norm": 100.07096862792969, "learning_rate": 5.577423184847932e-05, "loss": 1.3877, "step": 98 }, { "epoch": 0.2146341463414634, "grad_norm": 39.16847610473633, "learning_rate": 5.495227651252315e-05, "loss": 0.7818, "step": 99 }, { "epoch": 0.21680216802168023, "grad_norm": 56.72313690185547, "learning_rate": 5.4128967273616625e-05, "loss": 2.602, "step": 100 }, { "epoch": 0.21680216802168023, "eval_loss": 1.9244413375854492, "eval_runtime": 9.3416, "eval_samples_per_second": 83.177, "eval_steps_per_second": 20.874, "step": 100 }, { "epoch": 0.21897018970189702, "grad_norm": 35.613861083984375, "learning_rate": 5.330452921628497e-05, "loss": 4.964, "step": 101 }, { "epoch": 0.22113821138211381, "grad_norm": 40.78394317626953, "learning_rate": 5.247918773366112e-05, "loss": 11.7878, "step": 102 }, { "epoch": 0.22330623306233063, "grad_norm": 29.11028480529785, "learning_rate": 5.165316846586541e-05, "loss": 9.1479, "step": 103 }, { "epoch": 0.22547425474254743, "grad_norm": 35.1927604675293, "learning_rate": 5.0826697238317935e-05, "loss": 10.0267, "step": 104 }, { "epoch": 0.22764227642276422, "grad_norm": 49.512245178222656, "learning_rate": 5e-05, "loss": 8.8112, "step": 105 }, { "epoch": 0.22981029810298104, "grad_norm": 41.58905029296875, "learning_rate": 4.917330276168208e-05, "loss": 8.0448, "step": 106 }, { "epoch": 0.23197831978319783, "grad_norm": 29.24210548400879, "learning_rate": 4.834683153413459e-05, "loss": 8.3196, "step": 107 }, { "epoch": 0.23414634146341465, "grad_norm": 28.055253982543945, "learning_rate": 4.7520812266338885e-05, "loss": 8.8809, "step": 108 }, { "epoch": 0.23631436314363144, "grad_norm": 23.70207977294922, "learning_rate": 4.669547078371504e-05, "loss": 8.9278, "step": 109 }, { "epoch": 0.23848238482384823, "grad_norm": 39.66957092285156, "learning_rate": 4.5871032726383386e-05, "loss": 9.1542, "step": 110 }, { "epoch": 0.24065040650406505, "grad_norm": 22.6226749420166, "learning_rate": 4.504772348747687e-05, "loss": 8.8319, "step": 111 }, { "epoch": 0.24281842818428184, "grad_norm": 32.19573211669922, "learning_rate": 4.4225768151520694e-05, "loss": 9.4761, "step": 112 }, { "epoch": 0.24498644986449863, "grad_norm": 53.35695266723633, "learning_rate": 4.3405391432896555e-05, "loss": 8.1584, "step": 113 }, { "epoch": 0.24715447154471545, "grad_norm": 22.227432250976562, "learning_rate": 4.2586817614407895e-05, "loss": 9.1379, "step": 114 }, { "epoch": 0.24932249322493225, "grad_norm": 19.7288818359375, "learning_rate": 4.17702704859633e-05, "loss": 8.3286, "step": 115 }, { "epoch": 0.25149051490514907, "grad_norm": 20.1420841217041, "learning_rate": 4.095597328339452e-05, "loss": 8.4622, "step": 116 }, { "epoch": 0.25365853658536586, "grad_norm": 20.648754119873047, "learning_rate": 4.0144148627425993e-05, "loss": 8.313, "step": 117 }, { "epoch": 0.25582655826558265, "grad_norm": 18.873445510864258, "learning_rate": 3.933501846281267e-05, "loss": 8.4576, "step": 118 }, { "epoch": 0.25799457994579944, "grad_norm": 19.276405334472656, "learning_rate": 3.852880399766243e-05, "loss": 8.8261, "step": 119 }, { "epoch": 0.2601626016260163, "grad_norm": 26.74322509765625, "learning_rate": 3.772572564296005e-05, "loss": 8.7568, "step": 120 }, { "epoch": 0.2623306233062331, "grad_norm": 32.53742980957031, "learning_rate": 3.6926002952309016e-05, "loss": 9.0809, "step": 121 }, { "epoch": 0.26449864498644987, "grad_norm": 20.733869552612305, "learning_rate": 3.612985456190778e-05, "loss": 8.9164, "step": 122 }, { "epoch": 0.26666666666666666, "grad_norm": 25.21817970275879, "learning_rate": 3.533749813077677e-05, "loss": 9.0173, "step": 123 }, { "epoch": 0.26883468834688345, "grad_norm": 21.50628089904785, "learning_rate": 3.4549150281252636e-05, "loss": 8.8179, "step": 124 }, { "epoch": 0.27100271002710025, "grad_norm": 20.020416259765625, "learning_rate": 3.3765026539765834e-05, "loss": 9.306, "step": 125 }, { "epoch": 0.2731707317073171, "grad_norm": 20.90715789794922, "learning_rate": 3.298534127791785e-05, "loss": 9.0805, "step": 126 }, { "epoch": 0.2753387533875339, "grad_norm": 21.389572143554688, "learning_rate": 3.221030765387417e-05, "loss": 9.0048, "step": 127 }, { "epoch": 0.2775067750677507, "grad_norm": 26.47258758544922, "learning_rate": 3.144013755408895e-05, "loss": 8.9581, "step": 128 }, { "epoch": 0.27967479674796747, "grad_norm": 26.793968200683594, "learning_rate": 3.0675041535377405e-05, "loss": 8.1696, "step": 129 }, { "epoch": 0.28184281842818426, "grad_norm": 23.339202880859375, "learning_rate": 2.991522876735154e-05, "loss": 8.7234, "step": 130 }, { "epoch": 0.2840108401084011, "grad_norm": 26.5540771484375, "learning_rate": 2.916090697523549e-05, "loss": 8.2496, "step": 131 }, { "epoch": 0.2861788617886179, "grad_norm": 26.76554298400879, "learning_rate": 2.8412282383075363e-05, "loss": 8.164, "step": 132 }, { "epoch": 0.2883468834688347, "grad_norm": 25.66951560974121, "learning_rate": 2.766955965735968e-05, "loss": 8.5713, "step": 133 }, { "epoch": 0.2905149051490515, "grad_norm": 27.322519302368164, "learning_rate": 2.693294185106562e-05, "loss": 7.706, "step": 134 }, { "epoch": 0.2926829268292683, "grad_norm": 29.11598777770996, "learning_rate": 2.6202630348146324e-05, "loss": 9.3715, "step": 135 }, { "epoch": 0.2948509485094851, "grad_norm": 29.22189712524414, "learning_rate": 2.547882480847461e-05, "loss": 8.8805, "step": 136 }, { "epoch": 0.2970189701897019, "grad_norm": 40.12459182739258, "learning_rate": 2.476172311325783e-05, "loss": 7.1181, "step": 137 }, { "epoch": 0.2991869918699187, "grad_norm": 37.60042190551758, "learning_rate": 2.405152131093926e-05, "loss": 6.6358, "step": 138 }, { "epoch": 0.3013550135501355, "grad_norm": 44.48047637939453, "learning_rate": 2.3348413563600325e-05, "loss": 7.6617, "step": 139 }, { "epoch": 0.3035230352303523, "grad_norm": 69.62174224853516, "learning_rate": 2.2652592093878666e-05, "loss": 5.7551, "step": 140 }, { "epoch": 0.3056910569105691, "grad_norm": 52.42491149902344, "learning_rate": 2.196424713241637e-05, "loss": 2.2201, "step": 141 }, { "epoch": 0.30785907859078593, "grad_norm": 51.486175537109375, "learning_rate": 2.128356686585282e-05, "loss": 3.3775, "step": 142 }, { "epoch": 0.3100271002710027, "grad_norm": 58.227561950683594, "learning_rate": 2.061073738537635e-05, "loss": 1.907, "step": 143 }, { "epoch": 0.3121951219512195, "grad_norm": 51.19715881347656, "learning_rate": 1.9945942635848748e-05, "loss": 1.293, "step": 144 }, { "epoch": 0.3143631436314363, "grad_norm": 83.05162048339844, "learning_rate": 1.928936436551661e-05, "loss": 1.3919, "step": 145 }, { "epoch": 0.3165311653116531, "grad_norm": 46.72926712036133, "learning_rate": 1.8641182076323148e-05, "loss": 0.5313, "step": 146 }, { "epoch": 0.31869918699186994, "grad_norm": 61.03974914550781, "learning_rate": 1.800157297483417e-05, "loss": 0.7563, "step": 147 }, { "epoch": 0.32086720867208673, "grad_norm": 50.94718551635742, "learning_rate": 1.7370711923791567e-05, "loss": 0.8253, "step": 148 }, { "epoch": 0.3230352303523035, "grad_norm": 23.519289016723633, "learning_rate": 1.6748771394307585e-05, "loss": 0.4298, "step": 149 }, { "epoch": 0.3252032520325203, "grad_norm": 37.908599853515625, "learning_rate": 1.6135921418712956e-05, "loss": 1.921, "step": 150 }, { "epoch": 0.3252032520325203, "eval_loss": 1.788693904876709, "eval_runtime": 9.3835, "eval_samples_per_second": 82.805, "eval_steps_per_second": 20.781, "step": 150 }, { "epoch": 0.3273712737127371, "grad_norm": 23.393863677978516, "learning_rate": 1.553232954407171e-05, "loss": 6.4719, "step": 151 }, { "epoch": 0.32953929539295396, "grad_norm": 67.25944519042969, "learning_rate": 1.4938160786375572e-05, "loss": 9.8484, "step": 152 }, { "epoch": 0.33170731707317075, "grad_norm": 22.97135353088379, "learning_rate": 1.435357758543015e-05, "loss": 9.9367, "step": 153 }, { "epoch": 0.33387533875338754, "grad_norm": 21.3618221282959, "learning_rate": 1.3778739760445552e-05, "loss": 9.5086, "step": 154 }, { "epoch": 0.33604336043360433, "grad_norm": 24.72620391845703, "learning_rate": 1.3213804466343421e-05, "loss": 8.4169, "step": 155 }, { "epoch": 0.3382113821138211, "grad_norm": 23.45319366455078, "learning_rate": 1.2658926150792322e-05, "loss": 8.6031, "step": 156 }, { "epoch": 0.3403794037940379, "grad_norm": 29.1568660736084, "learning_rate": 1.2114256511983274e-05, "loss": 8.9896, "step": 157 }, { "epoch": 0.34254742547425476, "grad_norm": 21.772184371948242, "learning_rate": 1.157994445715706e-05, "loss": 8.3909, "step": 158 }, { "epoch": 0.34471544715447155, "grad_norm": 17.402957916259766, "learning_rate": 1.1056136061894384e-05, "loss": 8.1657, "step": 159 }, { "epoch": 0.34688346883468835, "grad_norm": 21.280426025390625, "learning_rate": 1.0542974530180327e-05, "loss": 8.2439, "step": 160 }, { "epoch": 0.34905149051490514, "grad_norm": 18.396419525146484, "learning_rate": 1.0040600155253765e-05, "loss": 8.3202, "step": 161 }, { "epoch": 0.35121951219512193, "grad_norm": 25.474815368652344, "learning_rate": 9.549150281252633e-06, "loss": 8.4802, "step": 162 }, { "epoch": 0.3533875338753388, "grad_norm": 22.321760177612305, "learning_rate": 9.068759265665384e-06, "loss": 8.9247, "step": 163 }, { "epoch": 0.35555555555555557, "grad_norm": 19.393081665039062, "learning_rate": 8.599558442598998e-06, "loss": 8.2353, "step": 164 }, { "epoch": 0.35772357723577236, "grad_norm": 22.461700439453125, "learning_rate": 8.141676086873572e-06, "loss": 8.3274, "step": 165 }, { "epoch": 0.35989159891598915, "grad_norm": 16.620500564575195, "learning_rate": 7.695237378953223e-06, "loss": 8.5916, "step": 166 }, { "epoch": 0.36205962059620594, "grad_norm": 21.414871215820312, "learning_rate": 7.260364370723044e-06, "loss": 8.1114, "step": 167 }, { "epoch": 0.3642276422764228, "grad_norm": 46.16633224487305, "learning_rate": 6.837175952121306e-06, "loss": 8.3523, "step": 168 }, { "epoch": 0.3663956639566396, "grad_norm": 28.88848114013672, "learning_rate": 6.425787818636131e-06, "loss": 7.8829, "step": 169 }, { "epoch": 0.3685636856368564, "grad_norm": 28.91385269165039, "learning_rate": 6.026312439675552e-06, "loss": 9.0534, "step": 170 }, { "epoch": 0.37073170731707317, "grad_norm": 20.6962833404541, "learning_rate": 5.6388590278194096e-06, "loss": 9.1964, "step": 171 }, { "epoch": 0.37289972899728996, "grad_norm": 19.90764808654785, "learning_rate": 5.263533508961827e-06, "loss": 9.1433, "step": 172 }, { "epoch": 0.37506775067750675, "grad_norm": 22.188579559326172, "learning_rate": 4.900438493352055e-06, "loss": 9.7097, "step": 173 }, { "epoch": 0.3772357723577236, "grad_norm": 19.773141860961914, "learning_rate": 4.549673247541875e-06, "loss": 8.6201, "step": 174 }, { "epoch": 0.3794037940379404, "grad_norm": 29.198694229125977, "learning_rate": 4.2113336672471245e-06, "loss": 8.9861, "step": 175 }, { "epoch": 0.3815718157181572, "grad_norm": 25.00130844116211, "learning_rate": 3.885512251130763e-06, "loss": 8.8726, "step": 176 }, { "epoch": 0.383739837398374, "grad_norm": 23.308883666992188, "learning_rate": 3.5722980755146517e-06, "loss": 9.429, "step": 177 }, { "epoch": 0.38590785907859076, "grad_norm": 22.925567626953125, "learning_rate": 3.271776770026963e-06, "loss": 8.6373, "step": 178 }, { "epoch": 0.3880758807588076, "grad_norm": 23.13882827758789, "learning_rate": 2.9840304941919415e-06, "loss": 9.5522, "step": 179 }, { "epoch": 0.3902439024390244, "grad_norm": 39.564605712890625, "learning_rate": 2.7091379149682685e-06, "loss": 8.0607, "step": 180 }, { "epoch": 0.3924119241192412, "grad_norm": 23.98958396911621, "learning_rate": 2.4471741852423237e-06, "loss": 9.0663, "step": 181 }, { "epoch": 0.394579945799458, "grad_norm": 27.768707275390625, "learning_rate": 2.1982109232821178e-06, "loss": 7.6105, "step": 182 }, { "epoch": 0.3967479674796748, "grad_norm": 27.029319763183594, "learning_rate": 1.962316193157593e-06, "loss": 7.9193, "step": 183 }, { "epoch": 0.3989159891598916, "grad_norm": 26.111921310424805, "learning_rate": 1.7395544861325718e-06, "loss": 8.5289, "step": 184 }, { "epoch": 0.4010840108401084, "grad_norm": 28.225522994995117, "learning_rate": 1.5299867030334814e-06, "loss": 9.1447, "step": 185 }, { "epoch": 0.4032520325203252, "grad_norm": 41.012489318847656, "learning_rate": 1.333670137599713e-06, "loss": 9.9929, "step": 186 }, { "epoch": 0.405420054200542, "grad_norm": 30.45234489440918, "learning_rate": 1.1506584608200367e-06, "loss": 8.7301, "step": 187 }, { "epoch": 0.4075880758807588, "grad_norm": 39.41157913208008, "learning_rate": 9.810017062595322e-07, "loss": 5.2084, "step": 188 }, { "epoch": 0.4097560975609756, "grad_norm": 44.882301330566406, "learning_rate": 8.247462563808817e-07, "loss": 4.658, "step": 189 }, { "epoch": 0.41192411924119243, "grad_norm": 67.9166488647461, "learning_rate": 6.819348298638839e-07, "loss": 6.5314, "step": 190 }, { "epoch": 0.4140921409214092, "grad_norm": 67.44029235839844, "learning_rate": 5.526064699265753e-07, "loss": 2.9418, "step": 191 }, { "epoch": 0.416260162601626, "grad_norm": 42.995975494384766, "learning_rate": 4.367965336512403e-07, "loss": 1.5539, "step": 192 }, { "epoch": 0.4184281842818428, "grad_norm": 72.60922241210938, "learning_rate": 3.3453668231809286e-07, "loss": 3.2945, "step": 193 }, { "epoch": 0.4205962059620596, "grad_norm": 59.845577239990234, "learning_rate": 2.458548727494292e-07, "loss": 2.0104, "step": 194 }, { "epoch": 0.42276422764227645, "grad_norm": 49.002201080322266, "learning_rate": 1.7077534966650766e-07, "loss": 0.894, "step": 195 }, { "epoch": 0.42493224932249324, "grad_norm": 50.669837951660156, "learning_rate": 1.0931863906127327e-07, "loss": 1.0598, "step": 196 }, { "epoch": 0.42710027100271003, "grad_norm": 38.22815704345703, "learning_rate": 6.150154258476315e-08, "loss": 0.7984, "step": 197 }, { "epoch": 0.4292682926829268, "grad_norm": 47.77947235107422, "learning_rate": 2.7337132953697554e-08, "loss": 1.0837, "step": 198 }, { "epoch": 0.4314363143631436, "grad_norm": 41.17764663696289, "learning_rate": 6.834750376549792e-09, "loss": 0.6525, "step": 199 }, { "epoch": 0.43360433604336046, "grad_norm": 48.83980941772461, "learning_rate": 0.0, "loss": 2.9829, "step": 200 }, { "epoch": 0.43360433604336046, "eval_loss": 1.7213791608810425, "eval_runtime": 9.3323, "eval_samples_per_second": 83.259, "eval_steps_per_second": 20.895, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5703441863081984e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }