{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 399, "global_step": 797, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012547051442910915, "grad_norm": 19.01241111755371, "learning_rate": 4.347826086956522e-07, "loss": 3.7662, "step": 1 }, { "epoch": 0.0012547051442910915, "eval_loss": 3.5013513565063477, "eval_runtime": 6.1042, "eval_samples_per_second": 109.924, "eval_steps_per_second": 6.881, "step": 1 }, { "epoch": 0.002509410288582183, "grad_norm": 16.68044090270996, "learning_rate": 8.695652173913044e-07, "loss": 4.9852, "step": 2 }, { "epoch": 0.0037641154328732747, "grad_norm": 20.054622650146484, "learning_rate": 1.3043478260869566e-06, "loss": 3.371, "step": 3 }, { "epoch": 0.005018820577164366, "grad_norm": 17.74432373046875, "learning_rate": 1.7391304347826088e-06, "loss": 3.2279, "step": 4 }, { "epoch": 0.006273525721455458, "grad_norm": 15.706853866577148, "learning_rate": 2.173913043478261e-06, "loss": 3.2849, "step": 5 }, { "epoch": 0.0075282308657465494, "grad_norm": 16.103614807128906, "learning_rate": 2.6086956521739132e-06, "loss": 3.3386, "step": 6 }, { "epoch": 0.00878293601003764, "grad_norm": 17.866008758544922, "learning_rate": 3.043478260869566e-06, "loss": 3.7729, "step": 7 }, { "epoch": 0.010037641154328732, "grad_norm": 15.147605895996094, "learning_rate": 3.4782608695652175e-06, "loss": 3.7953, "step": 8 }, { "epoch": 0.011292346298619825, "grad_norm": 18.901615142822266, "learning_rate": 3.91304347826087e-06, "loss": 3.5709, "step": 9 }, { "epoch": 0.012547051442910916, "grad_norm": 15.003190994262695, "learning_rate": 4.347826086956522e-06, "loss": 4.4118, "step": 10 }, { "epoch": 0.013801756587202008, "grad_norm": 16.06182098388672, "learning_rate": 4.782608695652174e-06, "loss": 3.6714, "step": 11 }, { "epoch": 0.015056461731493099, "grad_norm": 17.36846923828125, "learning_rate": 5.2173913043478265e-06, "loss": 3.8605, "step": 12 }, { "epoch": 0.01631116687578419, "grad_norm": 17.79241180419922, "learning_rate": 5.652173913043479e-06, "loss": 3.5272, "step": 13 }, { "epoch": 0.01756587202007528, "grad_norm": 14.266884803771973, "learning_rate": 6.086956521739132e-06, "loss": 3.6034, "step": 14 }, { "epoch": 0.018820577164366373, "grad_norm": 15.910148620605469, "learning_rate": 6.521739130434783e-06, "loss": 3.1557, "step": 15 }, { "epoch": 0.020075282308657464, "grad_norm": 16.65166473388672, "learning_rate": 6.956521739130435e-06, "loss": 3.1012, "step": 16 }, { "epoch": 0.02132998745294856, "grad_norm": 14.562183380126953, "learning_rate": 7.391304347826087e-06, "loss": 3.7211, "step": 17 }, { "epoch": 0.02258469259723965, "grad_norm": 12.386832237243652, "learning_rate": 7.82608695652174e-06, "loss": 3.4626, "step": 18 }, { "epoch": 0.02383939774153074, "grad_norm": 17.05539894104004, "learning_rate": 8.260869565217392e-06, "loss": 3.4974, "step": 19 }, { "epoch": 0.025094102885821833, "grad_norm": 20.27201271057129, "learning_rate": 8.695652173913044e-06, "loss": 3.6714, "step": 20 }, { "epoch": 0.026348808030112924, "grad_norm": 14.487604141235352, "learning_rate": 9.130434782608697e-06, "loss": 3.3083, "step": 21 }, { "epoch": 0.027603513174404015, "grad_norm": 16.102643966674805, "learning_rate": 9.565217391304349e-06, "loss": 3.4909, "step": 22 }, { "epoch": 0.028858218318695106, "grad_norm": 19.385705947875977, "learning_rate": 1e-05, "loss": 3.6031, "step": 23 }, { "epoch": 0.030112923462986198, "grad_norm": 16.015605926513672, "learning_rate": 9.999958813277235e-06, "loss": 3.2406, "step": 24 }, { "epoch": 0.03136762860727729, "grad_norm": 13.68839168548584, "learning_rate": 9.999835253787472e-06, "loss": 3.3278, "step": 25 }, { "epoch": 0.03262233375156838, "grad_norm": 14.528271675109863, "learning_rate": 9.999629323566323e-06, "loss": 2.9922, "step": 26 }, { "epoch": 0.033877038895859475, "grad_norm": 17.02483558654785, "learning_rate": 9.99934102600642e-06, "loss": 3.2691, "step": 27 }, { "epoch": 0.03513174404015056, "grad_norm": 16.797338485717773, "learning_rate": 9.998970365857374e-06, "loss": 3.3698, "step": 28 }, { "epoch": 0.03638644918444166, "grad_norm": 14.764263153076172, "learning_rate": 9.998517349225698e-06, "loss": 3.2543, "step": 29 }, { "epoch": 0.037641154328732745, "grad_norm": 14.670446395874023, "learning_rate": 9.9979819835747e-06, "loss": 3.2072, "step": 30 }, { "epoch": 0.03889585947302384, "grad_norm": 21.932998657226562, "learning_rate": 9.997364277724362e-06, "loss": 3.3685, "step": 31 }, { "epoch": 0.04015056461731493, "grad_norm": 18.700294494628906, "learning_rate": 9.996664241851197e-06, "loss": 2.9231, "step": 32 }, { "epoch": 0.04140526976160602, "grad_norm": 13.988219261169434, "learning_rate": 9.99588188748808e-06, "loss": 3.0859, "step": 33 }, { "epoch": 0.04265997490589712, "grad_norm": 15.462250709533691, "learning_rate": 9.995017227524049e-06, "loss": 3.5131, "step": 34 }, { "epoch": 0.043914680050188205, "grad_norm": 18.01273536682129, "learning_rate": 9.994070276204115e-06, "loss": 3.7054, "step": 35 }, { "epoch": 0.0451693851944793, "grad_norm": 21.7486515045166, "learning_rate": 9.993041049129005e-06, "loss": 3.2674, "step": 36 }, { "epoch": 0.04642409033877039, "grad_norm": 19.066349029541016, "learning_rate": 9.991929563254913e-06, "loss": 3.3495, "step": 37 }, { "epoch": 0.04767879548306148, "grad_norm": 20.077730178833008, "learning_rate": 9.990735836893226e-06, "loss": 3.396, "step": 38 }, { "epoch": 0.04893350062735257, "grad_norm": 14.028002738952637, "learning_rate": 9.989459889710214e-06, "loss": 2.9541, "step": 39 }, { "epoch": 0.050188205771643665, "grad_norm": 16.57832908630371, "learning_rate": 9.988101742726708e-06, "loss": 4.1811, "step": 40 }, { "epoch": 0.05144291091593475, "grad_norm": 16.29513931274414, "learning_rate": 9.986661418317759e-06, "loss": 3.282, "step": 41 }, { "epoch": 0.05269761606022585, "grad_norm": 19.623991012573242, "learning_rate": 9.985138940212264e-06, "loss": 2.7565, "step": 42 }, { "epoch": 0.053952321204516936, "grad_norm": 17.891677856445312, "learning_rate": 9.983534333492575e-06, "loss": 3.2153, "step": 43 }, { "epoch": 0.05520702634880803, "grad_norm": 15.788727760314941, "learning_rate": 9.981847624594093e-06, "loss": 3.2207, "step": 44 }, { "epoch": 0.056461731493099125, "grad_norm": 17.83070182800293, "learning_rate": 9.980078841304817e-06, "loss": 3.2656, "step": 45 }, { "epoch": 0.05771643663739021, "grad_norm": 15.635571479797363, "learning_rate": 9.978228012764904e-06, "loss": 3.0155, "step": 46 }, { "epoch": 0.05897114178168131, "grad_norm": 24.62743377685547, "learning_rate": 9.97629516946618e-06, "loss": 3.1795, "step": 47 }, { "epoch": 0.060225846925972396, "grad_norm": 16.98169708251953, "learning_rate": 9.974280343251637e-06, "loss": 3.3745, "step": 48 }, { "epoch": 0.06148055207026349, "grad_norm": 19.704118728637695, "learning_rate": 9.97218356731491e-06, "loss": 3.0855, "step": 49 }, { "epoch": 0.06273525721455459, "grad_norm": 15.05843734741211, "learning_rate": 9.970004876199731e-06, "loss": 2.9471, "step": 50 }, { "epoch": 0.06398996235884567, "grad_norm": 20.934194564819336, "learning_rate": 9.967744305799358e-06, "loss": 2.3807, "step": 51 }, { "epoch": 0.06524466750313676, "grad_norm": 19.842937469482422, "learning_rate": 9.965401893355985e-06, "loss": 3.0332, "step": 52 }, { "epoch": 0.06649937264742785, "grad_norm": 14.741073608398438, "learning_rate": 9.962977677460132e-06, "loss": 3.2762, "step": 53 }, { "epoch": 0.06775407779171895, "grad_norm": 16.709836959838867, "learning_rate": 9.96047169805e-06, "loss": 2.8045, "step": 54 }, { "epoch": 0.06900878293601004, "grad_norm": 22.069616317749023, "learning_rate": 9.957883996410821e-06, "loss": 2.9735, "step": 55 }, { "epoch": 0.07026348808030113, "grad_norm": 17.32145881652832, "learning_rate": 9.955214615174174e-06, "loss": 3.2817, "step": 56 }, { "epoch": 0.07151819322459223, "grad_norm": 16.994312286376953, "learning_rate": 9.952463598317286e-06, "loss": 2.9389, "step": 57 }, { "epoch": 0.07277289836888332, "grad_norm": 16.050113677978516, "learning_rate": 9.949630991162304e-06, "loss": 2.6915, "step": 58 }, { "epoch": 0.0740276035131744, "grad_norm": 12.047767639160156, "learning_rate": 9.946716840375552e-06, "loss": 3.1678, "step": 59 }, { "epoch": 0.07528230865746549, "grad_norm": 21.182559967041016, "learning_rate": 9.943721193966755e-06, "loss": 3.0534, "step": 60 }, { "epoch": 0.07653701380175659, "grad_norm": 15.370920181274414, "learning_rate": 9.940644101288259e-06, "loss": 2.9404, "step": 61 }, { "epoch": 0.07779171894604768, "grad_norm": 17.937530517578125, "learning_rate": 9.937485613034209e-06, "loss": 3.1182, "step": 62 }, { "epoch": 0.07904642409033877, "grad_norm": 15.242935180664062, "learning_rate": 9.934245781239714e-06, "loss": 3.2562, "step": 63 }, { "epoch": 0.08030112923462986, "grad_norm": 19.794172286987305, "learning_rate": 9.93092465928e-06, "loss": 2.9321, "step": 64 }, { "epoch": 0.08155583437892096, "grad_norm": 12.679776191711426, "learning_rate": 9.927522301869515e-06, "loss": 2.4835, "step": 65 }, { "epoch": 0.08281053952321205, "grad_norm": 17.018342971801758, "learning_rate": 9.924038765061042e-06, "loss": 2.7064, "step": 66 }, { "epoch": 0.08406524466750313, "grad_norm": 17.553468704223633, "learning_rate": 9.920474106244764e-06, "loss": 3.509, "step": 67 }, { "epoch": 0.08531994981179424, "grad_norm": 18.098421096801758, "learning_rate": 9.91682838414733e-06, "loss": 3.1203, "step": 68 }, { "epoch": 0.08657465495608532, "grad_norm": 20.72344398498535, "learning_rate": 9.913101658830879e-06, "loss": 3.1849, "step": 69 }, { "epoch": 0.08782936010037641, "grad_norm": 18.358638763427734, "learning_rate": 9.909293991692049e-06, "loss": 3.4645, "step": 70 }, { "epoch": 0.0890840652446675, "grad_norm": 19.074031829833984, "learning_rate": 9.905405445460972e-06, "loss": 2.9314, "step": 71 }, { "epoch": 0.0903387703889586, "grad_norm": 18.550411224365234, "learning_rate": 9.90143608420024e-06, "loss": 2.8456, "step": 72 }, { "epoch": 0.09159347553324969, "grad_norm": 23.823490142822266, "learning_rate": 9.897385973303845e-06, "loss": 3.3129, "step": 73 }, { "epoch": 0.09284818067754078, "grad_norm": 17.849550247192383, "learning_rate": 9.893255179496106e-06, "loss": 2.7676, "step": 74 }, { "epoch": 0.09410288582183186, "grad_norm": 14.884727478027344, "learning_rate": 9.889043770830566e-06, "loss": 2.9774, "step": 75 }, { "epoch": 0.09535759096612297, "grad_norm": 15.499114990234375, "learning_rate": 9.884751816688873e-06, "loss": 2.5129, "step": 76 }, { "epoch": 0.09661229611041405, "grad_norm": 17.986732482910156, "learning_rate": 9.880379387779637e-06, "loss": 3.5453, "step": 77 }, { "epoch": 0.09786700125470514, "grad_norm": 16.63545036315918, "learning_rate": 9.875926556137265e-06, "loss": 2.9293, "step": 78 }, { "epoch": 0.09912170639899624, "grad_norm": 17.408201217651367, "learning_rate": 9.871393395120774e-06, "loss": 3.1488, "step": 79 }, { "epoch": 0.10037641154328733, "grad_norm": 17.50285530090332, "learning_rate": 9.866779979412583e-06, "loss": 2.7078, "step": 80 }, { "epoch": 0.10163111668757842, "grad_norm": 16.590560913085938, "learning_rate": 9.862086385017283e-06, "loss": 2.8491, "step": 81 }, { "epoch": 0.1028858218318695, "grad_norm": 18.618976593017578, "learning_rate": 9.85731268926038e-06, "loss": 3.0485, "step": 82 }, { "epoch": 0.10414052697616061, "grad_norm": 17.413230895996094, "learning_rate": 9.852458970787027e-06, "loss": 3.0812, "step": 83 }, { "epoch": 0.1053952321204517, "grad_norm": 14.060961723327637, "learning_rate": 9.847525309560729e-06, "loss": 2.5551, "step": 84 }, { "epoch": 0.10664993726474278, "grad_norm": 14.511148452758789, "learning_rate": 9.842511786862018e-06, "loss": 2.8406, "step": 85 }, { "epoch": 0.10790464240903387, "grad_norm": 18.97178077697754, "learning_rate": 9.837418485287126e-06, "loss": 3.2963, "step": 86 }, { "epoch": 0.10915934755332497, "grad_norm": 13.818567276000977, "learning_rate": 9.832245488746612e-06, "loss": 2.6757, "step": 87 }, { "epoch": 0.11041405269761606, "grad_norm": 18.294200897216797, "learning_rate": 9.826992882463982e-06, "loss": 2.3428, "step": 88 }, { "epoch": 0.11166875784190715, "grad_norm": 17.605432510375977, "learning_rate": 9.821660752974294e-06, "loss": 2.8555, "step": 89 }, { "epoch": 0.11292346298619825, "grad_norm": 16.119766235351562, "learning_rate": 9.816249188122724e-06, "loss": 2.8055, "step": 90 }, { "epoch": 0.11417816813048934, "grad_norm": 16.537944793701172, "learning_rate": 9.81075827706312e-06, "loss": 2.7496, "step": 91 }, { "epoch": 0.11543287327478043, "grad_norm": 18.349796295166016, "learning_rate": 9.805188110256533e-06, "loss": 2.5472, "step": 92 }, { "epoch": 0.11668757841907151, "grad_norm": 21.679128646850586, "learning_rate": 9.799538779469734e-06, "loss": 2.9006, "step": 93 }, { "epoch": 0.11794228356336262, "grad_norm": 15.701348304748535, "learning_rate": 9.793810377773688e-06, "loss": 2.434, "step": 94 }, { "epoch": 0.1191969887076537, "grad_norm": 17.04868507385254, "learning_rate": 9.78800299954203e-06, "loss": 2.4092, "step": 95 }, { "epoch": 0.12045169385194479, "grad_norm": 17.143634796142578, "learning_rate": 9.782116740449515e-06, "loss": 2.979, "step": 96 }, { "epoch": 0.12170639899623588, "grad_norm": 16.7327880859375, "learning_rate": 9.776151697470431e-06, "loss": 2.9258, "step": 97 }, { "epoch": 0.12296110414052698, "grad_norm": 19.429100036621094, "learning_rate": 9.770107968877004e-06, "loss": 3.0748, "step": 98 }, { "epoch": 0.12421580928481807, "grad_norm": 15.504218101501465, "learning_rate": 9.763985654237785e-06, "loss": 3.0054, "step": 99 }, { "epoch": 0.12547051442910917, "grad_norm": 16.84503936767578, "learning_rate": 9.757784854416006e-06, "loss": 3.2136, "step": 100 }, { "epoch": 0.12672521957340024, "grad_norm": 16.334318161010742, "learning_rate": 9.751505671567914e-06, "loss": 2.5939, "step": 101 }, { "epoch": 0.12797992471769135, "grad_norm": 15.902310371398926, "learning_rate": 9.745148209141094e-06, "loss": 2.4743, "step": 102 }, { "epoch": 0.12923462986198245, "grad_norm": 13.628096580505371, "learning_rate": 9.738712571872765e-06, "loss": 2.2579, "step": 103 }, { "epoch": 0.13048933500627352, "grad_norm": 17.617816925048828, "learning_rate": 9.732198865788047e-06, "loss": 2.4754, "step": 104 }, { "epoch": 0.13174404015056462, "grad_norm": 18.667858123779297, "learning_rate": 9.725607198198227e-06, "loss": 2.6638, "step": 105 }, { "epoch": 0.1329987452948557, "grad_norm": 15.029777526855469, "learning_rate": 9.718937677698976e-06, "loss": 2.8075, "step": 106 }, { "epoch": 0.1342534504391468, "grad_norm": 18.5529727935791, "learning_rate": 9.712190414168573e-06, "loss": 2.627, "step": 107 }, { "epoch": 0.1355081555834379, "grad_norm": 17.021556854248047, "learning_rate": 9.705365518766085e-06, "loss": 2.2912, "step": 108 }, { "epoch": 0.13676286072772897, "grad_norm": 17.83435821533203, "learning_rate": 9.698463103929542e-06, "loss": 2.3247, "step": 109 }, { "epoch": 0.13801756587202008, "grad_norm": 17.74312400817871, "learning_rate": 9.691483283374085e-06, "loss": 2.5844, "step": 110 }, { "epoch": 0.13927227101631118, "grad_norm": 22.43841552734375, "learning_rate": 9.684426172090084e-06, "loss": 3.1616, "step": 111 }, { "epoch": 0.14052697616060225, "grad_norm": 16.035985946655273, "learning_rate": 9.677291886341256e-06, "loss": 2.5391, "step": 112 }, { "epoch": 0.14178168130489335, "grad_norm": 20.342103958129883, "learning_rate": 9.670080543662742e-06, "loss": 2.5258, "step": 113 }, { "epoch": 0.14303638644918445, "grad_norm": 20.725093841552734, "learning_rate": 9.662792262859167e-06, "loss": 2.5076, "step": 114 }, { "epoch": 0.14429109159347553, "grad_norm": 15.233530044555664, "learning_rate": 9.655427164002692e-06, "loss": 2.3355, "step": 115 }, { "epoch": 0.14554579673776663, "grad_norm": 15.496427536010742, "learning_rate": 9.647985368431031e-06, "loss": 2.5312, "step": 116 }, { "epoch": 0.1468005018820577, "grad_norm": 24.412311553955078, "learning_rate": 9.640466998745456e-06, "loss": 2.7875, "step": 117 }, { "epoch": 0.1480552070263488, "grad_norm": 15.683626174926758, "learning_rate": 9.632872178808766e-06, "loss": 2.2883, "step": 118 }, { "epoch": 0.1493099121706399, "grad_norm": 17.257770538330078, "learning_rate": 9.625201033743262e-06, "loss": 2.8936, "step": 119 }, { "epoch": 0.15056461731493098, "grad_norm": 19.208641052246094, "learning_rate": 9.617453689928668e-06, "loss": 2.7428, "step": 120 }, { "epoch": 0.15181932245922208, "grad_norm": 17.00638771057129, "learning_rate": 9.609630275000072e-06, "loss": 2.5065, "step": 121 }, { "epoch": 0.15307402760351319, "grad_norm": 17.896059036254883, "learning_rate": 9.601730917845798e-06, "loss": 2.4492, "step": 122 }, { "epoch": 0.15432873274780426, "grad_norm": 17.655044555664062, "learning_rate": 9.5937557486053e-06, "loss": 2.3202, "step": 123 }, { "epoch": 0.15558343789209536, "grad_norm": 19.35125732421875, "learning_rate": 9.585704898667015e-06, "loss": 2.5956, "step": 124 }, { "epoch": 0.15683814303638646, "grad_norm": 17.047664642333984, "learning_rate": 9.577578500666187e-06, "loss": 2.547, "step": 125 }, { "epoch": 0.15809284818067754, "grad_norm": 17.756309509277344, "learning_rate": 9.5693766884827e-06, "loss": 2.6131, "step": 126 }, { "epoch": 0.15934755332496864, "grad_norm": 18.9345760345459, "learning_rate": 9.561099597238862e-06, "loss": 2.4613, "step": 127 }, { "epoch": 0.1606022584692597, "grad_norm": 16.88786506652832, "learning_rate": 9.552747363297172e-06, "loss": 2.363, "step": 128 }, { "epoch": 0.1618569636135508, "grad_norm": 17.6533203125, "learning_rate": 9.544320124258093e-06, "loss": 2.453, "step": 129 }, { "epoch": 0.16311166875784192, "grad_norm": 19.48556137084961, "learning_rate": 9.535818018957768e-06, "loss": 2.2917, "step": 130 }, { "epoch": 0.164366373902133, "grad_norm": 17.511598587036133, "learning_rate": 9.527241187465735e-06, "loss": 2.2477, "step": 131 }, { "epoch": 0.1656210790464241, "grad_norm": 15.644845008850098, "learning_rate": 9.518589771082627e-06, "loss": 2.6145, "step": 132 }, { "epoch": 0.1668757841907152, "grad_norm": 13.586119651794434, "learning_rate": 9.509863912337843e-06, "loss": 2.3622, "step": 133 }, { "epoch": 0.16813048933500627, "grad_norm": 18.941696166992188, "learning_rate": 9.501063754987188e-06, "loss": 2.4396, "step": 134 }, { "epoch": 0.16938519447929737, "grad_norm": 19.57110023498535, "learning_rate": 9.492189444010522e-06, "loss": 2.082, "step": 135 }, { "epoch": 0.17063989962358847, "grad_norm": 16.997098922729492, "learning_rate": 9.483241125609358e-06, "loss": 2.1185, "step": 136 }, { "epoch": 0.17189460476787954, "grad_norm": 20.234926223754883, "learning_rate": 9.47421894720446e-06, "loss": 2.487, "step": 137 }, { "epoch": 0.17314930991217065, "grad_norm": 20.660642623901367, "learning_rate": 9.465123057433413e-06, "loss": 2.1378, "step": 138 }, { "epoch": 0.17440401505646172, "grad_norm": 21.305038452148438, "learning_rate": 9.455953606148172e-06, "loss": 2.7265, "step": 139 }, { "epoch": 0.17565872020075282, "grad_norm": 20.652212142944336, "learning_rate": 9.446710744412595e-06, "loss": 2.3179, "step": 140 }, { "epoch": 0.17691342534504392, "grad_norm": 22.552457809448242, "learning_rate": 9.437394624499957e-06, "loss": 2.2027, "step": 141 }, { "epoch": 0.178168130489335, "grad_norm": 18.889108657836914, "learning_rate": 9.428005399890442e-06, "loss": 2.3326, "step": 142 }, { "epoch": 0.1794228356336261, "grad_norm": 18.121183395385742, "learning_rate": 9.418543225268598e-06, "loss": 2.0905, "step": 143 }, { "epoch": 0.1806775407779172, "grad_norm": 28.54220199584961, "learning_rate": 9.409008256520814e-06, "loss": 2.1567, "step": 144 }, { "epoch": 0.18193224592220827, "grad_norm": 28.761722564697266, "learning_rate": 9.399400650732735e-06, "loss": 2.3487, "step": 145 }, { "epoch": 0.18318695106649938, "grad_norm": 20.803058624267578, "learning_rate": 9.38972056618668e-06, "loss": 2.4545, "step": 146 }, { "epoch": 0.18444165621079048, "grad_norm": 14.15235424041748, "learning_rate": 9.379968162359034e-06, "loss": 2.1002, "step": 147 }, { "epoch": 0.18569636135508155, "grad_norm": 18.501392364501953, "learning_rate": 9.370143599917617e-06, "loss": 2.1081, "step": 148 }, { "epoch": 0.18695106649937265, "grad_norm": 23.19183921813965, "learning_rate": 9.36024704071904e-06, "loss": 2.2682, "step": 149 }, { "epoch": 0.18820577164366373, "grad_norm": 21.424211502075195, "learning_rate": 9.350278647806037e-06, "loss": 2.3408, "step": 150 }, { "epoch": 0.18946047678795483, "grad_norm": 22.568864822387695, "learning_rate": 9.340238585404787e-06, "loss": 2.357, "step": 151 }, { "epoch": 0.19071518193224593, "grad_norm": 17.558080673217773, "learning_rate": 9.330127018922195e-06, "loss": 2.1341, "step": 152 }, { "epoch": 0.191969887076537, "grad_norm": 21.05203628540039, "learning_rate": 9.319944114943171e-06, "loss": 2.736, "step": 153 }, { "epoch": 0.1932245922208281, "grad_norm": 28.293092727661133, "learning_rate": 9.309690041227898e-06, "loss": 2.4961, "step": 154 }, { "epoch": 0.1944792973651192, "grad_norm": 21.68331527709961, "learning_rate": 9.299364966709051e-06, "loss": 2.2222, "step": 155 }, { "epoch": 0.19573400250941028, "grad_norm": 28.366355895996094, "learning_rate": 9.28896906148902e-06, "loss": 2.719, "step": 156 }, { "epoch": 0.19698870765370138, "grad_norm": 25.245935440063477, "learning_rate": 9.278502496837116e-06, "loss": 2.4558, "step": 157 }, { "epoch": 0.19824341279799249, "grad_norm": 34.29158020019531, "learning_rate": 9.267965445186733e-06, "loss": 2.1928, "step": 158 }, { "epoch": 0.19949811794228356, "grad_norm": 23.639026641845703, "learning_rate": 9.257358080132524e-06, "loss": 1.8916, "step": 159 }, { "epoch": 0.20075282308657466, "grad_norm": 17.318647384643555, "learning_rate": 9.24668057642753e-06, "loss": 2.2254, "step": 160 }, { "epoch": 0.20200752823086573, "grad_norm": 18.8333740234375, "learning_rate": 9.235933109980302e-06, "loss": 2.0529, "step": 161 }, { "epoch": 0.20326223337515684, "grad_norm": 20.41586685180664, "learning_rate": 9.225115857852015e-06, "loss": 2.0644, "step": 162 }, { "epoch": 0.20451693851944794, "grad_norm": 22.13117218017578, "learning_rate": 9.214228998253526e-06, "loss": 2.2199, "step": 163 }, { "epoch": 0.205771643663739, "grad_norm": 22.590608596801758, "learning_rate": 9.20327271054247e-06, "loss": 1.9851, "step": 164 }, { "epoch": 0.20702634880803011, "grad_norm": 19.450021743774414, "learning_rate": 9.192247175220276e-06, "loss": 2.1396, "step": 165 }, { "epoch": 0.20828105395232122, "grad_norm": 24.714031219482422, "learning_rate": 9.181152573929215e-06, "loss": 2.0162, "step": 166 }, { "epoch": 0.2095357590966123, "grad_norm": 25.66572380065918, "learning_rate": 9.16998908944939e-06, "loss": 2.1091, "step": 167 }, { "epoch": 0.2107904642409034, "grad_norm": 24.950700759887695, "learning_rate": 9.15875690569574e-06, "loss": 2.2533, "step": 168 }, { "epoch": 0.2120451693851945, "grad_norm": 23.020002365112305, "learning_rate": 9.147456207714998e-06, "loss": 2.3229, "step": 169 }, { "epoch": 0.21329987452948557, "grad_norm": 22.205028533935547, "learning_rate": 9.13608718168265e-06, "loss": 2.3614, "step": 170 }, { "epoch": 0.21455457967377667, "grad_norm": 19.170259475708008, "learning_rate": 9.124650014899868e-06, "loss": 2.1497, "step": 171 }, { "epoch": 0.21580928481806774, "grad_norm": 18.129199981689453, "learning_rate": 9.113144895790416e-06, "loss": 2.2325, "step": 172 }, { "epoch": 0.21706398996235884, "grad_norm": 18.413124084472656, "learning_rate": 9.101572013897555e-06, "loss": 1.8652, "step": 173 }, { "epoch": 0.21831869510664995, "grad_norm": 18.207448959350586, "learning_rate": 9.089931559880918e-06, "loss": 1.9094, "step": 174 }, { "epoch": 0.21957340025094102, "grad_norm": 26.02681541442871, "learning_rate": 9.078223725513366e-06, "loss": 2.2922, "step": 175 }, { "epoch": 0.22082810539523212, "grad_norm": 30.541122436523438, "learning_rate": 9.066448703677828e-06, "loss": 1.8914, "step": 176 }, { "epoch": 0.22208281053952322, "grad_norm": 19.35504722595215, "learning_rate": 9.05460668836413e-06, "loss": 2.0448, "step": 177 }, { "epoch": 0.2233375156838143, "grad_norm": 24.406612396240234, "learning_rate": 9.04269787466579e-06, "loss": 2.2088, "step": 178 }, { "epoch": 0.2245922208281054, "grad_norm": 28.934782028198242, "learning_rate": 9.030722458776815e-06, "loss": 2.0474, "step": 179 }, { "epoch": 0.2258469259723965, "grad_norm": 23.718971252441406, "learning_rate": 9.018680637988456e-06, "loss": 2.1075, "step": 180 }, { "epoch": 0.22710163111668757, "grad_norm": 19.34891700744629, "learning_rate": 9.006572610685969e-06, "loss": 2.0024, "step": 181 }, { "epoch": 0.22835633626097868, "grad_norm": 17.186641693115234, "learning_rate": 8.994398576345335e-06, "loss": 1.8304, "step": 182 }, { "epoch": 0.22961104140526975, "grad_norm": 23.781911849975586, "learning_rate": 8.982158735529991e-06, "loss": 1.8478, "step": 183 }, { "epoch": 0.23086574654956085, "grad_norm": 28.87154769897461, "learning_rate": 8.969853289887507e-06, "loss": 1.9214, "step": 184 }, { "epoch": 0.23212045169385195, "grad_norm": 24.24917221069336, "learning_rate": 8.957482442146271e-06, "loss": 1.8442, "step": 185 }, { "epoch": 0.23337515683814303, "grad_norm": 23.922151565551758, "learning_rate": 8.945046396112158e-06, "loss": 1.9284, "step": 186 }, { "epoch": 0.23462986198243413, "grad_norm": 22.065723419189453, "learning_rate": 8.932545356665157e-06, "loss": 1.8711, "step": 187 }, { "epoch": 0.23588456712672523, "grad_norm": 28.266712188720703, "learning_rate": 8.919979529756008e-06, "loss": 1.8295, "step": 188 }, { "epoch": 0.2371392722710163, "grad_norm": 22.024778366088867, "learning_rate": 8.907349122402803e-06, "loss": 1.9236, "step": 189 }, { "epoch": 0.2383939774153074, "grad_norm": 17.683101654052734, "learning_rate": 8.894654342687574e-06, "loss": 1.8348, "step": 190 }, { "epoch": 0.2396486825595985, "grad_norm": 26.601009368896484, "learning_rate": 8.881895399752873e-06, "loss": 1.7325, "step": 191 }, { "epoch": 0.24090338770388958, "grad_norm": 30.148361206054688, "learning_rate": 8.869072503798315e-06, "loss": 2.0121, "step": 192 }, { "epoch": 0.24215809284818068, "grad_norm": 23.811433792114258, "learning_rate": 8.85618586607713e-06, "loss": 1.7341, "step": 193 }, { "epoch": 0.24341279799247176, "grad_norm": 17.06600570678711, "learning_rate": 8.843235698892661e-06, "loss": 1.7895, "step": 194 }, { "epoch": 0.24466750313676286, "grad_norm": 21.146913528442383, "learning_rate": 8.83022221559489e-06, "loss": 1.8371, "step": 195 }, { "epoch": 0.24592220828105396, "grad_norm": 22.374889373779297, "learning_rate": 8.81714563057691e-06, "loss": 2.0259, "step": 196 }, { "epoch": 0.24717691342534504, "grad_norm": 23.482807159423828, "learning_rate": 8.80400615927139e-06, "loss": 2.126, "step": 197 }, { "epoch": 0.24843161856963614, "grad_norm": 20.430444717407227, "learning_rate": 8.790804018147039e-06, "loss": 1.5703, "step": 198 }, { "epoch": 0.24968632371392724, "grad_norm": 29.053224563598633, "learning_rate": 8.777539424705022e-06, "loss": 1.9014, "step": 199 }, { "epoch": 0.25094102885821834, "grad_norm": 22.412776947021484, "learning_rate": 8.764212597475397e-06, "loss": 1.9072, "step": 200 }, { "epoch": 0.2521957340025094, "grad_norm": 27.57085418701172, "learning_rate": 8.750823756013498e-06, "loss": 2.0304, "step": 201 }, { "epoch": 0.2534504391468005, "grad_norm": 21.350475311279297, "learning_rate": 8.737373120896325e-06, "loss": 1.797, "step": 202 }, { "epoch": 0.2547051442910916, "grad_norm": 25.71649169921875, "learning_rate": 8.72386091371891e-06, "loss": 1.9805, "step": 203 }, { "epoch": 0.2559598494353827, "grad_norm": 24.62053108215332, "learning_rate": 8.710287357090666e-06, "loss": 1.6377, "step": 204 }, { "epoch": 0.2572145545796738, "grad_norm": 26.515974044799805, "learning_rate": 8.696652674631716e-06, "loss": 2.2071, "step": 205 }, { "epoch": 0.2584692597239649, "grad_norm": 22.19689178466797, "learning_rate": 8.68295709096922e-06, "loss": 1.8681, "step": 206 }, { "epoch": 0.25972396486825594, "grad_norm": 22.31092643737793, "learning_rate": 8.669200831733655e-06, "loss": 1.643, "step": 207 }, { "epoch": 0.26097867001254704, "grad_norm": 18.85532569885254, "learning_rate": 8.655384123555117e-06, "loss": 1.669, "step": 208 }, { "epoch": 0.26223337515683814, "grad_norm": 24.516279220581055, "learning_rate": 8.64150719405958e-06, "loss": 1.8626, "step": 209 }, { "epoch": 0.26348808030112925, "grad_norm": 20.873056411743164, "learning_rate": 8.627570271865143e-06, "loss": 1.6009, "step": 210 }, { "epoch": 0.26474278544542035, "grad_norm": 26.961584091186523, "learning_rate": 8.613573586578262e-06, "loss": 1.8991, "step": 211 }, { "epoch": 0.2659974905897114, "grad_norm": 23.05677032470703, "learning_rate": 8.599517368789981e-06, "loss": 1.6264, "step": 212 }, { "epoch": 0.2672521957340025, "grad_norm": 23.3626766204834, "learning_rate": 8.585401850072114e-06, "loss": 1.763, "step": 213 }, { "epoch": 0.2685069008782936, "grad_norm": 22.876678466796875, "learning_rate": 8.571227262973444e-06, "loss": 1.8171, "step": 214 }, { "epoch": 0.2697616060225847, "grad_norm": 21.870689392089844, "learning_rate": 8.55699384101589e-06, "loss": 1.7618, "step": 215 }, { "epoch": 0.2710163111668758, "grad_norm": 23.80776023864746, "learning_rate": 8.54270181869065e-06, "loss": 1.7353, "step": 216 }, { "epoch": 0.2722710163111669, "grad_norm": 21.69217872619629, "learning_rate": 8.528351431454352e-06, "loss": 1.8667, "step": 217 }, { "epoch": 0.27352572145545795, "grad_norm": 22.88399887084961, "learning_rate": 8.513942915725159e-06, "loss": 1.7512, "step": 218 }, { "epoch": 0.27478042659974905, "grad_norm": 22.40818977355957, "learning_rate": 8.499476508878894e-06, "loss": 1.7168, "step": 219 }, { "epoch": 0.27603513174404015, "grad_norm": 25.04762840270996, "learning_rate": 8.484952449245107e-06, "loss": 1.6717, "step": 220 }, { "epoch": 0.27728983688833125, "grad_norm": 22.810468673706055, "learning_rate": 8.470370976103171e-06, "loss": 1.8007, "step": 221 }, { "epoch": 0.27854454203262236, "grad_norm": 24.604190826416016, "learning_rate": 8.455732329678317e-06, "loss": 1.9564, "step": 222 }, { "epoch": 0.2797992471769134, "grad_norm": 27.309738159179688, "learning_rate": 8.441036751137697e-06, "loss": 1.6334, "step": 223 }, { "epoch": 0.2810539523212045, "grad_norm": 29.318500518798828, "learning_rate": 8.426284482586397e-06, "loss": 1.6922, "step": 224 }, { "epoch": 0.2823086574654956, "grad_norm": 28.5482177734375, "learning_rate": 8.411475767063454e-06, "loss": 1.8862, "step": 225 }, { "epoch": 0.2835633626097867, "grad_norm": 25.247356414794922, "learning_rate": 8.396610848537858e-06, "loss": 1.7688, "step": 226 }, { "epoch": 0.2848180677540778, "grad_norm": 24.79906463623047, "learning_rate": 8.381689971904514e-06, "loss": 1.7844, "step": 227 }, { "epoch": 0.2860727728983689, "grad_norm": 28.987627029418945, "learning_rate": 8.36671338298023e-06, "loss": 1.7785, "step": 228 }, { "epoch": 0.28732747804265996, "grad_norm": 25.145153045654297, "learning_rate": 8.35168132849965e-06, "loss": 1.7741, "step": 229 }, { "epoch": 0.28858218318695106, "grad_norm": 22.089122772216797, "learning_rate": 8.336594056111197e-06, "loss": 1.5078, "step": 230 }, { "epoch": 0.28983688833124216, "grad_norm": 27.65213966369629, "learning_rate": 8.321451814372998e-06, "loss": 1.7603, "step": 231 }, { "epoch": 0.29109159347553326, "grad_norm": 33.60897445678711, "learning_rate": 8.306254852748773e-06, "loss": 1.7254, "step": 232 }, { "epoch": 0.29234629861982436, "grad_norm": 25.02092933654785, "learning_rate": 8.29100342160374e-06, "loss": 1.795, "step": 233 }, { "epoch": 0.2936010037641154, "grad_norm": 21.960206985473633, "learning_rate": 8.275697772200491e-06, "loss": 1.7087, "step": 234 }, { "epoch": 0.2948557089084065, "grad_norm": 29.953306198120117, "learning_rate": 8.260338156694836e-06, "loss": 1.4295, "step": 235 }, { "epoch": 0.2961104140526976, "grad_norm": 26.209787368774414, "learning_rate": 8.244924828131668e-06, "loss": 1.4427, "step": 236 }, { "epoch": 0.2973651191969887, "grad_norm": 23.775861740112305, "learning_rate": 8.229458040440783e-06, "loss": 1.7755, "step": 237 }, { "epoch": 0.2986198243412798, "grad_norm": 22.297338485717773, "learning_rate": 8.213938048432697e-06, "loss": 1.5213, "step": 238 }, { "epoch": 0.2998745294855709, "grad_norm": 24.113645553588867, "learning_rate": 8.198365107794457e-06, "loss": 1.5942, "step": 239 }, { "epoch": 0.30112923462986196, "grad_norm": 24.177122116088867, "learning_rate": 8.182739475085417e-06, "loss": 1.8395, "step": 240 }, { "epoch": 0.30238393977415307, "grad_norm": 28.40700912475586, "learning_rate": 8.167061407733018e-06, "loss": 1.6086, "step": 241 }, { "epoch": 0.30363864491844417, "grad_norm": 24.49298667907715, "learning_rate": 8.151331164028544e-06, "loss": 1.5645, "step": 242 }, { "epoch": 0.30489335006273527, "grad_norm": 33.37433624267578, "learning_rate": 8.135549003122871e-06, "loss": 1.698, "step": 243 }, { "epoch": 0.30614805520702637, "grad_norm": 24.059009552001953, "learning_rate": 8.119715185022195e-06, "loss": 1.5047, "step": 244 }, { "epoch": 0.3074027603513174, "grad_norm": 29.42665672302246, "learning_rate": 8.103829970583742e-06, "loss": 1.68, "step": 245 }, { "epoch": 0.3086574654956085, "grad_norm": 29.08376121520996, "learning_rate": 8.087893621511487e-06, "loss": 1.5872, "step": 246 }, { "epoch": 0.3099121706398996, "grad_norm": 28.20993995666504, "learning_rate": 8.071906400351823e-06, "loss": 1.6515, "step": 247 }, { "epoch": 0.3111668757841907, "grad_norm": 19.08958625793457, "learning_rate": 8.055868570489247e-06, "loss": 1.4665, "step": 248 }, { "epoch": 0.3124215809284818, "grad_norm": 20.03516960144043, "learning_rate": 8.039780396142023e-06, "loss": 1.6523, "step": 249 }, { "epoch": 0.3136762860727729, "grad_norm": 25.80693244934082, "learning_rate": 8.023642142357821e-06, "loss": 1.7412, "step": 250 }, { "epoch": 0.31493099121706397, "grad_norm": 24.467342376708984, "learning_rate": 8.007454075009352e-06, "loss": 1.5459, "step": 251 }, { "epoch": 0.3161856963613551, "grad_norm": 34.97882843017578, "learning_rate": 7.991216460789997e-06, "loss": 1.7311, "step": 252 }, { "epoch": 0.3174404015056462, "grad_norm": 29.624479293823242, "learning_rate": 7.974929567209399e-06, "loss": 1.7838, "step": 253 }, { "epoch": 0.3186951066499373, "grad_norm": 28.10247039794922, "learning_rate": 7.95859366258907e-06, "loss": 1.7842, "step": 254 }, { "epoch": 0.3199498117942284, "grad_norm": 25.512306213378906, "learning_rate": 7.942209016057954e-06, "loss": 1.6854, "step": 255 }, { "epoch": 0.3212045169385194, "grad_norm": 27.726490020751953, "learning_rate": 7.925775897548013e-06, "loss": 1.7176, "step": 256 }, { "epoch": 0.3224592220828105, "grad_norm": 29.725744247436523, "learning_rate": 7.909294577789765e-06, "loss": 1.6355, "step": 257 }, { "epoch": 0.3237139272271016, "grad_norm": 21.763940811157227, "learning_rate": 7.892765328307828e-06, "loss": 1.614, "step": 258 }, { "epoch": 0.32496863237139273, "grad_norm": 29.157032012939453, "learning_rate": 7.87618842141645e-06, "loss": 1.5684, "step": 259 }, { "epoch": 0.32622333751568383, "grad_norm": 29.150402069091797, "learning_rate": 7.859564130215015e-06, "loss": 1.5138, "step": 260 }, { "epoch": 0.32747804265997493, "grad_norm": 38.0162239074707, "learning_rate": 7.842892728583557e-06, "loss": 1.4729, "step": 261 }, { "epoch": 0.328732747804266, "grad_norm": 28.247106552124023, "learning_rate": 7.826174491178231e-06, "loss": 1.6418, "step": 262 }, { "epoch": 0.3299874529485571, "grad_norm": 28.189817428588867, "learning_rate": 7.809409693426803e-06, "loss": 1.5794, "step": 263 }, { "epoch": 0.3312421580928482, "grad_norm": 34.21451950073242, "learning_rate": 7.792598611524103e-06, "loss": 1.5883, "step": 264 }, { "epoch": 0.3324968632371393, "grad_norm": 27.97997283935547, "learning_rate": 7.775741522427477e-06, "loss": 1.4462, "step": 265 }, { "epoch": 0.3337515683814304, "grad_norm": 27.05823516845703, "learning_rate": 7.75883870385223e-06, "loss": 1.5044, "step": 266 }, { "epoch": 0.33500627352572143, "grad_norm": 29.075641632080078, "learning_rate": 7.741890434267043e-06, "loss": 1.5352, "step": 267 }, { "epoch": 0.33626097867001253, "grad_norm": 36.941951751708984, "learning_rate": 7.724896992889385e-06, "loss": 1.5779, "step": 268 }, { "epoch": 0.33751568381430364, "grad_norm": 28.30890655517578, "learning_rate": 7.707858659680924e-06, "loss": 1.8306, "step": 269 }, { "epoch": 0.33877038895859474, "grad_norm": 28.968425750732422, "learning_rate": 7.690775715342898e-06, "loss": 1.5735, "step": 270 }, { "epoch": 0.34002509410288584, "grad_norm": 23.6066951751709, "learning_rate": 7.67364844131151e-06, "loss": 1.6057, "step": 271 }, { "epoch": 0.34127979924717694, "grad_norm": 31.214929580688477, "learning_rate": 7.656477119753268e-06, "loss": 1.8741, "step": 272 }, { "epoch": 0.342534504391468, "grad_norm": 37.89013671875, "learning_rate": 7.63926203356036e-06, "loss": 1.7272, "step": 273 }, { "epoch": 0.3437892095357591, "grad_norm": 26.85829734802246, "learning_rate": 7.622003466345977e-06, "loss": 1.6312, "step": 274 }, { "epoch": 0.3450439146800502, "grad_norm": 25.076658248901367, "learning_rate": 7.604701702439652e-06, "loss": 1.5652, "step": 275 }, { "epoch": 0.3462986198243413, "grad_norm": 33.68350601196289, "learning_rate": 7.587357026882563e-06, "loss": 1.5935, "step": 276 }, { "epoch": 0.3475533249686324, "grad_norm": 26.654830932617188, "learning_rate": 7.5699697254228496e-06, "loss": 1.4547, "step": 277 }, { "epoch": 0.34880803011292344, "grad_norm": 25.102251052856445, "learning_rate": 7.552540084510896e-06, "loss": 1.6585, "step": 278 }, { "epoch": 0.35006273525721454, "grad_norm": 30.08404541015625, "learning_rate": 7.535068391294618e-06, "loss": 1.7801, "step": 279 }, { "epoch": 0.35131744040150564, "grad_norm": 23.15135955810547, "learning_rate": 7.517554933614729e-06, "loss": 1.4114, "step": 280 }, { "epoch": 0.35257214554579674, "grad_norm": 26.793306350708008, "learning_rate": 7.500000000000001e-06, "loss": 1.5748, "step": 281 }, { "epoch": 0.35382685069008785, "grad_norm": 26.644601821899414, "learning_rate": 7.482403879662505e-06, "loss": 1.7082, "step": 282 }, { "epoch": 0.35508155583437895, "grad_norm": 29.40913200378418, "learning_rate": 7.464766862492856e-06, "loss": 1.5906, "step": 283 }, { "epoch": 0.35633626097867, "grad_norm": 28.093795776367188, "learning_rate": 7.447089239055428e-06, "loss": 1.6122, "step": 284 }, { "epoch": 0.3575909661229611, "grad_norm": 23.78188133239746, "learning_rate": 7.42937130058357e-06, "loss": 1.4623, "step": 285 }, { "epoch": 0.3588456712672522, "grad_norm": 35.69364929199219, "learning_rate": 7.4116133389748115e-06, "loss": 1.6225, "step": 286 }, { "epoch": 0.3601003764115433, "grad_norm": 30.77789306640625, "learning_rate": 7.393815646786047e-06, "loss": 1.5917, "step": 287 }, { "epoch": 0.3613550815558344, "grad_norm": 41.9234619140625, "learning_rate": 7.3759785172287235e-06, "loss": 1.4922, "step": 288 }, { "epoch": 0.36260978670012545, "grad_norm": 26.941680908203125, "learning_rate": 7.358102244164003e-06, "loss": 1.8153, "step": 289 }, { "epoch": 0.36386449184441655, "grad_norm": 27.374059677124023, "learning_rate": 7.340187122097931e-06, "loss": 1.64, "step": 290 }, { "epoch": 0.36511919698870765, "grad_norm": 23.783817291259766, "learning_rate": 7.322233446176571e-06, "loss": 1.5758, "step": 291 }, { "epoch": 0.36637390213299875, "grad_norm": 23.492393493652344, "learning_rate": 7.304241512181152e-06, "loss": 1.479, "step": 292 }, { "epoch": 0.36762860727728985, "grad_norm": 27.81630516052246, "learning_rate": 7.286211616523193e-06, "loss": 1.5494, "step": 293 }, { "epoch": 0.36888331242158096, "grad_norm": 35.152557373046875, "learning_rate": 7.268144056239621e-06, "loss": 1.8003, "step": 294 }, { "epoch": 0.370138017565872, "grad_norm": 24.756799697875977, "learning_rate": 7.250039128987874e-06, "loss": 1.6751, "step": 295 }, { "epoch": 0.3713927227101631, "grad_norm": 30.238140106201172, "learning_rate": 7.231897133040997e-06, "loss": 1.4538, "step": 296 }, { "epoch": 0.3726474278544542, "grad_norm": 25.516706466674805, "learning_rate": 7.213718367282737e-06, "loss": 1.41, "step": 297 }, { "epoch": 0.3739021329987453, "grad_norm": 45.06476593017578, "learning_rate": 7.195503131202607e-06, "loss": 1.5351, "step": 298 }, { "epoch": 0.3751568381430364, "grad_norm": 30.282215118408203, "learning_rate": 7.177251724890957e-06, "loss": 1.6859, "step": 299 }, { "epoch": 0.37641154328732745, "grad_norm": 26.890932083129883, "learning_rate": 7.1589644490340334e-06, "loss": 1.5883, "step": 300 }, { "epoch": 0.37766624843161856, "grad_norm": 29.712207794189453, "learning_rate": 7.14064160490902e-06, "loss": 1.7468, "step": 301 }, { "epoch": 0.37892095357590966, "grad_norm": 23.99646759033203, "learning_rate": 7.122283494379076e-06, "loss": 1.3783, "step": 302 }, { "epoch": 0.38017565872020076, "grad_norm": 28.590595245361328, "learning_rate": 7.103890419888367e-06, "loss": 1.694, "step": 303 }, { "epoch": 0.38143036386449186, "grad_norm": 22.65292739868164, "learning_rate": 7.085462684457076e-06, "loss": 1.5418, "step": 304 }, { "epoch": 0.38268506900878296, "grad_norm": 27.158199310302734, "learning_rate": 7.067000591676416e-06, "loss": 1.6183, "step": 305 }, { "epoch": 0.383939774153074, "grad_norm": 29.83051872253418, "learning_rate": 7.048504445703623e-06, "loss": 1.5936, "step": 306 }, { "epoch": 0.3851944792973651, "grad_norm": 24.005414962768555, "learning_rate": 7.029974551256957e-06, "loss": 1.3992, "step": 307 }, { "epoch": 0.3864491844416562, "grad_norm": 34.38796615600586, "learning_rate": 7.011411213610663e-06, "loss": 1.6884, "step": 308 }, { "epoch": 0.3877038895859473, "grad_norm": 25.36124038696289, "learning_rate": 6.992814738589958e-06, "loss": 1.6561, "step": 309 }, { "epoch": 0.3889585947302384, "grad_norm": 21.46540641784668, "learning_rate": 6.97418543256599e-06, "loss": 1.3287, "step": 310 }, { "epoch": 0.39021329987452946, "grad_norm": 35.439361572265625, "learning_rate": 6.95552360245078e-06, "loss": 1.6699, "step": 311 }, { "epoch": 0.39146800501882056, "grad_norm": 32.73426055908203, "learning_rate": 6.936829555692182e-06, "loss": 1.3947, "step": 312 }, { "epoch": 0.39272271016311167, "grad_norm": 28.283676147460938, "learning_rate": 6.9181036002687985e-06, "loss": 1.4841, "step": 313 }, { "epoch": 0.39397741530740277, "grad_norm": 20.66922378540039, "learning_rate": 6.899346044684928e-06, "loss": 1.3804, "step": 314 }, { "epoch": 0.39523212045169387, "grad_norm": 31.596906661987305, "learning_rate": 6.880557197965465e-06, "loss": 1.467, "step": 315 }, { "epoch": 0.39648682559598497, "grad_norm": 22.125431060791016, "learning_rate": 6.861737369650818e-06, "loss": 1.4638, "step": 316 }, { "epoch": 0.397741530740276, "grad_norm": 26.49312400817871, "learning_rate": 6.84288686979181e-06, "loss": 1.2585, "step": 317 }, { "epoch": 0.3989962358845671, "grad_norm": 31.771793365478516, "learning_rate": 6.824006008944561e-06, "loss": 1.5593, "step": 318 }, { "epoch": 0.4002509410288582, "grad_norm": 33.718238830566406, "learning_rate": 6.805095098165388e-06, "loss": 1.5027, "step": 319 }, { "epoch": 0.4015056461731493, "grad_norm": 27.339921951293945, "learning_rate": 6.786154449005664e-06, "loss": 1.438, "step": 320 }, { "epoch": 0.4027603513174404, "grad_norm": 24.385299682617188, "learning_rate": 6.767184373506698e-06, "loss": 1.5481, "step": 321 }, { "epoch": 0.40401505646173147, "grad_norm": 38.833770751953125, "learning_rate": 6.7481851841945835e-06, "loss": 1.6319, "step": 322 }, { "epoch": 0.40526976160602257, "grad_norm": 27.79740333557129, "learning_rate": 6.7291571940750575e-06, "loss": 1.5855, "step": 323 }, { "epoch": 0.4065244667503137, "grad_norm": 30.081342697143555, "learning_rate": 6.710100716628345e-06, "loss": 1.3305, "step": 324 }, { "epoch": 0.4077791718946048, "grad_norm": 28.723339080810547, "learning_rate": 6.6910160658039835e-06, "loss": 1.5928, "step": 325 }, { "epoch": 0.4090338770388959, "grad_norm": 36.5059814453125, "learning_rate": 6.671903556015664e-06, "loss": 1.7107, "step": 326 }, { "epoch": 0.410288582183187, "grad_norm": 22.986221313476562, "learning_rate": 6.652763502136044e-06, "loss": 1.4106, "step": 327 }, { "epoch": 0.411543287327478, "grad_norm": 31.11964988708496, "learning_rate": 6.633596219491559e-06, "loss": 1.6816, "step": 328 }, { "epoch": 0.4127979924717691, "grad_norm": 25.74013900756836, "learning_rate": 6.614402023857231e-06, "loss": 1.5055, "step": 329 }, { "epoch": 0.41405269761606023, "grad_norm": 30.515594482421875, "learning_rate": 6.595181231451469e-06, "loss": 1.5854, "step": 330 }, { "epoch": 0.41530740276035133, "grad_norm": 37.943180084228516, "learning_rate": 6.57593415893085e-06, "loss": 1.4225, "step": 331 }, { "epoch": 0.41656210790464243, "grad_norm": 30.183914184570312, "learning_rate": 6.556661123384909e-06, "loss": 1.5019, "step": 332 }, { "epoch": 0.4178168130489335, "grad_norm": 35.5178337097168, "learning_rate": 6.5373624423309165e-06, "loss": 1.4571, "step": 333 }, { "epoch": 0.4190715181932246, "grad_norm": 30.98124885559082, "learning_rate": 6.518038433708643e-06, "loss": 1.381, "step": 334 }, { "epoch": 0.4203262233375157, "grad_norm": 31.475486755371094, "learning_rate": 6.498689415875121e-06, "loss": 1.607, "step": 335 }, { "epoch": 0.4215809284818068, "grad_norm": 29.79499053955078, "learning_rate": 6.479315707599407e-06, "loss": 1.3446, "step": 336 }, { "epoch": 0.4228356336260979, "grad_norm": 23.057994842529297, "learning_rate": 6.459917628057319e-06, "loss": 1.4102, "step": 337 }, { "epoch": 0.424090338770389, "grad_norm": 32.09408187866211, "learning_rate": 6.440495496826189e-06, "loss": 1.6248, "step": 338 }, { "epoch": 0.42534504391468003, "grad_norm": 30.396852493286133, "learning_rate": 6.421049633879588e-06, "loss": 1.5172, "step": 339 }, { "epoch": 0.42659974905897113, "grad_norm": 37.36663818359375, "learning_rate": 6.4015803595820635e-06, "loss": 1.6684, "step": 340 }, { "epoch": 0.42785445420326224, "grad_norm": 36.27682876586914, "learning_rate": 6.3820879946838585e-06, "loss": 1.43, "step": 341 }, { "epoch": 0.42910915934755334, "grad_norm": 38.0621223449707, "learning_rate": 6.3625728603156215e-06, "loss": 1.5009, "step": 342 }, { "epoch": 0.43036386449184444, "grad_norm": 30.142953872680664, "learning_rate": 6.3430352779831275e-06, "loss": 1.3865, "step": 343 }, { "epoch": 0.4316185696361355, "grad_norm": 31.03050994873047, "learning_rate": 6.323475569561968e-06, "loss": 1.5305, "step": 344 }, { "epoch": 0.4328732747804266, "grad_norm": 31.472867965698242, "learning_rate": 6.303894057292261e-06, "loss": 1.5711, "step": 345 }, { "epoch": 0.4341279799247177, "grad_norm": 34.335853576660156, "learning_rate": 6.284291063773331e-06, "loss": 1.5281, "step": 346 }, { "epoch": 0.4353826850690088, "grad_norm": 36.837493896484375, "learning_rate": 6.264666911958404e-06, "loss": 1.5468, "step": 347 }, { "epoch": 0.4366373902132999, "grad_norm": 33.03227996826172, "learning_rate": 6.2450219251492795e-06, "loss": 1.483, "step": 348 }, { "epoch": 0.437892095357591, "grad_norm": 28.33861541748047, "learning_rate": 6.225356426991007e-06, "loss": 1.2866, "step": 349 }, { "epoch": 0.43914680050188204, "grad_norm": 27.562910079956055, "learning_rate": 6.205670741466555e-06, "loss": 1.4045, "step": 350 }, { "epoch": 0.44040150564617314, "grad_norm": 31.761911392211914, "learning_rate": 6.185965192891472e-06, "loss": 1.337, "step": 351 }, { "epoch": 0.44165621079046424, "grad_norm": 35.49506378173828, "learning_rate": 6.166240105908547e-06, "loss": 1.6938, "step": 352 }, { "epoch": 0.44291091593475534, "grad_norm": 53.732215881347656, "learning_rate": 6.146495805482451e-06, "loss": 1.5635, "step": 353 }, { "epoch": 0.44416562107904645, "grad_norm": 29.330778121948242, "learning_rate": 6.126732616894397e-06, "loss": 1.5873, "step": 354 }, { "epoch": 0.4454203262233375, "grad_norm": 30.75185203552246, "learning_rate": 6.106950865736777e-06, "loss": 1.4611, "step": 355 }, { "epoch": 0.4466750313676286, "grad_norm": 34.61481857299805, "learning_rate": 6.087150877907786e-06, "loss": 1.5506, "step": 356 }, { "epoch": 0.4479297365119197, "grad_norm": 36.45780563354492, "learning_rate": 6.067332979606069e-06, "loss": 1.5333, "step": 357 }, { "epoch": 0.4491844416562108, "grad_norm": 43.751426696777344, "learning_rate": 6.047497497325341e-06, "loss": 1.5729, "step": 358 }, { "epoch": 0.4504391468005019, "grad_norm": 30.756084442138672, "learning_rate": 6.027644757849004e-06, "loss": 1.4557, "step": 359 }, { "epoch": 0.451693851944793, "grad_norm": 30.46338653564453, "learning_rate": 6.007775088244769e-06, "loss": 1.3311, "step": 360 }, { "epoch": 0.45294855708908405, "grad_norm": 29.494077682495117, "learning_rate": 5.987888815859266e-06, "loss": 1.3893, "step": 361 }, { "epoch": 0.45420326223337515, "grad_norm": 30.151817321777344, "learning_rate": 5.967986268312651e-06, "loss": 1.346, "step": 362 }, { "epoch": 0.45545796737766625, "grad_norm": 35.56706237792969, "learning_rate": 5.948067773493205e-06, "loss": 1.5986, "step": 363 }, { "epoch": 0.45671267252195735, "grad_norm": 26.097820281982422, "learning_rate": 5.928133659551939e-06, "loss": 1.3859, "step": 364 }, { "epoch": 0.45796737766624845, "grad_norm": 28.94278335571289, "learning_rate": 5.908184254897183e-06, "loss": 1.5139, "step": 365 }, { "epoch": 0.4592220828105395, "grad_norm": 36.553123474121094, "learning_rate": 5.888219888189176e-06, "loss": 1.4892, "step": 366 }, { "epoch": 0.4604767879548306, "grad_norm": 106.10436248779297, "learning_rate": 5.8682408883346535e-06, "loss": 1.4375, "step": 367 }, { "epoch": 0.4617314930991217, "grad_norm": 42.712303161621094, "learning_rate": 5.848247584481424e-06, "loss": 1.431, "step": 368 }, { "epoch": 0.4629861982434128, "grad_norm": 37.82698059082031, "learning_rate": 5.828240306012957e-06, "loss": 1.5441, "step": 369 }, { "epoch": 0.4642409033877039, "grad_norm": 35.159000396728516, "learning_rate": 5.808219382542941e-06, "loss": 1.4638, "step": 370 }, { "epoch": 0.465495608531995, "grad_norm": 28.512142181396484, "learning_rate": 5.788185143909868e-06, "loss": 1.4615, "step": 371 }, { "epoch": 0.46675031367628605, "grad_norm": 32.28644943237305, "learning_rate": 5.768137920171593e-06, "loss": 1.4778, "step": 372 }, { "epoch": 0.46800501882057716, "grad_norm": 30.508554458618164, "learning_rate": 5.74807804159989e-06, "loss": 1.656, "step": 373 }, { "epoch": 0.46925972396486826, "grad_norm": 31.334104537963867, "learning_rate": 5.728005838675026e-06, "loss": 1.3335, "step": 374 }, { "epoch": 0.47051442910915936, "grad_norm": 30.219167709350586, "learning_rate": 5.7079216420803e-06, "loss": 1.468, "step": 375 }, { "epoch": 0.47176913425345046, "grad_norm": 40.787261962890625, "learning_rate": 5.68782578269661e-06, "loss": 1.5705, "step": 376 }, { "epoch": 0.4730238393977415, "grad_norm": 36.666656494140625, "learning_rate": 5.66771859159699e-06, "loss": 1.5139, "step": 377 }, { "epoch": 0.4742785445420326, "grad_norm": 33.556617736816406, "learning_rate": 5.647600400041163e-06, "loss": 1.3386, "step": 378 }, { "epoch": 0.4755332496863237, "grad_norm": 28.310293197631836, "learning_rate": 5.6274715394700805e-06, "loss": 1.4892, "step": 379 }, { "epoch": 0.4767879548306148, "grad_norm": 30.385696411132812, "learning_rate": 5.6073323415004635e-06, "loss": 1.4074, "step": 380 }, { "epoch": 0.4780426599749059, "grad_norm": 30.94135856628418, "learning_rate": 5.587183137919332e-06, "loss": 1.3804, "step": 381 }, { "epoch": 0.479297365119197, "grad_norm": 25.842451095581055, "learning_rate": 5.567024260678559e-06, "loss": 1.3756, "step": 382 }, { "epoch": 0.48055207026348806, "grad_norm": 24.24115753173828, "learning_rate": 5.546856041889374e-06, "loss": 1.3217, "step": 383 }, { "epoch": 0.48180677540777916, "grad_norm": 29.69972801208496, "learning_rate": 5.526678813816912e-06, "loss": 1.3114, "step": 384 }, { "epoch": 0.48306148055207027, "grad_norm": 40.6950569152832, "learning_rate": 5.5064929088747324e-06, "loss": 1.6083, "step": 385 }, { "epoch": 0.48431618569636137, "grad_norm": 37.67729949951172, "learning_rate": 5.486298659619346e-06, "loss": 1.5827, "step": 386 }, { "epoch": 0.48557089084065247, "grad_norm": 38.3140754699707, "learning_rate": 5.46609639874473e-06, "loss": 1.3942, "step": 387 }, { "epoch": 0.4868255959849435, "grad_norm": 33.37904739379883, "learning_rate": 5.445886459076848e-06, "loss": 1.5518, "step": 388 }, { "epoch": 0.4880803011292346, "grad_norm": 30.683101654052734, "learning_rate": 5.425669173568179e-06, "loss": 1.3667, "step": 389 }, { "epoch": 0.4893350062735257, "grad_norm": 38.90886306762695, "learning_rate": 5.405444875292213e-06, "loss": 1.6388, "step": 390 }, { "epoch": 0.4905897114178168, "grad_norm": 32.49534606933594, "learning_rate": 5.385213897437975e-06, "loss": 1.3725, "step": 391 }, { "epoch": 0.4918444165621079, "grad_norm": 31.765207290649414, "learning_rate": 5.364976573304538e-06, "loss": 1.4513, "step": 392 }, { "epoch": 0.493099121706399, "grad_norm": 34.01384735107422, "learning_rate": 5.344733236295525e-06, "loss": 1.3848, "step": 393 }, { "epoch": 0.49435382685069007, "grad_norm": 36.31550216674805, "learning_rate": 5.324484219913621e-06, "loss": 1.3873, "step": 394 }, { "epoch": 0.49560853199498117, "grad_norm": 30.318265914916992, "learning_rate": 5.30422985775507e-06, "loss": 1.5321, "step": 395 }, { "epoch": 0.4968632371392723, "grad_norm": 30.169464111328125, "learning_rate": 5.283970483504198e-06, "loss": 1.3799, "step": 396 }, { "epoch": 0.4981179422835634, "grad_norm": 31.82530975341797, "learning_rate": 5.263706430927895e-06, "loss": 1.5295, "step": 397 }, { "epoch": 0.4993726474278545, "grad_norm": 36.714996337890625, "learning_rate": 5.243438033870126e-06, "loss": 1.4037, "step": 398 }, { "epoch": 0.5006273525721455, "grad_norm": 33.54505157470703, "learning_rate": 5.223165626246432e-06, "loss": 1.521, "step": 399 }, { "epoch": 0.5006273525721455, "eval_loss": 1.436629295349121, "eval_runtime": 6.0522, "eval_samples_per_second": 110.869, "eval_steps_per_second": 6.94, "step": 399 }, { "epoch": 0.5018820577164367, "grad_norm": 30.569034576416016, "learning_rate": 5.202889542038428e-06, "loss": 1.3634, "step": 400 }, { "epoch": 0.5031367628607277, "grad_norm": 28.09290885925293, "learning_rate": 5.182610115288296e-06, "loss": 1.4243, "step": 401 }, { "epoch": 0.5043914680050188, "grad_norm": 31.013883590698242, "learning_rate": 5.162327680093284e-06, "loss": 1.5255, "step": 402 }, { "epoch": 0.5056461731493099, "grad_norm": 28.622833251953125, "learning_rate": 5.142042570600212e-06, "loss": 1.143, "step": 403 }, { "epoch": 0.506900878293601, "grad_norm": 34.083290100097656, "learning_rate": 5.121755120999949e-06, "loss": 1.4854, "step": 404 }, { "epoch": 0.5081555834378921, "grad_norm": 29.883394241333008, "learning_rate": 5.101465665521919e-06, "loss": 1.2494, "step": 405 }, { "epoch": 0.5094102885821832, "grad_norm": 36.8629035949707, "learning_rate": 5.081174538428596e-06, "loss": 1.5055, "step": 406 }, { "epoch": 0.5106649937264742, "grad_norm": 39.23841094970703, "learning_rate": 5.060882074009988e-06, "loss": 1.41, "step": 407 }, { "epoch": 0.5119196988707654, "grad_norm": 42.195274353027344, "learning_rate": 5.04058860657814e-06, "loss": 1.5589, "step": 408 }, { "epoch": 0.5131744040150564, "grad_norm": 32.830596923828125, "learning_rate": 5.020294470461615e-06, "loss": 1.3412, "step": 409 }, { "epoch": 0.5144291091593476, "grad_norm": 49.16096496582031, "learning_rate": 5e-06, "loss": 1.5255, "step": 410 }, { "epoch": 0.5156838143036386, "grad_norm": 29.00592613220215, "learning_rate": 4.979705529538385e-06, "loss": 1.4311, "step": 411 }, { "epoch": 0.5169385194479298, "grad_norm": 39.06101608276367, "learning_rate": 4.959411393421863e-06, "loss": 1.3708, "step": 412 }, { "epoch": 0.5181932245922208, "grad_norm": 34.09449768066406, "learning_rate": 4.939117925990013e-06, "loss": 1.484, "step": 413 }, { "epoch": 0.5194479297365119, "grad_norm": 35.57181167602539, "learning_rate": 4.918825461571405e-06, "loss": 1.3226, "step": 414 }, { "epoch": 0.520702634880803, "grad_norm": 29.180233001708984, "learning_rate": 4.8985343344780815e-06, "loss": 1.6168, "step": 415 }, { "epoch": 0.5219573400250941, "grad_norm": 25.967992782592773, "learning_rate": 4.8782448790000525e-06, "loss": 1.4807, "step": 416 }, { "epoch": 0.5232120451693852, "grad_norm": 31.979293823242188, "learning_rate": 4.857957429399788e-06, "loss": 1.4218, "step": 417 }, { "epoch": 0.5244667503136763, "grad_norm": 30.151277542114258, "learning_rate": 4.837672319906717e-06, "loss": 1.4075, "step": 418 }, { "epoch": 0.5257214554579673, "grad_norm": 40.19000244140625, "learning_rate": 4.817389884711706e-06, "loss": 1.6472, "step": 419 }, { "epoch": 0.5269761606022585, "grad_norm": 28.63579559326172, "learning_rate": 4.797110457961575e-06, "loss": 1.1942, "step": 420 }, { "epoch": 0.5282308657465495, "grad_norm": 36.74559020996094, "learning_rate": 4.7768343737535694e-06, "loss": 1.5179, "step": 421 }, { "epoch": 0.5294855708908407, "grad_norm": 30.191770553588867, "learning_rate": 4.756561966129875e-06, "loss": 1.2881, "step": 422 }, { "epoch": 0.5307402760351317, "grad_norm": 31.707502365112305, "learning_rate": 4.736293569072108e-06, "loss": 1.3801, "step": 423 }, { "epoch": 0.5319949811794228, "grad_norm": 25.902997970581055, "learning_rate": 4.716029516495803e-06, "loss": 1.3326, "step": 424 }, { "epoch": 0.533249686323714, "grad_norm": 42.108238220214844, "learning_rate": 4.695770142244931e-06, "loss": 1.529, "step": 425 }, { "epoch": 0.534504391468005, "grad_norm": 31.789140701293945, "learning_rate": 4.6755157800863826e-06, "loss": 1.3478, "step": 426 }, { "epoch": 0.5357590966122961, "grad_norm": 27.96792984008789, "learning_rate": 4.655266763704476e-06, "loss": 1.397, "step": 427 }, { "epoch": 0.5370138017565872, "grad_norm": 31.803890228271484, "learning_rate": 4.635023426695462e-06, "loss": 1.4011, "step": 428 }, { "epoch": 0.5382685069008782, "grad_norm": 35.10597610473633, "learning_rate": 4.614786102562026e-06, "loss": 1.4848, "step": 429 }, { "epoch": 0.5395232120451694, "grad_norm": 31.621994018554688, "learning_rate": 4.594555124707789e-06, "loss": 1.3346, "step": 430 }, { "epoch": 0.5407779171894604, "grad_norm": 33.457908630371094, "learning_rate": 4.574330826431822e-06, "loss": 1.3045, "step": 431 }, { "epoch": 0.5420326223337516, "grad_norm": 31.1467342376709, "learning_rate": 4.554113540923153e-06, "loss": 1.4343, "step": 432 }, { "epoch": 0.5432873274780426, "grad_norm": 31.287960052490234, "learning_rate": 4.533903601255272e-06, "loss": 1.3903, "step": 433 }, { "epoch": 0.5445420326223338, "grad_norm": 26.70494842529297, "learning_rate": 4.513701340380655e-06, "loss": 1.3482, "step": 434 }, { "epoch": 0.5457967377666249, "grad_norm": 44.05613327026367, "learning_rate": 4.493507091125269e-06, "loss": 1.5986, "step": 435 }, { "epoch": 0.5470514429109159, "grad_norm": 29.704072952270508, "learning_rate": 4.473321186183091e-06, "loss": 1.3137, "step": 436 }, { "epoch": 0.548306148055207, "grad_norm": 29.141984939575195, "learning_rate": 4.4531439581106295e-06, "loss": 1.478, "step": 437 }, { "epoch": 0.5495608531994981, "grad_norm": 34.73693084716797, "learning_rate": 4.432975739321444e-06, "loss": 1.5629, "step": 438 }, { "epoch": 0.5508155583437893, "grad_norm": 33.1425666809082, "learning_rate": 4.412816862080668e-06, "loss": 1.3101, "step": 439 }, { "epoch": 0.5520702634880803, "grad_norm": 31.933034896850586, "learning_rate": 4.392667658499539e-06, "loss": 1.3371, "step": 440 }, { "epoch": 0.5533249686323714, "grad_norm": 30.45763397216797, "learning_rate": 4.37252846052992e-06, "loss": 1.3671, "step": 441 }, { "epoch": 0.5545796737766625, "grad_norm": 42.91053009033203, "learning_rate": 4.352399599958837e-06, "loss": 1.4992, "step": 442 }, { "epoch": 0.5558343789209536, "grad_norm": 36.65143585205078, "learning_rate": 4.332281408403011e-06, "loss": 1.4589, "step": 443 }, { "epoch": 0.5570890840652447, "grad_norm": 38.462398529052734, "learning_rate": 4.312174217303391e-06, "loss": 1.2266, "step": 444 }, { "epoch": 0.5583437892095358, "grad_norm": 31.30473518371582, "learning_rate": 4.292078357919701e-06, "loss": 1.4476, "step": 445 }, { "epoch": 0.5595984943538268, "grad_norm": 35.10082244873047, "learning_rate": 4.271994161324977e-06, "loss": 1.4988, "step": 446 }, { "epoch": 0.560853199498118, "grad_norm": 32.5116081237793, "learning_rate": 4.2519219584001106e-06, "loss": 1.4988, "step": 447 }, { "epoch": 0.562107904642409, "grad_norm": 29.34661102294922, "learning_rate": 4.231862079828408e-06, "loss": 1.4725, "step": 448 }, { "epoch": 0.5633626097867002, "grad_norm": 36.072879791259766, "learning_rate": 4.2118148560901325e-06, "loss": 1.4334, "step": 449 }, { "epoch": 0.5646173149309912, "grad_norm": 30.869470596313477, "learning_rate": 4.19178061745706e-06, "loss": 1.3606, "step": 450 }, { "epoch": 0.5658720200752823, "grad_norm": 29.298429489135742, "learning_rate": 4.171759693987046e-06, "loss": 1.2983, "step": 451 }, { "epoch": 0.5671267252195734, "grad_norm": 24.67900276184082, "learning_rate": 4.151752415518577e-06, "loss": 1.2631, "step": 452 }, { "epoch": 0.5683814303638645, "grad_norm": 33.28513717651367, "learning_rate": 4.131759111665349e-06, "loss": 1.3843, "step": 453 }, { "epoch": 0.5696361355081556, "grad_norm": 34.13528823852539, "learning_rate": 4.111780111810826e-06, "loss": 1.4529, "step": 454 }, { "epoch": 0.5708908406524467, "grad_norm": 28.38991355895996, "learning_rate": 4.091815745102818e-06, "loss": 1.5154, "step": 455 }, { "epoch": 0.5721455457967378, "grad_norm": 26.64844512939453, "learning_rate": 4.071866340448062e-06, "loss": 1.3302, "step": 456 }, { "epoch": 0.5734002509410289, "grad_norm": 37.00432205200195, "learning_rate": 4.051932226506797e-06, "loss": 1.3327, "step": 457 }, { "epoch": 0.5746549560853199, "grad_norm": 27.36146354675293, "learning_rate": 4.032013731687351e-06, "loss": 1.361, "step": 458 }, { "epoch": 0.5759096612296111, "grad_norm": 32.78675842285156, "learning_rate": 4.0121111841407345e-06, "loss": 1.4741, "step": 459 }, { "epoch": 0.5771643663739021, "grad_norm": 37.97308349609375, "learning_rate": 3.992224911755234e-06, "loss": 1.5363, "step": 460 }, { "epoch": 0.5784190715181933, "grad_norm": 31.34197235107422, "learning_rate": 3.9723552421509975e-06, "loss": 1.2434, "step": 461 }, { "epoch": 0.5796737766624843, "grad_norm": 36.909828186035156, "learning_rate": 3.95250250267466e-06, "loss": 1.3956, "step": 462 }, { "epoch": 0.5809284818067754, "grad_norm": 47.24994659423828, "learning_rate": 3.932667020393933e-06, "loss": 1.3312, "step": 463 }, { "epoch": 0.5821831869510665, "grad_norm": 35.684608459472656, "learning_rate": 3.912849122092216e-06, "loss": 1.4447, "step": 464 }, { "epoch": 0.5834378920953576, "grad_norm": 36.601715087890625, "learning_rate": 3.8930491342632235e-06, "loss": 1.4177, "step": 465 }, { "epoch": 0.5846925972396487, "grad_norm": 28.328744888305664, "learning_rate": 3.873267383105604e-06, "loss": 1.3929, "step": 466 }, { "epoch": 0.5859473023839398, "grad_norm": 32.12102127075195, "learning_rate": 3.853504194517551e-06, "loss": 1.4941, "step": 467 }, { "epoch": 0.5872020075282308, "grad_norm": 32.12097930908203, "learning_rate": 3.833759894091456e-06, "loss": 1.3292, "step": 468 }, { "epoch": 0.588456712672522, "grad_norm": 26.0775146484375, "learning_rate": 3.814034807108529e-06, "loss": 1.3233, "step": 469 }, { "epoch": 0.589711417816813, "grad_norm": 26.92903709411621, "learning_rate": 3.7943292585334464e-06, "loss": 1.3575, "step": 470 }, { "epoch": 0.5909661229611042, "grad_norm": 35.65913772583008, "learning_rate": 3.774643573008995e-06, "loss": 1.3416, "step": 471 }, { "epoch": 0.5922208281053952, "grad_norm": 44.53237533569336, "learning_rate": 3.754978074850722e-06, "loss": 1.6346, "step": 472 }, { "epoch": 0.5934755332496863, "grad_norm": 33.18136978149414, "learning_rate": 3.7353330880415963e-06, "loss": 1.5085, "step": 473 }, { "epoch": 0.5947302383939774, "grad_norm": 31.07672882080078, "learning_rate": 3.7157089362266695e-06, "loss": 1.3839, "step": 474 }, { "epoch": 0.5959849435382685, "grad_norm": 29.932600021362305, "learning_rate": 3.6961059427077407e-06, "loss": 1.4774, "step": 475 }, { "epoch": 0.5972396486825596, "grad_norm": 27.480052947998047, "learning_rate": 3.6765244304380323e-06, "loss": 1.2551, "step": 476 }, { "epoch": 0.5984943538268507, "grad_norm": 39.4902458190918, "learning_rate": 3.656964722016875e-06, "loss": 1.3972, "step": 477 }, { "epoch": 0.5997490589711418, "grad_norm": 36.17951583862305, "learning_rate": 3.6374271396843797e-06, "loss": 1.2946, "step": 478 }, { "epoch": 0.6010037641154329, "grad_norm": 30.92720603942871, "learning_rate": 3.617912005316142e-06, "loss": 1.2169, "step": 479 }, { "epoch": 0.6022584692597239, "grad_norm": 34.092063903808594, "learning_rate": 3.598419640417938e-06, "loss": 1.3757, "step": 480 }, { "epoch": 0.6035131744040151, "grad_norm": 27.944690704345703, "learning_rate": 3.578950366120414e-06, "loss": 1.2427, "step": 481 }, { "epoch": 0.6047678795483061, "grad_norm": 36.29844665527344, "learning_rate": 3.5595045031738123e-06, "loss": 1.3915, "step": 482 }, { "epoch": 0.6060225846925973, "grad_norm": 36.75183868408203, "learning_rate": 3.540082371942682e-06, "loss": 1.4398, "step": 483 }, { "epoch": 0.6072772898368883, "grad_norm": 28.854524612426758, "learning_rate": 3.5206842924005934e-06, "loss": 1.3392, "step": 484 }, { "epoch": 0.6085319949811794, "grad_norm": 32.42161560058594, "learning_rate": 3.5013105841248794e-06, "loss": 1.5482, "step": 485 }, { "epoch": 0.6097867001254705, "grad_norm": 38.66543960571289, "learning_rate": 3.481961566291358e-06, "loss": 1.4572, "step": 486 }, { "epoch": 0.6110414052697616, "grad_norm": 37.27582550048828, "learning_rate": 3.462637557669084e-06, "loss": 1.3017, "step": 487 }, { "epoch": 0.6122961104140527, "grad_norm": 28.435178756713867, "learning_rate": 3.443338876615092e-06, "loss": 1.3203, "step": 488 }, { "epoch": 0.6135508155583438, "grad_norm": 33.752044677734375, "learning_rate": 3.424065841069152e-06, "loss": 1.5739, "step": 489 }, { "epoch": 0.6148055207026348, "grad_norm": 34.22273635864258, "learning_rate": 3.4048187685485312e-06, "loss": 1.4068, "step": 490 }, { "epoch": 0.616060225846926, "grad_norm": 54.36898422241211, "learning_rate": 3.3855979761427705e-06, "loss": 1.3019, "step": 491 }, { "epoch": 0.617314930991217, "grad_norm": 32.61660385131836, "learning_rate": 3.3664037805084428e-06, "loss": 1.2823, "step": 492 }, { "epoch": 0.6185696361355082, "grad_norm": 34.06522750854492, "learning_rate": 3.347236497863957e-06, "loss": 1.3678, "step": 493 }, { "epoch": 0.6198243412797992, "grad_norm": 29.604419708251953, "learning_rate": 3.3280964439843377e-06, "loss": 1.3285, "step": 494 }, { "epoch": 0.6210790464240903, "grad_norm": 33.45100021362305, "learning_rate": 3.308983934196018e-06, "loss": 1.422, "step": 495 }, { "epoch": 0.6223337515683814, "grad_norm": 33.3889274597168, "learning_rate": 3.289899283371657e-06, "loss": 1.3114, "step": 496 }, { "epoch": 0.6235884567126725, "grad_norm": 30.00410270690918, "learning_rate": 3.2708428059249437e-06, "loss": 1.3216, "step": 497 }, { "epoch": 0.6248431618569636, "grad_norm": 41.03053283691406, "learning_rate": 3.2518148158054186e-06, "loss": 1.4942, "step": 498 }, { "epoch": 0.6260978670012547, "grad_norm": 46.363258361816406, "learning_rate": 3.2328156264933043e-06, "loss": 1.6328, "step": 499 }, { "epoch": 0.6273525721455459, "grad_norm": 37.64637756347656, "learning_rate": 3.2138455509943365e-06, "loss": 1.3816, "step": 500 }, { "epoch": 0.6286072772898369, "grad_norm": 46.19404602050781, "learning_rate": 3.194904901834613e-06, "loss": 1.5756, "step": 501 }, { "epoch": 0.6298619824341279, "grad_norm": 26.028804779052734, "learning_rate": 3.17599399105544e-06, "loss": 1.314, "step": 502 }, { "epoch": 0.6311166875784191, "grad_norm": 31.624303817749023, "learning_rate": 3.1571131302081916e-06, "loss": 1.3178, "step": 503 }, { "epoch": 0.6323713927227101, "grad_norm": 35.267478942871094, "learning_rate": 3.138262630349182e-06, "loss": 1.5758, "step": 504 }, { "epoch": 0.6336260978670013, "grad_norm": 30.934772491455078, "learning_rate": 3.1194428020345375e-06, "loss": 1.4725, "step": 505 }, { "epoch": 0.6348808030112923, "grad_norm": 28.47898292541504, "learning_rate": 3.1006539553150727e-06, "loss": 1.3188, "step": 506 }, { "epoch": 0.6361355081555834, "grad_norm": 38.18532943725586, "learning_rate": 3.081896399731202e-06, "loss": 1.2228, "step": 507 }, { "epoch": 0.6373902132998746, "grad_norm": 35.62003707885742, "learning_rate": 3.063170444307821e-06, "loss": 1.6133, "step": 508 }, { "epoch": 0.6386449184441656, "grad_norm": 58.091861724853516, "learning_rate": 3.044476397549221e-06, "loss": 1.3338, "step": 509 }, { "epoch": 0.6398996235884568, "grad_norm": 31.276124954223633, "learning_rate": 3.02581456743401e-06, "loss": 1.1924, "step": 510 }, { "epoch": 0.6411543287327478, "grad_norm": 36.98395538330078, "learning_rate": 3.0071852614100427e-06, "loss": 1.3475, "step": 511 }, { "epoch": 0.6424090338770388, "grad_norm": 33.80880355834961, "learning_rate": 2.9885887863893394e-06, "loss": 1.2211, "step": 512 }, { "epoch": 0.64366373902133, "grad_norm": 37.08169174194336, "learning_rate": 2.9700254487430448e-06, "loss": 1.3388, "step": 513 }, { "epoch": 0.644918444165621, "grad_norm": 30.51959228515625, "learning_rate": 2.9514955542963775e-06, "loss": 1.4277, "step": 514 }, { "epoch": 0.6461731493099122, "grad_norm": 31.10744285583496, "learning_rate": 2.9329994083235857e-06, "loss": 1.2503, "step": 515 }, { "epoch": 0.6474278544542033, "grad_norm": 32.857383728027344, "learning_rate": 2.9145373155429263e-06, "loss": 1.4776, "step": 516 }, { "epoch": 0.6486825595984943, "grad_norm": 36.374961853027344, "learning_rate": 2.896109580111634e-06, "loss": 1.2288, "step": 517 }, { "epoch": 0.6499372647427855, "grad_norm": 26.020505905151367, "learning_rate": 2.8777165056209256e-06, "loss": 1.2806, "step": 518 }, { "epoch": 0.6511919698870765, "grad_norm": 31.82769775390625, "learning_rate": 2.8593583950909833e-06, "loss": 1.3725, "step": 519 }, { "epoch": 0.6524466750313677, "grad_norm": 36.6817741394043, "learning_rate": 2.8410355509659682e-06, "loss": 1.2934, "step": 520 }, { "epoch": 0.6537013801756587, "grad_norm": 46.93891525268555, "learning_rate": 2.8227482751090445e-06, "loss": 1.4673, "step": 521 }, { "epoch": 0.6549560853199499, "grad_norm": 41.38336181640625, "learning_rate": 2.8044968687973956e-06, "loss": 1.4611, "step": 522 }, { "epoch": 0.6562107904642409, "grad_norm": 37.399681091308594, "learning_rate": 2.786281632717264e-06, "loss": 1.2811, "step": 523 }, { "epoch": 0.657465495608532, "grad_norm": 44.295719146728516, "learning_rate": 2.7681028669590038e-06, "loss": 1.3587, "step": 524 }, { "epoch": 0.6587202007528231, "grad_norm": 33.356292724609375, "learning_rate": 2.749960871012129e-06, "loss": 1.4634, "step": 525 }, { "epoch": 0.6599749058971142, "grad_norm": 38.98143005371094, "learning_rate": 2.73185594376038e-06, "loss": 1.4382, "step": 526 }, { "epoch": 0.6612296110414053, "grad_norm": 30.759475708007812, "learning_rate": 2.7137883834768076e-06, "loss": 1.3081, "step": 527 }, { "epoch": 0.6624843161856964, "grad_norm": 37.871238708496094, "learning_rate": 2.6957584878188496e-06, "loss": 1.3886, "step": 528 }, { "epoch": 0.6637390213299874, "grad_norm": 49.197872161865234, "learning_rate": 2.6777665538234292e-06, "loss": 1.5503, "step": 529 }, { "epoch": 0.6649937264742786, "grad_norm": 37.15614700317383, "learning_rate": 2.6598128779020693e-06, "loss": 1.3044, "step": 530 }, { "epoch": 0.6662484316185696, "grad_norm": 31.275415420532227, "learning_rate": 2.641897755835997e-06, "loss": 1.397, "step": 531 }, { "epoch": 0.6675031367628608, "grad_norm": 41.38181686401367, "learning_rate": 2.6240214827712794e-06, "loss": 1.4281, "step": 532 }, { "epoch": 0.6687578419071518, "grad_norm": 39.80350875854492, "learning_rate": 2.6061843532139563e-06, "loss": 1.4107, "step": 533 }, { "epoch": 0.6700125470514429, "grad_norm": 40.21477508544922, "learning_rate": 2.5883866610251906e-06, "loss": 1.4339, "step": 534 }, { "epoch": 0.671267252195734, "grad_norm": 43.72838592529297, "learning_rate": 2.5706286994164315e-06, "loss": 1.5603, "step": 535 }, { "epoch": 0.6725219573400251, "grad_norm": 27.070802688598633, "learning_rate": 2.5529107609445737e-06, "loss": 1.4321, "step": 536 }, { "epoch": 0.6737766624843162, "grad_norm": 41.055633544921875, "learning_rate": 2.5352331375071437e-06, "loss": 1.4914, "step": 537 }, { "epoch": 0.6750313676286073, "grad_norm": 39.451602935791016, "learning_rate": 2.5175961203374954e-06, "loss": 1.4453, "step": 538 }, { "epoch": 0.6762860727728983, "grad_norm": 38.11553955078125, "learning_rate": 2.5000000000000015e-06, "loss": 1.3918, "step": 539 }, { "epoch": 0.6775407779171895, "grad_norm": 30.756338119506836, "learning_rate": 2.4824450663852716e-06, "loss": 1.1408, "step": 540 }, { "epoch": 0.6787954830614805, "grad_norm": 31.51823616027832, "learning_rate": 2.464931608705384e-06, "loss": 1.5483, "step": 541 }, { "epoch": 0.6800501882057717, "grad_norm": 28.151769638061523, "learning_rate": 2.447459915489106e-06, "loss": 1.2619, "step": 542 }, { "epoch": 0.6813048933500627, "grad_norm": 34.87588119506836, "learning_rate": 2.430030274577151e-06, "loss": 1.3653, "step": 543 }, { "epoch": 0.6825595984943539, "grad_norm": 44.73030090332031, "learning_rate": 2.4126429731174372e-06, "loss": 1.4503, "step": 544 }, { "epoch": 0.6838143036386449, "grad_norm": 35.88227462768555, "learning_rate": 2.3952982975603494e-06, "loss": 1.3246, "step": 545 }, { "epoch": 0.685069008782936, "grad_norm": 27.695951461791992, "learning_rate": 2.3779965336540237e-06, "loss": 1.3869, "step": 546 }, { "epoch": 0.6863237139272271, "grad_norm": 37.88958740234375, "learning_rate": 2.3607379664396414e-06, "loss": 1.4772, "step": 547 }, { "epoch": 0.6875784190715182, "grad_norm": 30.21925926208496, "learning_rate": 2.343522880246734e-06, "loss": 1.3563, "step": 548 }, { "epoch": 0.6888331242158093, "grad_norm": 41.6002197265625, "learning_rate": 2.3263515586884935e-06, "loss": 1.3695, "step": 549 }, { "epoch": 0.6900878293601004, "grad_norm": 29.012378692626953, "learning_rate": 2.3092242846571034e-06, "loss": 1.3925, "step": 550 }, { "epoch": 0.6913425345043914, "grad_norm": 28.30169105529785, "learning_rate": 2.2921413403190774e-06, "loss": 1.3324, "step": 551 }, { "epoch": 0.6925972396486826, "grad_norm": 30.30564308166504, "learning_rate": 2.275103007110616e-06, "loss": 1.3319, "step": 552 }, { "epoch": 0.6938519447929736, "grad_norm": 32.01078796386719, "learning_rate": 2.25810956573296e-06, "loss": 1.2561, "step": 553 }, { "epoch": 0.6951066499372648, "grad_norm": 45.61001205444336, "learning_rate": 2.2411612961477704e-06, "loss": 1.4322, "step": 554 }, { "epoch": 0.6963613550815558, "grad_norm": 39.38789749145508, "learning_rate": 2.224258477572524e-06, "loss": 1.2698, "step": 555 }, { "epoch": 0.6976160602258469, "grad_norm": 41.91701126098633, "learning_rate": 2.2074013884758993e-06, "loss": 1.4422, "step": 556 }, { "epoch": 0.698870765370138, "grad_norm": 32.67595291137695, "learning_rate": 2.190590306573198e-06, "loss": 1.2315, "step": 557 }, { "epoch": 0.7001254705144291, "grad_norm": 33.57855224609375, "learning_rate": 2.17382550882177e-06, "loss": 1.2939, "step": 558 }, { "epoch": 0.7013801756587202, "grad_norm": 30.53522491455078, "learning_rate": 2.1571072714164445e-06, "loss": 1.3556, "step": 559 }, { "epoch": 0.7026348808030113, "grad_norm": 33.44630432128906, "learning_rate": 2.140435869784986e-06, "loss": 1.3701, "step": 560 }, { "epoch": 0.7038895859473023, "grad_norm": 34.59889221191406, "learning_rate": 2.1238115785835512e-06, "loss": 1.5211, "step": 561 }, { "epoch": 0.7051442910915935, "grad_norm": 42.23357009887695, "learning_rate": 2.1072346716921733e-06, "loss": 1.2913, "step": 562 }, { "epoch": 0.7063989962358845, "grad_norm": 32.22030258178711, "learning_rate": 2.0907054222102367e-06, "loss": 1.3462, "step": 563 }, { "epoch": 0.7076537013801757, "grad_norm": 39.91384506225586, "learning_rate": 2.0742241024519886e-06, "loss": 1.3211, "step": 564 }, { "epoch": 0.7089084065244667, "grad_norm": 41.389461517333984, "learning_rate": 2.0577909839420468e-06, "loss": 1.3882, "step": 565 }, { "epoch": 0.7101631116687579, "grad_norm": 25.932300567626953, "learning_rate": 2.0414063374109326e-06, "loss": 1.2911, "step": 566 }, { "epoch": 0.7114178168130489, "grad_norm": 40.37273025512695, "learning_rate": 2.0250704327906025e-06, "loss": 1.3346, "step": 567 }, { "epoch": 0.71267252195734, "grad_norm": 33.203975677490234, "learning_rate": 2.0087835392100034e-06, "loss": 1.3206, "step": 568 }, { "epoch": 0.7139272271016311, "grad_norm": 25.78790283203125, "learning_rate": 1.9925459249906488e-06, "loss": 1.2016, "step": 569 }, { "epoch": 0.7151819322459222, "grad_norm": 26.151403427124023, "learning_rate": 1.9763578576421816e-06, "loss": 1.3088, "step": 570 }, { "epoch": 0.7164366373902133, "grad_norm": 40.70786666870117, "learning_rate": 1.9602196038579774e-06, "loss": 1.2366, "step": 571 }, { "epoch": 0.7176913425345044, "grad_norm": 32.47188949584961, "learning_rate": 1.944131429510754e-06, "loss": 1.3264, "step": 572 }, { "epoch": 0.7189460476787954, "grad_norm": 44.57042694091797, "learning_rate": 1.9280935996481792e-06, "loss": 1.3883, "step": 573 }, { "epoch": 0.7202007528230866, "grad_norm": 37.86323165893555, "learning_rate": 1.9121063784885135e-06, "loss": 1.2686, "step": 574 }, { "epoch": 0.7214554579673776, "grad_norm": 28.20488739013672, "learning_rate": 1.8961700294162578e-06, "loss": 1.3424, "step": 575 }, { "epoch": 0.7227101631116688, "grad_norm": 80.7864761352539, "learning_rate": 1.880284814977807e-06, "loss": 1.4263, "step": 576 }, { "epoch": 0.7239648682559598, "grad_norm": 47.082122802734375, "learning_rate": 1.8644509968771302e-06, "loss": 1.3611, "step": 577 }, { "epoch": 0.7252195734002509, "grad_norm": 27.525779724121094, "learning_rate": 1.8486688359714567e-06, "loss": 1.1818, "step": 578 }, { "epoch": 0.726474278544542, "grad_norm": 26.097383499145508, "learning_rate": 1.832938592266984e-06, "loss": 1.4285, "step": 579 }, { "epoch": 0.7277289836888331, "grad_norm": 27.29695701599121, "learning_rate": 1.8172605249145848e-06, "loss": 1.2213, "step": 580 }, { "epoch": 0.7289836888331243, "grad_norm": 43.18733215332031, "learning_rate": 1.8016348922055448e-06, "loss": 1.3866, "step": 581 }, { "epoch": 0.7302383939774153, "grad_norm": 30.83635139465332, "learning_rate": 1.7860619515673034e-06, "loss": 1.2583, "step": 582 }, { "epoch": 0.7314930991217063, "grad_norm": 38.65605163574219, "learning_rate": 1.7705419595592193e-06, "loss": 1.4949, "step": 583 }, { "epoch": 0.7327478042659975, "grad_norm": 33.9451789855957, "learning_rate": 1.7550751718683339e-06, "loss": 1.4502, "step": 584 }, { "epoch": 0.7340025094102886, "grad_norm": 32.3410530090332, "learning_rate": 1.7396618433051648e-06, "loss": 1.3073, "step": 585 }, { "epoch": 0.7352572145545797, "grad_norm": 31.831172943115234, "learning_rate": 1.7243022277995109e-06, "loss": 1.1989, "step": 586 }, { "epoch": 0.7365119196988708, "grad_norm": 36.86290740966797, "learning_rate": 1.7089965783962608e-06, "loss": 1.4668, "step": 587 }, { "epoch": 0.7377666248431619, "grad_norm": 34.344600677490234, "learning_rate": 1.6937451472512284e-06, "loss": 1.3803, "step": 588 }, { "epoch": 0.739021329987453, "grad_norm": 27.322994232177734, "learning_rate": 1.6785481856270042e-06, "loss": 1.2354, "step": 589 }, { "epoch": 0.740276035131744, "grad_norm": 44.57414245605469, "learning_rate": 1.6634059438888034e-06, "loss": 1.5863, "step": 590 }, { "epoch": 0.7415307402760352, "grad_norm": 33.31477737426758, "learning_rate": 1.6483186715003523e-06, "loss": 1.4086, "step": 591 }, { "epoch": 0.7427854454203262, "grad_norm": 33.885536193847656, "learning_rate": 1.633286617019771e-06, "loss": 1.4022, "step": 592 }, { "epoch": 0.7440401505646174, "grad_norm": 43.636802673339844, "learning_rate": 1.618310028095486e-06, "loss": 1.403, "step": 593 }, { "epoch": 0.7452948557089084, "grad_norm": 38.1976432800293, "learning_rate": 1.6033891514621436e-06, "loss": 1.375, "step": 594 }, { "epoch": 0.7465495608531995, "grad_norm": 27.386051177978516, "learning_rate": 1.5885242329365448e-06, "loss": 1.2411, "step": 595 }, { "epoch": 0.7478042659974906, "grad_norm": 32.94865036010742, "learning_rate": 1.5737155174136042e-06, "loss": 1.3973, "step": 596 }, { "epoch": 0.7490589711417817, "grad_norm": 52.85768127441406, "learning_rate": 1.5589632488623053e-06, "loss": 1.3857, "step": 597 }, { "epoch": 0.7503136762860728, "grad_norm": 30.37677001953125, "learning_rate": 1.5442676703216851e-06, "loss": 1.2986, "step": 598 }, { "epoch": 0.7515683814303639, "grad_norm": 50.629112243652344, "learning_rate": 1.5296290238968303e-06, "loss": 1.4606, "step": 599 }, { "epoch": 0.7528230865746549, "grad_norm": 75.81658172607422, "learning_rate": 1.5150475507548933e-06, "loss": 1.4354, "step": 600 }, { "epoch": 0.7540777917189461, "grad_norm": 32.35127639770508, "learning_rate": 1.500523491121108e-06, "loss": 1.4572, "step": 601 }, { "epoch": 0.7553324968632371, "grad_norm": 36.757484436035156, "learning_rate": 1.4860570842748412e-06, "loss": 1.3798, "step": 602 }, { "epoch": 0.7565872020075283, "grad_norm": 39.54582977294922, "learning_rate": 1.47164856854565e-06, "loss": 1.4334, "step": 603 }, { "epoch": 0.7578419071518193, "grad_norm": 30.180776596069336, "learning_rate": 1.4572981813093507e-06, "loss": 1.4914, "step": 604 }, { "epoch": 0.7590966122961104, "grad_norm": 55.5819091796875, "learning_rate": 1.4430061589841122e-06, "loss": 1.3051, "step": 605 }, { "epoch": 0.7603513174404015, "grad_norm": 41.72428894042969, "learning_rate": 1.4287727370265558e-06, "loss": 1.5724, "step": 606 }, { "epoch": 0.7616060225846926, "grad_norm": 30.067726135253906, "learning_rate": 1.4145981499278877e-06, "loss": 1.2012, "step": 607 }, { "epoch": 0.7628607277289837, "grad_norm": 35.68577194213867, "learning_rate": 1.4004826312100218e-06, "loss": 1.375, "step": 608 }, { "epoch": 0.7641154328732748, "grad_norm": 34.37779998779297, "learning_rate": 1.386426413421738e-06, "loss": 1.4803, "step": 609 }, { "epoch": 0.7653701380175659, "grad_norm": 28.35356330871582, "learning_rate": 1.3724297281348591e-06, "loss": 1.0709, "step": 610 }, { "epoch": 0.766624843161857, "grad_norm": 63.945228576660156, "learning_rate": 1.3584928059404207e-06, "loss": 1.3223, "step": 611 }, { "epoch": 0.767879548306148, "grad_norm": 37.977333068847656, "learning_rate": 1.3446158764448842e-06, "loss": 1.3541, "step": 612 }, { "epoch": 0.7691342534504392, "grad_norm": 33.97459411621094, "learning_rate": 1.3307991682663463e-06, "loss": 1.2762, "step": 613 }, { "epoch": 0.7703889585947302, "grad_norm": 52.56448745727539, "learning_rate": 1.3170429090307824e-06, "loss": 1.4249, "step": 614 }, { "epoch": 0.7716436637390214, "grad_norm": 29.552059173583984, "learning_rate": 1.303347325368285e-06, "loss": 1.3487, "step": 615 }, { "epoch": 0.7728983688833124, "grad_norm": 52.34573745727539, "learning_rate": 1.2897126429093354e-06, "loss": 1.29, "step": 616 }, { "epoch": 0.7741530740276035, "grad_norm": 38.19261932373047, "learning_rate": 1.2761390862810907e-06, "loss": 1.4146, "step": 617 }, { "epoch": 0.7754077791718946, "grad_norm": 36.244651794433594, "learning_rate": 1.2626268791036766e-06, "loss": 1.4714, "step": 618 }, { "epoch": 0.7766624843161857, "grad_norm": 41.59754180908203, "learning_rate": 1.2491762439865034e-06, "loss": 1.2052, "step": 619 }, { "epoch": 0.7779171894604768, "grad_norm": 32.61091232299805, "learning_rate": 1.235787402524603e-06, "loss": 1.2954, "step": 620 }, { "epoch": 0.7791718946047679, "grad_norm": 30.722808837890625, "learning_rate": 1.2224605752949786e-06, "loss": 1.2545, "step": 621 }, { "epoch": 0.7804265997490589, "grad_norm": 36.57342529296875, "learning_rate": 1.2091959818529636e-06, "loss": 1.2536, "step": 622 }, { "epoch": 0.7816813048933501, "grad_norm": 45.92577362060547, "learning_rate": 1.1959938407286099e-06, "loss": 1.3089, "step": 623 }, { "epoch": 0.7829360100376411, "grad_norm": 31.191242218017578, "learning_rate": 1.182854369423091e-06, "loss": 1.2477, "step": 624 }, { "epoch": 0.7841907151819323, "grad_norm": 31.34370231628418, "learning_rate": 1.1697777844051105e-06, "loss": 1.3789, "step": 625 }, { "epoch": 0.7854454203262233, "grad_norm": 27.42989730834961, "learning_rate": 1.1567643011073393e-06, "loss": 1.2446, "step": 626 }, { "epoch": 0.7867001254705144, "grad_norm": 31.601276397705078, "learning_rate": 1.143814133922872e-06, "loss": 1.453, "step": 627 }, { "epoch": 0.7879548306148055, "grad_norm": 42.06584548950195, "learning_rate": 1.1309274962016854e-06, "loss": 1.2825, "step": 628 }, { "epoch": 0.7892095357590966, "grad_norm": 36.16788864135742, "learning_rate": 1.1181046002471292e-06, "loss": 1.3807, "step": 629 }, { "epoch": 0.7904642409033877, "grad_norm": 35.88719177246094, "learning_rate": 1.1053456573124272e-06, "loss": 1.1951, "step": 630 }, { "epoch": 0.7917189460476788, "grad_norm": 43.55876541137695, "learning_rate": 1.0926508775971995e-06, "loss": 1.3084, "step": 631 }, { "epoch": 0.7929736511919699, "grad_norm": 38.98108673095703, "learning_rate": 1.0800204702439937e-06, "loss": 1.336, "step": 632 }, { "epoch": 0.794228356336261, "grad_norm": 34.15788650512695, "learning_rate": 1.0674546433348453e-06, "loss": 1.4309, "step": 633 }, { "epoch": 0.795483061480552, "grad_norm": 42.34593963623047, "learning_rate": 1.0549536038878432e-06, "loss": 1.3815, "step": 634 }, { "epoch": 0.7967377666248432, "grad_norm": 33.58256530761719, "learning_rate": 1.04251755785373e-06, "loss": 1.2034, "step": 635 }, { "epoch": 0.7979924717691342, "grad_norm": 41.538753509521484, "learning_rate": 1.0301467101124956e-06, "loss": 1.3423, "step": 636 }, { "epoch": 0.7992471769134254, "grad_norm": 42.10636901855469, "learning_rate": 1.0178412644700093e-06, "loss": 1.3916, "step": 637 }, { "epoch": 0.8005018820577164, "grad_norm": 31.18490219116211, "learning_rate": 1.0056014236546647e-06, "loss": 1.1455, "step": 638 }, { "epoch": 0.8017565872020075, "grad_norm": 32.616031646728516, "learning_rate": 9.934273893140335e-07, "loss": 1.3136, "step": 639 }, { "epoch": 0.8030112923462986, "grad_norm": 41.29079818725586, "learning_rate": 9.813193620115446e-07, "loss": 1.2788, "step": 640 }, { "epoch": 0.8042659974905897, "grad_norm": 39.024993896484375, "learning_rate": 9.692775412231863e-07, "loss": 1.3029, "step": 641 }, { "epoch": 0.8055207026348808, "grad_norm": 40.532737731933594, "learning_rate": 9.573021253342114e-07, "loss": 1.3518, "step": 642 }, { "epoch": 0.8067754077791719, "grad_norm": 42.95549011230469, "learning_rate": 9.453933116358715e-07, "loss": 1.4456, "step": 643 }, { "epoch": 0.8080301129234629, "grad_norm": 30.134597778320312, "learning_rate": 9.335512963221732e-07, "loss": 1.2561, "step": 644 }, { "epoch": 0.8092848180677541, "grad_norm": 42.78569412231445, "learning_rate": 9.21776274486636e-07, "loss": 1.3378, "step": 645 }, { "epoch": 0.8105395232120451, "grad_norm": 54.95227813720703, "learning_rate": 9.100684401190829e-07, "loss": 1.3858, "step": 646 }, { "epoch": 0.8117942283563363, "grad_norm": 42.90878677368164, "learning_rate": 8.984279861024453e-07, "loss": 1.2899, "step": 647 }, { "epoch": 0.8130489335006273, "grad_norm": 53.56229019165039, "learning_rate": 8.868551042095852e-07, "loss": 1.468, "step": 648 }, { "epoch": 0.8143036386449184, "grad_norm": 31.682039260864258, "learning_rate": 8.753499851001341e-07, "loss": 1.1707, "step": 649 }, { "epoch": 0.8155583437892095, "grad_norm": 31.241701126098633, "learning_rate": 8.639128183173517e-07, "loss": 1.1829, "step": 650 }, { "epoch": 0.8168130489335006, "grad_norm": 33.625938415527344, "learning_rate": 8.525437922850033e-07, "loss": 1.3418, "step": 651 }, { "epoch": 0.8180677540777918, "grad_norm": 30.763322830200195, "learning_rate": 8.412430943042616e-07, "loss": 1.3651, "step": 652 }, { "epoch": 0.8193224592220828, "grad_norm": 48.34621810913086, "learning_rate": 8.30010910550611e-07, "loss": 1.3246, "step": 653 }, { "epoch": 0.820577164366374, "grad_norm": 35.97224426269531, "learning_rate": 8.188474260707857e-07, "loss": 1.422, "step": 654 }, { "epoch": 0.821831869510665, "grad_norm": 31.350204467773438, "learning_rate": 8.077528247797234e-07, "loss": 1.3197, "step": 655 }, { "epoch": 0.823086574654956, "grad_norm": 39.3220329284668, "learning_rate": 7.967272894575312e-07, "loss": 1.3164, "step": 656 }, { "epoch": 0.8243412797992472, "grad_norm": 34.87789535522461, "learning_rate": 7.857710017464737e-07, "loss": 1.3422, "step": 657 }, { "epoch": 0.8255959849435383, "grad_norm": 39.69428634643555, "learning_rate": 7.748841421479875e-07, "loss": 1.2374, "step": 658 }, { "epoch": 0.8268506900878294, "grad_norm": 40.43376541137695, "learning_rate": 7.640668900196985e-07, "loss": 1.3143, "step": 659 }, { "epoch": 0.8281053952321205, "grad_norm": 28.951221466064453, "learning_rate": 7.533194235724728e-07, "loss": 1.315, "step": 660 }, { "epoch": 0.8293601003764115, "grad_norm": 56.01127243041992, "learning_rate": 7.426419198674773e-07, "loss": 1.3279, "step": 661 }, { "epoch": 0.8306148055207027, "grad_norm": 36.56144332885742, "learning_rate": 7.320345548132679e-07, "loss": 1.2427, "step": 662 }, { "epoch": 0.8318695106649937, "grad_norm": 34.64320373535156, "learning_rate": 7.214975031628856e-07, "loss": 1.3805, "step": 663 }, { "epoch": 0.8331242158092849, "grad_norm": 42.90142059326172, "learning_rate": 7.110309385109804e-07, "loss": 1.3778, "step": 664 }, { "epoch": 0.8343789209535759, "grad_norm": 33.45329284667969, "learning_rate": 7.006350332909495e-07, "loss": 1.3461, "step": 665 }, { "epoch": 0.835633626097867, "grad_norm": 39.53373718261719, "learning_rate": 6.903099587721024e-07, "loss": 1.372, "step": 666 }, { "epoch": 0.8368883312421581, "grad_norm": 26.866334915161133, "learning_rate": 6.800558850568295e-07, "loss": 1.1701, "step": 667 }, { "epoch": 0.8381430363864492, "grad_norm": 35.01183319091797, "learning_rate": 6.698729810778065e-07, "loss": 1.2913, "step": 668 }, { "epoch": 0.8393977415307403, "grad_norm": 26.15965461730957, "learning_rate": 6.597614145952136e-07, "loss": 1.1659, "step": 669 }, { "epoch": 0.8406524466750314, "grad_norm": 27.10162925720215, "learning_rate": 6.497213521939638e-07, "loss": 1.176, "step": 670 }, { "epoch": 0.8419071518193224, "grad_norm": 39.48128128051758, "learning_rate": 6.397529592809615e-07, "loss": 1.4855, "step": 671 }, { "epoch": 0.8431618569636136, "grad_norm": 45.1597785949707, "learning_rate": 6.298564000823848e-07, "loss": 1.2702, "step": 672 }, { "epoch": 0.8444165621079046, "grad_norm": 59.02643585205078, "learning_rate": 6.20031837640967e-07, "loss": 1.3335, "step": 673 }, { "epoch": 0.8456712672521958, "grad_norm": 33.48893737792969, "learning_rate": 6.102794338133195e-07, "loss": 1.1215, "step": 674 }, { "epoch": 0.8469259723964868, "grad_norm": 33.40549850463867, "learning_rate": 6.005993492672657e-07, "loss": 1.3049, "step": 675 }, { "epoch": 0.848180677540778, "grad_norm": 28.336149215698242, "learning_rate": 5.909917434791884e-07, "loss": 1.2866, "step": 676 }, { "epoch": 0.849435382685069, "grad_norm": 31.5575008392334, "learning_rate": 5.814567747314049e-07, "loss": 1.1839, "step": 677 }, { "epoch": 0.8506900878293601, "grad_norm": 30.665040969848633, "learning_rate": 5.719946001095617e-07, "loss": 1.3647, "step": 678 }, { "epoch": 0.8519447929736512, "grad_norm": 38.09904098510742, "learning_rate": 5.626053755000421e-07, "loss": 1.3963, "step": 679 }, { "epoch": 0.8531994981179423, "grad_norm": 62.874881744384766, "learning_rate": 5.532892555874059e-07, "loss": 1.2852, "step": 680 }, { "epoch": 0.8544542032622334, "grad_norm": 31.233694076538086, "learning_rate": 5.440463938518304e-07, "loss": 1.487, "step": 681 }, { "epoch": 0.8557089084065245, "grad_norm": 34.371585845947266, "learning_rate": 5.348769425665884e-07, "loss": 1.3499, "step": 682 }, { "epoch": 0.8569636135508155, "grad_norm": 40.928802490234375, "learning_rate": 5.25781052795541e-07, "loss": 1.494, "step": 683 }, { "epoch": 0.8582183186951067, "grad_norm": 47.68248748779297, "learning_rate": 5.167588743906432e-07, "loss": 1.2565, "step": 684 }, { "epoch": 0.8594730238393977, "grad_norm": 31.525768280029297, "learning_rate": 5.078105559894791e-07, "loss": 1.2186, "step": 685 }, { "epoch": 0.8607277289836889, "grad_norm": 41.63323211669922, "learning_rate": 4.989362450128133e-07, "loss": 1.3934, "step": 686 }, { "epoch": 0.8619824341279799, "grad_norm": 29.7374324798584, "learning_rate": 4.901360876621597e-07, "loss": 1.2498, "step": 687 }, { "epoch": 0.863237139272271, "grad_norm": 38.2042350769043, "learning_rate": 4.814102289173733e-07, "loss": 1.1372, "step": 688 }, { "epoch": 0.8644918444165621, "grad_norm": 33.84709930419922, "learning_rate": 4.727588125342669e-07, "loss": 1.218, "step": 689 }, { "epoch": 0.8657465495608532, "grad_norm": 39.36479568481445, "learning_rate": 4.6418198104223434e-07, "loss": 1.3434, "step": 690 }, { "epoch": 0.8670012547051443, "grad_norm": 45.70726776123047, "learning_rate": 4.5567987574190677e-07, "loss": 1.3344, "step": 691 }, { "epoch": 0.8682559598494354, "grad_norm": 42.92964172363281, "learning_rate": 4.4725263670282905e-07, "loss": 1.3247, "step": 692 }, { "epoch": 0.8695106649937264, "grad_norm": 33.368629455566406, "learning_rate": 4.3890040276114044e-07, "loss": 1.3195, "step": 693 }, { "epoch": 0.8707653701380176, "grad_norm": 43.9223518371582, "learning_rate": 4.306233115173009e-07, "loss": 1.3844, "step": 694 }, { "epoch": 0.8720200752823086, "grad_norm": 40.18341064453125, "learning_rate": 4.224214993338149e-07, "loss": 1.3651, "step": 695 }, { "epoch": 0.8732747804265998, "grad_norm": 38.75429916381836, "learning_rate": 4.1429510133298714e-07, "loss": 1.3685, "step": 696 }, { "epoch": 0.8745294855708908, "grad_norm": 41.714378356933594, "learning_rate": 4.062442513947007e-07, "loss": 1.4269, "step": 697 }, { "epoch": 0.875784190715182, "grad_norm": 29.522842407226562, "learning_rate": 3.9826908215420344e-07, "loss": 1.1375, "step": 698 }, { "epoch": 0.877038895859473, "grad_norm": 28.621906280517578, "learning_rate": 3.903697249999289e-07, "loss": 1.3684, "step": 699 }, { "epoch": 0.8782936010037641, "grad_norm": 78.60023498535156, "learning_rate": 3.825463100713317e-07, "loss": 1.3113, "step": 700 }, { "epoch": 0.8795483061480552, "grad_norm": 34.123355865478516, "learning_rate": 3.747989662567403e-07, "loss": 1.4122, "step": 701 }, { "epoch": 0.8808030112923463, "grad_norm": 34.434959411621094, "learning_rate": 3.671278211912338e-07, "loss": 1.4044, "step": 702 }, { "epoch": 0.8820577164366374, "grad_norm": 43.3989372253418, "learning_rate": 3.595330012545445e-07, "loss": 1.3849, "step": 703 }, { "epoch": 0.8833124215809285, "grad_norm": 51.71344757080078, "learning_rate": 3.520146315689693e-07, "loss": 1.4736, "step": 704 }, { "epoch": 0.8845671267252195, "grad_norm": 37.74956130981445, "learning_rate": 3.445728359973094e-07, "loss": 1.5021, "step": 705 }, { "epoch": 0.8858218318695107, "grad_norm": 38.12771224975586, "learning_rate": 3.372077371408361e-07, "loss": 1.3782, "step": 706 }, { "epoch": 0.8870765370138017, "grad_norm": 45.82014465332031, "learning_rate": 3.299194563372604e-07, "loss": 1.4072, "step": 707 }, { "epoch": 0.8883312421580929, "grad_norm": 41.57502746582031, "learning_rate": 3.22708113658744e-07, "loss": 1.1852, "step": 708 }, { "epoch": 0.8895859473023839, "grad_norm": 40.33243179321289, "learning_rate": 3.1557382790991686e-07, "loss": 1.2315, "step": 709 }, { "epoch": 0.890840652446675, "grad_norm": 50.13658142089844, "learning_rate": 3.085167166259162e-07, "loss": 1.5278, "step": 710 }, { "epoch": 0.8920953575909661, "grad_norm": 43.55479431152344, "learning_rate": 3.015368960704584e-07, "loss": 1.214, "step": 711 }, { "epoch": 0.8933500627352572, "grad_norm": 40.6564826965332, "learning_rate": 2.9463448123391634e-07, "loss": 1.2893, "step": 712 }, { "epoch": 0.8946047678795483, "grad_norm": 36.575809478759766, "learning_rate": 2.878095858314278e-07, "loss": 1.2348, "step": 713 }, { "epoch": 0.8958594730238394, "grad_norm": 43.1509895324707, "learning_rate": 2.810623223010245e-07, "loss": 1.2692, "step": 714 }, { "epoch": 0.8971141781681304, "grad_norm": 30.058103561401367, "learning_rate": 2.743928018017744e-07, "loss": 1.2322, "step": 715 }, { "epoch": 0.8983688833124216, "grad_norm": 29.974342346191406, "learning_rate": 2.67801134211953e-07, "loss": 1.1901, "step": 716 }, { "epoch": 0.8996235884567126, "grad_norm": 35.170406341552734, "learning_rate": 2.612874281272371e-07, "loss": 1.2897, "step": 717 }, { "epoch": 0.9008782936010038, "grad_norm": 36.794464111328125, "learning_rate": 2.548517908589077e-07, "loss": 1.4094, "step": 718 }, { "epoch": 0.9021329987452948, "grad_norm": 34.6309700012207, "learning_rate": 2.4849432843208786e-07, "loss": 1.2453, "step": 719 }, { "epoch": 0.903387703889586, "grad_norm": 41.007938385009766, "learning_rate": 2.422151455839955e-07, "loss": 1.479, "step": 720 }, { "epoch": 0.904642409033877, "grad_norm": 35.56821823120117, "learning_rate": 2.3601434576221548e-07, "loss": 1.2376, "step": 721 }, { "epoch": 0.9058971141781681, "grad_norm": 40.16046905517578, "learning_rate": 2.2989203112299685e-07, "loss": 1.3773, "step": 722 }, { "epoch": 0.9071518193224593, "grad_norm": 31.831424713134766, "learning_rate": 2.2384830252957068e-07, "loss": 1.2387, "step": 723 }, { "epoch": 0.9084065244667503, "grad_norm": 102.80229187011719, "learning_rate": 2.178832595504854e-07, "loss": 1.3606, "step": 724 }, { "epoch": 0.9096612296110415, "grad_norm": 37.90886688232422, "learning_rate": 2.1199700045797077e-07, "loss": 1.4478, "step": 725 }, { "epoch": 0.9109159347553325, "grad_norm": 36.04559326171875, "learning_rate": 2.0618962222631434e-07, "loss": 1.4465, "step": 726 }, { "epoch": 0.9121706398996235, "grad_norm": 45.584922790527344, "learning_rate": 2.0046122053026697e-07, "loss": 1.3702, "step": 727 }, { "epoch": 0.9134253450439147, "grad_norm": 39.56161117553711, "learning_rate": 1.9481188974346698e-07, "loss": 1.2966, "step": 728 }, { "epoch": 0.9146800501882058, "grad_norm": 48.845314025878906, "learning_rate": 1.8924172293688148e-07, "loss": 1.3017, "step": 729 }, { "epoch": 0.9159347553324969, "grad_norm": 35.209503173828125, "learning_rate": 1.8375081187727683e-07, "loss": 1.2948, "step": 730 }, { "epoch": 0.917189460476788, "grad_norm": 35.617698669433594, "learning_rate": 1.7833924702570725e-07, "loss": 1.1993, "step": 731 }, { "epoch": 0.918444165621079, "grad_norm": 29.306623458862305, "learning_rate": 1.7300711753601985e-07, "loss": 1.2074, "step": 732 }, { "epoch": 0.9196988707653702, "grad_norm": 34.39566421508789, "learning_rate": 1.677545112533896e-07, "loss": 1.3316, "step": 733 }, { "epoch": 0.9209535759096612, "grad_norm": 36.989356994628906, "learning_rate": 1.6258151471287397e-07, "loss": 1.3134, "step": 734 }, { "epoch": 0.9222082810539524, "grad_norm": 48.13298034667969, "learning_rate": 1.5748821313798124e-07, "loss": 1.3963, "step": 735 }, { "epoch": 0.9234629861982434, "grad_norm": 39.777278900146484, "learning_rate": 1.5247469043927153e-07, "loss": 1.3866, "step": 736 }, { "epoch": 0.9247176913425345, "grad_norm": 31.973005294799805, "learning_rate": 1.4754102921297363e-07, "loss": 1.2392, "step": 737 }, { "epoch": 0.9259723964868256, "grad_norm": 31.995790481567383, "learning_rate": 1.4268731073962094e-07, "loss": 1.2198, "step": 738 }, { "epoch": 0.9272271016311167, "grad_norm": 33.672569274902344, "learning_rate": 1.3791361498271704e-07, "loss": 1.3004, "step": 739 }, { "epoch": 0.9284818067754078, "grad_norm": 31.81163787841797, "learning_rate": 1.3322002058741678e-07, "loss": 1.3826, "step": 740 }, { "epoch": 0.9297365119196989, "grad_norm": 32.36835479736328, "learning_rate": 1.2860660487922616e-07, "loss": 1.4068, "step": 741 }, { "epoch": 0.93099121706399, "grad_norm": 43.015193939208984, "learning_rate": 1.240734438627361e-07, "loss": 1.381, "step": 742 }, { "epoch": 0.9322459222082811, "grad_norm": 44.727230072021484, "learning_rate": 1.196206122203647e-07, "loss": 1.3348, "step": 743 }, { "epoch": 0.9335006273525721, "grad_norm": 29.804079055786133, "learning_rate": 1.1524818331112853e-07, "loss": 1.2291, "step": 744 }, { "epoch": 0.9347553324968633, "grad_norm": 46.379451751708984, "learning_rate": 1.1095622916943494e-07, "loss": 1.4644, "step": 745 }, { "epoch": 0.9360100376411543, "grad_norm": 31.480005264282227, "learning_rate": 1.0674482050389457e-07, "loss": 1.2402, "step": 746 }, { "epoch": 0.9372647427854455, "grad_norm": 25.78557586669922, "learning_rate": 1.0261402669615505e-07, "loss": 1.3798, "step": 747 }, { "epoch": 0.9385194479297365, "grad_norm": 41.28335189819336, "learning_rate": 9.856391579976032e-08, "loss": 1.3066, "step": 748 }, { "epoch": 0.9397741530740276, "grad_norm": 42.25539779663086, "learning_rate": 9.459455453902866e-08, "loss": 1.3258, "step": 749 }, { "epoch": 0.9410288582183187, "grad_norm": 39.32608413696289, "learning_rate": 9.070600830795251e-08, "loss": 1.3086, "step": 750 }, { "epoch": 0.9422835633626098, "grad_norm": 30.92926025390625, "learning_rate": 8.68983411691221e-08, "loss": 1.1993, "step": 751 }, { "epoch": 0.9435382685069009, "grad_norm": 33.10255813598633, "learning_rate": 8.317161585266964e-08, "loss": 1.1975, "step": 752 }, { "epoch": 0.944792973651192, "grad_norm": 42.903900146484375, "learning_rate": 7.952589375523567e-08, "loss": 1.1978, "step": 753 }, { "epoch": 0.946047678795483, "grad_norm": 28.121461868286133, "learning_rate": 7.59612349389599e-08, "loss": 1.222, "step": 754 }, { "epoch": 0.9473023839397742, "grad_norm": 35.83945083618164, "learning_rate": 7.247769813048644e-08, "loss": 1.2472, "step": 755 }, { "epoch": 0.9485570890840652, "grad_norm": 37.46073532104492, "learning_rate": 6.907534072000177e-08, "loss": 1.5055, "step": 756 }, { "epoch": 0.9498117942283564, "grad_norm": 27.392000198364258, "learning_rate": 6.575421876028721e-08, "loss": 1.1948, "step": 757 }, { "epoch": 0.9510664993726474, "grad_norm": 51.311744689941406, "learning_rate": 6.251438696579293e-08, "loss": 1.3754, "step": 758 }, { "epoch": 0.9523212045169385, "grad_norm": 36.097373962402344, "learning_rate": 5.935589871174208e-08, "loss": 1.1822, "step": 759 }, { "epoch": 0.9535759096612296, "grad_norm": 32.62606430053711, "learning_rate": 5.627880603324532e-08, "loss": 1.271, "step": 760 }, { "epoch": 0.9548306148055207, "grad_norm": 37.016719818115234, "learning_rate": 5.3283159624448745e-08, "loss": 1.1878, "step": 761 }, { "epoch": 0.9560853199498118, "grad_norm": 37.270118713378906, "learning_rate": 5.0369008837696244e-08, "loss": 1.2704, "step": 762 }, { "epoch": 0.9573400250941029, "grad_norm": 37.098854064941406, "learning_rate": 4.753640168271456e-08, "loss": 1.2812, "step": 763 }, { "epoch": 0.958594730238394, "grad_norm": 44.55942916870117, "learning_rate": 4.478538482582617e-08, "loss": 1.2366, "step": 764 }, { "epoch": 0.9598494353826851, "grad_norm": 30.313488006591797, "learning_rate": 4.211600358917989e-08, "loss": 1.2731, "step": 765 }, { "epoch": 0.9611041405269761, "grad_norm": 52.65010452270508, "learning_rate": 3.9528301950000345e-08, "loss": 1.4277, "step": 766 }, { "epoch": 0.9623588456712673, "grad_norm": 34.024227142333984, "learning_rate": 3.702232253986804e-08, "loss": 1.3047, "step": 767 }, { "epoch": 0.9636135508155583, "grad_norm": 49.82564163208008, "learning_rate": 3.4598106644014863e-08, "loss": 1.2943, "step": 768 }, { "epoch": 0.9648682559598495, "grad_norm": 42.5301513671875, "learning_rate": 3.2255694200643003e-08, "loss": 1.3643, "step": 769 }, { "epoch": 0.9661229611041405, "grad_norm": 36.81052017211914, "learning_rate": 2.9995123800270476e-08, "loss": 1.4252, "step": 770 }, { "epoch": 0.9673776662484316, "grad_norm": 35.52188491821289, "learning_rate": 2.7816432685091598e-08, "loss": 1.35, "step": 771 }, { "epoch": 0.9686323713927227, "grad_norm": 30.83523941040039, "learning_rate": 2.5719656748364184e-08, "loss": 1.2627, "step": 772 }, { "epoch": 0.9698870765370138, "grad_norm": 29.04794692993164, "learning_rate": 2.370483053382111e-08, "loss": 1.2903, "step": 773 }, { "epoch": 0.9711417816813049, "grad_norm": 36.21467208862305, "learning_rate": 2.177198723509688e-08, "loss": 1.3589, "step": 774 }, { "epoch": 0.972396486825596, "grad_norm": 30.13644790649414, "learning_rate": 1.992115869518474e-08, "loss": 1.2922, "step": 775 }, { "epoch": 0.973651191969887, "grad_norm": 50.431663513183594, "learning_rate": 1.8152375405909305e-08, "loss": 1.2573, "step": 776 }, { "epoch": 0.9749058971141782, "grad_norm": 50.13302230834961, "learning_rate": 1.6465666507425314e-08, "loss": 1.4401, "step": 777 }, { "epoch": 0.9761606022584692, "grad_norm": 46.383636474609375, "learning_rate": 1.4861059787736886e-08, "loss": 1.424, "step": 778 }, { "epoch": 0.9774153074027604, "grad_norm": 34.33049011230469, "learning_rate": 1.333858168224178e-08, "loss": 1.2715, "step": 779 }, { "epoch": 0.9786700125470514, "grad_norm": 42.03940963745117, "learning_rate": 1.1898257273292857e-08, "loss": 1.2918, "step": 780 }, { "epoch": 0.9799247176913425, "grad_norm": 42.43777847290039, "learning_rate": 1.0540110289786742e-08, "loss": 1.5214, "step": 781 }, { "epoch": 0.9811794228356336, "grad_norm": 31.801700592041016, "learning_rate": 9.264163106774138e-09, "loss": 1.2777, "step": 782 }, { "epoch": 0.9824341279799247, "grad_norm": 49.655391693115234, "learning_rate": 8.07043674508623e-09, "loss": 1.2324, "step": 783 }, { "epoch": 0.9836888331242158, "grad_norm": 37.17424011230469, "learning_rate": 6.958950870994963e-09, "loss": 1.2559, "step": 784 }, { "epoch": 0.9849435382685069, "grad_norm": 33.83037567138672, "learning_rate": 5.929723795884967e-09, "loss": 1.2658, "step": 785 }, { "epoch": 0.986198243412798, "grad_norm": 49.56622314453125, "learning_rate": 4.982772475951026e-09, "loss": 1.2301, "step": 786 }, { "epoch": 0.9874529485570891, "grad_norm": 71.51993560791016, "learning_rate": 4.1181125119221785e-09, "loss": 1.4287, "step": 787 }, { "epoch": 0.9887076537013801, "grad_norm": 37.62562942504883, "learning_rate": 3.3357581488030476e-09, "loss": 1.4585, "step": 788 }, { "epoch": 0.9899623588456713, "grad_norm": 44.091552734375, "learning_rate": 2.635722275638464e-09, "loss": 1.5654, "step": 789 }, { "epoch": 0.9912170639899623, "grad_norm": 67.96106719970703, "learning_rate": 2.0180164253008614e-09, "loss": 1.3665, "step": 790 }, { "epoch": 0.9924717691342535, "grad_norm": 38.18610763549805, "learning_rate": 1.4826507743032071e-09, "loss": 1.1607, "step": 791 }, { "epoch": 0.9937264742785445, "grad_norm": 36.40510940551758, "learning_rate": 1.029634142627467e-09, "loss": 1.2769, "step": 792 }, { "epoch": 0.9949811794228356, "grad_norm": 34.85893630981445, "learning_rate": 6.589739935819461e-10, "loss": 1.3029, "step": 793 }, { "epoch": 0.9962358845671268, "grad_norm": 36.350643157958984, "learning_rate": 3.7067643367749707e-10, "loss": 1.2861, "step": 794 }, { "epoch": 0.9974905897114178, "grad_norm": 38.7654914855957, "learning_rate": 1.6474621252704494e-10, "loss": 1.1653, "step": 795 }, { "epoch": 0.998745294855709, "grad_norm": 72.9517822265625, "learning_rate": 4.118672276620661e-11, "loss": 1.3579, "step": 796 }, { "epoch": 1.0, "grad_norm": 33.991390228271484, "learning_rate": 0.0, "loss": 1.346, "step": 797 } ], "logging_steps": 1, "max_steps": 797, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4921722755088384.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }