FatCat87's picture
Upload folder using huggingface_hub
7bca4da verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 399,
"global_step": 797,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012547051442910915,
"grad_norm": 19.01241111755371,
"learning_rate": 4.347826086956522e-07,
"loss": 3.7662,
"step": 1
},
{
"epoch": 0.0012547051442910915,
"eval_loss": 3.5013513565063477,
"eval_runtime": 6.1042,
"eval_samples_per_second": 109.924,
"eval_steps_per_second": 6.881,
"step": 1
},
{
"epoch": 0.002509410288582183,
"grad_norm": 16.68044090270996,
"learning_rate": 8.695652173913044e-07,
"loss": 4.9852,
"step": 2
},
{
"epoch": 0.0037641154328732747,
"grad_norm": 20.054622650146484,
"learning_rate": 1.3043478260869566e-06,
"loss": 3.371,
"step": 3
},
{
"epoch": 0.005018820577164366,
"grad_norm": 17.74432373046875,
"learning_rate": 1.7391304347826088e-06,
"loss": 3.2279,
"step": 4
},
{
"epoch": 0.006273525721455458,
"grad_norm": 15.706853866577148,
"learning_rate": 2.173913043478261e-06,
"loss": 3.2849,
"step": 5
},
{
"epoch": 0.0075282308657465494,
"grad_norm": 16.103614807128906,
"learning_rate": 2.6086956521739132e-06,
"loss": 3.3386,
"step": 6
},
{
"epoch": 0.00878293601003764,
"grad_norm": 17.866008758544922,
"learning_rate": 3.043478260869566e-06,
"loss": 3.7729,
"step": 7
},
{
"epoch": 0.010037641154328732,
"grad_norm": 15.147605895996094,
"learning_rate": 3.4782608695652175e-06,
"loss": 3.7953,
"step": 8
},
{
"epoch": 0.011292346298619825,
"grad_norm": 18.901615142822266,
"learning_rate": 3.91304347826087e-06,
"loss": 3.5709,
"step": 9
},
{
"epoch": 0.012547051442910916,
"grad_norm": 15.003190994262695,
"learning_rate": 4.347826086956522e-06,
"loss": 4.4118,
"step": 10
},
{
"epoch": 0.013801756587202008,
"grad_norm": 16.06182098388672,
"learning_rate": 4.782608695652174e-06,
"loss": 3.6714,
"step": 11
},
{
"epoch": 0.015056461731493099,
"grad_norm": 17.36846923828125,
"learning_rate": 5.2173913043478265e-06,
"loss": 3.8605,
"step": 12
},
{
"epoch": 0.01631116687578419,
"grad_norm": 17.79241180419922,
"learning_rate": 5.652173913043479e-06,
"loss": 3.5272,
"step": 13
},
{
"epoch": 0.01756587202007528,
"grad_norm": 14.266884803771973,
"learning_rate": 6.086956521739132e-06,
"loss": 3.6034,
"step": 14
},
{
"epoch": 0.018820577164366373,
"grad_norm": 15.910148620605469,
"learning_rate": 6.521739130434783e-06,
"loss": 3.1557,
"step": 15
},
{
"epoch": 0.020075282308657464,
"grad_norm": 16.65166473388672,
"learning_rate": 6.956521739130435e-06,
"loss": 3.1012,
"step": 16
},
{
"epoch": 0.02132998745294856,
"grad_norm": 14.562183380126953,
"learning_rate": 7.391304347826087e-06,
"loss": 3.7211,
"step": 17
},
{
"epoch": 0.02258469259723965,
"grad_norm": 12.386832237243652,
"learning_rate": 7.82608695652174e-06,
"loss": 3.4626,
"step": 18
},
{
"epoch": 0.02383939774153074,
"grad_norm": 17.05539894104004,
"learning_rate": 8.260869565217392e-06,
"loss": 3.4974,
"step": 19
},
{
"epoch": 0.025094102885821833,
"grad_norm": 20.27201271057129,
"learning_rate": 8.695652173913044e-06,
"loss": 3.6714,
"step": 20
},
{
"epoch": 0.026348808030112924,
"grad_norm": 14.487604141235352,
"learning_rate": 9.130434782608697e-06,
"loss": 3.3083,
"step": 21
},
{
"epoch": 0.027603513174404015,
"grad_norm": 16.102643966674805,
"learning_rate": 9.565217391304349e-06,
"loss": 3.4909,
"step": 22
},
{
"epoch": 0.028858218318695106,
"grad_norm": 19.385705947875977,
"learning_rate": 1e-05,
"loss": 3.6031,
"step": 23
},
{
"epoch": 0.030112923462986198,
"grad_norm": 16.015605926513672,
"learning_rate": 9.999958813277235e-06,
"loss": 3.2406,
"step": 24
},
{
"epoch": 0.03136762860727729,
"grad_norm": 13.68839168548584,
"learning_rate": 9.999835253787472e-06,
"loss": 3.3278,
"step": 25
},
{
"epoch": 0.03262233375156838,
"grad_norm": 14.528271675109863,
"learning_rate": 9.999629323566323e-06,
"loss": 2.9922,
"step": 26
},
{
"epoch": 0.033877038895859475,
"grad_norm": 17.02483558654785,
"learning_rate": 9.99934102600642e-06,
"loss": 3.2691,
"step": 27
},
{
"epoch": 0.03513174404015056,
"grad_norm": 16.797338485717773,
"learning_rate": 9.998970365857374e-06,
"loss": 3.3698,
"step": 28
},
{
"epoch": 0.03638644918444166,
"grad_norm": 14.764263153076172,
"learning_rate": 9.998517349225698e-06,
"loss": 3.2543,
"step": 29
},
{
"epoch": 0.037641154328732745,
"grad_norm": 14.670446395874023,
"learning_rate": 9.9979819835747e-06,
"loss": 3.2072,
"step": 30
},
{
"epoch": 0.03889585947302384,
"grad_norm": 21.932998657226562,
"learning_rate": 9.997364277724362e-06,
"loss": 3.3685,
"step": 31
},
{
"epoch": 0.04015056461731493,
"grad_norm": 18.700294494628906,
"learning_rate": 9.996664241851197e-06,
"loss": 2.9231,
"step": 32
},
{
"epoch": 0.04140526976160602,
"grad_norm": 13.988219261169434,
"learning_rate": 9.99588188748808e-06,
"loss": 3.0859,
"step": 33
},
{
"epoch": 0.04265997490589712,
"grad_norm": 15.462250709533691,
"learning_rate": 9.995017227524049e-06,
"loss": 3.5131,
"step": 34
},
{
"epoch": 0.043914680050188205,
"grad_norm": 18.01273536682129,
"learning_rate": 9.994070276204115e-06,
"loss": 3.7054,
"step": 35
},
{
"epoch": 0.0451693851944793,
"grad_norm": 21.7486515045166,
"learning_rate": 9.993041049129005e-06,
"loss": 3.2674,
"step": 36
},
{
"epoch": 0.04642409033877039,
"grad_norm": 19.066349029541016,
"learning_rate": 9.991929563254913e-06,
"loss": 3.3495,
"step": 37
},
{
"epoch": 0.04767879548306148,
"grad_norm": 20.077730178833008,
"learning_rate": 9.990735836893226e-06,
"loss": 3.396,
"step": 38
},
{
"epoch": 0.04893350062735257,
"grad_norm": 14.028002738952637,
"learning_rate": 9.989459889710214e-06,
"loss": 2.9541,
"step": 39
},
{
"epoch": 0.050188205771643665,
"grad_norm": 16.57832908630371,
"learning_rate": 9.988101742726708e-06,
"loss": 4.1811,
"step": 40
},
{
"epoch": 0.05144291091593475,
"grad_norm": 16.29513931274414,
"learning_rate": 9.986661418317759e-06,
"loss": 3.282,
"step": 41
},
{
"epoch": 0.05269761606022585,
"grad_norm": 19.623991012573242,
"learning_rate": 9.985138940212264e-06,
"loss": 2.7565,
"step": 42
},
{
"epoch": 0.053952321204516936,
"grad_norm": 17.891677856445312,
"learning_rate": 9.983534333492575e-06,
"loss": 3.2153,
"step": 43
},
{
"epoch": 0.05520702634880803,
"grad_norm": 15.788727760314941,
"learning_rate": 9.981847624594093e-06,
"loss": 3.2207,
"step": 44
},
{
"epoch": 0.056461731493099125,
"grad_norm": 17.83070182800293,
"learning_rate": 9.980078841304817e-06,
"loss": 3.2656,
"step": 45
},
{
"epoch": 0.05771643663739021,
"grad_norm": 15.635571479797363,
"learning_rate": 9.978228012764904e-06,
"loss": 3.0155,
"step": 46
},
{
"epoch": 0.05897114178168131,
"grad_norm": 24.62743377685547,
"learning_rate": 9.97629516946618e-06,
"loss": 3.1795,
"step": 47
},
{
"epoch": 0.060225846925972396,
"grad_norm": 16.98169708251953,
"learning_rate": 9.974280343251637e-06,
"loss": 3.3745,
"step": 48
},
{
"epoch": 0.06148055207026349,
"grad_norm": 19.704118728637695,
"learning_rate": 9.97218356731491e-06,
"loss": 3.0855,
"step": 49
},
{
"epoch": 0.06273525721455459,
"grad_norm": 15.05843734741211,
"learning_rate": 9.970004876199731e-06,
"loss": 2.9471,
"step": 50
},
{
"epoch": 0.06398996235884567,
"grad_norm": 20.934194564819336,
"learning_rate": 9.967744305799358e-06,
"loss": 2.3807,
"step": 51
},
{
"epoch": 0.06524466750313676,
"grad_norm": 19.842937469482422,
"learning_rate": 9.965401893355985e-06,
"loss": 3.0332,
"step": 52
},
{
"epoch": 0.06649937264742785,
"grad_norm": 14.741073608398438,
"learning_rate": 9.962977677460132e-06,
"loss": 3.2762,
"step": 53
},
{
"epoch": 0.06775407779171895,
"grad_norm": 16.709836959838867,
"learning_rate": 9.96047169805e-06,
"loss": 2.8045,
"step": 54
},
{
"epoch": 0.06900878293601004,
"grad_norm": 22.069616317749023,
"learning_rate": 9.957883996410821e-06,
"loss": 2.9735,
"step": 55
},
{
"epoch": 0.07026348808030113,
"grad_norm": 17.32145881652832,
"learning_rate": 9.955214615174174e-06,
"loss": 3.2817,
"step": 56
},
{
"epoch": 0.07151819322459223,
"grad_norm": 16.994312286376953,
"learning_rate": 9.952463598317286e-06,
"loss": 2.9389,
"step": 57
},
{
"epoch": 0.07277289836888332,
"grad_norm": 16.050113677978516,
"learning_rate": 9.949630991162304e-06,
"loss": 2.6915,
"step": 58
},
{
"epoch": 0.0740276035131744,
"grad_norm": 12.047767639160156,
"learning_rate": 9.946716840375552e-06,
"loss": 3.1678,
"step": 59
},
{
"epoch": 0.07528230865746549,
"grad_norm": 21.182559967041016,
"learning_rate": 9.943721193966755e-06,
"loss": 3.0534,
"step": 60
},
{
"epoch": 0.07653701380175659,
"grad_norm": 15.370920181274414,
"learning_rate": 9.940644101288259e-06,
"loss": 2.9404,
"step": 61
},
{
"epoch": 0.07779171894604768,
"grad_norm": 17.937530517578125,
"learning_rate": 9.937485613034209e-06,
"loss": 3.1182,
"step": 62
},
{
"epoch": 0.07904642409033877,
"grad_norm": 15.242935180664062,
"learning_rate": 9.934245781239714e-06,
"loss": 3.2562,
"step": 63
},
{
"epoch": 0.08030112923462986,
"grad_norm": 19.794172286987305,
"learning_rate": 9.93092465928e-06,
"loss": 2.9321,
"step": 64
},
{
"epoch": 0.08155583437892096,
"grad_norm": 12.679776191711426,
"learning_rate": 9.927522301869515e-06,
"loss": 2.4835,
"step": 65
},
{
"epoch": 0.08281053952321205,
"grad_norm": 17.018342971801758,
"learning_rate": 9.924038765061042e-06,
"loss": 2.7064,
"step": 66
},
{
"epoch": 0.08406524466750313,
"grad_norm": 17.553468704223633,
"learning_rate": 9.920474106244764e-06,
"loss": 3.509,
"step": 67
},
{
"epoch": 0.08531994981179424,
"grad_norm": 18.098421096801758,
"learning_rate": 9.91682838414733e-06,
"loss": 3.1203,
"step": 68
},
{
"epoch": 0.08657465495608532,
"grad_norm": 20.72344398498535,
"learning_rate": 9.913101658830879e-06,
"loss": 3.1849,
"step": 69
},
{
"epoch": 0.08782936010037641,
"grad_norm": 18.358638763427734,
"learning_rate": 9.909293991692049e-06,
"loss": 3.4645,
"step": 70
},
{
"epoch": 0.0890840652446675,
"grad_norm": 19.074031829833984,
"learning_rate": 9.905405445460972e-06,
"loss": 2.9314,
"step": 71
},
{
"epoch": 0.0903387703889586,
"grad_norm": 18.550411224365234,
"learning_rate": 9.90143608420024e-06,
"loss": 2.8456,
"step": 72
},
{
"epoch": 0.09159347553324969,
"grad_norm": 23.823490142822266,
"learning_rate": 9.897385973303845e-06,
"loss": 3.3129,
"step": 73
},
{
"epoch": 0.09284818067754078,
"grad_norm": 17.849550247192383,
"learning_rate": 9.893255179496106e-06,
"loss": 2.7676,
"step": 74
},
{
"epoch": 0.09410288582183186,
"grad_norm": 14.884727478027344,
"learning_rate": 9.889043770830566e-06,
"loss": 2.9774,
"step": 75
},
{
"epoch": 0.09535759096612297,
"grad_norm": 15.499114990234375,
"learning_rate": 9.884751816688873e-06,
"loss": 2.5129,
"step": 76
},
{
"epoch": 0.09661229611041405,
"grad_norm": 17.986732482910156,
"learning_rate": 9.880379387779637e-06,
"loss": 3.5453,
"step": 77
},
{
"epoch": 0.09786700125470514,
"grad_norm": 16.63545036315918,
"learning_rate": 9.875926556137265e-06,
"loss": 2.9293,
"step": 78
},
{
"epoch": 0.09912170639899624,
"grad_norm": 17.408201217651367,
"learning_rate": 9.871393395120774e-06,
"loss": 3.1488,
"step": 79
},
{
"epoch": 0.10037641154328733,
"grad_norm": 17.50285530090332,
"learning_rate": 9.866779979412583e-06,
"loss": 2.7078,
"step": 80
},
{
"epoch": 0.10163111668757842,
"grad_norm": 16.590560913085938,
"learning_rate": 9.862086385017283e-06,
"loss": 2.8491,
"step": 81
},
{
"epoch": 0.1028858218318695,
"grad_norm": 18.618976593017578,
"learning_rate": 9.85731268926038e-06,
"loss": 3.0485,
"step": 82
},
{
"epoch": 0.10414052697616061,
"grad_norm": 17.413230895996094,
"learning_rate": 9.852458970787027e-06,
"loss": 3.0812,
"step": 83
},
{
"epoch": 0.1053952321204517,
"grad_norm": 14.060961723327637,
"learning_rate": 9.847525309560729e-06,
"loss": 2.5551,
"step": 84
},
{
"epoch": 0.10664993726474278,
"grad_norm": 14.511148452758789,
"learning_rate": 9.842511786862018e-06,
"loss": 2.8406,
"step": 85
},
{
"epoch": 0.10790464240903387,
"grad_norm": 18.97178077697754,
"learning_rate": 9.837418485287126e-06,
"loss": 3.2963,
"step": 86
},
{
"epoch": 0.10915934755332497,
"grad_norm": 13.818567276000977,
"learning_rate": 9.832245488746612e-06,
"loss": 2.6757,
"step": 87
},
{
"epoch": 0.11041405269761606,
"grad_norm": 18.294200897216797,
"learning_rate": 9.826992882463982e-06,
"loss": 2.3428,
"step": 88
},
{
"epoch": 0.11166875784190715,
"grad_norm": 17.605432510375977,
"learning_rate": 9.821660752974294e-06,
"loss": 2.8555,
"step": 89
},
{
"epoch": 0.11292346298619825,
"grad_norm": 16.119766235351562,
"learning_rate": 9.816249188122724e-06,
"loss": 2.8055,
"step": 90
},
{
"epoch": 0.11417816813048934,
"grad_norm": 16.537944793701172,
"learning_rate": 9.81075827706312e-06,
"loss": 2.7496,
"step": 91
},
{
"epoch": 0.11543287327478043,
"grad_norm": 18.349796295166016,
"learning_rate": 9.805188110256533e-06,
"loss": 2.5472,
"step": 92
},
{
"epoch": 0.11668757841907151,
"grad_norm": 21.679128646850586,
"learning_rate": 9.799538779469734e-06,
"loss": 2.9006,
"step": 93
},
{
"epoch": 0.11794228356336262,
"grad_norm": 15.701348304748535,
"learning_rate": 9.793810377773688e-06,
"loss": 2.434,
"step": 94
},
{
"epoch": 0.1191969887076537,
"grad_norm": 17.04868507385254,
"learning_rate": 9.78800299954203e-06,
"loss": 2.4092,
"step": 95
},
{
"epoch": 0.12045169385194479,
"grad_norm": 17.143634796142578,
"learning_rate": 9.782116740449515e-06,
"loss": 2.979,
"step": 96
},
{
"epoch": 0.12170639899623588,
"grad_norm": 16.7327880859375,
"learning_rate": 9.776151697470431e-06,
"loss": 2.9258,
"step": 97
},
{
"epoch": 0.12296110414052698,
"grad_norm": 19.429100036621094,
"learning_rate": 9.770107968877004e-06,
"loss": 3.0748,
"step": 98
},
{
"epoch": 0.12421580928481807,
"grad_norm": 15.504218101501465,
"learning_rate": 9.763985654237785e-06,
"loss": 3.0054,
"step": 99
},
{
"epoch": 0.12547051442910917,
"grad_norm": 16.84503936767578,
"learning_rate": 9.757784854416006e-06,
"loss": 3.2136,
"step": 100
},
{
"epoch": 0.12672521957340024,
"grad_norm": 16.334318161010742,
"learning_rate": 9.751505671567914e-06,
"loss": 2.5939,
"step": 101
},
{
"epoch": 0.12797992471769135,
"grad_norm": 15.902310371398926,
"learning_rate": 9.745148209141094e-06,
"loss": 2.4743,
"step": 102
},
{
"epoch": 0.12923462986198245,
"grad_norm": 13.628096580505371,
"learning_rate": 9.738712571872765e-06,
"loss": 2.2579,
"step": 103
},
{
"epoch": 0.13048933500627352,
"grad_norm": 17.617816925048828,
"learning_rate": 9.732198865788047e-06,
"loss": 2.4754,
"step": 104
},
{
"epoch": 0.13174404015056462,
"grad_norm": 18.667858123779297,
"learning_rate": 9.725607198198227e-06,
"loss": 2.6638,
"step": 105
},
{
"epoch": 0.1329987452948557,
"grad_norm": 15.029777526855469,
"learning_rate": 9.718937677698976e-06,
"loss": 2.8075,
"step": 106
},
{
"epoch": 0.1342534504391468,
"grad_norm": 18.5529727935791,
"learning_rate": 9.712190414168573e-06,
"loss": 2.627,
"step": 107
},
{
"epoch": 0.1355081555834379,
"grad_norm": 17.021556854248047,
"learning_rate": 9.705365518766085e-06,
"loss": 2.2912,
"step": 108
},
{
"epoch": 0.13676286072772897,
"grad_norm": 17.83435821533203,
"learning_rate": 9.698463103929542e-06,
"loss": 2.3247,
"step": 109
},
{
"epoch": 0.13801756587202008,
"grad_norm": 17.74312400817871,
"learning_rate": 9.691483283374085e-06,
"loss": 2.5844,
"step": 110
},
{
"epoch": 0.13927227101631118,
"grad_norm": 22.43841552734375,
"learning_rate": 9.684426172090084e-06,
"loss": 3.1616,
"step": 111
},
{
"epoch": 0.14052697616060225,
"grad_norm": 16.035985946655273,
"learning_rate": 9.677291886341256e-06,
"loss": 2.5391,
"step": 112
},
{
"epoch": 0.14178168130489335,
"grad_norm": 20.342103958129883,
"learning_rate": 9.670080543662742e-06,
"loss": 2.5258,
"step": 113
},
{
"epoch": 0.14303638644918445,
"grad_norm": 20.725093841552734,
"learning_rate": 9.662792262859167e-06,
"loss": 2.5076,
"step": 114
},
{
"epoch": 0.14429109159347553,
"grad_norm": 15.233530044555664,
"learning_rate": 9.655427164002692e-06,
"loss": 2.3355,
"step": 115
},
{
"epoch": 0.14554579673776663,
"grad_norm": 15.496427536010742,
"learning_rate": 9.647985368431031e-06,
"loss": 2.5312,
"step": 116
},
{
"epoch": 0.1468005018820577,
"grad_norm": 24.412311553955078,
"learning_rate": 9.640466998745456e-06,
"loss": 2.7875,
"step": 117
},
{
"epoch": 0.1480552070263488,
"grad_norm": 15.683626174926758,
"learning_rate": 9.632872178808766e-06,
"loss": 2.2883,
"step": 118
},
{
"epoch": 0.1493099121706399,
"grad_norm": 17.257770538330078,
"learning_rate": 9.625201033743262e-06,
"loss": 2.8936,
"step": 119
},
{
"epoch": 0.15056461731493098,
"grad_norm": 19.208641052246094,
"learning_rate": 9.617453689928668e-06,
"loss": 2.7428,
"step": 120
},
{
"epoch": 0.15181932245922208,
"grad_norm": 17.00638771057129,
"learning_rate": 9.609630275000072e-06,
"loss": 2.5065,
"step": 121
},
{
"epoch": 0.15307402760351319,
"grad_norm": 17.896059036254883,
"learning_rate": 9.601730917845798e-06,
"loss": 2.4492,
"step": 122
},
{
"epoch": 0.15432873274780426,
"grad_norm": 17.655044555664062,
"learning_rate": 9.5937557486053e-06,
"loss": 2.3202,
"step": 123
},
{
"epoch": 0.15558343789209536,
"grad_norm": 19.35125732421875,
"learning_rate": 9.585704898667015e-06,
"loss": 2.5956,
"step": 124
},
{
"epoch": 0.15683814303638646,
"grad_norm": 17.047664642333984,
"learning_rate": 9.577578500666187e-06,
"loss": 2.547,
"step": 125
},
{
"epoch": 0.15809284818067754,
"grad_norm": 17.756309509277344,
"learning_rate": 9.5693766884827e-06,
"loss": 2.6131,
"step": 126
},
{
"epoch": 0.15934755332496864,
"grad_norm": 18.9345760345459,
"learning_rate": 9.561099597238862e-06,
"loss": 2.4613,
"step": 127
},
{
"epoch": 0.1606022584692597,
"grad_norm": 16.88786506652832,
"learning_rate": 9.552747363297172e-06,
"loss": 2.363,
"step": 128
},
{
"epoch": 0.1618569636135508,
"grad_norm": 17.6533203125,
"learning_rate": 9.544320124258093e-06,
"loss": 2.453,
"step": 129
},
{
"epoch": 0.16311166875784192,
"grad_norm": 19.48556137084961,
"learning_rate": 9.535818018957768e-06,
"loss": 2.2917,
"step": 130
},
{
"epoch": 0.164366373902133,
"grad_norm": 17.511598587036133,
"learning_rate": 9.527241187465735e-06,
"loss": 2.2477,
"step": 131
},
{
"epoch": 0.1656210790464241,
"grad_norm": 15.644845008850098,
"learning_rate": 9.518589771082627e-06,
"loss": 2.6145,
"step": 132
},
{
"epoch": 0.1668757841907152,
"grad_norm": 13.586119651794434,
"learning_rate": 9.509863912337843e-06,
"loss": 2.3622,
"step": 133
},
{
"epoch": 0.16813048933500627,
"grad_norm": 18.941696166992188,
"learning_rate": 9.501063754987188e-06,
"loss": 2.4396,
"step": 134
},
{
"epoch": 0.16938519447929737,
"grad_norm": 19.57110023498535,
"learning_rate": 9.492189444010522e-06,
"loss": 2.082,
"step": 135
},
{
"epoch": 0.17063989962358847,
"grad_norm": 16.997098922729492,
"learning_rate": 9.483241125609358e-06,
"loss": 2.1185,
"step": 136
},
{
"epoch": 0.17189460476787954,
"grad_norm": 20.234926223754883,
"learning_rate": 9.47421894720446e-06,
"loss": 2.487,
"step": 137
},
{
"epoch": 0.17314930991217065,
"grad_norm": 20.660642623901367,
"learning_rate": 9.465123057433413e-06,
"loss": 2.1378,
"step": 138
},
{
"epoch": 0.17440401505646172,
"grad_norm": 21.305038452148438,
"learning_rate": 9.455953606148172e-06,
"loss": 2.7265,
"step": 139
},
{
"epoch": 0.17565872020075282,
"grad_norm": 20.652212142944336,
"learning_rate": 9.446710744412595e-06,
"loss": 2.3179,
"step": 140
},
{
"epoch": 0.17691342534504392,
"grad_norm": 22.552457809448242,
"learning_rate": 9.437394624499957e-06,
"loss": 2.2027,
"step": 141
},
{
"epoch": 0.178168130489335,
"grad_norm": 18.889108657836914,
"learning_rate": 9.428005399890442e-06,
"loss": 2.3326,
"step": 142
},
{
"epoch": 0.1794228356336261,
"grad_norm": 18.121183395385742,
"learning_rate": 9.418543225268598e-06,
"loss": 2.0905,
"step": 143
},
{
"epoch": 0.1806775407779172,
"grad_norm": 28.54220199584961,
"learning_rate": 9.409008256520814e-06,
"loss": 2.1567,
"step": 144
},
{
"epoch": 0.18193224592220827,
"grad_norm": 28.761722564697266,
"learning_rate": 9.399400650732735e-06,
"loss": 2.3487,
"step": 145
},
{
"epoch": 0.18318695106649938,
"grad_norm": 20.803058624267578,
"learning_rate": 9.38972056618668e-06,
"loss": 2.4545,
"step": 146
},
{
"epoch": 0.18444165621079048,
"grad_norm": 14.15235424041748,
"learning_rate": 9.379968162359034e-06,
"loss": 2.1002,
"step": 147
},
{
"epoch": 0.18569636135508155,
"grad_norm": 18.501392364501953,
"learning_rate": 9.370143599917617e-06,
"loss": 2.1081,
"step": 148
},
{
"epoch": 0.18695106649937265,
"grad_norm": 23.19183921813965,
"learning_rate": 9.36024704071904e-06,
"loss": 2.2682,
"step": 149
},
{
"epoch": 0.18820577164366373,
"grad_norm": 21.424211502075195,
"learning_rate": 9.350278647806037e-06,
"loss": 2.3408,
"step": 150
},
{
"epoch": 0.18946047678795483,
"grad_norm": 22.568864822387695,
"learning_rate": 9.340238585404787e-06,
"loss": 2.357,
"step": 151
},
{
"epoch": 0.19071518193224593,
"grad_norm": 17.558080673217773,
"learning_rate": 9.330127018922195e-06,
"loss": 2.1341,
"step": 152
},
{
"epoch": 0.191969887076537,
"grad_norm": 21.05203628540039,
"learning_rate": 9.319944114943171e-06,
"loss": 2.736,
"step": 153
},
{
"epoch": 0.1932245922208281,
"grad_norm": 28.293092727661133,
"learning_rate": 9.309690041227898e-06,
"loss": 2.4961,
"step": 154
},
{
"epoch": 0.1944792973651192,
"grad_norm": 21.68331527709961,
"learning_rate": 9.299364966709051e-06,
"loss": 2.2222,
"step": 155
},
{
"epoch": 0.19573400250941028,
"grad_norm": 28.366355895996094,
"learning_rate": 9.28896906148902e-06,
"loss": 2.719,
"step": 156
},
{
"epoch": 0.19698870765370138,
"grad_norm": 25.245935440063477,
"learning_rate": 9.278502496837116e-06,
"loss": 2.4558,
"step": 157
},
{
"epoch": 0.19824341279799249,
"grad_norm": 34.29158020019531,
"learning_rate": 9.267965445186733e-06,
"loss": 2.1928,
"step": 158
},
{
"epoch": 0.19949811794228356,
"grad_norm": 23.639026641845703,
"learning_rate": 9.257358080132524e-06,
"loss": 1.8916,
"step": 159
},
{
"epoch": 0.20075282308657466,
"grad_norm": 17.318647384643555,
"learning_rate": 9.24668057642753e-06,
"loss": 2.2254,
"step": 160
},
{
"epoch": 0.20200752823086573,
"grad_norm": 18.8333740234375,
"learning_rate": 9.235933109980302e-06,
"loss": 2.0529,
"step": 161
},
{
"epoch": 0.20326223337515684,
"grad_norm": 20.41586685180664,
"learning_rate": 9.225115857852015e-06,
"loss": 2.0644,
"step": 162
},
{
"epoch": 0.20451693851944794,
"grad_norm": 22.13117218017578,
"learning_rate": 9.214228998253526e-06,
"loss": 2.2199,
"step": 163
},
{
"epoch": 0.205771643663739,
"grad_norm": 22.590608596801758,
"learning_rate": 9.20327271054247e-06,
"loss": 1.9851,
"step": 164
},
{
"epoch": 0.20702634880803011,
"grad_norm": 19.450021743774414,
"learning_rate": 9.192247175220276e-06,
"loss": 2.1396,
"step": 165
},
{
"epoch": 0.20828105395232122,
"grad_norm": 24.714031219482422,
"learning_rate": 9.181152573929215e-06,
"loss": 2.0162,
"step": 166
},
{
"epoch": 0.2095357590966123,
"grad_norm": 25.66572380065918,
"learning_rate": 9.16998908944939e-06,
"loss": 2.1091,
"step": 167
},
{
"epoch": 0.2107904642409034,
"grad_norm": 24.950700759887695,
"learning_rate": 9.15875690569574e-06,
"loss": 2.2533,
"step": 168
},
{
"epoch": 0.2120451693851945,
"grad_norm": 23.020002365112305,
"learning_rate": 9.147456207714998e-06,
"loss": 2.3229,
"step": 169
},
{
"epoch": 0.21329987452948557,
"grad_norm": 22.205028533935547,
"learning_rate": 9.13608718168265e-06,
"loss": 2.3614,
"step": 170
},
{
"epoch": 0.21455457967377667,
"grad_norm": 19.170259475708008,
"learning_rate": 9.124650014899868e-06,
"loss": 2.1497,
"step": 171
},
{
"epoch": 0.21580928481806774,
"grad_norm": 18.129199981689453,
"learning_rate": 9.113144895790416e-06,
"loss": 2.2325,
"step": 172
},
{
"epoch": 0.21706398996235884,
"grad_norm": 18.413124084472656,
"learning_rate": 9.101572013897555e-06,
"loss": 1.8652,
"step": 173
},
{
"epoch": 0.21831869510664995,
"grad_norm": 18.207448959350586,
"learning_rate": 9.089931559880918e-06,
"loss": 1.9094,
"step": 174
},
{
"epoch": 0.21957340025094102,
"grad_norm": 26.02681541442871,
"learning_rate": 9.078223725513366e-06,
"loss": 2.2922,
"step": 175
},
{
"epoch": 0.22082810539523212,
"grad_norm": 30.541122436523438,
"learning_rate": 9.066448703677828e-06,
"loss": 1.8914,
"step": 176
},
{
"epoch": 0.22208281053952322,
"grad_norm": 19.35504722595215,
"learning_rate": 9.05460668836413e-06,
"loss": 2.0448,
"step": 177
},
{
"epoch": 0.2233375156838143,
"grad_norm": 24.406612396240234,
"learning_rate": 9.04269787466579e-06,
"loss": 2.2088,
"step": 178
},
{
"epoch": 0.2245922208281054,
"grad_norm": 28.934782028198242,
"learning_rate": 9.030722458776815e-06,
"loss": 2.0474,
"step": 179
},
{
"epoch": 0.2258469259723965,
"grad_norm": 23.718971252441406,
"learning_rate": 9.018680637988456e-06,
"loss": 2.1075,
"step": 180
},
{
"epoch": 0.22710163111668757,
"grad_norm": 19.34891700744629,
"learning_rate": 9.006572610685969e-06,
"loss": 2.0024,
"step": 181
},
{
"epoch": 0.22835633626097868,
"grad_norm": 17.186641693115234,
"learning_rate": 8.994398576345335e-06,
"loss": 1.8304,
"step": 182
},
{
"epoch": 0.22961104140526975,
"grad_norm": 23.781911849975586,
"learning_rate": 8.982158735529991e-06,
"loss": 1.8478,
"step": 183
},
{
"epoch": 0.23086574654956085,
"grad_norm": 28.87154769897461,
"learning_rate": 8.969853289887507e-06,
"loss": 1.9214,
"step": 184
},
{
"epoch": 0.23212045169385195,
"grad_norm": 24.24917221069336,
"learning_rate": 8.957482442146271e-06,
"loss": 1.8442,
"step": 185
},
{
"epoch": 0.23337515683814303,
"grad_norm": 23.922151565551758,
"learning_rate": 8.945046396112158e-06,
"loss": 1.9284,
"step": 186
},
{
"epoch": 0.23462986198243413,
"grad_norm": 22.065723419189453,
"learning_rate": 8.932545356665157e-06,
"loss": 1.8711,
"step": 187
},
{
"epoch": 0.23588456712672523,
"grad_norm": 28.266712188720703,
"learning_rate": 8.919979529756008e-06,
"loss": 1.8295,
"step": 188
},
{
"epoch": 0.2371392722710163,
"grad_norm": 22.024778366088867,
"learning_rate": 8.907349122402803e-06,
"loss": 1.9236,
"step": 189
},
{
"epoch": 0.2383939774153074,
"grad_norm": 17.683101654052734,
"learning_rate": 8.894654342687574e-06,
"loss": 1.8348,
"step": 190
},
{
"epoch": 0.2396486825595985,
"grad_norm": 26.601009368896484,
"learning_rate": 8.881895399752873e-06,
"loss": 1.7325,
"step": 191
},
{
"epoch": 0.24090338770388958,
"grad_norm": 30.148361206054688,
"learning_rate": 8.869072503798315e-06,
"loss": 2.0121,
"step": 192
},
{
"epoch": 0.24215809284818068,
"grad_norm": 23.811433792114258,
"learning_rate": 8.85618586607713e-06,
"loss": 1.7341,
"step": 193
},
{
"epoch": 0.24341279799247176,
"grad_norm": 17.06600570678711,
"learning_rate": 8.843235698892661e-06,
"loss": 1.7895,
"step": 194
},
{
"epoch": 0.24466750313676286,
"grad_norm": 21.146913528442383,
"learning_rate": 8.83022221559489e-06,
"loss": 1.8371,
"step": 195
},
{
"epoch": 0.24592220828105396,
"grad_norm": 22.374889373779297,
"learning_rate": 8.81714563057691e-06,
"loss": 2.0259,
"step": 196
},
{
"epoch": 0.24717691342534504,
"grad_norm": 23.482807159423828,
"learning_rate": 8.80400615927139e-06,
"loss": 2.126,
"step": 197
},
{
"epoch": 0.24843161856963614,
"grad_norm": 20.430444717407227,
"learning_rate": 8.790804018147039e-06,
"loss": 1.5703,
"step": 198
},
{
"epoch": 0.24968632371392724,
"grad_norm": 29.053224563598633,
"learning_rate": 8.777539424705022e-06,
"loss": 1.9014,
"step": 199
},
{
"epoch": 0.25094102885821834,
"grad_norm": 22.412776947021484,
"learning_rate": 8.764212597475397e-06,
"loss": 1.9072,
"step": 200
},
{
"epoch": 0.2521957340025094,
"grad_norm": 27.57085418701172,
"learning_rate": 8.750823756013498e-06,
"loss": 2.0304,
"step": 201
},
{
"epoch": 0.2534504391468005,
"grad_norm": 21.350475311279297,
"learning_rate": 8.737373120896325e-06,
"loss": 1.797,
"step": 202
},
{
"epoch": 0.2547051442910916,
"grad_norm": 25.71649169921875,
"learning_rate": 8.72386091371891e-06,
"loss": 1.9805,
"step": 203
},
{
"epoch": 0.2559598494353827,
"grad_norm": 24.62053108215332,
"learning_rate": 8.710287357090666e-06,
"loss": 1.6377,
"step": 204
},
{
"epoch": 0.2572145545796738,
"grad_norm": 26.515974044799805,
"learning_rate": 8.696652674631716e-06,
"loss": 2.2071,
"step": 205
},
{
"epoch": 0.2584692597239649,
"grad_norm": 22.19689178466797,
"learning_rate": 8.68295709096922e-06,
"loss": 1.8681,
"step": 206
},
{
"epoch": 0.25972396486825594,
"grad_norm": 22.31092643737793,
"learning_rate": 8.669200831733655e-06,
"loss": 1.643,
"step": 207
},
{
"epoch": 0.26097867001254704,
"grad_norm": 18.85532569885254,
"learning_rate": 8.655384123555117e-06,
"loss": 1.669,
"step": 208
},
{
"epoch": 0.26223337515683814,
"grad_norm": 24.516279220581055,
"learning_rate": 8.64150719405958e-06,
"loss": 1.8626,
"step": 209
},
{
"epoch": 0.26348808030112925,
"grad_norm": 20.873056411743164,
"learning_rate": 8.627570271865143e-06,
"loss": 1.6009,
"step": 210
},
{
"epoch": 0.26474278544542035,
"grad_norm": 26.961584091186523,
"learning_rate": 8.613573586578262e-06,
"loss": 1.8991,
"step": 211
},
{
"epoch": 0.2659974905897114,
"grad_norm": 23.05677032470703,
"learning_rate": 8.599517368789981e-06,
"loss": 1.6264,
"step": 212
},
{
"epoch": 0.2672521957340025,
"grad_norm": 23.3626766204834,
"learning_rate": 8.585401850072114e-06,
"loss": 1.763,
"step": 213
},
{
"epoch": 0.2685069008782936,
"grad_norm": 22.876678466796875,
"learning_rate": 8.571227262973444e-06,
"loss": 1.8171,
"step": 214
},
{
"epoch": 0.2697616060225847,
"grad_norm": 21.870689392089844,
"learning_rate": 8.55699384101589e-06,
"loss": 1.7618,
"step": 215
},
{
"epoch": 0.2710163111668758,
"grad_norm": 23.80776023864746,
"learning_rate": 8.54270181869065e-06,
"loss": 1.7353,
"step": 216
},
{
"epoch": 0.2722710163111669,
"grad_norm": 21.69217872619629,
"learning_rate": 8.528351431454352e-06,
"loss": 1.8667,
"step": 217
},
{
"epoch": 0.27352572145545795,
"grad_norm": 22.88399887084961,
"learning_rate": 8.513942915725159e-06,
"loss": 1.7512,
"step": 218
},
{
"epoch": 0.27478042659974905,
"grad_norm": 22.40818977355957,
"learning_rate": 8.499476508878894e-06,
"loss": 1.7168,
"step": 219
},
{
"epoch": 0.27603513174404015,
"grad_norm": 25.04762840270996,
"learning_rate": 8.484952449245107e-06,
"loss": 1.6717,
"step": 220
},
{
"epoch": 0.27728983688833125,
"grad_norm": 22.810468673706055,
"learning_rate": 8.470370976103171e-06,
"loss": 1.8007,
"step": 221
},
{
"epoch": 0.27854454203262236,
"grad_norm": 24.604190826416016,
"learning_rate": 8.455732329678317e-06,
"loss": 1.9564,
"step": 222
},
{
"epoch": 0.2797992471769134,
"grad_norm": 27.309738159179688,
"learning_rate": 8.441036751137697e-06,
"loss": 1.6334,
"step": 223
},
{
"epoch": 0.2810539523212045,
"grad_norm": 29.318500518798828,
"learning_rate": 8.426284482586397e-06,
"loss": 1.6922,
"step": 224
},
{
"epoch": 0.2823086574654956,
"grad_norm": 28.5482177734375,
"learning_rate": 8.411475767063454e-06,
"loss": 1.8862,
"step": 225
},
{
"epoch": 0.2835633626097867,
"grad_norm": 25.247356414794922,
"learning_rate": 8.396610848537858e-06,
"loss": 1.7688,
"step": 226
},
{
"epoch": 0.2848180677540778,
"grad_norm": 24.79906463623047,
"learning_rate": 8.381689971904514e-06,
"loss": 1.7844,
"step": 227
},
{
"epoch": 0.2860727728983689,
"grad_norm": 28.987627029418945,
"learning_rate": 8.36671338298023e-06,
"loss": 1.7785,
"step": 228
},
{
"epoch": 0.28732747804265996,
"grad_norm": 25.145153045654297,
"learning_rate": 8.35168132849965e-06,
"loss": 1.7741,
"step": 229
},
{
"epoch": 0.28858218318695106,
"grad_norm": 22.089122772216797,
"learning_rate": 8.336594056111197e-06,
"loss": 1.5078,
"step": 230
},
{
"epoch": 0.28983688833124216,
"grad_norm": 27.65213966369629,
"learning_rate": 8.321451814372998e-06,
"loss": 1.7603,
"step": 231
},
{
"epoch": 0.29109159347553326,
"grad_norm": 33.60897445678711,
"learning_rate": 8.306254852748773e-06,
"loss": 1.7254,
"step": 232
},
{
"epoch": 0.29234629861982436,
"grad_norm": 25.02092933654785,
"learning_rate": 8.29100342160374e-06,
"loss": 1.795,
"step": 233
},
{
"epoch": 0.2936010037641154,
"grad_norm": 21.960206985473633,
"learning_rate": 8.275697772200491e-06,
"loss": 1.7087,
"step": 234
},
{
"epoch": 0.2948557089084065,
"grad_norm": 29.953306198120117,
"learning_rate": 8.260338156694836e-06,
"loss": 1.4295,
"step": 235
},
{
"epoch": 0.2961104140526976,
"grad_norm": 26.209787368774414,
"learning_rate": 8.244924828131668e-06,
"loss": 1.4427,
"step": 236
},
{
"epoch": 0.2973651191969887,
"grad_norm": 23.775861740112305,
"learning_rate": 8.229458040440783e-06,
"loss": 1.7755,
"step": 237
},
{
"epoch": 0.2986198243412798,
"grad_norm": 22.297338485717773,
"learning_rate": 8.213938048432697e-06,
"loss": 1.5213,
"step": 238
},
{
"epoch": 0.2998745294855709,
"grad_norm": 24.113645553588867,
"learning_rate": 8.198365107794457e-06,
"loss": 1.5942,
"step": 239
},
{
"epoch": 0.30112923462986196,
"grad_norm": 24.177122116088867,
"learning_rate": 8.182739475085417e-06,
"loss": 1.8395,
"step": 240
},
{
"epoch": 0.30238393977415307,
"grad_norm": 28.40700912475586,
"learning_rate": 8.167061407733018e-06,
"loss": 1.6086,
"step": 241
},
{
"epoch": 0.30363864491844417,
"grad_norm": 24.49298667907715,
"learning_rate": 8.151331164028544e-06,
"loss": 1.5645,
"step": 242
},
{
"epoch": 0.30489335006273527,
"grad_norm": 33.37433624267578,
"learning_rate": 8.135549003122871e-06,
"loss": 1.698,
"step": 243
},
{
"epoch": 0.30614805520702637,
"grad_norm": 24.059009552001953,
"learning_rate": 8.119715185022195e-06,
"loss": 1.5047,
"step": 244
},
{
"epoch": 0.3074027603513174,
"grad_norm": 29.42665672302246,
"learning_rate": 8.103829970583742e-06,
"loss": 1.68,
"step": 245
},
{
"epoch": 0.3086574654956085,
"grad_norm": 29.08376121520996,
"learning_rate": 8.087893621511487e-06,
"loss": 1.5872,
"step": 246
},
{
"epoch": 0.3099121706398996,
"grad_norm": 28.20993995666504,
"learning_rate": 8.071906400351823e-06,
"loss": 1.6515,
"step": 247
},
{
"epoch": 0.3111668757841907,
"grad_norm": 19.08958625793457,
"learning_rate": 8.055868570489247e-06,
"loss": 1.4665,
"step": 248
},
{
"epoch": 0.3124215809284818,
"grad_norm": 20.03516960144043,
"learning_rate": 8.039780396142023e-06,
"loss": 1.6523,
"step": 249
},
{
"epoch": 0.3136762860727729,
"grad_norm": 25.80693244934082,
"learning_rate": 8.023642142357821e-06,
"loss": 1.7412,
"step": 250
},
{
"epoch": 0.31493099121706397,
"grad_norm": 24.467342376708984,
"learning_rate": 8.007454075009352e-06,
"loss": 1.5459,
"step": 251
},
{
"epoch": 0.3161856963613551,
"grad_norm": 34.97882843017578,
"learning_rate": 7.991216460789997e-06,
"loss": 1.7311,
"step": 252
},
{
"epoch": 0.3174404015056462,
"grad_norm": 29.624479293823242,
"learning_rate": 7.974929567209399e-06,
"loss": 1.7838,
"step": 253
},
{
"epoch": 0.3186951066499373,
"grad_norm": 28.10247039794922,
"learning_rate": 7.95859366258907e-06,
"loss": 1.7842,
"step": 254
},
{
"epoch": 0.3199498117942284,
"grad_norm": 25.512306213378906,
"learning_rate": 7.942209016057954e-06,
"loss": 1.6854,
"step": 255
},
{
"epoch": 0.3212045169385194,
"grad_norm": 27.726490020751953,
"learning_rate": 7.925775897548013e-06,
"loss": 1.7176,
"step": 256
},
{
"epoch": 0.3224592220828105,
"grad_norm": 29.725744247436523,
"learning_rate": 7.909294577789765e-06,
"loss": 1.6355,
"step": 257
},
{
"epoch": 0.3237139272271016,
"grad_norm": 21.763940811157227,
"learning_rate": 7.892765328307828e-06,
"loss": 1.614,
"step": 258
},
{
"epoch": 0.32496863237139273,
"grad_norm": 29.157032012939453,
"learning_rate": 7.87618842141645e-06,
"loss": 1.5684,
"step": 259
},
{
"epoch": 0.32622333751568383,
"grad_norm": 29.150402069091797,
"learning_rate": 7.859564130215015e-06,
"loss": 1.5138,
"step": 260
},
{
"epoch": 0.32747804265997493,
"grad_norm": 38.0162239074707,
"learning_rate": 7.842892728583557e-06,
"loss": 1.4729,
"step": 261
},
{
"epoch": 0.328732747804266,
"grad_norm": 28.247106552124023,
"learning_rate": 7.826174491178231e-06,
"loss": 1.6418,
"step": 262
},
{
"epoch": 0.3299874529485571,
"grad_norm": 28.189817428588867,
"learning_rate": 7.809409693426803e-06,
"loss": 1.5794,
"step": 263
},
{
"epoch": 0.3312421580928482,
"grad_norm": 34.21451950073242,
"learning_rate": 7.792598611524103e-06,
"loss": 1.5883,
"step": 264
},
{
"epoch": 0.3324968632371393,
"grad_norm": 27.97997283935547,
"learning_rate": 7.775741522427477e-06,
"loss": 1.4462,
"step": 265
},
{
"epoch": 0.3337515683814304,
"grad_norm": 27.05823516845703,
"learning_rate": 7.75883870385223e-06,
"loss": 1.5044,
"step": 266
},
{
"epoch": 0.33500627352572143,
"grad_norm": 29.075641632080078,
"learning_rate": 7.741890434267043e-06,
"loss": 1.5352,
"step": 267
},
{
"epoch": 0.33626097867001253,
"grad_norm": 36.941951751708984,
"learning_rate": 7.724896992889385e-06,
"loss": 1.5779,
"step": 268
},
{
"epoch": 0.33751568381430364,
"grad_norm": 28.30890655517578,
"learning_rate": 7.707858659680924e-06,
"loss": 1.8306,
"step": 269
},
{
"epoch": 0.33877038895859474,
"grad_norm": 28.968425750732422,
"learning_rate": 7.690775715342898e-06,
"loss": 1.5735,
"step": 270
},
{
"epoch": 0.34002509410288584,
"grad_norm": 23.6066951751709,
"learning_rate": 7.67364844131151e-06,
"loss": 1.6057,
"step": 271
},
{
"epoch": 0.34127979924717694,
"grad_norm": 31.214929580688477,
"learning_rate": 7.656477119753268e-06,
"loss": 1.8741,
"step": 272
},
{
"epoch": 0.342534504391468,
"grad_norm": 37.89013671875,
"learning_rate": 7.63926203356036e-06,
"loss": 1.7272,
"step": 273
},
{
"epoch": 0.3437892095357591,
"grad_norm": 26.85829734802246,
"learning_rate": 7.622003466345977e-06,
"loss": 1.6312,
"step": 274
},
{
"epoch": 0.3450439146800502,
"grad_norm": 25.076658248901367,
"learning_rate": 7.604701702439652e-06,
"loss": 1.5652,
"step": 275
},
{
"epoch": 0.3462986198243413,
"grad_norm": 33.68350601196289,
"learning_rate": 7.587357026882563e-06,
"loss": 1.5935,
"step": 276
},
{
"epoch": 0.3475533249686324,
"grad_norm": 26.654830932617188,
"learning_rate": 7.5699697254228496e-06,
"loss": 1.4547,
"step": 277
},
{
"epoch": 0.34880803011292344,
"grad_norm": 25.102251052856445,
"learning_rate": 7.552540084510896e-06,
"loss": 1.6585,
"step": 278
},
{
"epoch": 0.35006273525721454,
"grad_norm": 30.08404541015625,
"learning_rate": 7.535068391294618e-06,
"loss": 1.7801,
"step": 279
},
{
"epoch": 0.35131744040150564,
"grad_norm": 23.15135955810547,
"learning_rate": 7.517554933614729e-06,
"loss": 1.4114,
"step": 280
},
{
"epoch": 0.35257214554579674,
"grad_norm": 26.793306350708008,
"learning_rate": 7.500000000000001e-06,
"loss": 1.5748,
"step": 281
},
{
"epoch": 0.35382685069008785,
"grad_norm": 26.644601821899414,
"learning_rate": 7.482403879662505e-06,
"loss": 1.7082,
"step": 282
},
{
"epoch": 0.35508155583437895,
"grad_norm": 29.40913200378418,
"learning_rate": 7.464766862492856e-06,
"loss": 1.5906,
"step": 283
},
{
"epoch": 0.35633626097867,
"grad_norm": 28.093795776367188,
"learning_rate": 7.447089239055428e-06,
"loss": 1.6122,
"step": 284
},
{
"epoch": 0.3575909661229611,
"grad_norm": 23.78188133239746,
"learning_rate": 7.42937130058357e-06,
"loss": 1.4623,
"step": 285
},
{
"epoch": 0.3588456712672522,
"grad_norm": 35.69364929199219,
"learning_rate": 7.4116133389748115e-06,
"loss": 1.6225,
"step": 286
},
{
"epoch": 0.3601003764115433,
"grad_norm": 30.77789306640625,
"learning_rate": 7.393815646786047e-06,
"loss": 1.5917,
"step": 287
},
{
"epoch": 0.3613550815558344,
"grad_norm": 41.9234619140625,
"learning_rate": 7.3759785172287235e-06,
"loss": 1.4922,
"step": 288
},
{
"epoch": 0.36260978670012545,
"grad_norm": 26.941680908203125,
"learning_rate": 7.358102244164003e-06,
"loss": 1.8153,
"step": 289
},
{
"epoch": 0.36386449184441655,
"grad_norm": 27.374059677124023,
"learning_rate": 7.340187122097931e-06,
"loss": 1.64,
"step": 290
},
{
"epoch": 0.36511919698870765,
"grad_norm": 23.783817291259766,
"learning_rate": 7.322233446176571e-06,
"loss": 1.5758,
"step": 291
},
{
"epoch": 0.36637390213299875,
"grad_norm": 23.492393493652344,
"learning_rate": 7.304241512181152e-06,
"loss": 1.479,
"step": 292
},
{
"epoch": 0.36762860727728985,
"grad_norm": 27.81630516052246,
"learning_rate": 7.286211616523193e-06,
"loss": 1.5494,
"step": 293
},
{
"epoch": 0.36888331242158096,
"grad_norm": 35.152557373046875,
"learning_rate": 7.268144056239621e-06,
"loss": 1.8003,
"step": 294
},
{
"epoch": 0.370138017565872,
"grad_norm": 24.756799697875977,
"learning_rate": 7.250039128987874e-06,
"loss": 1.6751,
"step": 295
},
{
"epoch": 0.3713927227101631,
"grad_norm": 30.238140106201172,
"learning_rate": 7.231897133040997e-06,
"loss": 1.4538,
"step": 296
},
{
"epoch": 0.3726474278544542,
"grad_norm": 25.516706466674805,
"learning_rate": 7.213718367282737e-06,
"loss": 1.41,
"step": 297
},
{
"epoch": 0.3739021329987453,
"grad_norm": 45.06476593017578,
"learning_rate": 7.195503131202607e-06,
"loss": 1.5351,
"step": 298
},
{
"epoch": 0.3751568381430364,
"grad_norm": 30.282215118408203,
"learning_rate": 7.177251724890957e-06,
"loss": 1.6859,
"step": 299
},
{
"epoch": 0.37641154328732745,
"grad_norm": 26.890932083129883,
"learning_rate": 7.1589644490340334e-06,
"loss": 1.5883,
"step": 300
},
{
"epoch": 0.37766624843161856,
"grad_norm": 29.712207794189453,
"learning_rate": 7.14064160490902e-06,
"loss": 1.7468,
"step": 301
},
{
"epoch": 0.37892095357590966,
"grad_norm": 23.99646759033203,
"learning_rate": 7.122283494379076e-06,
"loss": 1.3783,
"step": 302
},
{
"epoch": 0.38017565872020076,
"grad_norm": 28.590595245361328,
"learning_rate": 7.103890419888367e-06,
"loss": 1.694,
"step": 303
},
{
"epoch": 0.38143036386449186,
"grad_norm": 22.65292739868164,
"learning_rate": 7.085462684457076e-06,
"loss": 1.5418,
"step": 304
},
{
"epoch": 0.38268506900878296,
"grad_norm": 27.158199310302734,
"learning_rate": 7.067000591676416e-06,
"loss": 1.6183,
"step": 305
},
{
"epoch": 0.383939774153074,
"grad_norm": 29.83051872253418,
"learning_rate": 7.048504445703623e-06,
"loss": 1.5936,
"step": 306
},
{
"epoch": 0.3851944792973651,
"grad_norm": 24.005414962768555,
"learning_rate": 7.029974551256957e-06,
"loss": 1.3992,
"step": 307
},
{
"epoch": 0.3864491844416562,
"grad_norm": 34.38796615600586,
"learning_rate": 7.011411213610663e-06,
"loss": 1.6884,
"step": 308
},
{
"epoch": 0.3877038895859473,
"grad_norm": 25.36124038696289,
"learning_rate": 6.992814738589958e-06,
"loss": 1.6561,
"step": 309
},
{
"epoch": 0.3889585947302384,
"grad_norm": 21.46540641784668,
"learning_rate": 6.97418543256599e-06,
"loss": 1.3287,
"step": 310
},
{
"epoch": 0.39021329987452946,
"grad_norm": 35.439361572265625,
"learning_rate": 6.95552360245078e-06,
"loss": 1.6699,
"step": 311
},
{
"epoch": 0.39146800501882056,
"grad_norm": 32.73426055908203,
"learning_rate": 6.936829555692182e-06,
"loss": 1.3947,
"step": 312
},
{
"epoch": 0.39272271016311167,
"grad_norm": 28.283676147460938,
"learning_rate": 6.9181036002687985e-06,
"loss": 1.4841,
"step": 313
},
{
"epoch": 0.39397741530740277,
"grad_norm": 20.66922378540039,
"learning_rate": 6.899346044684928e-06,
"loss": 1.3804,
"step": 314
},
{
"epoch": 0.39523212045169387,
"grad_norm": 31.596906661987305,
"learning_rate": 6.880557197965465e-06,
"loss": 1.467,
"step": 315
},
{
"epoch": 0.39648682559598497,
"grad_norm": 22.125431060791016,
"learning_rate": 6.861737369650818e-06,
"loss": 1.4638,
"step": 316
},
{
"epoch": 0.397741530740276,
"grad_norm": 26.49312400817871,
"learning_rate": 6.84288686979181e-06,
"loss": 1.2585,
"step": 317
},
{
"epoch": 0.3989962358845671,
"grad_norm": 31.771793365478516,
"learning_rate": 6.824006008944561e-06,
"loss": 1.5593,
"step": 318
},
{
"epoch": 0.4002509410288582,
"grad_norm": 33.718238830566406,
"learning_rate": 6.805095098165388e-06,
"loss": 1.5027,
"step": 319
},
{
"epoch": 0.4015056461731493,
"grad_norm": 27.339921951293945,
"learning_rate": 6.786154449005664e-06,
"loss": 1.438,
"step": 320
},
{
"epoch": 0.4027603513174404,
"grad_norm": 24.385299682617188,
"learning_rate": 6.767184373506698e-06,
"loss": 1.5481,
"step": 321
},
{
"epoch": 0.40401505646173147,
"grad_norm": 38.833770751953125,
"learning_rate": 6.7481851841945835e-06,
"loss": 1.6319,
"step": 322
},
{
"epoch": 0.40526976160602257,
"grad_norm": 27.79740333557129,
"learning_rate": 6.7291571940750575e-06,
"loss": 1.5855,
"step": 323
},
{
"epoch": 0.4065244667503137,
"grad_norm": 30.081342697143555,
"learning_rate": 6.710100716628345e-06,
"loss": 1.3305,
"step": 324
},
{
"epoch": 0.4077791718946048,
"grad_norm": 28.723339080810547,
"learning_rate": 6.6910160658039835e-06,
"loss": 1.5928,
"step": 325
},
{
"epoch": 0.4090338770388959,
"grad_norm": 36.5059814453125,
"learning_rate": 6.671903556015664e-06,
"loss": 1.7107,
"step": 326
},
{
"epoch": 0.410288582183187,
"grad_norm": 22.986221313476562,
"learning_rate": 6.652763502136044e-06,
"loss": 1.4106,
"step": 327
},
{
"epoch": 0.411543287327478,
"grad_norm": 31.11964988708496,
"learning_rate": 6.633596219491559e-06,
"loss": 1.6816,
"step": 328
},
{
"epoch": 0.4127979924717691,
"grad_norm": 25.74013900756836,
"learning_rate": 6.614402023857231e-06,
"loss": 1.5055,
"step": 329
},
{
"epoch": 0.41405269761606023,
"grad_norm": 30.515594482421875,
"learning_rate": 6.595181231451469e-06,
"loss": 1.5854,
"step": 330
},
{
"epoch": 0.41530740276035133,
"grad_norm": 37.943180084228516,
"learning_rate": 6.57593415893085e-06,
"loss": 1.4225,
"step": 331
},
{
"epoch": 0.41656210790464243,
"grad_norm": 30.183914184570312,
"learning_rate": 6.556661123384909e-06,
"loss": 1.5019,
"step": 332
},
{
"epoch": 0.4178168130489335,
"grad_norm": 35.5178337097168,
"learning_rate": 6.5373624423309165e-06,
"loss": 1.4571,
"step": 333
},
{
"epoch": 0.4190715181932246,
"grad_norm": 30.98124885559082,
"learning_rate": 6.518038433708643e-06,
"loss": 1.381,
"step": 334
},
{
"epoch": 0.4203262233375157,
"grad_norm": 31.475486755371094,
"learning_rate": 6.498689415875121e-06,
"loss": 1.607,
"step": 335
},
{
"epoch": 0.4215809284818068,
"grad_norm": 29.79499053955078,
"learning_rate": 6.479315707599407e-06,
"loss": 1.3446,
"step": 336
},
{
"epoch": 0.4228356336260979,
"grad_norm": 23.057994842529297,
"learning_rate": 6.459917628057319e-06,
"loss": 1.4102,
"step": 337
},
{
"epoch": 0.424090338770389,
"grad_norm": 32.09408187866211,
"learning_rate": 6.440495496826189e-06,
"loss": 1.6248,
"step": 338
},
{
"epoch": 0.42534504391468003,
"grad_norm": 30.396852493286133,
"learning_rate": 6.421049633879588e-06,
"loss": 1.5172,
"step": 339
},
{
"epoch": 0.42659974905897113,
"grad_norm": 37.36663818359375,
"learning_rate": 6.4015803595820635e-06,
"loss": 1.6684,
"step": 340
},
{
"epoch": 0.42785445420326224,
"grad_norm": 36.27682876586914,
"learning_rate": 6.3820879946838585e-06,
"loss": 1.43,
"step": 341
},
{
"epoch": 0.42910915934755334,
"grad_norm": 38.0621223449707,
"learning_rate": 6.3625728603156215e-06,
"loss": 1.5009,
"step": 342
},
{
"epoch": 0.43036386449184444,
"grad_norm": 30.142953872680664,
"learning_rate": 6.3430352779831275e-06,
"loss": 1.3865,
"step": 343
},
{
"epoch": 0.4316185696361355,
"grad_norm": 31.03050994873047,
"learning_rate": 6.323475569561968e-06,
"loss": 1.5305,
"step": 344
},
{
"epoch": 0.4328732747804266,
"grad_norm": 31.472867965698242,
"learning_rate": 6.303894057292261e-06,
"loss": 1.5711,
"step": 345
},
{
"epoch": 0.4341279799247177,
"grad_norm": 34.335853576660156,
"learning_rate": 6.284291063773331e-06,
"loss": 1.5281,
"step": 346
},
{
"epoch": 0.4353826850690088,
"grad_norm": 36.837493896484375,
"learning_rate": 6.264666911958404e-06,
"loss": 1.5468,
"step": 347
},
{
"epoch": 0.4366373902132999,
"grad_norm": 33.03227996826172,
"learning_rate": 6.2450219251492795e-06,
"loss": 1.483,
"step": 348
},
{
"epoch": 0.437892095357591,
"grad_norm": 28.33861541748047,
"learning_rate": 6.225356426991007e-06,
"loss": 1.2866,
"step": 349
},
{
"epoch": 0.43914680050188204,
"grad_norm": 27.562910079956055,
"learning_rate": 6.205670741466555e-06,
"loss": 1.4045,
"step": 350
},
{
"epoch": 0.44040150564617314,
"grad_norm": 31.761911392211914,
"learning_rate": 6.185965192891472e-06,
"loss": 1.337,
"step": 351
},
{
"epoch": 0.44165621079046424,
"grad_norm": 35.49506378173828,
"learning_rate": 6.166240105908547e-06,
"loss": 1.6938,
"step": 352
},
{
"epoch": 0.44291091593475534,
"grad_norm": 53.732215881347656,
"learning_rate": 6.146495805482451e-06,
"loss": 1.5635,
"step": 353
},
{
"epoch": 0.44416562107904645,
"grad_norm": 29.330778121948242,
"learning_rate": 6.126732616894397e-06,
"loss": 1.5873,
"step": 354
},
{
"epoch": 0.4454203262233375,
"grad_norm": 30.75185203552246,
"learning_rate": 6.106950865736777e-06,
"loss": 1.4611,
"step": 355
},
{
"epoch": 0.4466750313676286,
"grad_norm": 34.61481857299805,
"learning_rate": 6.087150877907786e-06,
"loss": 1.5506,
"step": 356
},
{
"epoch": 0.4479297365119197,
"grad_norm": 36.45780563354492,
"learning_rate": 6.067332979606069e-06,
"loss": 1.5333,
"step": 357
},
{
"epoch": 0.4491844416562108,
"grad_norm": 43.751426696777344,
"learning_rate": 6.047497497325341e-06,
"loss": 1.5729,
"step": 358
},
{
"epoch": 0.4504391468005019,
"grad_norm": 30.756084442138672,
"learning_rate": 6.027644757849004e-06,
"loss": 1.4557,
"step": 359
},
{
"epoch": 0.451693851944793,
"grad_norm": 30.46338653564453,
"learning_rate": 6.007775088244769e-06,
"loss": 1.3311,
"step": 360
},
{
"epoch": 0.45294855708908405,
"grad_norm": 29.494077682495117,
"learning_rate": 5.987888815859266e-06,
"loss": 1.3893,
"step": 361
},
{
"epoch": 0.45420326223337515,
"grad_norm": 30.151817321777344,
"learning_rate": 5.967986268312651e-06,
"loss": 1.346,
"step": 362
},
{
"epoch": 0.45545796737766625,
"grad_norm": 35.56706237792969,
"learning_rate": 5.948067773493205e-06,
"loss": 1.5986,
"step": 363
},
{
"epoch": 0.45671267252195735,
"grad_norm": 26.097820281982422,
"learning_rate": 5.928133659551939e-06,
"loss": 1.3859,
"step": 364
},
{
"epoch": 0.45796737766624845,
"grad_norm": 28.94278335571289,
"learning_rate": 5.908184254897183e-06,
"loss": 1.5139,
"step": 365
},
{
"epoch": 0.4592220828105395,
"grad_norm": 36.553123474121094,
"learning_rate": 5.888219888189176e-06,
"loss": 1.4892,
"step": 366
},
{
"epoch": 0.4604767879548306,
"grad_norm": 106.10436248779297,
"learning_rate": 5.8682408883346535e-06,
"loss": 1.4375,
"step": 367
},
{
"epoch": 0.4617314930991217,
"grad_norm": 42.712303161621094,
"learning_rate": 5.848247584481424e-06,
"loss": 1.431,
"step": 368
},
{
"epoch": 0.4629861982434128,
"grad_norm": 37.82698059082031,
"learning_rate": 5.828240306012957e-06,
"loss": 1.5441,
"step": 369
},
{
"epoch": 0.4642409033877039,
"grad_norm": 35.159000396728516,
"learning_rate": 5.808219382542941e-06,
"loss": 1.4638,
"step": 370
},
{
"epoch": 0.465495608531995,
"grad_norm": 28.512142181396484,
"learning_rate": 5.788185143909868e-06,
"loss": 1.4615,
"step": 371
},
{
"epoch": 0.46675031367628605,
"grad_norm": 32.28644943237305,
"learning_rate": 5.768137920171593e-06,
"loss": 1.4778,
"step": 372
},
{
"epoch": 0.46800501882057716,
"grad_norm": 30.508554458618164,
"learning_rate": 5.74807804159989e-06,
"loss": 1.656,
"step": 373
},
{
"epoch": 0.46925972396486826,
"grad_norm": 31.334104537963867,
"learning_rate": 5.728005838675026e-06,
"loss": 1.3335,
"step": 374
},
{
"epoch": 0.47051442910915936,
"grad_norm": 30.219167709350586,
"learning_rate": 5.7079216420803e-06,
"loss": 1.468,
"step": 375
},
{
"epoch": 0.47176913425345046,
"grad_norm": 40.787261962890625,
"learning_rate": 5.68782578269661e-06,
"loss": 1.5705,
"step": 376
},
{
"epoch": 0.4730238393977415,
"grad_norm": 36.666656494140625,
"learning_rate": 5.66771859159699e-06,
"loss": 1.5139,
"step": 377
},
{
"epoch": 0.4742785445420326,
"grad_norm": 33.556617736816406,
"learning_rate": 5.647600400041163e-06,
"loss": 1.3386,
"step": 378
},
{
"epoch": 0.4755332496863237,
"grad_norm": 28.310293197631836,
"learning_rate": 5.6274715394700805e-06,
"loss": 1.4892,
"step": 379
},
{
"epoch": 0.4767879548306148,
"grad_norm": 30.385696411132812,
"learning_rate": 5.6073323415004635e-06,
"loss": 1.4074,
"step": 380
},
{
"epoch": 0.4780426599749059,
"grad_norm": 30.94135856628418,
"learning_rate": 5.587183137919332e-06,
"loss": 1.3804,
"step": 381
},
{
"epoch": 0.479297365119197,
"grad_norm": 25.842451095581055,
"learning_rate": 5.567024260678559e-06,
"loss": 1.3756,
"step": 382
},
{
"epoch": 0.48055207026348806,
"grad_norm": 24.24115753173828,
"learning_rate": 5.546856041889374e-06,
"loss": 1.3217,
"step": 383
},
{
"epoch": 0.48180677540777916,
"grad_norm": 29.69972801208496,
"learning_rate": 5.526678813816912e-06,
"loss": 1.3114,
"step": 384
},
{
"epoch": 0.48306148055207027,
"grad_norm": 40.6950569152832,
"learning_rate": 5.5064929088747324e-06,
"loss": 1.6083,
"step": 385
},
{
"epoch": 0.48431618569636137,
"grad_norm": 37.67729949951172,
"learning_rate": 5.486298659619346e-06,
"loss": 1.5827,
"step": 386
},
{
"epoch": 0.48557089084065247,
"grad_norm": 38.3140754699707,
"learning_rate": 5.46609639874473e-06,
"loss": 1.3942,
"step": 387
},
{
"epoch": 0.4868255959849435,
"grad_norm": 33.37904739379883,
"learning_rate": 5.445886459076848e-06,
"loss": 1.5518,
"step": 388
},
{
"epoch": 0.4880803011292346,
"grad_norm": 30.683101654052734,
"learning_rate": 5.425669173568179e-06,
"loss": 1.3667,
"step": 389
},
{
"epoch": 0.4893350062735257,
"grad_norm": 38.90886306762695,
"learning_rate": 5.405444875292213e-06,
"loss": 1.6388,
"step": 390
},
{
"epoch": 0.4905897114178168,
"grad_norm": 32.49534606933594,
"learning_rate": 5.385213897437975e-06,
"loss": 1.3725,
"step": 391
},
{
"epoch": 0.4918444165621079,
"grad_norm": 31.765207290649414,
"learning_rate": 5.364976573304538e-06,
"loss": 1.4513,
"step": 392
},
{
"epoch": 0.493099121706399,
"grad_norm": 34.01384735107422,
"learning_rate": 5.344733236295525e-06,
"loss": 1.3848,
"step": 393
},
{
"epoch": 0.49435382685069007,
"grad_norm": 36.31550216674805,
"learning_rate": 5.324484219913621e-06,
"loss": 1.3873,
"step": 394
},
{
"epoch": 0.49560853199498117,
"grad_norm": 30.318265914916992,
"learning_rate": 5.30422985775507e-06,
"loss": 1.5321,
"step": 395
},
{
"epoch": 0.4968632371392723,
"grad_norm": 30.169464111328125,
"learning_rate": 5.283970483504198e-06,
"loss": 1.3799,
"step": 396
},
{
"epoch": 0.4981179422835634,
"grad_norm": 31.82530975341797,
"learning_rate": 5.263706430927895e-06,
"loss": 1.5295,
"step": 397
},
{
"epoch": 0.4993726474278545,
"grad_norm": 36.714996337890625,
"learning_rate": 5.243438033870126e-06,
"loss": 1.4037,
"step": 398
},
{
"epoch": 0.5006273525721455,
"grad_norm": 33.54505157470703,
"learning_rate": 5.223165626246432e-06,
"loss": 1.521,
"step": 399
},
{
"epoch": 0.5006273525721455,
"eval_loss": 1.436629295349121,
"eval_runtime": 6.0522,
"eval_samples_per_second": 110.869,
"eval_steps_per_second": 6.94,
"step": 399
},
{
"epoch": 0.5018820577164367,
"grad_norm": 30.569034576416016,
"learning_rate": 5.202889542038428e-06,
"loss": 1.3634,
"step": 400
},
{
"epoch": 0.5031367628607277,
"grad_norm": 28.09290885925293,
"learning_rate": 5.182610115288296e-06,
"loss": 1.4243,
"step": 401
},
{
"epoch": 0.5043914680050188,
"grad_norm": 31.013883590698242,
"learning_rate": 5.162327680093284e-06,
"loss": 1.5255,
"step": 402
},
{
"epoch": 0.5056461731493099,
"grad_norm": 28.622833251953125,
"learning_rate": 5.142042570600212e-06,
"loss": 1.143,
"step": 403
},
{
"epoch": 0.506900878293601,
"grad_norm": 34.083290100097656,
"learning_rate": 5.121755120999949e-06,
"loss": 1.4854,
"step": 404
},
{
"epoch": 0.5081555834378921,
"grad_norm": 29.883394241333008,
"learning_rate": 5.101465665521919e-06,
"loss": 1.2494,
"step": 405
},
{
"epoch": 0.5094102885821832,
"grad_norm": 36.8629035949707,
"learning_rate": 5.081174538428596e-06,
"loss": 1.5055,
"step": 406
},
{
"epoch": 0.5106649937264742,
"grad_norm": 39.23841094970703,
"learning_rate": 5.060882074009988e-06,
"loss": 1.41,
"step": 407
},
{
"epoch": 0.5119196988707654,
"grad_norm": 42.195274353027344,
"learning_rate": 5.04058860657814e-06,
"loss": 1.5589,
"step": 408
},
{
"epoch": 0.5131744040150564,
"grad_norm": 32.830596923828125,
"learning_rate": 5.020294470461615e-06,
"loss": 1.3412,
"step": 409
},
{
"epoch": 0.5144291091593476,
"grad_norm": 49.16096496582031,
"learning_rate": 5e-06,
"loss": 1.5255,
"step": 410
},
{
"epoch": 0.5156838143036386,
"grad_norm": 29.00592613220215,
"learning_rate": 4.979705529538385e-06,
"loss": 1.4311,
"step": 411
},
{
"epoch": 0.5169385194479298,
"grad_norm": 39.06101608276367,
"learning_rate": 4.959411393421863e-06,
"loss": 1.3708,
"step": 412
},
{
"epoch": 0.5181932245922208,
"grad_norm": 34.09449768066406,
"learning_rate": 4.939117925990013e-06,
"loss": 1.484,
"step": 413
},
{
"epoch": 0.5194479297365119,
"grad_norm": 35.57181167602539,
"learning_rate": 4.918825461571405e-06,
"loss": 1.3226,
"step": 414
},
{
"epoch": 0.520702634880803,
"grad_norm": 29.180233001708984,
"learning_rate": 4.8985343344780815e-06,
"loss": 1.6168,
"step": 415
},
{
"epoch": 0.5219573400250941,
"grad_norm": 25.967992782592773,
"learning_rate": 4.8782448790000525e-06,
"loss": 1.4807,
"step": 416
},
{
"epoch": 0.5232120451693852,
"grad_norm": 31.979293823242188,
"learning_rate": 4.857957429399788e-06,
"loss": 1.4218,
"step": 417
},
{
"epoch": 0.5244667503136763,
"grad_norm": 30.151277542114258,
"learning_rate": 4.837672319906717e-06,
"loss": 1.4075,
"step": 418
},
{
"epoch": 0.5257214554579673,
"grad_norm": 40.19000244140625,
"learning_rate": 4.817389884711706e-06,
"loss": 1.6472,
"step": 419
},
{
"epoch": 0.5269761606022585,
"grad_norm": 28.63579559326172,
"learning_rate": 4.797110457961575e-06,
"loss": 1.1942,
"step": 420
},
{
"epoch": 0.5282308657465495,
"grad_norm": 36.74559020996094,
"learning_rate": 4.7768343737535694e-06,
"loss": 1.5179,
"step": 421
},
{
"epoch": 0.5294855708908407,
"grad_norm": 30.191770553588867,
"learning_rate": 4.756561966129875e-06,
"loss": 1.2881,
"step": 422
},
{
"epoch": 0.5307402760351317,
"grad_norm": 31.707502365112305,
"learning_rate": 4.736293569072108e-06,
"loss": 1.3801,
"step": 423
},
{
"epoch": 0.5319949811794228,
"grad_norm": 25.902997970581055,
"learning_rate": 4.716029516495803e-06,
"loss": 1.3326,
"step": 424
},
{
"epoch": 0.533249686323714,
"grad_norm": 42.108238220214844,
"learning_rate": 4.695770142244931e-06,
"loss": 1.529,
"step": 425
},
{
"epoch": 0.534504391468005,
"grad_norm": 31.789140701293945,
"learning_rate": 4.6755157800863826e-06,
"loss": 1.3478,
"step": 426
},
{
"epoch": 0.5357590966122961,
"grad_norm": 27.96792984008789,
"learning_rate": 4.655266763704476e-06,
"loss": 1.397,
"step": 427
},
{
"epoch": 0.5370138017565872,
"grad_norm": 31.803890228271484,
"learning_rate": 4.635023426695462e-06,
"loss": 1.4011,
"step": 428
},
{
"epoch": 0.5382685069008782,
"grad_norm": 35.10597610473633,
"learning_rate": 4.614786102562026e-06,
"loss": 1.4848,
"step": 429
},
{
"epoch": 0.5395232120451694,
"grad_norm": 31.621994018554688,
"learning_rate": 4.594555124707789e-06,
"loss": 1.3346,
"step": 430
},
{
"epoch": 0.5407779171894604,
"grad_norm": 33.457908630371094,
"learning_rate": 4.574330826431822e-06,
"loss": 1.3045,
"step": 431
},
{
"epoch": 0.5420326223337516,
"grad_norm": 31.1467342376709,
"learning_rate": 4.554113540923153e-06,
"loss": 1.4343,
"step": 432
},
{
"epoch": 0.5432873274780426,
"grad_norm": 31.287960052490234,
"learning_rate": 4.533903601255272e-06,
"loss": 1.3903,
"step": 433
},
{
"epoch": 0.5445420326223338,
"grad_norm": 26.70494842529297,
"learning_rate": 4.513701340380655e-06,
"loss": 1.3482,
"step": 434
},
{
"epoch": 0.5457967377666249,
"grad_norm": 44.05613327026367,
"learning_rate": 4.493507091125269e-06,
"loss": 1.5986,
"step": 435
},
{
"epoch": 0.5470514429109159,
"grad_norm": 29.704072952270508,
"learning_rate": 4.473321186183091e-06,
"loss": 1.3137,
"step": 436
},
{
"epoch": 0.548306148055207,
"grad_norm": 29.141984939575195,
"learning_rate": 4.4531439581106295e-06,
"loss": 1.478,
"step": 437
},
{
"epoch": 0.5495608531994981,
"grad_norm": 34.73693084716797,
"learning_rate": 4.432975739321444e-06,
"loss": 1.5629,
"step": 438
},
{
"epoch": 0.5508155583437893,
"grad_norm": 33.1425666809082,
"learning_rate": 4.412816862080668e-06,
"loss": 1.3101,
"step": 439
},
{
"epoch": 0.5520702634880803,
"grad_norm": 31.933034896850586,
"learning_rate": 4.392667658499539e-06,
"loss": 1.3371,
"step": 440
},
{
"epoch": 0.5533249686323714,
"grad_norm": 30.45763397216797,
"learning_rate": 4.37252846052992e-06,
"loss": 1.3671,
"step": 441
},
{
"epoch": 0.5545796737766625,
"grad_norm": 42.91053009033203,
"learning_rate": 4.352399599958837e-06,
"loss": 1.4992,
"step": 442
},
{
"epoch": 0.5558343789209536,
"grad_norm": 36.65143585205078,
"learning_rate": 4.332281408403011e-06,
"loss": 1.4589,
"step": 443
},
{
"epoch": 0.5570890840652447,
"grad_norm": 38.462398529052734,
"learning_rate": 4.312174217303391e-06,
"loss": 1.2266,
"step": 444
},
{
"epoch": 0.5583437892095358,
"grad_norm": 31.30473518371582,
"learning_rate": 4.292078357919701e-06,
"loss": 1.4476,
"step": 445
},
{
"epoch": 0.5595984943538268,
"grad_norm": 35.10082244873047,
"learning_rate": 4.271994161324977e-06,
"loss": 1.4988,
"step": 446
},
{
"epoch": 0.560853199498118,
"grad_norm": 32.5116081237793,
"learning_rate": 4.2519219584001106e-06,
"loss": 1.4988,
"step": 447
},
{
"epoch": 0.562107904642409,
"grad_norm": 29.34661102294922,
"learning_rate": 4.231862079828408e-06,
"loss": 1.4725,
"step": 448
},
{
"epoch": 0.5633626097867002,
"grad_norm": 36.072879791259766,
"learning_rate": 4.2118148560901325e-06,
"loss": 1.4334,
"step": 449
},
{
"epoch": 0.5646173149309912,
"grad_norm": 30.869470596313477,
"learning_rate": 4.19178061745706e-06,
"loss": 1.3606,
"step": 450
},
{
"epoch": 0.5658720200752823,
"grad_norm": 29.298429489135742,
"learning_rate": 4.171759693987046e-06,
"loss": 1.2983,
"step": 451
},
{
"epoch": 0.5671267252195734,
"grad_norm": 24.67900276184082,
"learning_rate": 4.151752415518577e-06,
"loss": 1.2631,
"step": 452
},
{
"epoch": 0.5683814303638645,
"grad_norm": 33.28513717651367,
"learning_rate": 4.131759111665349e-06,
"loss": 1.3843,
"step": 453
},
{
"epoch": 0.5696361355081556,
"grad_norm": 34.13528823852539,
"learning_rate": 4.111780111810826e-06,
"loss": 1.4529,
"step": 454
},
{
"epoch": 0.5708908406524467,
"grad_norm": 28.38991355895996,
"learning_rate": 4.091815745102818e-06,
"loss": 1.5154,
"step": 455
},
{
"epoch": 0.5721455457967378,
"grad_norm": 26.64844512939453,
"learning_rate": 4.071866340448062e-06,
"loss": 1.3302,
"step": 456
},
{
"epoch": 0.5734002509410289,
"grad_norm": 37.00432205200195,
"learning_rate": 4.051932226506797e-06,
"loss": 1.3327,
"step": 457
},
{
"epoch": 0.5746549560853199,
"grad_norm": 27.36146354675293,
"learning_rate": 4.032013731687351e-06,
"loss": 1.361,
"step": 458
},
{
"epoch": 0.5759096612296111,
"grad_norm": 32.78675842285156,
"learning_rate": 4.0121111841407345e-06,
"loss": 1.4741,
"step": 459
},
{
"epoch": 0.5771643663739021,
"grad_norm": 37.97308349609375,
"learning_rate": 3.992224911755234e-06,
"loss": 1.5363,
"step": 460
},
{
"epoch": 0.5784190715181933,
"grad_norm": 31.34197235107422,
"learning_rate": 3.9723552421509975e-06,
"loss": 1.2434,
"step": 461
},
{
"epoch": 0.5796737766624843,
"grad_norm": 36.909828186035156,
"learning_rate": 3.95250250267466e-06,
"loss": 1.3956,
"step": 462
},
{
"epoch": 0.5809284818067754,
"grad_norm": 47.24994659423828,
"learning_rate": 3.932667020393933e-06,
"loss": 1.3312,
"step": 463
},
{
"epoch": 0.5821831869510665,
"grad_norm": 35.684608459472656,
"learning_rate": 3.912849122092216e-06,
"loss": 1.4447,
"step": 464
},
{
"epoch": 0.5834378920953576,
"grad_norm": 36.601715087890625,
"learning_rate": 3.8930491342632235e-06,
"loss": 1.4177,
"step": 465
},
{
"epoch": 0.5846925972396487,
"grad_norm": 28.328744888305664,
"learning_rate": 3.873267383105604e-06,
"loss": 1.3929,
"step": 466
},
{
"epoch": 0.5859473023839398,
"grad_norm": 32.12102127075195,
"learning_rate": 3.853504194517551e-06,
"loss": 1.4941,
"step": 467
},
{
"epoch": 0.5872020075282308,
"grad_norm": 32.12097930908203,
"learning_rate": 3.833759894091456e-06,
"loss": 1.3292,
"step": 468
},
{
"epoch": 0.588456712672522,
"grad_norm": 26.0775146484375,
"learning_rate": 3.814034807108529e-06,
"loss": 1.3233,
"step": 469
},
{
"epoch": 0.589711417816813,
"grad_norm": 26.92903709411621,
"learning_rate": 3.7943292585334464e-06,
"loss": 1.3575,
"step": 470
},
{
"epoch": 0.5909661229611042,
"grad_norm": 35.65913772583008,
"learning_rate": 3.774643573008995e-06,
"loss": 1.3416,
"step": 471
},
{
"epoch": 0.5922208281053952,
"grad_norm": 44.53237533569336,
"learning_rate": 3.754978074850722e-06,
"loss": 1.6346,
"step": 472
},
{
"epoch": 0.5934755332496863,
"grad_norm": 33.18136978149414,
"learning_rate": 3.7353330880415963e-06,
"loss": 1.5085,
"step": 473
},
{
"epoch": 0.5947302383939774,
"grad_norm": 31.07672882080078,
"learning_rate": 3.7157089362266695e-06,
"loss": 1.3839,
"step": 474
},
{
"epoch": 0.5959849435382685,
"grad_norm": 29.932600021362305,
"learning_rate": 3.6961059427077407e-06,
"loss": 1.4774,
"step": 475
},
{
"epoch": 0.5972396486825596,
"grad_norm": 27.480052947998047,
"learning_rate": 3.6765244304380323e-06,
"loss": 1.2551,
"step": 476
},
{
"epoch": 0.5984943538268507,
"grad_norm": 39.4902458190918,
"learning_rate": 3.656964722016875e-06,
"loss": 1.3972,
"step": 477
},
{
"epoch": 0.5997490589711418,
"grad_norm": 36.17951583862305,
"learning_rate": 3.6374271396843797e-06,
"loss": 1.2946,
"step": 478
},
{
"epoch": 0.6010037641154329,
"grad_norm": 30.92720603942871,
"learning_rate": 3.617912005316142e-06,
"loss": 1.2169,
"step": 479
},
{
"epoch": 0.6022584692597239,
"grad_norm": 34.092063903808594,
"learning_rate": 3.598419640417938e-06,
"loss": 1.3757,
"step": 480
},
{
"epoch": 0.6035131744040151,
"grad_norm": 27.944690704345703,
"learning_rate": 3.578950366120414e-06,
"loss": 1.2427,
"step": 481
},
{
"epoch": 0.6047678795483061,
"grad_norm": 36.29844665527344,
"learning_rate": 3.5595045031738123e-06,
"loss": 1.3915,
"step": 482
},
{
"epoch": 0.6060225846925973,
"grad_norm": 36.75183868408203,
"learning_rate": 3.540082371942682e-06,
"loss": 1.4398,
"step": 483
},
{
"epoch": 0.6072772898368883,
"grad_norm": 28.854524612426758,
"learning_rate": 3.5206842924005934e-06,
"loss": 1.3392,
"step": 484
},
{
"epoch": 0.6085319949811794,
"grad_norm": 32.42161560058594,
"learning_rate": 3.5013105841248794e-06,
"loss": 1.5482,
"step": 485
},
{
"epoch": 0.6097867001254705,
"grad_norm": 38.66543960571289,
"learning_rate": 3.481961566291358e-06,
"loss": 1.4572,
"step": 486
},
{
"epoch": 0.6110414052697616,
"grad_norm": 37.27582550048828,
"learning_rate": 3.462637557669084e-06,
"loss": 1.3017,
"step": 487
},
{
"epoch": 0.6122961104140527,
"grad_norm": 28.435178756713867,
"learning_rate": 3.443338876615092e-06,
"loss": 1.3203,
"step": 488
},
{
"epoch": 0.6135508155583438,
"grad_norm": 33.752044677734375,
"learning_rate": 3.424065841069152e-06,
"loss": 1.5739,
"step": 489
},
{
"epoch": 0.6148055207026348,
"grad_norm": 34.22273635864258,
"learning_rate": 3.4048187685485312e-06,
"loss": 1.4068,
"step": 490
},
{
"epoch": 0.616060225846926,
"grad_norm": 54.36898422241211,
"learning_rate": 3.3855979761427705e-06,
"loss": 1.3019,
"step": 491
},
{
"epoch": 0.617314930991217,
"grad_norm": 32.61660385131836,
"learning_rate": 3.3664037805084428e-06,
"loss": 1.2823,
"step": 492
},
{
"epoch": 0.6185696361355082,
"grad_norm": 34.06522750854492,
"learning_rate": 3.347236497863957e-06,
"loss": 1.3678,
"step": 493
},
{
"epoch": 0.6198243412797992,
"grad_norm": 29.604419708251953,
"learning_rate": 3.3280964439843377e-06,
"loss": 1.3285,
"step": 494
},
{
"epoch": 0.6210790464240903,
"grad_norm": 33.45100021362305,
"learning_rate": 3.308983934196018e-06,
"loss": 1.422,
"step": 495
},
{
"epoch": 0.6223337515683814,
"grad_norm": 33.3889274597168,
"learning_rate": 3.289899283371657e-06,
"loss": 1.3114,
"step": 496
},
{
"epoch": 0.6235884567126725,
"grad_norm": 30.00410270690918,
"learning_rate": 3.2708428059249437e-06,
"loss": 1.3216,
"step": 497
},
{
"epoch": 0.6248431618569636,
"grad_norm": 41.03053283691406,
"learning_rate": 3.2518148158054186e-06,
"loss": 1.4942,
"step": 498
},
{
"epoch": 0.6260978670012547,
"grad_norm": 46.363258361816406,
"learning_rate": 3.2328156264933043e-06,
"loss": 1.6328,
"step": 499
},
{
"epoch": 0.6273525721455459,
"grad_norm": 37.64637756347656,
"learning_rate": 3.2138455509943365e-06,
"loss": 1.3816,
"step": 500
},
{
"epoch": 0.6286072772898369,
"grad_norm": 46.19404602050781,
"learning_rate": 3.194904901834613e-06,
"loss": 1.5756,
"step": 501
},
{
"epoch": 0.6298619824341279,
"grad_norm": 26.028804779052734,
"learning_rate": 3.17599399105544e-06,
"loss": 1.314,
"step": 502
},
{
"epoch": 0.6311166875784191,
"grad_norm": 31.624303817749023,
"learning_rate": 3.1571131302081916e-06,
"loss": 1.3178,
"step": 503
},
{
"epoch": 0.6323713927227101,
"grad_norm": 35.267478942871094,
"learning_rate": 3.138262630349182e-06,
"loss": 1.5758,
"step": 504
},
{
"epoch": 0.6336260978670013,
"grad_norm": 30.934772491455078,
"learning_rate": 3.1194428020345375e-06,
"loss": 1.4725,
"step": 505
},
{
"epoch": 0.6348808030112923,
"grad_norm": 28.47898292541504,
"learning_rate": 3.1006539553150727e-06,
"loss": 1.3188,
"step": 506
},
{
"epoch": 0.6361355081555834,
"grad_norm": 38.18532943725586,
"learning_rate": 3.081896399731202e-06,
"loss": 1.2228,
"step": 507
},
{
"epoch": 0.6373902132998746,
"grad_norm": 35.62003707885742,
"learning_rate": 3.063170444307821e-06,
"loss": 1.6133,
"step": 508
},
{
"epoch": 0.6386449184441656,
"grad_norm": 58.091861724853516,
"learning_rate": 3.044476397549221e-06,
"loss": 1.3338,
"step": 509
},
{
"epoch": 0.6398996235884568,
"grad_norm": 31.276124954223633,
"learning_rate": 3.02581456743401e-06,
"loss": 1.1924,
"step": 510
},
{
"epoch": 0.6411543287327478,
"grad_norm": 36.98395538330078,
"learning_rate": 3.0071852614100427e-06,
"loss": 1.3475,
"step": 511
},
{
"epoch": 0.6424090338770388,
"grad_norm": 33.80880355834961,
"learning_rate": 2.9885887863893394e-06,
"loss": 1.2211,
"step": 512
},
{
"epoch": 0.64366373902133,
"grad_norm": 37.08169174194336,
"learning_rate": 2.9700254487430448e-06,
"loss": 1.3388,
"step": 513
},
{
"epoch": 0.644918444165621,
"grad_norm": 30.51959228515625,
"learning_rate": 2.9514955542963775e-06,
"loss": 1.4277,
"step": 514
},
{
"epoch": 0.6461731493099122,
"grad_norm": 31.10744285583496,
"learning_rate": 2.9329994083235857e-06,
"loss": 1.2503,
"step": 515
},
{
"epoch": 0.6474278544542033,
"grad_norm": 32.857383728027344,
"learning_rate": 2.9145373155429263e-06,
"loss": 1.4776,
"step": 516
},
{
"epoch": 0.6486825595984943,
"grad_norm": 36.374961853027344,
"learning_rate": 2.896109580111634e-06,
"loss": 1.2288,
"step": 517
},
{
"epoch": 0.6499372647427855,
"grad_norm": 26.020505905151367,
"learning_rate": 2.8777165056209256e-06,
"loss": 1.2806,
"step": 518
},
{
"epoch": 0.6511919698870765,
"grad_norm": 31.82769775390625,
"learning_rate": 2.8593583950909833e-06,
"loss": 1.3725,
"step": 519
},
{
"epoch": 0.6524466750313677,
"grad_norm": 36.6817741394043,
"learning_rate": 2.8410355509659682e-06,
"loss": 1.2934,
"step": 520
},
{
"epoch": 0.6537013801756587,
"grad_norm": 46.93891525268555,
"learning_rate": 2.8227482751090445e-06,
"loss": 1.4673,
"step": 521
},
{
"epoch": 0.6549560853199499,
"grad_norm": 41.38336181640625,
"learning_rate": 2.8044968687973956e-06,
"loss": 1.4611,
"step": 522
},
{
"epoch": 0.6562107904642409,
"grad_norm": 37.399681091308594,
"learning_rate": 2.786281632717264e-06,
"loss": 1.2811,
"step": 523
},
{
"epoch": 0.657465495608532,
"grad_norm": 44.295719146728516,
"learning_rate": 2.7681028669590038e-06,
"loss": 1.3587,
"step": 524
},
{
"epoch": 0.6587202007528231,
"grad_norm": 33.356292724609375,
"learning_rate": 2.749960871012129e-06,
"loss": 1.4634,
"step": 525
},
{
"epoch": 0.6599749058971142,
"grad_norm": 38.98143005371094,
"learning_rate": 2.73185594376038e-06,
"loss": 1.4382,
"step": 526
},
{
"epoch": 0.6612296110414053,
"grad_norm": 30.759475708007812,
"learning_rate": 2.7137883834768076e-06,
"loss": 1.3081,
"step": 527
},
{
"epoch": 0.6624843161856964,
"grad_norm": 37.871238708496094,
"learning_rate": 2.6957584878188496e-06,
"loss": 1.3886,
"step": 528
},
{
"epoch": 0.6637390213299874,
"grad_norm": 49.197872161865234,
"learning_rate": 2.6777665538234292e-06,
"loss": 1.5503,
"step": 529
},
{
"epoch": 0.6649937264742786,
"grad_norm": 37.15614700317383,
"learning_rate": 2.6598128779020693e-06,
"loss": 1.3044,
"step": 530
},
{
"epoch": 0.6662484316185696,
"grad_norm": 31.275415420532227,
"learning_rate": 2.641897755835997e-06,
"loss": 1.397,
"step": 531
},
{
"epoch": 0.6675031367628608,
"grad_norm": 41.38181686401367,
"learning_rate": 2.6240214827712794e-06,
"loss": 1.4281,
"step": 532
},
{
"epoch": 0.6687578419071518,
"grad_norm": 39.80350875854492,
"learning_rate": 2.6061843532139563e-06,
"loss": 1.4107,
"step": 533
},
{
"epoch": 0.6700125470514429,
"grad_norm": 40.21477508544922,
"learning_rate": 2.5883866610251906e-06,
"loss": 1.4339,
"step": 534
},
{
"epoch": 0.671267252195734,
"grad_norm": 43.72838592529297,
"learning_rate": 2.5706286994164315e-06,
"loss": 1.5603,
"step": 535
},
{
"epoch": 0.6725219573400251,
"grad_norm": 27.070802688598633,
"learning_rate": 2.5529107609445737e-06,
"loss": 1.4321,
"step": 536
},
{
"epoch": 0.6737766624843162,
"grad_norm": 41.055633544921875,
"learning_rate": 2.5352331375071437e-06,
"loss": 1.4914,
"step": 537
},
{
"epoch": 0.6750313676286073,
"grad_norm": 39.451602935791016,
"learning_rate": 2.5175961203374954e-06,
"loss": 1.4453,
"step": 538
},
{
"epoch": 0.6762860727728983,
"grad_norm": 38.11553955078125,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.3918,
"step": 539
},
{
"epoch": 0.6775407779171895,
"grad_norm": 30.756338119506836,
"learning_rate": 2.4824450663852716e-06,
"loss": 1.1408,
"step": 540
},
{
"epoch": 0.6787954830614805,
"grad_norm": 31.51823616027832,
"learning_rate": 2.464931608705384e-06,
"loss": 1.5483,
"step": 541
},
{
"epoch": 0.6800501882057717,
"grad_norm": 28.151769638061523,
"learning_rate": 2.447459915489106e-06,
"loss": 1.2619,
"step": 542
},
{
"epoch": 0.6813048933500627,
"grad_norm": 34.87588119506836,
"learning_rate": 2.430030274577151e-06,
"loss": 1.3653,
"step": 543
},
{
"epoch": 0.6825595984943539,
"grad_norm": 44.73030090332031,
"learning_rate": 2.4126429731174372e-06,
"loss": 1.4503,
"step": 544
},
{
"epoch": 0.6838143036386449,
"grad_norm": 35.88227462768555,
"learning_rate": 2.3952982975603494e-06,
"loss": 1.3246,
"step": 545
},
{
"epoch": 0.685069008782936,
"grad_norm": 27.695951461791992,
"learning_rate": 2.3779965336540237e-06,
"loss": 1.3869,
"step": 546
},
{
"epoch": 0.6863237139272271,
"grad_norm": 37.88958740234375,
"learning_rate": 2.3607379664396414e-06,
"loss": 1.4772,
"step": 547
},
{
"epoch": 0.6875784190715182,
"grad_norm": 30.21925926208496,
"learning_rate": 2.343522880246734e-06,
"loss": 1.3563,
"step": 548
},
{
"epoch": 0.6888331242158093,
"grad_norm": 41.6002197265625,
"learning_rate": 2.3263515586884935e-06,
"loss": 1.3695,
"step": 549
},
{
"epoch": 0.6900878293601004,
"grad_norm": 29.012378692626953,
"learning_rate": 2.3092242846571034e-06,
"loss": 1.3925,
"step": 550
},
{
"epoch": 0.6913425345043914,
"grad_norm": 28.30169105529785,
"learning_rate": 2.2921413403190774e-06,
"loss": 1.3324,
"step": 551
},
{
"epoch": 0.6925972396486826,
"grad_norm": 30.30564308166504,
"learning_rate": 2.275103007110616e-06,
"loss": 1.3319,
"step": 552
},
{
"epoch": 0.6938519447929736,
"grad_norm": 32.01078796386719,
"learning_rate": 2.25810956573296e-06,
"loss": 1.2561,
"step": 553
},
{
"epoch": 0.6951066499372648,
"grad_norm": 45.61001205444336,
"learning_rate": 2.2411612961477704e-06,
"loss": 1.4322,
"step": 554
},
{
"epoch": 0.6963613550815558,
"grad_norm": 39.38789749145508,
"learning_rate": 2.224258477572524e-06,
"loss": 1.2698,
"step": 555
},
{
"epoch": 0.6976160602258469,
"grad_norm": 41.91701126098633,
"learning_rate": 2.2074013884758993e-06,
"loss": 1.4422,
"step": 556
},
{
"epoch": 0.698870765370138,
"grad_norm": 32.67595291137695,
"learning_rate": 2.190590306573198e-06,
"loss": 1.2315,
"step": 557
},
{
"epoch": 0.7001254705144291,
"grad_norm": 33.57855224609375,
"learning_rate": 2.17382550882177e-06,
"loss": 1.2939,
"step": 558
},
{
"epoch": 0.7013801756587202,
"grad_norm": 30.53522491455078,
"learning_rate": 2.1571072714164445e-06,
"loss": 1.3556,
"step": 559
},
{
"epoch": 0.7026348808030113,
"grad_norm": 33.44630432128906,
"learning_rate": 2.140435869784986e-06,
"loss": 1.3701,
"step": 560
},
{
"epoch": 0.7038895859473023,
"grad_norm": 34.59889221191406,
"learning_rate": 2.1238115785835512e-06,
"loss": 1.5211,
"step": 561
},
{
"epoch": 0.7051442910915935,
"grad_norm": 42.23357009887695,
"learning_rate": 2.1072346716921733e-06,
"loss": 1.2913,
"step": 562
},
{
"epoch": 0.7063989962358845,
"grad_norm": 32.22030258178711,
"learning_rate": 2.0907054222102367e-06,
"loss": 1.3462,
"step": 563
},
{
"epoch": 0.7076537013801757,
"grad_norm": 39.91384506225586,
"learning_rate": 2.0742241024519886e-06,
"loss": 1.3211,
"step": 564
},
{
"epoch": 0.7089084065244667,
"grad_norm": 41.389461517333984,
"learning_rate": 2.0577909839420468e-06,
"loss": 1.3882,
"step": 565
},
{
"epoch": 0.7101631116687579,
"grad_norm": 25.932300567626953,
"learning_rate": 2.0414063374109326e-06,
"loss": 1.2911,
"step": 566
},
{
"epoch": 0.7114178168130489,
"grad_norm": 40.37273025512695,
"learning_rate": 2.0250704327906025e-06,
"loss": 1.3346,
"step": 567
},
{
"epoch": 0.71267252195734,
"grad_norm": 33.203975677490234,
"learning_rate": 2.0087835392100034e-06,
"loss": 1.3206,
"step": 568
},
{
"epoch": 0.7139272271016311,
"grad_norm": 25.78790283203125,
"learning_rate": 1.9925459249906488e-06,
"loss": 1.2016,
"step": 569
},
{
"epoch": 0.7151819322459222,
"grad_norm": 26.151403427124023,
"learning_rate": 1.9763578576421816e-06,
"loss": 1.3088,
"step": 570
},
{
"epoch": 0.7164366373902133,
"grad_norm": 40.70786666870117,
"learning_rate": 1.9602196038579774e-06,
"loss": 1.2366,
"step": 571
},
{
"epoch": 0.7176913425345044,
"grad_norm": 32.47188949584961,
"learning_rate": 1.944131429510754e-06,
"loss": 1.3264,
"step": 572
},
{
"epoch": 0.7189460476787954,
"grad_norm": 44.57042694091797,
"learning_rate": 1.9280935996481792e-06,
"loss": 1.3883,
"step": 573
},
{
"epoch": 0.7202007528230866,
"grad_norm": 37.86323165893555,
"learning_rate": 1.9121063784885135e-06,
"loss": 1.2686,
"step": 574
},
{
"epoch": 0.7214554579673776,
"grad_norm": 28.20488739013672,
"learning_rate": 1.8961700294162578e-06,
"loss": 1.3424,
"step": 575
},
{
"epoch": 0.7227101631116688,
"grad_norm": 80.7864761352539,
"learning_rate": 1.880284814977807e-06,
"loss": 1.4263,
"step": 576
},
{
"epoch": 0.7239648682559598,
"grad_norm": 47.082122802734375,
"learning_rate": 1.8644509968771302e-06,
"loss": 1.3611,
"step": 577
},
{
"epoch": 0.7252195734002509,
"grad_norm": 27.525779724121094,
"learning_rate": 1.8486688359714567e-06,
"loss": 1.1818,
"step": 578
},
{
"epoch": 0.726474278544542,
"grad_norm": 26.097383499145508,
"learning_rate": 1.832938592266984e-06,
"loss": 1.4285,
"step": 579
},
{
"epoch": 0.7277289836888331,
"grad_norm": 27.29695701599121,
"learning_rate": 1.8172605249145848e-06,
"loss": 1.2213,
"step": 580
},
{
"epoch": 0.7289836888331243,
"grad_norm": 43.18733215332031,
"learning_rate": 1.8016348922055448e-06,
"loss": 1.3866,
"step": 581
},
{
"epoch": 0.7302383939774153,
"grad_norm": 30.83635139465332,
"learning_rate": 1.7860619515673034e-06,
"loss": 1.2583,
"step": 582
},
{
"epoch": 0.7314930991217063,
"grad_norm": 38.65605163574219,
"learning_rate": 1.7705419595592193e-06,
"loss": 1.4949,
"step": 583
},
{
"epoch": 0.7327478042659975,
"grad_norm": 33.9451789855957,
"learning_rate": 1.7550751718683339e-06,
"loss": 1.4502,
"step": 584
},
{
"epoch": 0.7340025094102886,
"grad_norm": 32.3410530090332,
"learning_rate": 1.7396618433051648e-06,
"loss": 1.3073,
"step": 585
},
{
"epoch": 0.7352572145545797,
"grad_norm": 31.831172943115234,
"learning_rate": 1.7243022277995109e-06,
"loss": 1.1989,
"step": 586
},
{
"epoch": 0.7365119196988708,
"grad_norm": 36.86290740966797,
"learning_rate": 1.7089965783962608e-06,
"loss": 1.4668,
"step": 587
},
{
"epoch": 0.7377666248431619,
"grad_norm": 34.344600677490234,
"learning_rate": 1.6937451472512284e-06,
"loss": 1.3803,
"step": 588
},
{
"epoch": 0.739021329987453,
"grad_norm": 27.322994232177734,
"learning_rate": 1.6785481856270042e-06,
"loss": 1.2354,
"step": 589
},
{
"epoch": 0.740276035131744,
"grad_norm": 44.57414245605469,
"learning_rate": 1.6634059438888034e-06,
"loss": 1.5863,
"step": 590
},
{
"epoch": 0.7415307402760352,
"grad_norm": 33.31477737426758,
"learning_rate": 1.6483186715003523e-06,
"loss": 1.4086,
"step": 591
},
{
"epoch": 0.7427854454203262,
"grad_norm": 33.885536193847656,
"learning_rate": 1.633286617019771e-06,
"loss": 1.4022,
"step": 592
},
{
"epoch": 0.7440401505646174,
"grad_norm": 43.636802673339844,
"learning_rate": 1.618310028095486e-06,
"loss": 1.403,
"step": 593
},
{
"epoch": 0.7452948557089084,
"grad_norm": 38.1976432800293,
"learning_rate": 1.6033891514621436e-06,
"loss": 1.375,
"step": 594
},
{
"epoch": 0.7465495608531995,
"grad_norm": 27.386051177978516,
"learning_rate": 1.5885242329365448e-06,
"loss": 1.2411,
"step": 595
},
{
"epoch": 0.7478042659974906,
"grad_norm": 32.94865036010742,
"learning_rate": 1.5737155174136042e-06,
"loss": 1.3973,
"step": 596
},
{
"epoch": 0.7490589711417817,
"grad_norm": 52.85768127441406,
"learning_rate": 1.5589632488623053e-06,
"loss": 1.3857,
"step": 597
},
{
"epoch": 0.7503136762860728,
"grad_norm": 30.37677001953125,
"learning_rate": 1.5442676703216851e-06,
"loss": 1.2986,
"step": 598
},
{
"epoch": 0.7515683814303639,
"grad_norm": 50.629112243652344,
"learning_rate": 1.5296290238968303e-06,
"loss": 1.4606,
"step": 599
},
{
"epoch": 0.7528230865746549,
"grad_norm": 75.81658172607422,
"learning_rate": 1.5150475507548933e-06,
"loss": 1.4354,
"step": 600
},
{
"epoch": 0.7540777917189461,
"grad_norm": 32.35127639770508,
"learning_rate": 1.500523491121108e-06,
"loss": 1.4572,
"step": 601
},
{
"epoch": 0.7553324968632371,
"grad_norm": 36.757484436035156,
"learning_rate": 1.4860570842748412e-06,
"loss": 1.3798,
"step": 602
},
{
"epoch": 0.7565872020075283,
"grad_norm": 39.54582977294922,
"learning_rate": 1.47164856854565e-06,
"loss": 1.4334,
"step": 603
},
{
"epoch": 0.7578419071518193,
"grad_norm": 30.180776596069336,
"learning_rate": 1.4572981813093507e-06,
"loss": 1.4914,
"step": 604
},
{
"epoch": 0.7590966122961104,
"grad_norm": 55.5819091796875,
"learning_rate": 1.4430061589841122e-06,
"loss": 1.3051,
"step": 605
},
{
"epoch": 0.7603513174404015,
"grad_norm": 41.72428894042969,
"learning_rate": 1.4287727370265558e-06,
"loss": 1.5724,
"step": 606
},
{
"epoch": 0.7616060225846926,
"grad_norm": 30.067726135253906,
"learning_rate": 1.4145981499278877e-06,
"loss": 1.2012,
"step": 607
},
{
"epoch": 0.7628607277289837,
"grad_norm": 35.68577194213867,
"learning_rate": 1.4004826312100218e-06,
"loss": 1.375,
"step": 608
},
{
"epoch": 0.7641154328732748,
"grad_norm": 34.37779998779297,
"learning_rate": 1.386426413421738e-06,
"loss": 1.4803,
"step": 609
},
{
"epoch": 0.7653701380175659,
"grad_norm": 28.35356330871582,
"learning_rate": 1.3724297281348591e-06,
"loss": 1.0709,
"step": 610
},
{
"epoch": 0.766624843161857,
"grad_norm": 63.945228576660156,
"learning_rate": 1.3584928059404207e-06,
"loss": 1.3223,
"step": 611
},
{
"epoch": 0.767879548306148,
"grad_norm": 37.977333068847656,
"learning_rate": 1.3446158764448842e-06,
"loss": 1.3541,
"step": 612
},
{
"epoch": 0.7691342534504392,
"grad_norm": 33.97459411621094,
"learning_rate": 1.3307991682663463e-06,
"loss": 1.2762,
"step": 613
},
{
"epoch": 0.7703889585947302,
"grad_norm": 52.56448745727539,
"learning_rate": 1.3170429090307824e-06,
"loss": 1.4249,
"step": 614
},
{
"epoch": 0.7716436637390214,
"grad_norm": 29.552059173583984,
"learning_rate": 1.303347325368285e-06,
"loss": 1.3487,
"step": 615
},
{
"epoch": 0.7728983688833124,
"grad_norm": 52.34573745727539,
"learning_rate": 1.2897126429093354e-06,
"loss": 1.29,
"step": 616
},
{
"epoch": 0.7741530740276035,
"grad_norm": 38.19261932373047,
"learning_rate": 1.2761390862810907e-06,
"loss": 1.4146,
"step": 617
},
{
"epoch": 0.7754077791718946,
"grad_norm": 36.244651794433594,
"learning_rate": 1.2626268791036766e-06,
"loss": 1.4714,
"step": 618
},
{
"epoch": 0.7766624843161857,
"grad_norm": 41.59754180908203,
"learning_rate": 1.2491762439865034e-06,
"loss": 1.2052,
"step": 619
},
{
"epoch": 0.7779171894604768,
"grad_norm": 32.61091232299805,
"learning_rate": 1.235787402524603e-06,
"loss": 1.2954,
"step": 620
},
{
"epoch": 0.7791718946047679,
"grad_norm": 30.722808837890625,
"learning_rate": 1.2224605752949786e-06,
"loss": 1.2545,
"step": 621
},
{
"epoch": 0.7804265997490589,
"grad_norm": 36.57342529296875,
"learning_rate": 1.2091959818529636e-06,
"loss": 1.2536,
"step": 622
},
{
"epoch": 0.7816813048933501,
"grad_norm": 45.92577362060547,
"learning_rate": 1.1959938407286099e-06,
"loss": 1.3089,
"step": 623
},
{
"epoch": 0.7829360100376411,
"grad_norm": 31.191242218017578,
"learning_rate": 1.182854369423091e-06,
"loss": 1.2477,
"step": 624
},
{
"epoch": 0.7841907151819323,
"grad_norm": 31.34370231628418,
"learning_rate": 1.1697777844051105e-06,
"loss": 1.3789,
"step": 625
},
{
"epoch": 0.7854454203262233,
"grad_norm": 27.42989730834961,
"learning_rate": 1.1567643011073393e-06,
"loss": 1.2446,
"step": 626
},
{
"epoch": 0.7867001254705144,
"grad_norm": 31.601276397705078,
"learning_rate": 1.143814133922872e-06,
"loss": 1.453,
"step": 627
},
{
"epoch": 0.7879548306148055,
"grad_norm": 42.06584548950195,
"learning_rate": 1.1309274962016854e-06,
"loss": 1.2825,
"step": 628
},
{
"epoch": 0.7892095357590966,
"grad_norm": 36.16788864135742,
"learning_rate": 1.1181046002471292e-06,
"loss": 1.3807,
"step": 629
},
{
"epoch": 0.7904642409033877,
"grad_norm": 35.88719177246094,
"learning_rate": 1.1053456573124272e-06,
"loss": 1.1951,
"step": 630
},
{
"epoch": 0.7917189460476788,
"grad_norm": 43.55876541137695,
"learning_rate": 1.0926508775971995e-06,
"loss": 1.3084,
"step": 631
},
{
"epoch": 0.7929736511919699,
"grad_norm": 38.98108673095703,
"learning_rate": 1.0800204702439937e-06,
"loss": 1.336,
"step": 632
},
{
"epoch": 0.794228356336261,
"grad_norm": 34.15788650512695,
"learning_rate": 1.0674546433348453e-06,
"loss": 1.4309,
"step": 633
},
{
"epoch": 0.795483061480552,
"grad_norm": 42.34593963623047,
"learning_rate": 1.0549536038878432e-06,
"loss": 1.3815,
"step": 634
},
{
"epoch": 0.7967377666248432,
"grad_norm": 33.58256530761719,
"learning_rate": 1.04251755785373e-06,
"loss": 1.2034,
"step": 635
},
{
"epoch": 0.7979924717691342,
"grad_norm": 41.538753509521484,
"learning_rate": 1.0301467101124956e-06,
"loss": 1.3423,
"step": 636
},
{
"epoch": 0.7992471769134254,
"grad_norm": 42.10636901855469,
"learning_rate": 1.0178412644700093e-06,
"loss": 1.3916,
"step": 637
},
{
"epoch": 0.8005018820577164,
"grad_norm": 31.18490219116211,
"learning_rate": 1.0056014236546647e-06,
"loss": 1.1455,
"step": 638
},
{
"epoch": 0.8017565872020075,
"grad_norm": 32.616031646728516,
"learning_rate": 9.934273893140335e-07,
"loss": 1.3136,
"step": 639
},
{
"epoch": 0.8030112923462986,
"grad_norm": 41.29079818725586,
"learning_rate": 9.813193620115446e-07,
"loss": 1.2788,
"step": 640
},
{
"epoch": 0.8042659974905897,
"grad_norm": 39.024993896484375,
"learning_rate": 9.692775412231863e-07,
"loss": 1.3029,
"step": 641
},
{
"epoch": 0.8055207026348808,
"grad_norm": 40.532737731933594,
"learning_rate": 9.573021253342114e-07,
"loss": 1.3518,
"step": 642
},
{
"epoch": 0.8067754077791719,
"grad_norm": 42.95549011230469,
"learning_rate": 9.453933116358715e-07,
"loss": 1.4456,
"step": 643
},
{
"epoch": 0.8080301129234629,
"grad_norm": 30.134597778320312,
"learning_rate": 9.335512963221732e-07,
"loss": 1.2561,
"step": 644
},
{
"epoch": 0.8092848180677541,
"grad_norm": 42.78569412231445,
"learning_rate": 9.21776274486636e-07,
"loss": 1.3378,
"step": 645
},
{
"epoch": 0.8105395232120451,
"grad_norm": 54.95227813720703,
"learning_rate": 9.100684401190829e-07,
"loss": 1.3858,
"step": 646
},
{
"epoch": 0.8117942283563363,
"grad_norm": 42.90878677368164,
"learning_rate": 8.984279861024453e-07,
"loss": 1.2899,
"step": 647
},
{
"epoch": 0.8130489335006273,
"grad_norm": 53.56229019165039,
"learning_rate": 8.868551042095852e-07,
"loss": 1.468,
"step": 648
},
{
"epoch": 0.8143036386449184,
"grad_norm": 31.682039260864258,
"learning_rate": 8.753499851001341e-07,
"loss": 1.1707,
"step": 649
},
{
"epoch": 0.8155583437892095,
"grad_norm": 31.241701126098633,
"learning_rate": 8.639128183173517e-07,
"loss": 1.1829,
"step": 650
},
{
"epoch": 0.8168130489335006,
"grad_norm": 33.625938415527344,
"learning_rate": 8.525437922850033e-07,
"loss": 1.3418,
"step": 651
},
{
"epoch": 0.8180677540777918,
"grad_norm": 30.763322830200195,
"learning_rate": 8.412430943042616e-07,
"loss": 1.3651,
"step": 652
},
{
"epoch": 0.8193224592220828,
"grad_norm": 48.34621810913086,
"learning_rate": 8.30010910550611e-07,
"loss": 1.3246,
"step": 653
},
{
"epoch": 0.820577164366374,
"grad_norm": 35.97224426269531,
"learning_rate": 8.188474260707857e-07,
"loss": 1.422,
"step": 654
},
{
"epoch": 0.821831869510665,
"grad_norm": 31.350204467773438,
"learning_rate": 8.077528247797234e-07,
"loss": 1.3197,
"step": 655
},
{
"epoch": 0.823086574654956,
"grad_norm": 39.3220329284668,
"learning_rate": 7.967272894575312e-07,
"loss": 1.3164,
"step": 656
},
{
"epoch": 0.8243412797992472,
"grad_norm": 34.87789535522461,
"learning_rate": 7.857710017464737e-07,
"loss": 1.3422,
"step": 657
},
{
"epoch": 0.8255959849435383,
"grad_norm": 39.69428634643555,
"learning_rate": 7.748841421479875e-07,
"loss": 1.2374,
"step": 658
},
{
"epoch": 0.8268506900878294,
"grad_norm": 40.43376541137695,
"learning_rate": 7.640668900196985e-07,
"loss": 1.3143,
"step": 659
},
{
"epoch": 0.8281053952321205,
"grad_norm": 28.951221466064453,
"learning_rate": 7.533194235724728e-07,
"loss": 1.315,
"step": 660
},
{
"epoch": 0.8293601003764115,
"grad_norm": 56.01127243041992,
"learning_rate": 7.426419198674773e-07,
"loss": 1.3279,
"step": 661
},
{
"epoch": 0.8306148055207027,
"grad_norm": 36.56144332885742,
"learning_rate": 7.320345548132679e-07,
"loss": 1.2427,
"step": 662
},
{
"epoch": 0.8318695106649937,
"grad_norm": 34.64320373535156,
"learning_rate": 7.214975031628856e-07,
"loss": 1.3805,
"step": 663
},
{
"epoch": 0.8331242158092849,
"grad_norm": 42.90142059326172,
"learning_rate": 7.110309385109804e-07,
"loss": 1.3778,
"step": 664
},
{
"epoch": 0.8343789209535759,
"grad_norm": 33.45329284667969,
"learning_rate": 7.006350332909495e-07,
"loss": 1.3461,
"step": 665
},
{
"epoch": 0.835633626097867,
"grad_norm": 39.53373718261719,
"learning_rate": 6.903099587721024e-07,
"loss": 1.372,
"step": 666
},
{
"epoch": 0.8368883312421581,
"grad_norm": 26.866334915161133,
"learning_rate": 6.800558850568295e-07,
"loss": 1.1701,
"step": 667
},
{
"epoch": 0.8381430363864492,
"grad_norm": 35.01183319091797,
"learning_rate": 6.698729810778065e-07,
"loss": 1.2913,
"step": 668
},
{
"epoch": 0.8393977415307403,
"grad_norm": 26.15965461730957,
"learning_rate": 6.597614145952136e-07,
"loss": 1.1659,
"step": 669
},
{
"epoch": 0.8406524466750314,
"grad_norm": 27.10162925720215,
"learning_rate": 6.497213521939638e-07,
"loss": 1.176,
"step": 670
},
{
"epoch": 0.8419071518193224,
"grad_norm": 39.48128128051758,
"learning_rate": 6.397529592809615e-07,
"loss": 1.4855,
"step": 671
},
{
"epoch": 0.8431618569636136,
"grad_norm": 45.1597785949707,
"learning_rate": 6.298564000823848e-07,
"loss": 1.2702,
"step": 672
},
{
"epoch": 0.8444165621079046,
"grad_norm": 59.02643585205078,
"learning_rate": 6.20031837640967e-07,
"loss": 1.3335,
"step": 673
},
{
"epoch": 0.8456712672521958,
"grad_norm": 33.48893737792969,
"learning_rate": 6.102794338133195e-07,
"loss": 1.1215,
"step": 674
},
{
"epoch": 0.8469259723964868,
"grad_norm": 33.40549850463867,
"learning_rate": 6.005993492672657e-07,
"loss": 1.3049,
"step": 675
},
{
"epoch": 0.848180677540778,
"grad_norm": 28.336149215698242,
"learning_rate": 5.909917434791884e-07,
"loss": 1.2866,
"step": 676
},
{
"epoch": 0.849435382685069,
"grad_norm": 31.5575008392334,
"learning_rate": 5.814567747314049e-07,
"loss": 1.1839,
"step": 677
},
{
"epoch": 0.8506900878293601,
"grad_norm": 30.665040969848633,
"learning_rate": 5.719946001095617e-07,
"loss": 1.3647,
"step": 678
},
{
"epoch": 0.8519447929736512,
"grad_norm": 38.09904098510742,
"learning_rate": 5.626053755000421e-07,
"loss": 1.3963,
"step": 679
},
{
"epoch": 0.8531994981179423,
"grad_norm": 62.874881744384766,
"learning_rate": 5.532892555874059e-07,
"loss": 1.2852,
"step": 680
},
{
"epoch": 0.8544542032622334,
"grad_norm": 31.233694076538086,
"learning_rate": 5.440463938518304e-07,
"loss": 1.487,
"step": 681
},
{
"epoch": 0.8557089084065245,
"grad_norm": 34.371585845947266,
"learning_rate": 5.348769425665884e-07,
"loss": 1.3499,
"step": 682
},
{
"epoch": 0.8569636135508155,
"grad_norm": 40.928802490234375,
"learning_rate": 5.25781052795541e-07,
"loss": 1.494,
"step": 683
},
{
"epoch": 0.8582183186951067,
"grad_norm": 47.68248748779297,
"learning_rate": 5.167588743906432e-07,
"loss": 1.2565,
"step": 684
},
{
"epoch": 0.8594730238393977,
"grad_norm": 31.525768280029297,
"learning_rate": 5.078105559894791e-07,
"loss": 1.2186,
"step": 685
},
{
"epoch": 0.8607277289836889,
"grad_norm": 41.63323211669922,
"learning_rate": 4.989362450128133e-07,
"loss": 1.3934,
"step": 686
},
{
"epoch": 0.8619824341279799,
"grad_norm": 29.7374324798584,
"learning_rate": 4.901360876621597e-07,
"loss": 1.2498,
"step": 687
},
{
"epoch": 0.863237139272271,
"grad_norm": 38.2042350769043,
"learning_rate": 4.814102289173733e-07,
"loss": 1.1372,
"step": 688
},
{
"epoch": 0.8644918444165621,
"grad_norm": 33.84709930419922,
"learning_rate": 4.727588125342669e-07,
"loss": 1.218,
"step": 689
},
{
"epoch": 0.8657465495608532,
"grad_norm": 39.36479568481445,
"learning_rate": 4.6418198104223434e-07,
"loss": 1.3434,
"step": 690
},
{
"epoch": 0.8670012547051443,
"grad_norm": 45.70726776123047,
"learning_rate": 4.5567987574190677e-07,
"loss": 1.3344,
"step": 691
},
{
"epoch": 0.8682559598494354,
"grad_norm": 42.92964172363281,
"learning_rate": 4.4725263670282905e-07,
"loss": 1.3247,
"step": 692
},
{
"epoch": 0.8695106649937264,
"grad_norm": 33.368629455566406,
"learning_rate": 4.3890040276114044e-07,
"loss": 1.3195,
"step": 693
},
{
"epoch": 0.8707653701380176,
"grad_norm": 43.9223518371582,
"learning_rate": 4.306233115173009e-07,
"loss": 1.3844,
"step": 694
},
{
"epoch": 0.8720200752823086,
"grad_norm": 40.18341064453125,
"learning_rate": 4.224214993338149e-07,
"loss": 1.3651,
"step": 695
},
{
"epoch": 0.8732747804265998,
"grad_norm": 38.75429916381836,
"learning_rate": 4.1429510133298714e-07,
"loss": 1.3685,
"step": 696
},
{
"epoch": 0.8745294855708908,
"grad_norm": 41.714378356933594,
"learning_rate": 4.062442513947007e-07,
"loss": 1.4269,
"step": 697
},
{
"epoch": 0.875784190715182,
"grad_norm": 29.522842407226562,
"learning_rate": 3.9826908215420344e-07,
"loss": 1.1375,
"step": 698
},
{
"epoch": 0.877038895859473,
"grad_norm": 28.621906280517578,
"learning_rate": 3.903697249999289e-07,
"loss": 1.3684,
"step": 699
},
{
"epoch": 0.8782936010037641,
"grad_norm": 78.60023498535156,
"learning_rate": 3.825463100713317e-07,
"loss": 1.3113,
"step": 700
},
{
"epoch": 0.8795483061480552,
"grad_norm": 34.123355865478516,
"learning_rate": 3.747989662567403e-07,
"loss": 1.4122,
"step": 701
},
{
"epoch": 0.8808030112923463,
"grad_norm": 34.434959411621094,
"learning_rate": 3.671278211912338e-07,
"loss": 1.4044,
"step": 702
},
{
"epoch": 0.8820577164366374,
"grad_norm": 43.3989372253418,
"learning_rate": 3.595330012545445e-07,
"loss": 1.3849,
"step": 703
},
{
"epoch": 0.8833124215809285,
"grad_norm": 51.71344757080078,
"learning_rate": 3.520146315689693e-07,
"loss": 1.4736,
"step": 704
},
{
"epoch": 0.8845671267252195,
"grad_norm": 37.74956130981445,
"learning_rate": 3.445728359973094e-07,
"loss": 1.5021,
"step": 705
},
{
"epoch": 0.8858218318695107,
"grad_norm": 38.12771224975586,
"learning_rate": 3.372077371408361e-07,
"loss": 1.3782,
"step": 706
},
{
"epoch": 0.8870765370138017,
"grad_norm": 45.82014465332031,
"learning_rate": 3.299194563372604e-07,
"loss": 1.4072,
"step": 707
},
{
"epoch": 0.8883312421580929,
"grad_norm": 41.57502746582031,
"learning_rate": 3.22708113658744e-07,
"loss": 1.1852,
"step": 708
},
{
"epoch": 0.8895859473023839,
"grad_norm": 40.33243179321289,
"learning_rate": 3.1557382790991686e-07,
"loss": 1.2315,
"step": 709
},
{
"epoch": 0.890840652446675,
"grad_norm": 50.13658142089844,
"learning_rate": 3.085167166259162e-07,
"loss": 1.5278,
"step": 710
},
{
"epoch": 0.8920953575909661,
"grad_norm": 43.55479431152344,
"learning_rate": 3.015368960704584e-07,
"loss": 1.214,
"step": 711
},
{
"epoch": 0.8933500627352572,
"grad_norm": 40.6564826965332,
"learning_rate": 2.9463448123391634e-07,
"loss": 1.2893,
"step": 712
},
{
"epoch": 0.8946047678795483,
"grad_norm": 36.575809478759766,
"learning_rate": 2.878095858314278e-07,
"loss": 1.2348,
"step": 713
},
{
"epoch": 0.8958594730238394,
"grad_norm": 43.1509895324707,
"learning_rate": 2.810623223010245e-07,
"loss": 1.2692,
"step": 714
},
{
"epoch": 0.8971141781681304,
"grad_norm": 30.058103561401367,
"learning_rate": 2.743928018017744e-07,
"loss": 1.2322,
"step": 715
},
{
"epoch": 0.8983688833124216,
"grad_norm": 29.974342346191406,
"learning_rate": 2.67801134211953e-07,
"loss": 1.1901,
"step": 716
},
{
"epoch": 0.8996235884567126,
"grad_norm": 35.170406341552734,
"learning_rate": 2.612874281272371e-07,
"loss": 1.2897,
"step": 717
},
{
"epoch": 0.9008782936010038,
"grad_norm": 36.794464111328125,
"learning_rate": 2.548517908589077e-07,
"loss": 1.4094,
"step": 718
},
{
"epoch": 0.9021329987452948,
"grad_norm": 34.6309700012207,
"learning_rate": 2.4849432843208786e-07,
"loss": 1.2453,
"step": 719
},
{
"epoch": 0.903387703889586,
"grad_norm": 41.007938385009766,
"learning_rate": 2.422151455839955e-07,
"loss": 1.479,
"step": 720
},
{
"epoch": 0.904642409033877,
"grad_norm": 35.56821823120117,
"learning_rate": 2.3601434576221548e-07,
"loss": 1.2376,
"step": 721
},
{
"epoch": 0.9058971141781681,
"grad_norm": 40.16046905517578,
"learning_rate": 2.2989203112299685e-07,
"loss": 1.3773,
"step": 722
},
{
"epoch": 0.9071518193224593,
"grad_norm": 31.831424713134766,
"learning_rate": 2.2384830252957068e-07,
"loss": 1.2387,
"step": 723
},
{
"epoch": 0.9084065244667503,
"grad_norm": 102.80229187011719,
"learning_rate": 2.178832595504854e-07,
"loss": 1.3606,
"step": 724
},
{
"epoch": 0.9096612296110415,
"grad_norm": 37.90886688232422,
"learning_rate": 2.1199700045797077e-07,
"loss": 1.4478,
"step": 725
},
{
"epoch": 0.9109159347553325,
"grad_norm": 36.04559326171875,
"learning_rate": 2.0618962222631434e-07,
"loss": 1.4465,
"step": 726
},
{
"epoch": 0.9121706398996235,
"grad_norm": 45.584922790527344,
"learning_rate": 2.0046122053026697e-07,
"loss": 1.3702,
"step": 727
},
{
"epoch": 0.9134253450439147,
"grad_norm": 39.56161117553711,
"learning_rate": 1.9481188974346698e-07,
"loss": 1.2966,
"step": 728
},
{
"epoch": 0.9146800501882058,
"grad_norm": 48.845314025878906,
"learning_rate": 1.8924172293688148e-07,
"loss": 1.3017,
"step": 729
},
{
"epoch": 0.9159347553324969,
"grad_norm": 35.209503173828125,
"learning_rate": 1.8375081187727683e-07,
"loss": 1.2948,
"step": 730
},
{
"epoch": 0.917189460476788,
"grad_norm": 35.617698669433594,
"learning_rate": 1.7833924702570725e-07,
"loss": 1.1993,
"step": 731
},
{
"epoch": 0.918444165621079,
"grad_norm": 29.306623458862305,
"learning_rate": 1.7300711753601985e-07,
"loss": 1.2074,
"step": 732
},
{
"epoch": 0.9196988707653702,
"grad_norm": 34.39566421508789,
"learning_rate": 1.677545112533896e-07,
"loss": 1.3316,
"step": 733
},
{
"epoch": 0.9209535759096612,
"grad_norm": 36.989356994628906,
"learning_rate": 1.6258151471287397e-07,
"loss": 1.3134,
"step": 734
},
{
"epoch": 0.9222082810539524,
"grad_norm": 48.13298034667969,
"learning_rate": 1.5748821313798124e-07,
"loss": 1.3963,
"step": 735
},
{
"epoch": 0.9234629861982434,
"grad_norm": 39.777278900146484,
"learning_rate": 1.5247469043927153e-07,
"loss": 1.3866,
"step": 736
},
{
"epoch": 0.9247176913425345,
"grad_norm": 31.973005294799805,
"learning_rate": 1.4754102921297363e-07,
"loss": 1.2392,
"step": 737
},
{
"epoch": 0.9259723964868256,
"grad_norm": 31.995790481567383,
"learning_rate": 1.4268731073962094e-07,
"loss": 1.2198,
"step": 738
},
{
"epoch": 0.9272271016311167,
"grad_norm": 33.672569274902344,
"learning_rate": 1.3791361498271704e-07,
"loss": 1.3004,
"step": 739
},
{
"epoch": 0.9284818067754078,
"grad_norm": 31.81163787841797,
"learning_rate": 1.3322002058741678e-07,
"loss": 1.3826,
"step": 740
},
{
"epoch": 0.9297365119196989,
"grad_norm": 32.36835479736328,
"learning_rate": 1.2860660487922616e-07,
"loss": 1.4068,
"step": 741
},
{
"epoch": 0.93099121706399,
"grad_norm": 43.015193939208984,
"learning_rate": 1.240734438627361e-07,
"loss": 1.381,
"step": 742
},
{
"epoch": 0.9322459222082811,
"grad_norm": 44.727230072021484,
"learning_rate": 1.196206122203647e-07,
"loss": 1.3348,
"step": 743
},
{
"epoch": 0.9335006273525721,
"grad_norm": 29.804079055786133,
"learning_rate": 1.1524818331112853e-07,
"loss": 1.2291,
"step": 744
},
{
"epoch": 0.9347553324968633,
"grad_norm": 46.379451751708984,
"learning_rate": 1.1095622916943494e-07,
"loss": 1.4644,
"step": 745
},
{
"epoch": 0.9360100376411543,
"grad_norm": 31.480005264282227,
"learning_rate": 1.0674482050389457e-07,
"loss": 1.2402,
"step": 746
},
{
"epoch": 0.9372647427854455,
"grad_norm": 25.78557586669922,
"learning_rate": 1.0261402669615505e-07,
"loss": 1.3798,
"step": 747
},
{
"epoch": 0.9385194479297365,
"grad_norm": 41.28335189819336,
"learning_rate": 9.856391579976032e-08,
"loss": 1.3066,
"step": 748
},
{
"epoch": 0.9397741530740276,
"grad_norm": 42.25539779663086,
"learning_rate": 9.459455453902866e-08,
"loss": 1.3258,
"step": 749
},
{
"epoch": 0.9410288582183187,
"grad_norm": 39.32608413696289,
"learning_rate": 9.070600830795251e-08,
"loss": 1.3086,
"step": 750
},
{
"epoch": 0.9422835633626098,
"grad_norm": 30.92926025390625,
"learning_rate": 8.68983411691221e-08,
"loss": 1.1993,
"step": 751
},
{
"epoch": 0.9435382685069009,
"grad_norm": 33.10255813598633,
"learning_rate": 8.317161585266964e-08,
"loss": 1.1975,
"step": 752
},
{
"epoch": 0.944792973651192,
"grad_norm": 42.903900146484375,
"learning_rate": 7.952589375523567e-08,
"loss": 1.1978,
"step": 753
},
{
"epoch": 0.946047678795483,
"grad_norm": 28.121461868286133,
"learning_rate": 7.59612349389599e-08,
"loss": 1.222,
"step": 754
},
{
"epoch": 0.9473023839397742,
"grad_norm": 35.83945083618164,
"learning_rate": 7.247769813048644e-08,
"loss": 1.2472,
"step": 755
},
{
"epoch": 0.9485570890840652,
"grad_norm": 37.46073532104492,
"learning_rate": 6.907534072000177e-08,
"loss": 1.5055,
"step": 756
},
{
"epoch": 0.9498117942283564,
"grad_norm": 27.392000198364258,
"learning_rate": 6.575421876028721e-08,
"loss": 1.1948,
"step": 757
},
{
"epoch": 0.9510664993726474,
"grad_norm": 51.311744689941406,
"learning_rate": 6.251438696579293e-08,
"loss": 1.3754,
"step": 758
},
{
"epoch": 0.9523212045169385,
"grad_norm": 36.097373962402344,
"learning_rate": 5.935589871174208e-08,
"loss": 1.1822,
"step": 759
},
{
"epoch": 0.9535759096612296,
"grad_norm": 32.62606430053711,
"learning_rate": 5.627880603324532e-08,
"loss": 1.271,
"step": 760
},
{
"epoch": 0.9548306148055207,
"grad_norm": 37.016719818115234,
"learning_rate": 5.3283159624448745e-08,
"loss": 1.1878,
"step": 761
},
{
"epoch": 0.9560853199498118,
"grad_norm": 37.270118713378906,
"learning_rate": 5.0369008837696244e-08,
"loss": 1.2704,
"step": 762
},
{
"epoch": 0.9573400250941029,
"grad_norm": 37.098854064941406,
"learning_rate": 4.753640168271456e-08,
"loss": 1.2812,
"step": 763
},
{
"epoch": 0.958594730238394,
"grad_norm": 44.55942916870117,
"learning_rate": 4.478538482582617e-08,
"loss": 1.2366,
"step": 764
},
{
"epoch": 0.9598494353826851,
"grad_norm": 30.313488006591797,
"learning_rate": 4.211600358917989e-08,
"loss": 1.2731,
"step": 765
},
{
"epoch": 0.9611041405269761,
"grad_norm": 52.65010452270508,
"learning_rate": 3.9528301950000345e-08,
"loss": 1.4277,
"step": 766
},
{
"epoch": 0.9623588456712673,
"grad_norm": 34.024227142333984,
"learning_rate": 3.702232253986804e-08,
"loss": 1.3047,
"step": 767
},
{
"epoch": 0.9636135508155583,
"grad_norm": 49.82564163208008,
"learning_rate": 3.4598106644014863e-08,
"loss": 1.2943,
"step": 768
},
{
"epoch": 0.9648682559598495,
"grad_norm": 42.5301513671875,
"learning_rate": 3.2255694200643003e-08,
"loss": 1.3643,
"step": 769
},
{
"epoch": 0.9661229611041405,
"grad_norm": 36.81052017211914,
"learning_rate": 2.9995123800270476e-08,
"loss": 1.4252,
"step": 770
},
{
"epoch": 0.9673776662484316,
"grad_norm": 35.52188491821289,
"learning_rate": 2.7816432685091598e-08,
"loss": 1.35,
"step": 771
},
{
"epoch": 0.9686323713927227,
"grad_norm": 30.83523941040039,
"learning_rate": 2.5719656748364184e-08,
"loss": 1.2627,
"step": 772
},
{
"epoch": 0.9698870765370138,
"grad_norm": 29.04794692993164,
"learning_rate": 2.370483053382111e-08,
"loss": 1.2903,
"step": 773
},
{
"epoch": 0.9711417816813049,
"grad_norm": 36.21467208862305,
"learning_rate": 2.177198723509688e-08,
"loss": 1.3589,
"step": 774
},
{
"epoch": 0.972396486825596,
"grad_norm": 30.13644790649414,
"learning_rate": 1.992115869518474e-08,
"loss": 1.2922,
"step": 775
},
{
"epoch": 0.973651191969887,
"grad_norm": 50.431663513183594,
"learning_rate": 1.8152375405909305e-08,
"loss": 1.2573,
"step": 776
},
{
"epoch": 0.9749058971141782,
"grad_norm": 50.13302230834961,
"learning_rate": 1.6465666507425314e-08,
"loss": 1.4401,
"step": 777
},
{
"epoch": 0.9761606022584692,
"grad_norm": 46.383636474609375,
"learning_rate": 1.4861059787736886e-08,
"loss": 1.424,
"step": 778
},
{
"epoch": 0.9774153074027604,
"grad_norm": 34.33049011230469,
"learning_rate": 1.333858168224178e-08,
"loss": 1.2715,
"step": 779
},
{
"epoch": 0.9786700125470514,
"grad_norm": 42.03940963745117,
"learning_rate": 1.1898257273292857e-08,
"loss": 1.2918,
"step": 780
},
{
"epoch": 0.9799247176913425,
"grad_norm": 42.43777847290039,
"learning_rate": 1.0540110289786742e-08,
"loss": 1.5214,
"step": 781
},
{
"epoch": 0.9811794228356336,
"grad_norm": 31.801700592041016,
"learning_rate": 9.264163106774138e-09,
"loss": 1.2777,
"step": 782
},
{
"epoch": 0.9824341279799247,
"grad_norm": 49.655391693115234,
"learning_rate": 8.07043674508623e-09,
"loss": 1.2324,
"step": 783
},
{
"epoch": 0.9836888331242158,
"grad_norm": 37.17424011230469,
"learning_rate": 6.958950870994963e-09,
"loss": 1.2559,
"step": 784
},
{
"epoch": 0.9849435382685069,
"grad_norm": 33.83037567138672,
"learning_rate": 5.929723795884967e-09,
"loss": 1.2658,
"step": 785
},
{
"epoch": 0.986198243412798,
"grad_norm": 49.56622314453125,
"learning_rate": 4.982772475951026e-09,
"loss": 1.2301,
"step": 786
},
{
"epoch": 0.9874529485570891,
"grad_norm": 71.51993560791016,
"learning_rate": 4.1181125119221785e-09,
"loss": 1.4287,
"step": 787
},
{
"epoch": 0.9887076537013801,
"grad_norm": 37.62562942504883,
"learning_rate": 3.3357581488030476e-09,
"loss": 1.4585,
"step": 788
},
{
"epoch": 0.9899623588456713,
"grad_norm": 44.091552734375,
"learning_rate": 2.635722275638464e-09,
"loss": 1.5654,
"step": 789
},
{
"epoch": 0.9912170639899623,
"grad_norm": 67.96106719970703,
"learning_rate": 2.0180164253008614e-09,
"loss": 1.3665,
"step": 790
},
{
"epoch": 0.9924717691342535,
"grad_norm": 38.18610763549805,
"learning_rate": 1.4826507743032071e-09,
"loss": 1.1607,
"step": 791
},
{
"epoch": 0.9937264742785445,
"grad_norm": 36.40510940551758,
"learning_rate": 1.029634142627467e-09,
"loss": 1.2769,
"step": 792
},
{
"epoch": 0.9949811794228356,
"grad_norm": 34.85893630981445,
"learning_rate": 6.589739935819461e-10,
"loss": 1.3029,
"step": 793
},
{
"epoch": 0.9962358845671268,
"grad_norm": 36.350643157958984,
"learning_rate": 3.7067643367749707e-10,
"loss": 1.2861,
"step": 794
},
{
"epoch": 0.9974905897114178,
"grad_norm": 38.7654914855957,
"learning_rate": 1.6474621252704494e-10,
"loss": 1.1653,
"step": 795
},
{
"epoch": 0.998745294855709,
"grad_norm": 72.9517822265625,
"learning_rate": 4.118672276620661e-11,
"loss": 1.3579,
"step": 796
},
{
"epoch": 1.0,
"grad_norm": 33.991390228271484,
"learning_rate": 0.0,
"loss": 1.346,
"step": 797
}
],
"logging_steps": 1,
"max_steps": 797,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4921722755088384.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}