hosseinbv's picture
Uploading /ephemeral/hossein/output/y-llama-7B-from-scratch
932bcbf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.337349397590361,
"eval_steps": 50,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007228915662650603,
"grad_norm": 3.171796668248988,
"learning_rate": 1.3333333333333334e-06,
"loss": 11.2137,
"step": 1
},
{
"epoch": 0.014457831325301205,
"grad_norm": 3.1707739710439076,
"learning_rate": 2.666666666666667e-06,
"loss": 11.2137,
"step": 2
},
{
"epoch": 0.021686746987951807,
"grad_norm": 3.2027941867217407,
"learning_rate": 4.000000000000001e-06,
"loss": 11.2128,
"step": 3
},
{
"epoch": 0.02891566265060241,
"grad_norm": 3.165506125068552,
"learning_rate": 5.333333333333334e-06,
"loss": 11.1895,
"step": 4
},
{
"epoch": 0.03614457831325301,
"grad_norm": 3.193452742356146,
"learning_rate": 6.666666666666667e-06,
"loss": 11.1119,
"step": 5
},
{
"epoch": 0.043373493975903614,
"grad_norm": 3.225997111904869,
"learning_rate": 8.000000000000001e-06,
"loss": 11.0683,
"step": 6
},
{
"epoch": 0.05060240963855422,
"grad_norm": 3.730074542647314,
"learning_rate": 9.333333333333334e-06,
"loss": 10.74,
"step": 7
},
{
"epoch": 0.05783132530120482,
"grad_norm": 4.035219081617531,
"learning_rate": 1.0666666666666667e-05,
"loss": 10.6055,
"step": 8
},
{
"epoch": 0.06506024096385542,
"grad_norm": 9.65258338136702,
"learning_rate": 1.2e-05,
"loss": 10.0028,
"step": 9
},
{
"epoch": 0.07228915662650602,
"grad_norm": 8.31667671170884,
"learning_rate": 1.3333333333333333e-05,
"loss": 9.8408,
"step": 10
},
{
"epoch": 0.07951807228915662,
"grad_norm": 4.991403421247793,
"learning_rate": 1.4666666666666666e-05,
"loss": 9.5849,
"step": 11
},
{
"epoch": 0.08674698795180723,
"grad_norm": 22.736881861641823,
"learning_rate": 1.6000000000000003e-05,
"loss": 10.3364,
"step": 12
},
{
"epoch": 0.09397590361445783,
"grad_norm": 6.702516595093613,
"learning_rate": 1.7333333333333336e-05,
"loss": 9.9409,
"step": 13
},
{
"epoch": 0.10120481927710843,
"grad_norm": 6.489246651194166,
"learning_rate": 1.866666666666667e-05,
"loss": 9.541,
"step": 14
},
{
"epoch": 0.10843373493975904,
"grad_norm": 6.0911333019039,
"learning_rate": 2e-05,
"loss": 8.9292,
"step": 15
},
{
"epoch": 0.11566265060240964,
"grad_norm": 19.10698802756055,
"learning_rate": 1.9999994461641828e-05,
"loss": 8.8768,
"step": 16
},
{
"epoch": 0.12289156626506025,
"grad_norm": 23.248522636875954,
"learning_rate": 1.9999977846573447e-05,
"loss": 9.075,
"step": 17
},
{
"epoch": 0.13012048192771083,
"grad_norm": 16.744432299141884,
"learning_rate": 1.9999950154813253e-05,
"loss": 8.8426,
"step": 18
},
{
"epoch": 0.13734939759036144,
"grad_norm": 8.068279939163723,
"learning_rate": 1.999991138639193e-05,
"loss": 8.464,
"step": 19
},
{
"epoch": 0.14457831325301204,
"grad_norm": 5.015885612913215,
"learning_rate": 1.9999861541352416e-05,
"loss": 8.3351,
"step": 20
},
{
"epoch": 0.15180722891566265,
"grad_norm": 8.22497767561685,
"learning_rate": 1.9999800619749922e-05,
"loss": 8.2269,
"step": 21
},
{
"epoch": 0.15903614457831325,
"grad_norm": 6.117420631713467,
"learning_rate": 1.9999728621651928e-05,
"loss": 8.0756,
"step": 22
},
{
"epoch": 0.16626506024096385,
"grad_norm": 12.16982281732316,
"learning_rate": 1.999964554713819e-05,
"loss": 8.1024,
"step": 23
},
{
"epoch": 0.17349397590361446,
"grad_norm": 7.456329076047914,
"learning_rate": 1.9999551396300722e-05,
"loss": 8.0019,
"step": 24
},
{
"epoch": 0.18072289156626506,
"grad_norm": 4.440613166100985,
"learning_rate": 1.9999446169243816e-05,
"loss": 7.8293,
"step": 25
},
{
"epoch": 0.18795180722891566,
"grad_norm": 6.566054149827827,
"learning_rate": 1.9999329866084025e-05,
"loss": 7.8169,
"step": 26
},
{
"epoch": 0.19518072289156627,
"grad_norm": 5.278515612282176,
"learning_rate": 1.9999202486950177e-05,
"loss": 7.7384,
"step": 27
},
{
"epoch": 0.20240963855421687,
"grad_norm": 3.95795043459971,
"learning_rate": 1.9999064031983366e-05,
"loss": 7.6547,
"step": 28
},
{
"epoch": 0.20963855421686747,
"grad_norm": 3.127753435885174,
"learning_rate": 1.9998914501336956e-05,
"loss": 7.5462,
"step": 29
},
{
"epoch": 0.21686746987951808,
"grad_norm": 4.438751294210153,
"learning_rate": 1.9998753895176576e-05,
"loss": 7.49,
"step": 30
},
{
"epoch": 0.22409638554216868,
"grad_norm": 4.273352272968494,
"learning_rate": 1.9998582213680123e-05,
"loss": 7.4468,
"step": 31
},
{
"epoch": 0.23132530120481928,
"grad_norm": 4.039956440849618,
"learning_rate": 1.999839945703777e-05,
"loss": 7.3883,
"step": 32
},
{
"epoch": 0.2385542168674699,
"grad_norm": 3.46365331288089,
"learning_rate": 1.9998205625451942e-05,
"loss": 7.3413,
"step": 33
},
{
"epoch": 0.2457831325301205,
"grad_norm": 3.761224558374574,
"learning_rate": 1.9998000719137353e-05,
"loss": 7.2896,
"step": 34
},
{
"epoch": 0.25301204819277107,
"grad_norm": 2.186432630643423,
"learning_rate": 1.999778473832096e-05,
"loss": 7.2291,
"step": 35
},
{
"epoch": 0.26024096385542167,
"grad_norm": 2.362614951337317,
"learning_rate": 1.9997557683242002e-05,
"loss": 7.2061,
"step": 36
},
{
"epoch": 0.2674698795180723,
"grad_norm": 2.98365102790646,
"learning_rate": 1.9997319554151988e-05,
"loss": 7.1366,
"step": 37
},
{
"epoch": 0.2746987951807229,
"grad_norm": 2.4721645367609306,
"learning_rate": 1.999707035131468e-05,
"loss": 7.1196,
"step": 38
},
{
"epoch": 0.2819277108433735,
"grad_norm": 2.5862742956183595,
"learning_rate": 1.9996810075006118e-05,
"loss": 7.078,
"step": 39
},
{
"epoch": 0.2891566265060241,
"grad_norm": 1.761307023407293,
"learning_rate": 1.9996538725514597e-05,
"loss": 7.0406,
"step": 40
},
{
"epoch": 0.2963855421686747,
"grad_norm": 2.000320378345402,
"learning_rate": 1.9996256303140688e-05,
"loss": 6.994,
"step": 41
},
{
"epoch": 0.3036144578313253,
"grad_norm": 1.746136060053765,
"learning_rate": 1.9995962808197217e-05,
"loss": 6.9823,
"step": 42
},
{
"epoch": 0.3108433734939759,
"grad_norm": 2.221450310426094,
"learning_rate": 1.9995658241009287e-05,
"loss": 6.9416,
"step": 43
},
{
"epoch": 0.3180722891566265,
"grad_norm": 2.1286116939303565,
"learning_rate": 1.9995342601914253e-05,
"loss": 6.8975,
"step": 44
},
{
"epoch": 0.3253012048192771,
"grad_norm": 3.9934233459602155,
"learning_rate": 1.999501589126174e-05,
"loss": 6.8784,
"step": 45
},
{
"epoch": 0.3325301204819277,
"grad_norm": 2.3348260466914788,
"learning_rate": 1.9994678109413638e-05,
"loss": 6.8568,
"step": 46
},
{
"epoch": 0.3397590361445783,
"grad_norm": 2.354485208468061,
"learning_rate": 1.9994329256744098e-05,
"loss": 6.8399,
"step": 47
},
{
"epoch": 0.3469879518072289,
"grad_norm": 2.1060678792501606,
"learning_rate": 1.9993969333639533e-05,
"loss": 6.8025,
"step": 48
},
{
"epoch": 0.3542168674698795,
"grad_norm": 1.968952895510157,
"learning_rate": 1.999359834049862e-05,
"loss": 6.7626,
"step": 49
},
{
"epoch": 0.3614457831325301,
"grad_norm": 3.1844501997779573,
"learning_rate": 1.9993216277732302e-05,
"loss": 6.7535,
"step": 50
},
{
"epoch": 0.3686746987951807,
"grad_norm": 1.4004142529003338,
"learning_rate": 1.999282314576377e-05,
"loss": 6.7236,
"step": 51
},
{
"epoch": 0.3759036144578313,
"grad_norm": 2.2511518742674075,
"learning_rate": 1.9992418945028494e-05,
"loss": 6.7025,
"step": 52
},
{
"epoch": 0.38313253012048193,
"grad_norm": 2.017984757576541,
"learning_rate": 1.999200367597419e-05,
"loss": 6.6952,
"step": 53
},
{
"epoch": 0.39036144578313253,
"grad_norm": 2.693725440825055,
"learning_rate": 1.9991577339060842e-05,
"loss": 6.6671,
"step": 54
},
{
"epoch": 0.39759036144578314,
"grad_norm": 2.336687191513332,
"learning_rate": 1.999113993476069e-05,
"loss": 6.6279,
"step": 55
},
{
"epoch": 0.40481927710843374,
"grad_norm": 1.626156137507855,
"learning_rate": 1.9990691463558235e-05,
"loss": 6.6054,
"step": 56
},
{
"epoch": 0.41204819277108434,
"grad_norm": 4.710443054060896,
"learning_rate": 1.999023192595024e-05,
"loss": 6.6323,
"step": 57
},
{
"epoch": 0.41927710843373495,
"grad_norm": 3.1631374303832787,
"learning_rate": 1.9989761322445714e-05,
"loss": 6.6045,
"step": 58
},
{
"epoch": 0.42650602409638555,
"grad_norm": 2.705558784445874,
"learning_rate": 1.9989279653565937e-05,
"loss": 6.5761,
"step": 59
},
{
"epoch": 0.43373493975903615,
"grad_norm": 2.4029383370840343,
"learning_rate": 1.9988786919844437e-05,
"loss": 6.5353,
"step": 60
},
{
"epoch": 0.44096385542168676,
"grad_norm": 4.839935196054632,
"learning_rate": 1.9988283121827004e-05,
"loss": 6.5426,
"step": 61
},
{
"epoch": 0.44819277108433736,
"grad_norm": 2.756332508287445,
"learning_rate": 1.9987768260071676e-05,
"loss": 6.5024,
"step": 62
},
{
"epoch": 0.45542168674698796,
"grad_norm": 3.1831689088661688,
"learning_rate": 1.9987242335148757e-05,
"loss": 6.4839,
"step": 63
},
{
"epoch": 0.46265060240963857,
"grad_norm": 2.4835416495638625,
"learning_rate": 1.9986705347640797e-05,
"loss": 6.4779,
"step": 64
},
{
"epoch": 0.46987951807228917,
"grad_norm": 3.5329316160565254,
"learning_rate": 1.9986157298142595e-05,
"loss": 6.4812,
"step": 65
},
{
"epoch": 0.4771084337349398,
"grad_norm": 5.2048663013264616,
"learning_rate": 1.998559818726122e-05,
"loss": 6.4407,
"step": 66
},
{
"epoch": 0.4843373493975904,
"grad_norm": 3.354789297443166,
"learning_rate": 1.9985028015615977e-05,
"loss": 6.403,
"step": 67
},
{
"epoch": 0.491566265060241,
"grad_norm": 2.157900918992127,
"learning_rate": 1.9984446783838432e-05,
"loss": 6.3751,
"step": 68
},
{
"epoch": 0.4987951807228916,
"grad_norm": 4.357115916572051,
"learning_rate": 1.9983854492572394e-05,
"loss": 6.4063,
"step": 69
},
{
"epoch": 0.5060240963855421,
"grad_norm": 2.201495820105759,
"learning_rate": 1.9983251142473935e-05,
"loss": 6.3544,
"step": 70
},
{
"epoch": 0.5132530120481927,
"grad_norm": 3.560124480900461,
"learning_rate": 1.9982636734211363e-05,
"loss": 6.3406,
"step": 71
},
{
"epoch": 0.5204819277108433,
"grad_norm": 3.566386015017448,
"learning_rate": 1.9982011268465246e-05,
"loss": 6.3305,
"step": 72
},
{
"epoch": 0.5277108433734939,
"grad_norm": 3.1131013450231375,
"learning_rate": 1.9981374745928388e-05,
"loss": 6.3004,
"step": 73
},
{
"epoch": 0.5349397590361445,
"grad_norm": 2.233793810037388,
"learning_rate": 1.998072716730585e-05,
"loss": 6.2761,
"step": 74
},
{
"epoch": 0.5421686746987951,
"grad_norm": 2.092535316219654,
"learning_rate": 1.9980068533314937e-05,
"loss": 6.2389,
"step": 75
},
{
"epoch": 0.5493975903614458,
"grad_norm": 2.45842268354128,
"learning_rate": 1.9979398844685197e-05,
"loss": 6.2151,
"step": 76
},
{
"epoch": 0.5566265060240964,
"grad_norm": 2.434137746198376,
"learning_rate": 1.9978718102158426e-05,
"loss": 6.2081,
"step": 77
},
{
"epoch": 0.563855421686747,
"grad_norm": 1.9402942665250358,
"learning_rate": 1.9978026306488665e-05,
"loss": 6.1746,
"step": 78
},
{
"epoch": 0.5710843373493976,
"grad_norm": 2.212680204205858,
"learning_rate": 1.99773234584422e-05,
"loss": 6.1698,
"step": 79
},
{
"epoch": 0.5783132530120482,
"grad_norm": 2.867414145507553,
"learning_rate": 1.9976609558797545e-05,
"loss": 6.1731,
"step": 80
},
{
"epoch": 0.5855421686746988,
"grad_norm": 1.6677690785418402,
"learning_rate": 1.9975884608345476e-05,
"loss": 6.131,
"step": 81
},
{
"epoch": 0.5927710843373494,
"grad_norm": 2.283628912711049,
"learning_rate": 1.9975148607888996e-05,
"loss": 6.0977,
"step": 82
},
{
"epoch": 0.6,
"grad_norm": 1.694395360167115,
"learning_rate": 1.9974401558243355e-05,
"loss": 6.0956,
"step": 83
},
{
"epoch": 0.6072289156626506,
"grad_norm": 2.1339705194766396,
"learning_rate": 1.9973643460236032e-05,
"loss": 6.108,
"step": 84
},
{
"epoch": 0.6144578313253012,
"grad_norm": 1.8551400305063734,
"learning_rate": 1.9972874314706755e-05,
"loss": 6.0656,
"step": 85
},
{
"epoch": 0.6216867469879518,
"grad_norm": 1.5313963607117285,
"learning_rate": 1.9972094122507485e-05,
"loss": 6.0186,
"step": 86
},
{
"epoch": 0.6289156626506024,
"grad_norm": 2.740724894685539,
"learning_rate": 1.997130288450242e-05,
"loss": 6.0288,
"step": 87
},
{
"epoch": 0.636144578313253,
"grad_norm": 2.0020638006423455,
"learning_rate": 1.997050060156799e-05,
"loss": 5.9922,
"step": 88
},
{
"epoch": 0.6433734939759036,
"grad_norm": 2.0686044074960948,
"learning_rate": 1.9969687274592863e-05,
"loss": 5.9676,
"step": 89
},
{
"epoch": 0.6506024096385542,
"grad_norm": 2.5446753330179055,
"learning_rate": 1.9968862904477936e-05,
"loss": 5.9557,
"step": 90
},
{
"epoch": 0.6578313253012048,
"grad_norm": 1.2820834374222696,
"learning_rate": 1.9968027492136337e-05,
"loss": 5.9011,
"step": 91
},
{
"epoch": 0.6650602409638554,
"grad_norm": 3.091054744667501,
"learning_rate": 1.9967181038493435e-05,
"loss": 5.9168,
"step": 92
},
{
"epoch": 0.672289156626506,
"grad_norm": 2.072802836305613,
"learning_rate": 1.9966323544486817e-05,
"loss": 5.8987,
"step": 93
},
{
"epoch": 0.6795180722891566,
"grad_norm": 1.946281703099331,
"learning_rate": 1.9965455011066313e-05,
"loss": 5.8606,
"step": 94
},
{
"epoch": 0.6867469879518072,
"grad_norm": 3.9574158066669143,
"learning_rate": 1.9964575439193966e-05,
"loss": 5.8466,
"step": 95
},
{
"epoch": 0.6939759036144578,
"grad_norm": 2.8374569157108525,
"learning_rate": 1.9963684829844053e-05,
"loss": 5.8101,
"step": 96
},
{
"epoch": 0.7012048192771084,
"grad_norm": 2.481663771923425,
"learning_rate": 1.996278318400308e-05,
"loss": 5.7891,
"step": 97
},
{
"epoch": 0.708433734939759,
"grad_norm": 2.628963140065436,
"learning_rate": 1.9961870502669772e-05,
"loss": 5.778,
"step": 98
},
{
"epoch": 0.7156626506024096,
"grad_norm": 1.7537754862303554,
"learning_rate": 1.996094678685508e-05,
"loss": 5.7059,
"step": 99
},
{
"epoch": 0.7228915662650602,
"grad_norm": 2.9054329151341802,
"learning_rate": 1.996001203758218e-05,
"loss": 5.7243,
"step": 100
},
{
"epoch": 0.7301204819277108,
"grad_norm": 2.335259221423995,
"learning_rate": 1.9959066255886465e-05,
"loss": 5.7074,
"step": 101
},
{
"epoch": 0.7373493975903614,
"grad_norm": 1.503090373097313,
"learning_rate": 1.9958109442815553e-05,
"loss": 5.6759,
"step": 102
},
{
"epoch": 0.744578313253012,
"grad_norm": 3.6022806083901573,
"learning_rate": 1.9957141599429277e-05,
"loss": 5.671,
"step": 103
},
{
"epoch": 0.7518072289156627,
"grad_norm": 2.0559454839399285,
"learning_rate": 1.995616272679969e-05,
"loss": 5.6682,
"step": 104
},
{
"epoch": 0.7590361445783133,
"grad_norm": 2.320827355274334,
"learning_rate": 1.995517282601106e-05,
"loss": 5.6525,
"step": 105
},
{
"epoch": 0.7662650602409639,
"grad_norm": 1.7559053764640538,
"learning_rate": 1.9954171898159876e-05,
"loss": 5.5938,
"step": 106
},
{
"epoch": 0.7734939759036145,
"grad_norm": 2.955031141899699,
"learning_rate": 1.9953159944354832e-05,
"loss": 5.5713,
"step": 107
},
{
"epoch": 0.7807228915662651,
"grad_norm": 1.6374927792829403,
"learning_rate": 1.9952136965716846e-05,
"loss": 5.5735,
"step": 108
},
{
"epoch": 0.7879518072289157,
"grad_norm": 2.291268667751327,
"learning_rate": 1.995110296337904e-05,
"loss": 5.5624,
"step": 109
},
{
"epoch": 0.7951807228915663,
"grad_norm": 1.889173374799646,
"learning_rate": 1.9950057938486745e-05,
"loss": 5.5267,
"step": 110
},
{
"epoch": 0.8024096385542169,
"grad_norm": 2.1937875261971054,
"learning_rate": 1.9949001892197514e-05,
"loss": 5.4858,
"step": 111
},
{
"epoch": 0.8096385542168675,
"grad_norm": 1.4888370843663468,
"learning_rate": 1.9947934825681092e-05,
"loss": 5.4787,
"step": 112
},
{
"epoch": 0.8168674698795181,
"grad_norm": 2.8633836455244195,
"learning_rate": 1.9946856740119442e-05,
"loss": 5.4862,
"step": 113
},
{
"epoch": 0.8240963855421687,
"grad_norm": 2.4482360353083337,
"learning_rate": 1.994576763670673e-05,
"loss": 5.4919,
"step": 114
},
{
"epoch": 0.8313253012048193,
"grad_norm": 1.7268002558296573,
"learning_rate": 1.994466751664932e-05,
"loss": 5.4446,
"step": 115
},
{
"epoch": 0.8385542168674699,
"grad_norm": 2.186838960349522,
"learning_rate": 1.9943556381165785e-05,
"loss": 5.4286,
"step": 116
},
{
"epoch": 0.8457831325301205,
"grad_norm": 1.7901714148433048,
"learning_rate": 1.99424342314869e-05,
"loss": 5.4139,
"step": 117
},
{
"epoch": 0.8530120481927711,
"grad_norm": 1.7585907215388294,
"learning_rate": 1.994130106885564e-05,
"loss": 5.3935,
"step": 118
},
{
"epoch": 0.8602409638554217,
"grad_norm": 1.8017706034631977,
"learning_rate": 1.9940156894527178e-05,
"loss": 5.3549,
"step": 119
},
{
"epoch": 0.8674698795180723,
"grad_norm": 1.851212819912402,
"learning_rate": 1.993900170976888e-05,
"loss": 5.3467,
"step": 120
},
{
"epoch": 0.8746987951807229,
"grad_norm": 1.5118926301562392,
"learning_rate": 1.993783551586031e-05,
"loss": 5.3375,
"step": 121
},
{
"epoch": 0.8819277108433735,
"grad_norm": 1.7153595550375194,
"learning_rate": 1.993665831409323e-05,
"loss": 5.2921,
"step": 122
},
{
"epoch": 0.8891566265060241,
"grad_norm": 2.074317448041623,
"learning_rate": 1.9935470105771596e-05,
"loss": 5.2889,
"step": 123
},
{
"epoch": 0.8963855421686747,
"grad_norm": 1.3244813318443867,
"learning_rate": 1.9934270892211548e-05,
"loss": 5.2638,
"step": 124
},
{
"epoch": 0.9036144578313253,
"grad_norm": 3.321959023030634,
"learning_rate": 1.9933060674741422e-05,
"loss": 5.3037,
"step": 125
},
{
"epoch": 0.9108433734939759,
"grad_norm": 2.0640131331656213,
"learning_rate": 1.9931839454701744e-05,
"loss": 5.289,
"step": 126
},
{
"epoch": 0.9180722891566265,
"grad_norm": 2.1456889879912757,
"learning_rate": 1.993060723344522e-05,
"loss": 5.2088,
"step": 127
},
{
"epoch": 0.9253012048192771,
"grad_norm": 2.0596746709249167,
"learning_rate": 1.992936401233675e-05,
"loss": 5.2139,
"step": 128
},
{
"epoch": 0.9325301204819277,
"grad_norm": 1.9248162588441524,
"learning_rate": 1.9928109792753417e-05,
"loss": 5.2246,
"step": 129
},
{
"epoch": 0.9397590361445783,
"grad_norm": 1.4891364526870876,
"learning_rate": 1.9926844576084483e-05,
"loss": 5.1677,
"step": 130
},
{
"epoch": 0.946987951807229,
"grad_norm": 1.5932598961826778,
"learning_rate": 1.9925568363731388e-05,
"loss": 5.1616,
"step": 131
},
{
"epoch": 0.9542168674698795,
"grad_norm": 1.7385768882199546,
"learning_rate": 1.9924281157107762e-05,
"loss": 5.1549,
"step": 132
},
{
"epoch": 0.9614457831325302,
"grad_norm": 1.6773180106543204,
"learning_rate": 1.99229829576394e-05,
"loss": 5.1407,
"step": 133
},
{
"epoch": 0.9686746987951808,
"grad_norm": 1.1426123121276215,
"learning_rate": 1.9921673766764292e-05,
"loss": 5.121,
"step": 134
},
{
"epoch": 0.9759036144578314,
"grad_norm": 2.474738495686583,
"learning_rate": 1.992035358593258e-05,
"loss": 5.1344,
"step": 135
},
{
"epoch": 0.983132530120482,
"grad_norm": 1.6451131370612986,
"learning_rate": 1.9919022416606596e-05,
"loss": 5.1402,
"step": 136
},
{
"epoch": 0.9903614457831326,
"grad_norm": 2.9204003742023374,
"learning_rate": 1.991768026026084e-05,
"loss": 5.1172,
"step": 137
},
{
"epoch": 0.9975903614457832,
"grad_norm": 2.1702223253759776,
"learning_rate": 1.991632711838198e-05,
"loss": 5.1175,
"step": 138
},
{
"epoch": 1.0048192771084337,
"grad_norm": 1.496695700102801,
"learning_rate": 1.9914962992468854e-05,
"loss": 5.0598,
"step": 139
},
{
"epoch": 1.0120481927710843,
"grad_norm": 3.3192304356055917,
"learning_rate": 1.991358788403246e-05,
"loss": 5.0917,
"step": 140
},
{
"epoch": 1.0192771084337349,
"grad_norm": 2.613171654160906,
"learning_rate": 1.991220179459597e-05,
"loss": 5.0876,
"step": 141
},
{
"epoch": 1.0265060240963855,
"grad_norm": 2.021270169187327,
"learning_rate": 1.991080472569472e-05,
"loss": 5.0376,
"step": 142
},
{
"epoch": 1.033734939759036,
"grad_norm": 1.7874754394384833,
"learning_rate": 1.9909396678876197e-05,
"loss": 5.0297,
"step": 143
},
{
"epoch": 1.0409638554216867,
"grad_norm": 2.1666942328948693,
"learning_rate": 1.9907977655700056e-05,
"loss": 5.021,
"step": 144
},
{
"epoch": 1.0481927710843373,
"grad_norm": 1.4974174294265452,
"learning_rate": 1.990654765773811e-05,
"loss": 4.9784,
"step": 145
},
{
"epoch": 1.0554216867469879,
"grad_norm": 2.4030846737304903,
"learning_rate": 1.990510668657433e-05,
"loss": 5.0143,
"step": 146
},
{
"epoch": 1.0626506024096385,
"grad_norm": 1.5847142291975158,
"learning_rate": 1.9903654743804833e-05,
"loss": 4.9717,
"step": 147
},
{
"epoch": 1.069879518072289,
"grad_norm": 2.2293042123928872,
"learning_rate": 1.9902191831037898e-05,
"loss": 4.975,
"step": 148
},
{
"epoch": 1.0771084337349397,
"grad_norm": 1.3107443058211161,
"learning_rate": 1.9900717949893952e-05,
"loss": 4.9554,
"step": 149
},
{
"epoch": 1.0843373493975903,
"grad_norm": 2.98633810723938,
"learning_rate": 1.9899233102005573e-05,
"loss": 4.9822,
"step": 150
},
{
"epoch": 1.091566265060241,
"grad_norm": 2.2485470075652567,
"learning_rate": 1.989773728901748e-05,
"loss": 4.9836,
"step": 151
},
{
"epoch": 1.0987951807228915,
"grad_norm": 1.6406575235273002,
"learning_rate": 1.9896230512586548e-05,
"loss": 4.9658,
"step": 152
},
{
"epoch": 1.106024096385542,
"grad_norm": 2.042584221681109,
"learning_rate": 1.9894712774381787e-05,
"loss": 4.9253,
"step": 153
},
{
"epoch": 1.1132530120481927,
"grad_norm": 2.027178425201193,
"learning_rate": 1.9893184076084356e-05,
"loss": 4.9186,
"step": 154
},
{
"epoch": 1.1204819277108433,
"grad_norm": 1.432938791816063,
"learning_rate": 1.9891644419387545e-05,
"loss": 4.915,
"step": 155
},
{
"epoch": 1.127710843373494,
"grad_norm": 1.8296615722980554,
"learning_rate": 1.989009380599679e-05,
"loss": 4.8821,
"step": 156
},
{
"epoch": 1.1349397590361445,
"grad_norm": 1.8064088509012493,
"learning_rate": 1.988853223762967e-05,
"loss": 4.9103,
"step": 157
},
{
"epoch": 1.1421686746987951,
"grad_norm": 1.6601625863719773,
"learning_rate": 1.988695971601588e-05,
"loss": 4.902,
"step": 158
},
{
"epoch": 1.1493975903614457,
"grad_norm": 1.4310915855021005,
"learning_rate": 1.9885376242897257e-05,
"loss": 4.8543,
"step": 159
},
{
"epoch": 1.1566265060240963,
"grad_norm": 1.6199867111977333,
"learning_rate": 1.9883781820027777e-05,
"loss": 4.8642,
"step": 160
},
{
"epoch": 1.163855421686747,
"grad_norm": 1.60087290545569,
"learning_rate": 1.988217644917353e-05,
"loss": 4.861,
"step": 161
},
{
"epoch": 1.1710843373493975,
"grad_norm": 1.4310791832649268,
"learning_rate": 1.9880560132112743e-05,
"loss": 4.8449,
"step": 162
},
{
"epoch": 1.1783132530120481,
"grad_norm": 1.5116431119020965,
"learning_rate": 1.9878932870635766e-05,
"loss": 4.8333,
"step": 163
},
{
"epoch": 1.1855421686746987,
"grad_norm": 1.3107772455570288,
"learning_rate": 1.9877294666545067e-05,
"loss": 4.8205,
"step": 164
},
{
"epoch": 1.1927710843373494,
"grad_norm": 1.590059839480383,
"learning_rate": 1.987564552165524e-05,
"loss": 4.7843,
"step": 165
},
{
"epoch": 1.2,
"grad_norm": 1.2940053194060677,
"learning_rate": 1.9873985437792992e-05,
"loss": 4.7775,
"step": 166
},
{
"epoch": 1.2072289156626506,
"grad_norm": 1.5649565551472995,
"learning_rate": 1.987231441679716e-05,
"loss": 4.7764,
"step": 167
},
{
"epoch": 1.2144578313253012,
"grad_norm": 1.364758540526031,
"learning_rate": 1.987063246051868e-05,
"loss": 4.7742,
"step": 168
},
{
"epoch": 1.2216867469879518,
"grad_norm": 1.9001350592348814,
"learning_rate": 1.9868939570820608e-05,
"loss": 4.7813,
"step": 169
},
{
"epoch": 1.2289156626506024,
"grad_norm": 1.4234469610031761,
"learning_rate": 1.9867235749578108e-05,
"loss": 4.7641,
"step": 170
},
{
"epoch": 1.236144578313253,
"grad_norm": 2.1776918892703225,
"learning_rate": 1.9865520998678458e-05,
"loss": 4.7818,
"step": 171
},
{
"epoch": 1.2433734939759036,
"grad_norm": 1.3890011845463408,
"learning_rate": 1.986379532002104e-05,
"loss": 4.7704,
"step": 172
},
{
"epoch": 1.2506024096385542,
"grad_norm": 2.0487165142274306,
"learning_rate": 1.9862058715517332e-05,
"loss": 4.7802,
"step": 173
},
{
"epoch": 1.2578313253012048,
"grad_norm": 1.2355025181764954,
"learning_rate": 1.9860311187090927e-05,
"loss": 4.7402,
"step": 174
},
{
"epoch": 1.2650602409638554,
"grad_norm": 1.9249359849876109,
"learning_rate": 1.9858552736677516e-05,
"loss": 4.7306,
"step": 175
},
{
"epoch": 1.272289156626506,
"grad_norm": 2.139696984414974,
"learning_rate": 1.985678336622488e-05,
"loss": 4.717,
"step": 176
},
{
"epoch": 1.2795180722891566,
"grad_norm": 1.0388000431735693,
"learning_rate": 1.9855003077692898e-05,
"loss": 4.6619,
"step": 177
},
{
"epoch": 1.2867469879518072,
"grad_norm": 1.5186233110203489,
"learning_rate": 1.9853211873053547e-05,
"loss": 4.6737,
"step": 178
},
{
"epoch": 1.2939759036144578,
"grad_norm": 0.9525342243672005,
"learning_rate": 1.98514097542909e-05,
"loss": 4.6861,
"step": 179
},
{
"epoch": 1.3012048192771084,
"grad_norm": 1.689627739569883,
"learning_rate": 1.984959672340111e-05,
"loss": 4.6936,
"step": 180
},
{
"epoch": 1.308433734939759,
"grad_norm": 1.7822859896216436,
"learning_rate": 1.984777278239241e-05,
"loss": 4.6881,
"step": 181
},
{
"epoch": 1.3156626506024096,
"grad_norm": 0.9953258914708404,
"learning_rate": 1.984593793328514e-05,
"loss": 4.6258,
"step": 182
},
{
"epoch": 1.3228915662650602,
"grad_norm": 2.5510924952211647,
"learning_rate": 1.9844092178111703e-05,
"loss": 4.6827,
"step": 183
},
{
"epoch": 1.3301204819277108,
"grad_norm": 1.4564971326104093,
"learning_rate": 1.9842235518916592e-05,
"loss": 4.675,
"step": 184
},
{
"epoch": 1.3373493975903614,
"grad_norm": 1.3183469685608156,
"learning_rate": 1.984036795775638e-05,
"loss": 4.6577,
"step": 185
},
{
"epoch": 1.344578313253012,
"grad_norm": 1.6076666014762455,
"learning_rate": 1.9838489496699703e-05,
"loss": 4.6344,
"step": 186
},
{
"epoch": 1.3518072289156626,
"grad_norm": 1.4334408175522528,
"learning_rate": 1.983660013782729e-05,
"loss": 4.6371,
"step": 187
},
{
"epoch": 1.3590361445783132,
"grad_norm": 1.0276575588288683,
"learning_rate": 1.983469988323192e-05,
"loss": 4.5858,
"step": 188
},
{
"epoch": 1.3662650602409638,
"grad_norm": 1.2524191668366011,
"learning_rate": 1.9832788735018452e-05,
"loss": 4.6238,
"step": 189
},
{
"epoch": 1.3734939759036144,
"grad_norm": 1.111924655995767,
"learning_rate": 1.9830866695303817e-05,
"loss": 4.6196,
"step": 190
},
{
"epoch": 1.380722891566265,
"grad_norm": 1.4587811465856404,
"learning_rate": 1.9828933766216998e-05,
"loss": 4.6152,
"step": 191
},
{
"epoch": 1.3879518072289156,
"grad_norm": 1.3517246312720717,
"learning_rate": 1.982698994989905e-05,
"loss": 4.5879,
"step": 192
},
{
"epoch": 1.3951807228915662,
"grad_norm": 1.0781361858523884,
"learning_rate": 1.982503524850308e-05,
"loss": 4.5935,
"step": 193
},
{
"epoch": 1.4024096385542169,
"grad_norm": 1.3983940494845462,
"learning_rate": 1.982306966419426e-05,
"loss": 4.567,
"step": 194
},
{
"epoch": 1.4096385542168675,
"grad_norm": 1.0951708644481744,
"learning_rate": 1.9821093199149806e-05,
"loss": 4.5285,
"step": 195
},
{
"epoch": 1.416867469879518,
"grad_norm": 1.5337453436245976,
"learning_rate": 1.9819105855558996e-05,
"loss": 4.5419,
"step": 196
},
{
"epoch": 1.4240963855421687,
"grad_norm": 1.0266826762343255,
"learning_rate": 1.981710763562315e-05,
"loss": 4.5681,
"step": 197
},
{
"epoch": 1.4313253012048193,
"grad_norm": 1.7564922932399614,
"learning_rate": 1.9815098541555645e-05,
"loss": 4.5582,
"step": 198
},
{
"epoch": 1.4385542168674699,
"grad_norm": 1.1072497618442172,
"learning_rate": 1.9813078575581898e-05,
"loss": 4.5167,
"step": 199
},
{
"epoch": 1.4457831325301205,
"grad_norm": 1.566828226827763,
"learning_rate": 1.981104773993936e-05,
"loss": 4.5037,
"step": 200
},
{
"epoch": 1.453012048192771,
"grad_norm": 1.2269950037429016,
"learning_rate": 1.9809006036877537e-05,
"loss": 4.5214,
"step": 201
},
{
"epoch": 1.4602409638554217,
"grad_norm": 1.2192495763855802,
"learning_rate": 1.9806953468657966e-05,
"loss": 4.5067,
"step": 202
},
{
"epoch": 1.4674698795180723,
"grad_norm": 1.0133073397884125,
"learning_rate": 1.9804890037554215e-05,
"loss": 4.5097,
"step": 203
},
{
"epoch": 1.4746987951807229,
"grad_norm": 1.1081042205578755,
"learning_rate": 1.9802815745851888e-05,
"loss": 4.4802,
"step": 204
},
{
"epoch": 1.4819277108433735,
"grad_norm": 1.052790913749799,
"learning_rate": 1.980073059584862e-05,
"loss": 4.4943,
"step": 205
},
{
"epoch": 1.489156626506024,
"grad_norm": 1.0752814528281058,
"learning_rate": 1.9798634589854073e-05,
"loss": 4.4714,
"step": 206
},
{
"epoch": 1.4963855421686747,
"grad_norm": 1.4338817481695876,
"learning_rate": 1.9796527730189936e-05,
"loss": 4.4653,
"step": 207
},
{
"epoch": 1.5036144578313253,
"grad_norm": 0.9863500898479299,
"learning_rate": 1.9794410019189913e-05,
"loss": 4.4539,
"step": 208
},
{
"epoch": 1.510843373493976,
"grad_norm": 1.2842052237396306,
"learning_rate": 1.9792281459199737e-05,
"loss": 4.4653,
"step": 209
},
{
"epoch": 1.5180722891566265,
"grad_norm": 1.4247627004514936,
"learning_rate": 1.9790142052577148e-05,
"loss": 4.4411,
"step": 210
},
{
"epoch": 1.5253012048192771,
"grad_norm": 1.2800317446823677,
"learning_rate": 1.9787991801691907e-05,
"loss": 4.43,
"step": 211
},
{
"epoch": 1.5325301204819277,
"grad_norm": 1.288596540985055,
"learning_rate": 1.9785830708925792e-05,
"loss": 4.4493,
"step": 212
},
{
"epoch": 1.5397590361445783,
"grad_norm": 1.2727314825477154,
"learning_rate": 1.978365877667258e-05,
"loss": 4.4233,
"step": 213
},
{
"epoch": 1.546987951807229,
"grad_norm": 1.0980274594078798,
"learning_rate": 1.9781476007338058e-05,
"loss": 4.3968,
"step": 214
},
{
"epoch": 1.5542168674698795,
"grad_norm": 1.2067021264618283,
"learning_rate": 1.977928240334002e-05,
"loss": 4.4214,
"step": 215
},
{
"epoch": 1.5614457831325301,
"grad_norm": 1.2126474757588954,
"learning_rate": 1.9777077967108254e-05,
"loss": 4.4098,
"step": 216
},
{
"epoch": 1.5686746987951807,
"grad_norm": 1.4690386666975572,
"learning_rate": 1.9774862701084555e-05,
"loss": 4.4268,
"step": 217
},
{
"epoch": 1.5759036144578313,
"grad_norm": 1.0488849882671147,
"learning_rate": 1.9772636607722714e-05,
"loss": 4.3894,
"step": 218
},
{
"epoch": 1.583132530120482,
"grad_norm": 1.630809721223867,
"learning_rate": 1.9770399689488506e-05,
"loss": 4.4133,
"step": 219
},
{
"epoch": 1.5903614457831325,
"grad_norm": 1.113109729713893,
"learning_rate": 1.9768151948859705e-05,
"loss": 4.3772,
"step": 220
},
{
"epoch": 1.5975903614457831,
"grad_norm": 1.5423275320082017,
"learning_rate": 1.9765893388326064e-05,
"loss": 4.3785,
"step": 221
},
{
"epoch": 1.6048192771084338,
"grad_norm": 1.0807052768227303,
"learning_rate": 1.9763624010389334e-05,
"loss": 4.3694,
"step": 222
},
{
"epoch": 1.6120481927710844,
"grad_norm": 1.235992463133922,
"learning_rate": 1.9761343817563235e-05,
"loss": 4.359,
"step": 223
},
{
"epoch": 1.619277108433735,
"grad_norm": 1.5186454860312188,
"learning_rate": 1.9759052812373478e-05,
"loss": 4.3792,
"step": 224
},
{
"epoch": 1.6265060240963856,
"grad_norm": 1.2019163259646208,
"learning_rate": 1.9756750997357738e-05,
"loss": 4.3688,
"step": 225
},
{
"epoch": 1.6337349397590362,
"grad_norm": 1.7333794728755505,
"learning_rate": 1.9754438375065673e-05,
"loss": 4.3512,
"step": 226
},
{
"epoch": 1.6409638554216868,
"grad_norm": 1.1407460567282055,
"learning_rate": 1.9752114948058908e-05,
"loss": 4.3802,
"step": 227
},
{
"epoch": 1.6481927710843374,
"grad_norm": 1.6157840392044631,
"learning_rate": 1.974978071891104e-05,
"loss": 4.3511,
"step": 228
},
{
"epoch": 1.655421686746988,
"grad_norm": 1.0476544634861773,
"learning_rate": 1.9747435690207625e-05,
"loss": 4.3221,
"step": 229
},
{
"epoch": 1.6626506024096386,
"grad_norm": 1.697500843790948,
"learning_rate": 1.9745079864546184e-05,
"loss": 4.3312,
"step": 230
},
{
"epoch": 1.6698795180722892,
"grad_norm": 1.3904195390686156,
"learning_rate": 1.9742713244536205e-05,
"loss": 4.2968,
"step": 231
},
{
"epoch": 1.6771084337349398,
"grad_norm": 1.5290382901012387,
"learning_rate": 1.9740335832799118e-05,
"loss": 4.319,
"step": 232
},
{
"epoch": 1.6843373493975904,
"grad_norm": 1.279841205474429,
"learning_rate": 1.973794763196832e-05,
"loss": 4.3077,
"step": 233
},
{
"epoch": 1.691566265060241,
"grad_norm": 1.3872098276085976,
"learning_rate": 1.973554864468915e-05,
"loss": 4.3165,
"step": 234
},
{
"epoch": 1.6987951807228916,
"grad_norm": 1.2522587215309555,
"learning_rate": 1.97331388736189e-05,
"loss": 4.2679,
"step": 235
},
{
"epoch": 1.7060240963855422,
"grad_norm": 1.3360665273295562,
"learning_rate": 1.9730718321426804e-05,
"loss": 4.306,
"step": 236
},
{
"epoch": 1.7132530120481928,
"grad_norm": 1.6653558400826634,
"learning_rate": 1.972828699079404e-05,
"loss": 4.2908,
"step": 237
},
{
"epoch": 1.7204819277108434,
"grad_norm": 1.057771248044181,
"learning_rate": 1.972584488441372e-05,
"loss": 4.2358,
"step": 238
},
{
"epoch": 1.727710843373494,
"grad_norm": 1.690770002741558,
"learning_rate": 1.9723392004990904e-05,
"loss": 4.2696,
"step": 239
},
{
"epoch": 1.7349397590361446,
"grad_norm": 1.3303504757518965,
"learning_rate": 1.972092835524257e-05,
"loss": 4.2581,
"step": 240
},
{
"epoch": 1.7421686746987952,
"grad_norm": 1.5896442171678875,
"learning_rate": 1.9718453937897637e-05,
"loss": 4.2744,
"step": 241
},
{
"epoch": 1.7493975903614458,
"grad_norm": 1.2953058262004782,
"learning_rate": 1.9715968755696946e-05,
"loss": 4.2906,
"step": 242
},
{
"epoch": 1.7566265060240964,
"grad_norm": 1.1295293666533506,
"learning_rate": 1.971347281139326e-05,
"loss": 4.2681,
"step": 243
},
{
"epoch": 1.763855421686747,
"grad_norm": 1.7488116684794741,
"learning_rate": 1.971096610775127e-05,
"loss": 4.2374,
"step": 244
},
{
"epoch": 1.7710843373493976,
"grad_norm": 0.9787269896385271,
"learning_rate": 1.9708448647547575e-05,
"loss": 4.2927,
"step": 245
},
{
"epoch": 1.7783132530120482,
"grad_norm": 1.9264724538555262,
"learning_rate": 1.9705920433570696e-05,
"loss": 4.2781,
"step": 246
},
{
"epoch": 1.7855421686746988,
"grad_norm": 1.1677065487324365,
"learning_rate": 1.970338146862107e-05,
"loss": 4.2401,
"step": 247
},
{
"epoch": 1.7927710843373494,
"grad_norm": 1.5709729392003717,
"learning_rate": 1.9700831755511028e-05,
"loss": 4.2468,
"step": 248
},
{
"epoch": 1.8,
"grad_norm": 0.971212117208314,
"learning_rate": 1.969827129706482e-05,
"loss": 4.2163,
"step": 249
},
{
"epoch": 1.8072289156626506,
"grad_norm": 1.4724427784286502,
"learning_rate": 1.9695700096118594e-05,
"loss": 4.2263,
"step": 250
},
{
"epoch": 1.8144578313253013,
"grad_norm": 1.2968473120262307,
"learning_rate": 1.9693118155520394e-05,
"loss": 4.2356,
"step": 251
},
{
"epoch": 1.8216867469879519,
"grad_norm": 1.3911380380341898,
"learning_rate": 1.9690525478130162e-05,
"loss": 4.2298,
"step": 252
},
{
"epoch": 1.8289156626506025,
"grad_norm": 0.8098318647581148,
"learning_rate": 1.9687922066819734e-05,
"loss": 4.1948,
"step": 253
},
{
"epoch": 1.836144578313253,
"grad_norm": 1.1659829550163594,
"learning_rate": 1.9685307924472835e-05,
"loss": 4.1959,
"step": 254
},
{
"epoch": 1.8433734939759037,
"grad_norm": 1.1602925584411976,
"learning_rate": 1.9682683053985073e-05,
"loss": 4.1684,
"step": 255
},
{
"epoch": 1.8506024096385543,
"grad_norm": 1.643102304487686,
"learning_rate": 1.968004745826395e-05,
"loss": 4.2193,
"step": 256
},
{
"epoch": 1.8578313253012049,
"grad_norm": 1.0061759340188303,
"learning_rate": 1.9677401140228834e-05,
"loss": 4.1585,
"step": 257
},
{
"epoch": 1.8650602409638555,
"grad_norm": 1.4204336218376243,
"learning_rate": 1.967474410281098e-05,
"loss": 4.2211,
"step": 258
},
{
"epoch": 1.872289156626506,
"grad_norm": 1.2381003435701177,
"learning_rate": 1.9672076348953507e-05,
"loss": 4.164,
"step": 259
},
{
"epoch": 1.8795180722891565,
"grad_norm": 1.3142604064597698,
"learning_rate": 1.966939788161142e-05,
"loss": 4.1279,
"step": 260
},
{
"epoch": 1.886746987951807,
"grad_norm": 1.5490808076132756,
"learning_rate": 1.9666708703751576e-05,
"loss": 4.1989,
"step": 261
},
{
"epoch": 1.8939759036144577,
"grad_norm": 1.6637523057931343,
"learning_rate": 1.96640088183527e-05,
"loss": 4.1386,
"step": 262
},
{
"epoch": 1.9012048192771083,
"grad_norm": 1.1845491376199697,
"learning_rate": 1.9661298228405377e-05,
"loss": 4.1487,
"step": 263
},
{
"epoch": 1.9084337349397589,
"grad_norm": 1.263908380935535,
"learning_rate": 1.9658576936912057e-05,
"loss": 4.1302,
"step": 264
},
{
"epoch": 1.9156626506024095,
"grad_norm": 1.289931147920118,
"learning_rate": 1.9655844946887035e-05,
"loss": 4.1303,
"step": 265
},
{
"epoch": 1.92289156626506,
"grad_norm": 1.0861244271331052,
"learning_rate": 1.9653102261356456e-05,
"loss": 4.1042,
"step": 266
},
{
"epoch": 1.9301204819277107,
"grad_norm": 1.4855987153795662,
"learning_rate": 1.9650348883358313e-05,
"loss": 4.1239,
"step": 267
},
{
"epoch": 1.9373493975903613,
"grad_norm": 1.1356728975830408,
"learning_rate": 1.9647584815942455e-05,
"loss": 4.1402,
"step": 268
},
{
"epoch": 1.944578313253012,
"grad_norm": 1.4987833232332588,
"learning_rate": 1.964481006217055e-05,
"loss": 4.1223,
"step": 269
},
{
"epoch": 1.9518072289156625,
"grad_norm": 1.3698683946332046,
"learning_rate": 1.9642024625116117e-05,
"loss": 4.0846,
"step": 270
},
{
"epoch": 1.959036144578313,
"grad_norm": 1.4713817649962406,
"learning_rate": 1.963922850786451e-05,
"loss": 4.0943,
"step": 271
},
{
"epoch": 1.9662650602409637,
"grad_norm": 1.6158687103746316,
"learning_rate": 1.963642171351291e-05,
"loss": 4.1092,
"step": 272
},
{
"epoch": 1.9734939759036143,
"grad_norm": 1.3112900282213475,
"learning_rate": 1.963360424517031e-05,
"loss": 4.1197,
"step": 273
},
{
"epoch": 1.980722891566265,
"grad_norm": 1.3165917322818186,
"learning_rate": 1.9630776105957553e-05,
"loss": 4.0985,
"step": 274
},
{
"epoch": 1.9879518072289155,
"grad_norm": 1.2679890312035356,
"learning_rate": 1.9627937299007286e-05,
"loss": 4.0918,
"step": 275
},
{
"epoch": 1.9951807228915661,
"grad_norm": 1.3006507811283787,
"learning_rate": 1.962508782746397e-05,
"loss": 4.1095,
"step": 276
},
{
"epoch": 2.0024096385542167,
"grad_norm": 1.4124431865354987,
"learning_rate": 1.9622227694483893e-05,
"loss": 4.0901,
"step": 277
},
{
"epoch": 2.0096385542168673,
"grad_norm": 1.2819799804016005,
"learning_rate": 1.9619356903235132e-05,
"loss": 4.047,
"step": 278
},
{
"epoch": 2.016867469879518,
"grad_norm": 1.5597905663515192,
"learning_rate": 1.961647545689759e-05,
"loss": 4.0477,
"step": 279
},
{
"epoch": 2.0240963855421685,
"grad_norm": 0.9423133582923033,
"learning_rate": 1.961358335866296e-05,
"loss": 4.0491,
"step": 280
},
{
"epoch": 2.031325301204819,
"grad_norm": 1.8743296089578012,
"learning_rate": 1.9610680611734737e-05,
"loss": 4.0293,
"step": 281
},
{
"epoch": 2.0385542168674697,
"grad_norm": 0.9891080430136323,
"learning_rate": 1.960776721932821e-05,
"loss": 4.0302,
"step": 282
},
{
"epoch": 2.0457831325301203,
"grad_norm": 2.1075583542099157,
"learning_rate": 1.9604843184670462e-05,
"loss": 4.0533,
"step": 283
},
{
"epoch": 2.053012048192771,
"grad_norm": 1.3836722891539663,
"learning_rate": 1.9601908511000362e-05,
"loss": 4.0804,
"step": 284
},
{
"epoch": 2.0602409638554215,
"grad_norm": 1.4934238633799322,
"learning_rate": 1.959896320156857e-05,
"loss": 4.0569,
"step": 285
},
{
"epoch": 2.067469879518072,
"grad_norm": 1.1757415076562812,
"learning_rate": 1.959600725963752e-05,
"loss": 3.9952,
"step": 286
},
{
"epoch": 2.0746987951807228,
"grad_norm": 1.5693536638143577,
"learning_rate": 1.959304068848142e-05,
"loss": 4.0489,
"step": 287
},
{
"epoch": 2.0819277108433734,
"grad_norm": 1.224291668036312,
"learning_rate": 1.9590063491386262e-05,
"loss": 4.0471,
"step": 288
},
{
"epoch": 2.089156626506024,
"grad_norm": 1.1290270781594147,
"learning_rate": 1.95870756716498e-05,
"loss": 4.0212,
"step": 289
},
{
"epoch": 2.0963855421686746,
"grad_norm": 1.894574752880013,
"learning_rate": 1.958407723258156e-05,
"loss": 4.0215,
"step": 290
},
{
"epoch": 2.103614457831325,
"grad_norm": 1.4980896883131283,
"learning_rate": 1.9581068177502826e-05,
"loss": 4.014,
"step": 291
},
{
"epoch": 2.1108433734939758,
"grad_norm": 1.4071538350431236,
"learning_rate": 1.9578048509746643e-05,
"loss": 3.9862,
"step": 292
},
{
"epoch": 2.1180722891566264,
"grad_norm": 1.3369928398838042,
"learning_rate": 1.9575018232657806e-05,
"loss": 4.0087,
"step": 293
},
{
"epoch": 2.125301204819277,
"grad_norm": 1.3725504314220875,
"learning_rate": 1.9571977349592878e-05,
"loss": 3.9819,
"step": 294
},
{
"epoch": 2.1325301204819276,
"grad_norm": 1.074883668514762,
"learning_rate": 1.9568925863920155e-05,
"loss": 3.9878,
"step": 295
},
{
"epoch": 2.139759036144578,
"grad_norm": 1.4617619796119823,
"learning_rate": 1.9565863779019675e-05,
"loss": 4.0011,
"step": 296
},
{
"epoch": 2.146987951807229,
"grad_norm": 1.295064754737287,
"learning_rate": 1.9562791098283226e-05,
"loss": 3.9857,
"step": 297
},
{
"epoch": 2.1542168674698794,
"grad_norm": 1.7198029396122985,
"learning_rate": 1.9559707825114334e-05,
"loss": 3.995,
"step": 298
},
{
"epoch": 2.16144578313253,
"grad_norm": 1.1877879145202288,
"learning_rate": 1.9556613962928244e-05,
"loss": 3.9518,
"step": 299
},
{
"epoch": 2.1686746987951806,
"grad_norm": 1.4041536368701515,
"learning_rate": 1.955350951515195e-05,
"loss": 3.9694,
"step": 300
},
{
"epoch": 2.175903614457831,
"grad_norm": 1.4632978460765673,
"learning_rate": 1.9550394485224154e-05,
"loss": 4.0008,
"step": 301
},
{
"epoch": 2.183132530120482,
"grad_norm": 1.179001588622081,
"learning_rate": 1.9547268876595286e-05,
"loss": 3.9742,
"step": 302
},
{
"epoch": 2.1903614457831324,
"grad_norm": 1.3424766923175933,
"learning_rate": 1.9544132692727497e-05,
"loss": 3.9582,
"step": 303
},
{
"epoch": 2.197590361445783,
"grad_norm": 1.44537054552251,
"learning_rate": 1.954098593709465e-05,
"loss": 3.9669,
"step": 304
},
{
"epoch": 2.2048192771084336,
"grad_norm": 1.2894995735637325,
"learning_rate": 1.9537828613182314e-05,
"loss": 3.9352,
"step": 305
},
{
"epoch": 2.212048192771084,
"grad_norm": 1.2191740632517898,
"learning_rate": 1.9534660724487766e-05,
"loss": 3.9462,
"step": 306
},
{
"epoch": 2.219277108433735,
"grad_norm": 1.2922608641871975,
"learning_rate": 1.953148227451999e-05,
"loss": 3.968,
"step": 307
},
{
"epoch": 2.2265060240963854,
"grad_norm": 1.4031391626334404,
"learning_rate": 1.952829326679966e-05,
"loss": 3.9203,
"step": 308
},
{
"epoch": 2.233734939759036,
"grad_norm": 1.3797106514946174,
"learning_rate": 1.9525093704859156e-05,
"loss": 3.9411,
"step": 309
},
{
"epoch": 2.2409638554216866,
"grad_norm": 1.2240686755095442,
"learning_rate": 1.9521883592242537e-05,
"loss": 3.8945,
"step": 310
},
{
"epoch": 2.2481927710843372,
"grad_norm": 1.359674903926122,
"learning_rate": 1.9518662932505554e-05,
"loss": 3.9592,
"step": 311
},
{
"epoch": 2.255421686746988,
"grad_norm": 0.8861545685501412,
"learning_rate": 1.9515431729215642e-05,
"loss": 3.9198,
"step": 312
},
{
"epoch": 2.2626506024096384,
"grad_norm": 1.486723607802486,
"learning_rate": 1.951218998595192e-05,
"loss": 3.9507,
"step": 313
},
{
"epoch": 2.269879518072289,
"grad_norm": 1.313871908419903,
"learning_rate": 1.950893770630516e-05,
"loss": 3.9217,
"step": 314
},
{
"epoch": 2.2771084337349397,
"grad_norm": 0.8836336304647858,
"learning_rate": 1.950567489387783e-05,
"loss": 3.9043,
"step": 315
},
{
"epoch": 2.2843373493975903,
"grad_norm": 1.1630419252591093,
"learning_rate": 1.9502401552284053e-05,
"loss": 3.9174,
"step": 316
},
{
"epoch": 2.291566265060241,
"grad_norm": 1.6191540917865768,
"learning_rate": 1.9499117685149617e-05,
"loss": 3.9032,
"step": 317
},
{
"epoch": 2.2987951807228915,
"grad_norm": 1.046430539809095,
"learning_rate": 1.9495823296111968e-05,
"loss": 3.9224,
"step": 318
},
{
"epoch": 2.306024096385542,
"grad_norm": 2.0316286942343846,
"learning_rate": 1.9492518388820204e-05,
"loss": 3.935,
"step": 319
},
{
"epoch": 2.3132530120481927,
"grad_norm": 0.7503307570466232,
"learning_rate": 1.9489202966935084e-05,
"loss": 3.8877,
"step": 320
},
{
"epoch": 2.3204819277108433,
"grad_norm": 2.195779504301335,
"learning_rate": 1.9485877034129e-05,
"loss": 3.9413,
"step": 321
},
{
"epoch": 2.327710843373494,
"grad_norm": 0.9316398409981106,
"learning_rate": 1.9482540594086e-05,
"loss": 3.8999,
"step": 322
},
{
"epoch": 2.3349397590361445,
"grad_norm": 1.869511574600292,
"learning_rate": 1.9479193650501756e-05,
"loss": 3.9171,
"step": 323
},
{
"epoch": 2.342168674698795,
"grad_norm": 1.1547734133448195,
"learning_rate": 1.947583620708359e-05,
"loss": 3.9078,
"step": 324
},
{
"epoch": 2.3493975903614457,
"grad_norm": 1.7976523987062796,
"learning_rate": 1.947246826755044e-05,
"loss": 3.9188,
"step": 325
},
{
"epoch": 2.3566265060240963,
"grad_norm": 1.3943533699571777,
"learning_rate": 1.9469089835632883e-05,
"loss": 3.8771,
"step": 326
},
{
"epoch": 2.363855421686747,
"grad_norm": 1.326917276516545,
"learning_rate": 1.946570091507311e-05,
"loss": 3.8827,
"step": 327
},
{
"epoch": 2.3710843373493975,
"grad_norm": 1.2430788981224146,
"learning_rate": 1.9462301509624933e-05,
"loss": 3.8734,
"step": 328
},
{
"epoch": 2.378313253012048,
"grad_norm": 1.5379553907088528,
"learning_rate": 1.9458891623053776e-05,
"loss": 3.9014,
"step": 329
},
{
"epoch": 2.3855421686746987,
"grad_norm": 1.2479691146217833,
"learning_rate": 1.945547125913667e-05,
"loss": 3.9109,
"step": 330
},
{
"epoch": 2.3927710843373493,
"grad_norm": 1.2398397651517286,
"learning_rate": 1.9452040421662263e-05,
"loss": 3.8697,
"step": 331
},
{
"epoch": 2.4,
"grad_norm": 1.1754621111943704,
"learning_rate": 1.9448599114430793e-05,
"loss": 3.8795,
"step": 332
},
{
"epoch": 2.4072289156626505,
"grad_norm": 1.3452310894880555,
"learning_rate": 1.9445147341254094e-05,
"loss": 3.8632,
"step": 333
},
{
"epoch": 2.414457831325301,
"grad_norm": 1.3656366008762484,
"learning_rate": 1.9441685105955605e-05,
"loss": 3.8541,
"step": 334
},
{
"epoch": 2.4216867469879517,
"grad_norm": 1.033093624385257,
"learning_rate": 1.943821241237034e-05,
"loss": 3.8198,
"step": 335
},
{
"epoch": 2.4289156626506023,
"grad_norm": 1.290188236529091,
"learning_rate": 1.94347292643449e-05,
"loss": 3.8341,
"step": 336
},
{
"epoch": 2.436144578313253,
"grad_norm": 1.2189948090841016,
"learning_rate": 1.943123566573748e-05,
"loss": 3.8599,
"step": 337
},
{
"epoch": 2.4433734939759035,
"grad_norm": 1.3522120356287177,
"learning_rate": 1.9427731620417832e-05,
"loss": 3.8676,
"step": 338
},
{
"epoch": 2.450602409638554,
"grad_norm": 1.293234164144659,
"learning_rate": 1.9424217132267287e-05,
"loss": 3.8229,
"step": 339
},
{
"epoch": 2.4578313253012047,
"grad_norm": 1.2407110746826315,
"learning_rate": 1.9420692205178753e-05,
"loss": 3.8536,
"step": 340
},
{
"epoch": 2.4650602409638553,
"grad_norm": 1.1831514740273819,
"learning_rate": 1.941715684305668e-05,
"loss": 3.8319,
"step": 341
},
{
"epoch": 2.472289156626506,
"grad_norm": 1.4909103135486783,
"learning_rate": 1.9413611049817098e-05,
"loss": 3.8408,
"step": 342
},
{
"epoch": 2.4795180722891565,
"grad_norm": 1.4697296716221555,
"learning_rate": 1.9410054829387577e-05,
"loss": 3.8168,
"step": 343
},
{
"epoch": 2.486746987951807,
"grad_norm": 1.3134233365354873,
"learning_rate": 1.940648818570724e-05,
"loss": 3.8229,
"step": 344
},
{
"epoch": 2.4939759036144578,
"grad_norm": 1.2481322084400526,
"learning_rate": 1.9402911122726756e-05,
"loss": 3.8271,
"step": 345
},
{
"epoch": 2.5012048192771084,
"grad_norm": 1.2353282699128232,
"learning_rate": 1.9399323644408343e-05,
"loss": 3.8442,
"step": 346
},
{
"epoch": 2.508433734939759,
"grad_norm": 1.3879510352014255,
"learning_rate": 1.9395725754725743e-05,
"loss": 3.8101,
"step": 347
},
{
"epoch": 2.5156626506024096,
"grad_norm": 1.2661019202378452,
"learning_rate": 1.939211745766424e-05,
"loss": 3.8343,
"step": 348
},
{
"epoch": 2.52289156626506,
"grad_norm": 1.4094304789617342,
"learning_rate": 1.938849875722064e-05,
"loss": 3.8225,
"step": 349
},
{
"epoch": 2.5301204819277108,
"grad_norm": 1.0676824726351448,
"learning_rate": 1.9384869657403277e-05,
"loss": 3.8148,
"step": 350
},
{
"epoch": 2.5373493975903614,
"grad_norm": 1.3083681717217666,
"learning_rate": 1.9381230162232e-05,
"loss": 3.8434,
"step": 351
},
{
"epoch": 2.544578313253012,
"grad_norm": 1.2325764419019432,
"learning_rate": 1.9377580275738173e-05,
"loss": 3.812,
"step": 352
},
{
"epoch": 2.5518072289156626,
"grad_norm": 1.3648434313844788,
"learning_rate": 1.9373920001964675e-05,
"loss": 3.8195,
"step": 353
},
{
"epoch": 2.559036144578313,
"grad_norm": 1.5083792667684395,
"learning_rate": 1.9370249344965884e-05,
"loss": 3.8391,
"step": 354
},
{
"epoch": 2.566265060240964,
"grad_norm": 1.0291949716161193,
"learning_rate": 1.9366568308807685e-05,
"loss": 3.8017,
"step": 355
},
{
"epoch": 2.5734939759036144,
"grad_norm": 1.5685196716178862,
"learning_rate": 1.9362876897567463e-05,
"loss": 3.8024,
"step": 356
},
{
"epoch": 2.580722891566265,
"grad_norm": 1.2942627306283834,
"learning_rate": 1.9359175115334077e-05,
"loss": 3.7885,
"step": 357
},
{
"epoch": 2.5879518072289156,
"grad_norm": 1.4645373990690937,
"learning_rate": 1.9355462966207895e-05,
"loss": 3.7797,
"step": 358
},
{
"epoch": 2.595180722891566,
"grad_norm": 1.2398186764780104,
"learning_rate": 1.9351740454300757e-05,
"loss": 3.7539,
"step": 359
},
{
"epoch": 2.602409638554217,
"grad_norm": 1.6059301916580724,
"learning_rate": 1.9348007583735985e-05,
"loss": 3.7835,
"step": 360
},
{
"epoch": 2.6096385542168674,
"grad_norm": 1.1561865019785784,
"learning_rate": 1.934426435864837e-05,
"loss": 3.777,
"step": 361
},
{
"epoch": 2.616867469879518,
"grad_norm": 1.6980059293390584,
"learning_rate": 1.9340510783184183e-05,
"loss": 3.7765,
"step": 362
},
{
"epoch": 2.6240963855421686,
"grad_norm": 1.0123067416195255,
"learning_rate": 1.9336746861501147e-05,
"loss": 3.7598,
"step": 363
},
{
"epoch": 2.6313253012048192,
"grad_norm": 1.673455267873693,
"learning_rate": 1.9332972597768457e-05,
"loss": 3.7805,
"step": 364
},
{
"epoch": 2.63855421686747,
"grad_norm": 1.0701597206020481,
"learning_rate": 1.9329187996166747e-05,
"loss": 3.7862,
"step": 365
},
{
"epoch": 2.6457831325301204,
"grad_norm": 1.7611784860019106,
"learning_rate": 1.9325393060888124e-05,
"loss": 3.7598,
"step": 366
},
{
"epoch": 2.653012048192771,
"grad_norm": 1.2398458219597617,
"learning_rate": 1.932158779613613e-05,
"loss": 3.768,
"step": 367
},
{
"epoch": 2.6602409638554216,
"grad_norm": 1.3462209809802463,
"learning_rate": 1.931777220612574e-05,
"loss": 3.7391,
"step": 368
},
{
"epoch": 2.6674698795180722,
"grad_norm": 1.3822586150711442,
"learning_rate": 1.9313946295083383e-05,
"loss": 3.7948,
"step": 369
},
{
"epoch": 2.674698795180723,
"grad_norm": 1.2896937317956254,
"learning_rate": 1.9310110067246905e-05,
"loss": 3.7446,
"step": 370
},
{
"epoch": 2.6819277108433734,
"grad_norm": 1.597748609201892,
"learning_rate": 1.9306263526865593e-05,
"loss": 3.7899,
"step": 371
},
{
"epoch": 2.689156626506024,
"grad_norm": 0.9887103404024649,
"learning_rate": 1.930240667820015e-05,
"loss": 3.7663,
"step": 372
},
{
"epoch": 2.6963855421686747,
"grad_norm": 1.479034639605643,
"learning_rate": 1.9298539525522698e-05,
"loss": 3.7601,
"step": 373
},
{
"epoch": 2.7036144578313253,
"grad_norm": 1.4800888826961993,
"learning_rate": 1.9294662073116768e-05,
"loss": 3.7695,
"step": 374
},
{
"epoch": 2.710843373493976,
"grad_norm": 1.2534099267746543,
"learning_rate": 1.9290774325277305e-05,
"loss": 3.747,
"step": 375
},
{
"epoch": 2.7180722891566265,
"grad_norm": 1.3056444769450586,
"learning_rate": 1.9286876286310663e-05,
"loss": 3.7368,
"step": 376
},
{
"epoch": 2.725301204819277,
"grad_norm": 1.1856236202724628,
"learning_rate": 1.9282967960534584e-05,
"loss": 3.7167,
"step": 377
},
{
"epoch": 2.7325301204819277,
"grad_norm": 1.151548092933736,
"learning_rate": 1.927904935227821e-05,
"loss": 3.7442,
"step": 378
},
{
"epoch": 2.7397590361445783,
"grad_norm": 1.8119770126998607,
"learning_rate": 1.927512046588207e-05,
"loss": 3.7437,
"step": 379
},
{
"epoch": 2.746987951807229,
"grad_norm": 1.2134322654005893,
"learning_rate": 1.9271181305698084e-05,
"loss": 3.7225,
"step": 380
},
{
"epoch": 2.7542168674698795,
"grad_norm": 1.6877025454737837,
"learning_rate": 1.926723187608955e-05,
"loss": 3.7445,
"step": 381
},
{
"epoch": 2.76144578313253,
"grad_norm": 1.1011325902712465,
"learning_rate": 1.9263272181431133e-05,
"loss": 3.7481,
"step": 382
},
{
"epoch": 2.7686746987951807,
"grad_norm": 1.5507791869887935,
"learning_rate": 1.9259302226108878e-05,
"loss": 3.7363,
"step": 383
},
{
"epoch": 2.7759036144578313,
"grad_norm": 1.1252933329680082,
"learning_rate": 1.9255322014520193e-05,
"loss": 3.7281,
"step": 384
},
{
"epoch": 2.783132530120482,
"grad_norm": 1.7448884917100538,
"learning_rate": 1.9251331551073843e-05,
"loss": 3.7347,
"step": 385
},
{
"epoch": 2.7903614457831325,
"grad_norm": 1.3167522856552092,
"learning_rate": 1.9247330840189948e-05,
"loss": 3.7262,
"step": 386
},
{
"epoch": 2.797590361445783,
"grad_norm": 1.2302348129154204,
"learning_rate": 1.924331988629999e-05,
"loss": 3.7248,
"step": 387
},
{
"epoch": 2.8048192771084337,
"grad_norm": 1.1716288708997376,
"learning_rate": 1.9239298693846783e-05,
"loss": 3.6978,
"step": 388
},
{
"epoch": 2.8120481927710843,
"grad_norm": 1.3724544034995316,
"learning_rate": 1.923526726728449e-05,
"loss": 3.7314,
"step": 389
},
{
"epoch": 2.819277108433735,
"grad_norm": 1.4480412852065119,
"learning_rate": 1.923122561107861e-05,
"loss": 3.7551,
"step": 390
},
{
"epoch": 2.8265060240963855,
"grad_norm": 1.004068176105684,
"learning_rate": 1.9227173729705962e-05,
"loss": 3.665,
"step": 391
},
{
"epoch": 2.833734939759036,
"grad_norm": 1.499032232612859,
"learning_rate": 1.9223111627654713e-05,
"loss": 3.7162,
"step": 392
},
{
"epoch": 2.8409638554216867,
"grad_norm": 1.3213003849841365,
"learning_rate": 1.921903930942433e-05,
"loss": 3.6653,
"step": 393
},
{
"epoch": 2.8481927710843373,
"grad_norm": 1.0692128354096737,
"learning_rate": 1.9214956779525605e-05,
"loss": 3.6982,
"step": 394
},
{
"epoch": 2.855421686746988,
"grad_norm": 1.6726385792325693,
"learning_rate": 1.9210864042480645e-05,
"loss": 3.6977,
"step": 395
},
{
"epoch": 2.8626506024096385,
"grad_norm": 1.2084768043267635,
"learning_rate": 1.920676110282285e-05,
"loss": 3.7059,
"step": 396
},
{
"epoch": 2.869879518072289,
"grad_norm": 1.1973676731876703,
"learning_rate": 1.9202647965096936e-05,
"loss": 3.6892,
"step": 397
},
{
"epoch": 2.8771084337349397,
"grad_norm": 1.7061815264302218,
"learning_rate": 1.919852463385891e-05,
"loss": 3.7393,
"step": 398
},
{
"epoch": 2.8843373493975903,
"grad_norm": 0.9603314248247711,
"learning_rate": 1.9194391113676066e-05,
"loss": 3.7312,
"step": 399
},
{
"epoch": 2.891566265060241,
"grad_norm": 1.2381539449289038,
"learning_rate": 1.9190247409126993e-05,
"loss": 3.6477,
"step": 400
},
{
"epoch": 2.8987951807228916,
"grad_norm": 1.4897933906972847,
"learning_rate": 1.9186093524801546e-05,
"loss": 3.6784,
"step": 401
},
{
"epoch": 2.906024096385542,
"grad_norm": 1.1132373553731765,
"learning_rate": 1.918192946530087e-05,
"loss": 3.6536,
"step": 402
},
{
"epoch": 2.9132530120481928,
"grad_norm": 1.3263129892692505,
"learning_rate": 1.917775523523737e-05,
"loss": 3.6956,
"step": 403
},
{
"epoch": 2.9204819277108434,
"grad_norm": 1.0680026924503008,
"learning_rate": 1.9173570839234735e-05,
"loss": 3.6553,
"step": 404
},
{
"epoch": 2.927710843373494,
"grad_norm": 1.4754538436442506,
"learning_rate": 1.916937628192789e-05,
"loss": 3.6432,
"step": 405
},
{
"epoch": 2.9349397590361446,
"grad_norm": 1.2479137486432985,
"learning_rate": 1.916517156796303e-05,
"loss": 3.6492,
"step": 406
},
{
"epoch": 2.942168674698795,
"grad_norm": 1.3365195210042529,
"learning_rate": 1.91609567019976e-05,
"loss": 3.6622,
"step": 407
},
{
"epoch": 2.9493975903614458,
"grad_norm": 1.2544818586873212,
"learning_rate": 1.9156731688700284e-05,
"loss": 3.6819,
"step": 408
},
{
"epoch": 2.9566265060240964,
"grad_norm": 1.2814859527796207,
"learning_rate": 1.9152496532751014e-05,
"loss": 3.6563,
"step": 409
},
{
"epoch": 2.963855421686747,
"grad_norm": 1.3272064954264093,
"learning_rate": 1.9148251238840947e-05,
"loss": 3.6581,
"step": 410
},
{
"epoch": 2.9710843373493976,
"grad_norm": 1.171014354368878,
"learning_rate": 1.9143995811672477e-05,
"loss": 3.637,
"step": 411
},
{
"epoch": 2.978313253012048,
"grad_norm": 1.7001876017529212,
"learning_rate": 1.913973025595922e-05,
"loss": 3.6431,
"step": 412
},
{
"epoch": 2.985542168674699,
"grad_norm": 1.0699675183248216,
"learning_rate": 1.913545457642601e-05,
"loss": 3.6085,
"step": 413
},
{
"epoch": 2.9927710843373494,
"grad_norm": 1.1416693812329064,
"learning_rate": 1.9131168777808898e-05,
"loss": 3.656,
"step": 414
},
{
"epoch": 3.0,
"grad_norm": 1.394313876963253,
"learning_rate": 1.9126872864855142e-05,
"loss": 3.625,
"step": 415
},
{
"epoch": 3.0072289156626506,
"grad_norm": 1.1937015014005639,
"learning_rate": 1.91225668423232e-05,
"loss": 3.6024,
"step": 416
},
{
"epoch": 3.014457831325301,
"grad_norm": 1.3445114911170963,
"learning_rate": 1.9118250714982733e-05,
"loss": 3.6236,
"step": 417
},
{
"epoch": 3.021686746987952,
"grad_norm": 1.7350644891144746,
"learning_rate": 1.911392448761459e-05,
"loss": 3.6218,
"step": 418
},
{
"epoch": 3.0289156626506024,
"grad_norm": 1.047365051789017,
"learning_rate": 1.910958816501081e-05,
"loss": 3.6287,
"step": 419
},
{
"epoch": 3.036144578313253,
"grad_norm": 1.7055691742758698,
"learning_rate": 1.9105241751974624e-05,
"loss": 3.5951,
"step": 420
},
{
"epoch": 3.0433734939759036,
"grad_norm": 1.0364220044160384,
"learning_rate": 1.910088525332042e-05,
"loss": 3.5828,
"step": 421
},
{
"epoch": 3.0506024096385542,
"grad_norm": 1.94317043809307,
"learning_rate": 1.9096518673873773e-05,
"loss": 3.6355,
"step": 422
},
{
"epoch": 3.057831325301205,
"grad_norm": 1.096628442648723,
"learning_rate": 1.9092142018471415e-05,
"loss": 3.5654,
"step": 423
},
{
"epoch": 3.0650602409638554,
"grad_norm": 1.8943159764104194,
"learning_rate": 1.9087755291961246e-05,
"loss": 3.5896,
"step": 424
},
{
"epoch": 3.072289156626506,
"grad_norm": 1.4605008938774497,
"learning_rate": 1.9083358499202323e-05,
"loss": 3.6443,
"step": 425
},
{
"epoch": 3.0795180722891566,
"grad_norm": 1.199002982548527,
"learning_rate": 1.9078951645064837e-05,
"loss": 3.6156,
"step": 426
},
{
"epoch": 3.0867469879518072,
"grad_norm": 1.3737048491463308,
"learning_rate": 1.9074534734430147e-05,
"loss": 3.5969,
"step": 427
},
{
"epoch": 3.093975903614458,
"grad_norm": 1.1507142428566965,
"learning_rate": 1.9070107772190734e-05,
"loss": 3.6112,
"step": 428
},
{
"epoch": 3.1012048192771084,
"grad_norm": 1.4622624757116582,
"learning_rate": 1.906567076325022e-05,
"loss": 3.6137,
"step": 429
},
{
"epoch": 3.108433734939759,
"grad_norm": 1.3627701895130038,
"learning_rate": 1.9061223712523352e-05,
"loss": 3.5957,
"step": 430
},
{
"epoch": 3.1156626506024097,
"grad_norm": 1.0700877019696842,
"learning_rate": 1.9056766624936003e-05,
"loss": 3.5863,
"step": 431
},
{
"epoch": 3.1228915662650603,
"grad_norm": 1.5955208448573346,
"learning_rate": 1.9052299505425164e-05,
"loss": 3.587,
"step": 432
},
{
"epoch": 3.130120481927711,
"grad_norm": 1.3565000081916603,
"learning_rate": 1.9047822358938933e-05,
"loss": 3.6409,
"step": 433
},
{
"epoch": 3.1373493975903615,
"grad_norm": 0.8840912550967165,
"learning_rate": 1.904333519043652e-05,
"loss": 3.5647,
"step": 434
},
{
"epoch": 3.144578313253012,
"grad_norm": 1.3572343135153577,
"learning_rate": 1.903883800488824e-05,
"loss": 3.6011,
"step": 435
},
{
"epoch": 3.1518072289156627,
"grad_norm": 1.2429040148925297,
"learning_rate": 1.9034330807275484e-05,
"loss": 3.5839,
"step": 436
},
{
"epoch": 3.1590361445783133,
"grad_norm": 1.4800542337109683,
"learning_rate": 1.9029813602590763e-05,
"loss": 3.5555,
"step": 437
},
{
"epoch": 3.166265060240964,
"grad_norm": 1.6482800246580644,
"learning_rate": 1.9025286395837647e-05,
"loss": 3.5884,
"step": 438
},
{
"epoch": 3.1734939759036145,
"grad_norm": 1.0501732415023106,
"learning_rate": 1.9020749192030795e-05,
"loss": 3.5781,
"step": 439
},
{
"epoch": 3.180722891566265,
"grad_norm": 1.5328073640134885,
"learning_rate": 1.9016201996195943e-05,
"loss": 3.5548,
"step": 440
},
{
"epoch": 3.1879518072289157,
"grad_norm": 1.118507184933988,
"learning_rate": 1.9011644813369886e-05,
"loss": 3.5596,
"step": 441
},
{
"epoch": 3.1951807228915663,
"grad_norm": 1.2284264310864856,
"learning_rate": 1.900707764860049e-05,
"loss": 3.5657,
"step": 442
},
{
"epoch": 3.202409638554217,
"grad_norm": 1.5003452370811305,
"learning_rate": 1.900250050694667e-05,
"loss": 3.5662,
"step": 443
},
{
"epoch": 3.2096385542168675,
"grad_norm": 1.2853868444977676,
"learning_rate": 1.8997913393478404e-05,
"loss": 3.5712,
"step": 444
},
{
"epoch": 3.216867469879518,
"grad_norm": 1.5925034895130508,
"learning_rate": 1.8993316313276694e-05,
"loss": 3.5776,
"step": 445
},
{
"epoch": 3.2240963855421687,
"grad_norm": 0.9811616125028083,
"learning_rate": 1.8988709271433607e-05,
"loss": 3.5428,
"step": 446
},
{
"epoch": 3.2313253012048193,
"grad_norm": 1.4478536151263606,
"learning_rate": 1.898409227305223e-05,
"loss": 3.5508,
"step": 447
},
{
"epoch": 3.23855421686747,
"grad_norm": 1.2477828448918005,
"learning_rate": 1.8979465323246676e-05,
"loss": 3.5611,
"step": 448
},
{
"epoch": 3.2457831325301205,
"grad_norm": 1.5858711129317897,
"learning_rate": 1.8974828427142093e-05,
"loss": 3.5629,
"step": 449
},
{
"epoch": 3.253012048192771,
"grad_norm": 1.0950035377608183,
"learning_rate": 1.8970181589874637e-05,
"loss": 3.5406,
"step": 450
},
{
"epoch": 3.2602409638554217,
"grad_norm": 1.5415115226682654,
"learning_rate": 1.8965524816591477e-05,
"loss": 3.5606,
"step": 451
},
{
"epoch": 3.2674698795180723,
"grad_norm": 1.1487484478951757,
"learning_rate": 1.8960858112450785e-05,
"loss": 3.5166,
"step": 452
},
{
"epoch": 3.274698795180723,
"grad_norm": 1.6420967703083962,
"learning_rate": 1.8956181482621746e-05,
"loss": 3.528,
"step": 453
},
{
"epoch": 3.2819277108433735,
"grad_norm": 0.8980303432743649,
"learning_rate": 1.895149493228452e-05,
"loss": 3.5287,
"step": 454
},
{
"epoch": 3.289156626506024,
"grad_norm": 1.3483910988264465,
"learning_rate": 1.894679846663027e-05,
"loss": 3.5731,
"step": 455
},
{
"epoch": 3.2963855421686747,
"grad_norm": 1.491314706003945,
"learning_rate": 1.894209209086114e-05,
"loss": 3.5231,
"step": 456
},
{
"epoch": 3.3036144578313253,
"grad_norm": 1.2270834009390768,
"learning_rate": 1.8937375810190247e-05,
"loss": 3.5386,
"step": 457
},
{
"epoch": 3.310843373493976,
"grad_norm": 1.1777132337252105,
"learning_rate": 1.8932649629841685e-05,
"loss": 3.5111,
"step": 458
},
{
"epoch": 3.3180722891566266,
"grad_norm": 1.3783327375707983,
"learning_rate": 1.8927913555050503e-05,
"loss": 3.5123,
"step": 459
},
{
"epoch": 3.325301204819277,
"grad_norm": 0.842194772816192,
"learning_rate": 1.8923167591062723e-05,
"loss": 3.5513,
"step": 460
},
{
"epoch": 3.3325301204819278,
"grad_norm": 1.3331154478098168,
"learning_rate": 1.891841174313531e-05,
"loss": 3.5017,
"step": 461
},
{
"epoch": 3.3397590361445784,
"grad_norm": 1.5139276628552356,
"learning_rate": 1.8913646016536184e-05,
"loss": 3.5145,
"step": 462
},
{
"epoch": 3.346987951807229,
"grad_norm": 1.1149277809499802,
"learning_rate": 1.8908870416544202e-05,
"loss": 3.5004,
"step": 463
},
{
"epoch": 3.3542168674698796,
"grad_norm": 1.4898157461888148,
"learning_rate": 1.890408494844917e-05,
"loss": 3.5329,
"step": 464
},
{
"epoch": 3.36144578313253,
"grad_norm": 0.9549742540395183,
"learning_rate": 1.8899289617551803e-05,
"loss": 3.5187,
"step": 465
},
{
"epoch": 3.3686746987951808,
"grad_norm": 1.7805688799594914,
"learning_rate": 1.8894484429163764e-05,
"loss": 3.5207,
"step": 466
},
{
"epoch": 3.3759036144578314,
"grad_norm": 1.2106962876906113,
"learning_rate": 1.8889669388607615e-05,
"loss": 3.514,
"step": 467
},
{
"epoch": 3.383132530120482,
"grad_norm": 1.6763747112024485,
"learning_rate": 1.8884844501216845e-05,
"loss": 3.5147,
"step": 468
},
{
"epoch": 3.3903614457831326,
"grad_norm": 1.031632991874722,
"learning_rate": 1.8880009772335843e-05,
"loss": 3.5116,
"step": 469
},
{
"epoch": 3.397590361445783,
"grad_norm": 1.8307621258883875,
"learning_rate": 1.8875165207319902e-05,
"loss": 3.5397,
"step": 470
},
{
"epoch": 3.404819277108434,
"grad_norm": 1.2852387998358652,
"learning_rate": 1.8870310811535208e-05,
"loss": 3.5036,
"step": 471
},
{
"epoch": 3.4120481927710844,
"grad_norm": 1.406175754082574,
"learning_rate": 1.886544659035884e-05,
"loss": 3.5133,
"step": 472
},
{
"epoch": 3.419277108433735,
"grad_norm": 1.4308043920793336,
"learning_rate": 1.8860572549178755e-05,
"loss": 3.5003,
"step": 473
},
{
"epoch": 3.4265060240963856,
"grad_norm": 1.161805161480239,
"learning_rate": 1.8855688693393788e-05,
"loss": 3.5061,
"step": 474
},
{
"epoch": 3.433734939759036,
"grad_norm": 1.3056601266916144,
"learning_rate": 1.8850795028413658e-05,
"loss": 3.4847,
"step": 475
},
{
"epoch": 3.440963855421687,
"grad_norm": 1.5521465688683451,
"learning_rate": 1.8845891559658925e-05,
"loss": 3.496,
"step": 476
},
{
"epoch": 3.4481927710843374,
"grad_norm": 1.1070668015919285,
"learning_rate": 1.884097829256103e-05,
"loss": 3.5009,
"step": 477
},
{
"epoch": 3.455421686746988,
"grad_norm": 1.3329315835553603,
"learning_rate": 1.883605523256226e-05,
"loss": 3.4943,
"step": 478
},
{
"epoch": 3.4626506024096386,
"grad_norm": 1.5538331615869527,
"learning_rate": 1.883112238511575e-05,
"loss": 3.4837,
"step": 479
},
{
"epoch": 3.4698795180722892,
"grad_norm": 1.092373343461448,
"learning_rate": 1.882617975568547e-05,
"loss": 3.4926,
"step": 480
},
{
"epoch": 3.47710843373494,
"grad_norm": 1.874379766491242,
"learning_rate": 1.8821227349746235e-05,
"loss": 3.4899,
"step": 481
},
{
"epoch": 3.4843373493975904,
"grad_norm": 0.9923448678530435,
"learning_rate": 1.881626517278368e-05,
"loss": 3.4728,
"step": 482
},
{
"epoch": 3.491566265060241,
"grad_norm": 1.2266774334839128,
"learning_rate": 1.881129323029427e-05,
"loss": 3.4892,
"step": 483
},
{
"epoch": 3.4987951807228916,
"grad_norm": 1.5096151923346919,
"learning_rate": 1.8806311527785287e-05,
"loss": 3.5182,
"step": 484
},
{
"epoch": 3.5060240963855422,
"grad_norm": 1.1571956260425034,
"learning_rate": 1.880132007077482e-05,
"loss": 3.4708,
"step": 485
},
{
"epoch": 3.513253012048193,
"grad_norm": 1.648444159789046,
"learning_rate": 1.8796318864791763e-05,
"loss": 3.4952,
"step": 486
},
{
"epoch": 3.5204819277108435,
"grad_norm": 1.093285229500888,
"learning_rate": 1.879130791537581e-05,
"loss": 3.4968,
"step": 487
},
{
"epoch": 3.527710843373494,
"grad_norm": 1.6721766885025302,
"learning_rate": 1.8786287228077453e-05,
"loss": 3.4886,
"step": 488
},
{
"epoch": 3.5349397590361447,
"grad_norm": 1.0756493747079099,
"learning_rate": 1.8781256808457954e-05,
"loss": 3.4859,
"step": 489
},
{
"epoch": 3.5421686746987953,
"grad_norm": 1.5147018364957974,
"learning_rate": 1.8776216662089373e-05,
"loss": 3.4958,
"step": 490
},
{
"epoch": 3.549397590361446,
"grad_norm": 1.3158687997204326,
"learning_rate": 1.8771166794554544e-05,
"loss": 3.5135,
"step": 491
},
{
"epoch": 3.5566265060240965,
"grad_norm": 1.519976844773512,
"learning_rate": 1.8766107211447046e-05,
"loss": 3.4772,
"step": 492
},
{
"epoch": 3.563855421686747,
"grad_norm": 1.1141857128087669,
"learning_rate": 1.8761037918371248e-05,
"loss": 3.4985,
"step": 493
},
{
"epoch": 3.5710843373493977,
"grad_norm": 1.4826112621775518,
"learning_rate": 1.875595892094226e-05,
"loss": 3.4615,
"step": 494
},
{
"epoch": 3.5783132530120483,
"grad_norm": 1.4123308003869872,
"learning_rate": 1.875087022478594e-05,
"loss": 3.464,
"step": 495
},
{
"epoch": 3.585542168674699,
"grad_norm": 1.290374316443633,
"learning_rate": 1.8745771835538893e-05,
"loss": 3.4795,
"step": 496
},
{
"epoch": 3.5927710843373495,
"grad_norm": 1.2931656378131897,
"learning_rate": 1.8740663758848463e-05,
"loss": 3.4812,
"step": 497
},
{
"epoch": 3.6,
"grad_norm": 1.023375443616966,
"learning_rate": 1.873554600037272e-05,
"loss": 3.471,
"step": 498
},
{
"epoch": 3.6072289156626507,
"grad_norm": 1.8081312761107482,
"learning_rate": 1.873041856578046e-05,
"loss": 3.4738,
"step": 499
},
{
"epoch": 3.6144578313253013,
"grad_norm": 1.0288627689254883,
"learning_rate": 1.8725281460751198e-05,
"loss": 3.4398,
"step": 500
},
{
"epoch": 3.621686746987952,
"grad_norm": 1.5832162058004138,
"learning_rate": 1.8720134690975157e-05,
"loss": 3.4424,
"step": 501
},
{
"epoch": 3.6289156626506025,
"grad_norm": 1.3177178077765743,
"learning_rate": 1.8714978262153268e-05,
"loss": 3.4589,
"step": 502
},
{
"epoch": 3.636144578313253,
"grad_norm": 1.0363297002224263,
"learning_rate": 1.8709812179997162e-05,
"loss": 3.4862,
"step": 503
},
{
"epoch": 3.6433734939759037,
"grad_norm": 1.4795903294105097,
"learning_rate": 1.8704636450229164e-05,
"loss": 3.4501,
"step": 504
},
{
"epoch": 3.6506024096385543,
"grad_norm": 1.3966788968436568,
"learning_rate": 1.869945107858228e-05,
"loss": 3.4638,
"step": 505
},
{
"epoch": 3.657831325301205,
"grad_norm": 1.1325789827684314,
"learning_rate": 1.8694256070800202e-05,
"loss": 3.4599,
"step": 506
},
{
"epoch": 3.6650602409638555,
"grad_norm": 1.5981674695873835,
"learning_rate": 1.868905143263729e-05,
"loss": 3.4762,
"step": 507
},
{
"epoch": 3.672289156626506,
"grad_norm": 1.1727596332947487,
"learning_rate": 1.8683837169858573e-05,
"loss": 3.4592,
"step": 508
},
{
"epoch": 3.6795180722891567,
"grad_norm": 1.3675155550509581,
"learning_rate": 1.8678613288239745e-05,
"loss": 3.4125,
"step": 509
},
{
"epoch": 3.6867469879518073,
"grad_norm": 1.4041957545233883,
"learning_rate": 1.867337979356715e-05,
"loss": 3.4293,
"step": 510
},
{
"epoch": 3.693975903614458,
"grad_norm": 1.3560113123509387,
"learning_rate": 1.866813669163778e-05,
"loss": 3.4508,
"step": 511
},
{
"epoch": 3.7012048192771085,
"grad_norm": 1.1545842984692207,
"learning_rate": 1.8662883988259276e-05,
"loss": 3.4248,
"step": 512
},
{
"epoch": 3.708433734939759,
"grad_norm": 1.2992676256754951,
"learning_rate": 1.86576216892499e-05,
"loss": 3.4531,
"step": 513
},
{
"epoch": 3.7156626506024097,
"grad_norm": 1.8499772386187696,
"learning_rate": 1.8652349800438558e-05,
"loss": 3.4513,
"step": 514
},
{
"epoch": 3.7228915662650603,
"grad_norm": 1.02374620550504,
"learning_rate": 1.8647068327664774e-05,
"loss": 3.4277,
"step": 515
},
{
"epoch": 3.730120481927711,
"grad_norm": 1.6630005224882898,
"learning_rate": 1.8641777276778677e-05,
"loss": 3.4379,
"step": 516
},
{
"epoch": 3.7373493975903616,
"grad_norm": 1.0239537219245924,
"learning_rate": 1.8636476653641015e-05,
"loss": 3.441,
"step": 517
},
{
"epoch": 3.744578313253012,
"grad_norm": 1.452754740516667,
"learning_rate": 1.8631166464123147e-05,
"loss": 3.4094,
"step": 518
},
{
"epoch": 3.7518072289156628,
"grad_norm": 1.1575160234867263,
"learning_rate": 1.8625846714107013e-05,
"loss": 3.4469,
"step": 519
},
{
"epoch": 3.7590361445783134,
"grad_norm": 1.8402168349048114,
"learning_rate": 1.8620517409485148e-05,
"loss": 3.4152,
"step": 520
},
{
"epoch": 3.766265060240964,
"grad_norm": 0.9286315933247561,
"learning_rate": 1.8615178556160675e-05,
"loss": 3.4106,
"step": 521
},
{
"epoch": 3.7734939759036146,
"grad_norm": 1.7068898144657225,
"learning_rate": 1.8609830160047283e-05,
"loss": 3.4277,
"step": 522
},
{
"epoch": 3.780722891566265,
"grad_norm": 1.3543309281771512,
"learning_rate": 1.8604472227069253e-05,
"loss": 3.4126,
"step": 523
},
{
"epoch": 3.787951807228916,
"grad_norm": 0.8836248770013484,
"learning_rate": 1.8599104763161402e-05,
"loss": 3.3996,
"step": 524
},
{
"epoch": 3.7951807228915664,
"grad_norm": 1.104673956712585,
"learning_rate": 1.8593727774269122e-05,
"loss": 3.4098,
"step": 525
},
{
"epoch": 3.802409638554217,
"grad_norm": 1.3914713972311137,
"learning_rate": 1.858834126634835e-05,
"loss": 3.4337,
"step": 526
},
{
"epoch": 3.8096385542168676,
"grad_norm": 1.4164106405394201,
"learning_rate": 1.8582945245365574e-05,
"loss": 3.4169,
"step": 527
},
{
"epoch": 3.816867469879518,
"grad_norm": 1.3207121109028854,
"learning_rate": 1.8577539717297806e-05,
"loss": 3.4247,
"step": 528
},
{
"epoch": 3.824096385542169,
"grad_norm": 1.4858975363520965,
"learning_rate": 1.85721246881326e-05,
"loss": 3.4251,
"step": 529
},
{
"epoch": 3.8313253012048194,
"grad_norm": 1.0773614825733793,
"learning_rate": 1.8566700163868027e-05,
"loss": 3.438,
"step": 530
},
{
"epoch": 3.83855421686747,
"grad_norm": 1.5240572802175603,
"learning_rate": 1.856126615051268e-05,
"loss": 3.3856,
"step": 531
},
{
"epoch": 3.8457831325301206,
"grad_norm": 1.212618647930279,
"learning_rate": 1.8555822654085665e-05,
"loss": 3.3794,
"step": 532
},
{
"epoch": 3.853012048192771,
"grad_norm": 1.3089580322466006,
"learning_rate": 1.8550369680616584e-05,
"loss": 3.3856,
"step": 533
},
{
"epoch": 3.860240963855422,
"grad_norm": 1.294624288123662,
"learning_rate": 1.854490723614554e-05,
"loss": 3.4072,
"step": 534
},
{
"epoch": 3.8674698795180724,
"grad_norm": 1.4162512393718127,
"learning_rate": 1.8539435326723135e-05,
"loss": 3.3985,
"step": 535
},
{
"epoch": 3.874698795180723,
"grad_norm": 1.5646417878212386,
"learning_rate": 1.853395395841044e-05,
"loss": 3.4295,
"step": 536
},
{
"epoch": 3.8819277108433736,
"grad_norm": 0.986716262091348,
"learning_rate": 1.852846313727902e-05,
"loss": 3.379,
"step": 537
},
{
"epoch": 3.8891566265060242,
"grad_norm": 1.4476864853119071,
"learning_rate": 1.852296286941089e-05,
"loss": 3.4318,
"step": 538
},
{
"epoch": 3.896385542168675,
"grad_norm": 1.336789866057074,
"learning_rate": 1.851745316089855e-05,
"loss": 3.3884,
"step": 539
},
{
"epoch": 3.9036144578313254,
"grad_norm": 1.5594590122697878,
"learning_rate": 1.851193401784495e-05,
"loss": 3.4078,
"step": 540
},
{
"epoch": 3.910843373493976,
"grad_norm": 1.217821216453777,
"learning_rate": 1.8506405446363476e-05,
"loss": 3.3646,
"step": 541
},
{
"epoch": 3.9180722891566266,
"grad_norm": 1.5654916960235976,
"learning_rate": 1.8500867452577978e-05,
"loss": 3.4169,
"step": 542
},
{
"epoch": 3.9253012048192772,
"grad_norm": 1.0290965394229443,
"learning_rate": 1.849532004262274e-05,
"loss": 3.4067,
"step": 543
},
{
"epoch": 3.932530120481928,
"grad_norm": 1.4367792614004797,
"learning_rate": 1.8489763222642457e-05,
"loss": 3.3943,
"step": 544
},
{
"epoch": 3.9397590361445785,
"grad_norm": 1.038088226531718,
"learning_rate": 1.848419699879227e-05,
"loss": 3.3621,
"step": 545
},
{
"epoch": 3.946987951807229,
"grad_norm": 1.6594075535521433,
"learning_rate": 1.8478621377237724e-05,
"loss": 3.3758,
"step": 546
},
{
"epoch": 3.9542168674698797,
"grad_norm": 1.100843923400257,
"learning_rate": 1.847303636415478e-05,
"loss": 3.4082,
"step": 547
},
{
"epoch": 3.9614457831325303,
"grad_norm": 1.30799271344962,
"learning_rate": 1.8467441965729796e-05,
"loss": 3.3983,
"step": 548
},
{
"epoch": 3.968674698795181,
"grad_norm": 1.0064869104717655,
"learning_rate": 1.846183818815953e-05,
"loss": 3.3768,
"step": 549
},
{
"epoch": 3.9759036144578315,
"grad_norm": 1.9019272157678964,
"learning_rate": 1.845622503765113e-05,
"loss": 3.3945,
"step": 550
},
{
"epoch": 3.983132530120482,
"grad_norm": 0.8834203829363729,
"learning_rate": 1.8450602520422113e-05,
"loss": 3.3691,
"step": 551
},
{
"epoch": 3.9903614457831327,
"grad_norm": 1.5879949736683259,
"learning_rate": 1.8444970642700396e-05,
"loss": 3.3763,
"step": 552
},
{
"epoch": 3.9975903614457833,
"grad_norm": 1.417799256266747,
"learning_rate": 1.843932941072424e-05,
"loss": 3.3897,
"step": 553
},
{
"epoch": 4.004819277108433,
"grad_norm": 1.268057538537551,
"learning_rate": 1.8433678830742284e-05,
"loss": 3.3272,
"step": 554
},
{
"epoch": 4.0120481927710845,
"grad_norm": 1.484719882690862,
"learning_rate": 1.842801890901351e-05,
"loss": 3.367,
"step": 555
},
{
"epoch": 4.019277108433735,
"grad_norm": 1.2643656531559433,
"learning_rate": 1.842234965180725e-05,
"loss": 3.3467,
"step": 556
},
{
"epoch": 4.026506024096386,
"grad_norm": 1.7452752799634084,
"learning_rate": 1.841667106540319e-05,
"loss": 3.3538,
"step": 557
},
{
"epoch": 4.033734939759036,
"grad_norm": 1.079426848592131,
"learning_rate": 1.8410983156091334e-05,
"loss": 3.3437,
"step": 558
},
{
"epoch": 4.040963855421687,
"grad_norm": 1.8083444736473766,
"learning_rate": 1.8405285930172016e-05,
"loss": 3.3492,
"step": 559
},
{
"epoch": 4.048192771084337,
"grad_norm": 1.2699753552523099,
"learning_rate": 1.8399579393955893e-05,
"loss": 3.3249,
"step": 560
},
{
"epoch": 4.055421686746988,
"grad_norm": 1.400246092211567,
"learning_rate": 1.839386355376393e-05,
"loss": 3.3767,
"step": 561
},
{
"epoch": 4.062650602409638,
"grad_norm": 1.244251569866778,
"learning_rate": 1.838813841592741e-05,
"loss": 3.2968,
"step": 562
},
{
"epoch": 4.069879518072289,
"grad_norm": 1.5584023465055492,
"learning_rate": 1.8382403986787895e-05,
"loss": 3.3499,
"step": 563
},
{
"epoch": 4.0771084337349395,
"grad_norm": 1.0765855271037599,
"learning_rate": 1.8376660272697257e-05,
"loss": 3.3528,
"step": 564
},
{
"epoch": 4.0843373493975905,
"grad_norm": 1.5570491879229293,
"learning_rate": 1.837090728001764e-05,
"loss": 3.3208,
"step": 565
},
{
"epoch": 4.091566265060241,
"grad_norm": 1.157602436803469,
"learning_rate": 1.8365145015121477e-05,
"loss": 3.3533,
"step": 566
},
{
"epoch": 4.098795180722892,
"grad_norm": 1.6099172851284775,
"learning_rate": 1.835937348439146e-05,
"loss": 3.3203,
"step": 567
},
{
"epoch": 4.106024096385542,
"grad_norm": 1.5490215318502463,
"learning_rate": 1.835359269422055e-05,
"loss": 3.3612,
"step": 568
},
{
"epoch": 4.113253012048193,
"grad_norm": 0.8911556352082233,
"learning_rate": 1.8347802651011965e-05,
"loss": 3.2801,
"step": 569
},
{
"epoch": 4.120481927710843,
"grad_norm": 1.6677145666770425,
"learning_rate": 1.834200336117918e-05,
"loss": 3.3293,
"step": 570
},
{
"epoch": 4.127710843373494,
"grad_norm": 1.3171442358917278,
"learning_rate": 1.8336194831145888e-05,
"loss": 3.3275,
"step": 571
},
{
"epoch": 4.134939759036144,
"grad_norm": 1.1299184058969571,
"learning_rate": 1.8330377067346045e-05,
"loss": 3.3105,
"step": 572
},
{
"epoch": 4.142168674698795,
"grad_norm": 1.2163200316102576,
"learning_rate": 1.8324550076223818e-05,
"loss": 3.3086,
"step": 573
},
{
"epoch": 4.1493975903614455,
"grad_norm": 1.4252511237340142,
"learning_rate": 1.83187138642336e-05,
"loss": 3.3388,
"step": 574
},
{
"epoch": 4.156626506024097,
"grad_norm": 1.3132028148730224,
"learning_rate": 1.8312868437840002e-05,
"loss": 3.3009,
"step": 575
},
{
"epoch": 4.163855421686747,
"grad_norm": 1.1629824196249723,
"learning_rate": 1.8307013803517834e-05,
"loss": 3.2775,
"step": 576
},
{
"epoch": 4.171084337349398,
"grad_norm": 1.5011239726999521,
"learning_rate": 1.8301149967752104e-05,
"loss": 3.3308,
"step": 577
},
{
"epoch": 4.178313253012048,
"grad_norm": 1.4397390142744335,
"learning_rate": 1.8295276937038023e-05,
"loss": 3.2927,
"step": 578
},
{
"epoch": 4.185542168674699,
"grad_norm": 1.2349876041301024,
"learning_rate": 1.8289394717880977e-05,
"loss": 3.3274,
"step": 579
},
{
"epoch": 4.192771084337349,
"grad_norm": 1.3027251020995771,
"learning_rate": 1.8283503316796536e-05,
"loss": 3.3363,
"step": 580
},
{
"epoch": 4.2,
"grad_norm": 1.2802765485296743,
"learning_rate": 1.8277602740310433e-05,
"loss": 3.3138,
"step": 581
},
{
"epoch": 4.20722891566265,
"grad_norm": 1.5672452547403777,
"learning_rate": 1.8271692994958576e-05,
"loss": 3.311,
"step": 582
},
{
"epoch": 4.214457831325301,
"grad_norm": 1.2976733074790296,
"learning_rate": 1.8265774087287016e-05,
"loss": 3.2846,
"step": 583
},
{
"epoch": 4.2216867469879515,
"grad_norm": 1.4144637634643196,
"learning_rate": 1.825984602385196e-05,
"loss": 3.3297,
"step": 584
},
{
"epoch": 4.228915662650603,
"grad_norm": 1.1749959337917693,
"learning_rate": 1.8253908811219764e-05,
"loss": 3.3001,
"step": 585
},
{
"epoch": 4.236144578313253,
"grad_norm": 1.4039885634229308,
"learning_rate": 1.82479624559669e-05,
"loss": 3.3219,
"step": 586
},
{
"epoch": 4.243373493975904,
"grad_norm": 1.212164693294027,
"learning_rate": 1.8242006964679978e-05,
"loss": 3.2915,
"step": 587
},
{
"epoch": 4.250602409638554,
"grad_norm": 1.2485625997266088,
"learning_rate": 1.8236042343955734e-05,
"loss": 3.2814,
"step": 588
},
{
"epoch": 4.257831325301205,
"grad_norm": 1.2830218516375618,
"learning_rate": 1.8230068600401e-05,
"loss": 3.2821,
"step": 589
},
{
"epoch": 4.265060240963855,
"grad_norm": 1.366157342834461,
"learning_rate": 1.822408574063273e-05,
"loss": 3.2994,
"step": 590
},
{
"epoch": 4.272289156626506,
"grad_norm": 1.3861702655842194,
"learning_rate": 1.8218093771277965e-05,
"loss": 3.292,
"step": 591
},
{
"epoch": 4.279518072289156,
"grad_norm": 1.2538740015004524,
"learning_rate": 1.8212092698973837e-05,
"loss": 3.3082,
"step": 592
},
{
"epoch": 4.286746987951807,
"grad_norm": 1.3134983549799732,
"learning_rate": 1.820608253036757e-05,
"loss": 3.2543,
"step": 593
},
{
"epoch": 4.293975903614458,
"grad_norm": 1.0874207166004457,
"learning_rate": 1.820006327211645e-05,
"loss": 3.2682,
"step": 594
},
{
"epoch": 4.301204819277109,
"grad_norm": 1.3562305588386105,
"learning_rate": 1.8194034930887842e-05,
"loss": 3.2865,
"step": 595
},
{
"epoch": 4.308433734939759,
"grad_norm": 1.1232485868536217,
"learning_rate": 1.818799751335917e-05,
"loss": 3.2844,
"step": 596
},
{
"epoch": 4.31566265060241,
"grad_norm": 1.4011009500618496,
"learning_rate": 1.8181951026217908e-05,
"loss": 3.3233,
"step": 597
},
{
"epoch": 4.32289156626506,
"grad_norm": 1.0020226275410233,
"learning_rate": 1.8175895476161577e-05,
"loss": 3.3181,
"step": 598
},
{
"epoch": 4.330120481927711,
"grad_norm": 1.3938263912157418,
"learning_rate": 1.8169830869897743e-05,
"loss": 3.2752,
"step": 599
},
{
"epoch": 4.337349397590361,
"grad_norm": 1.259453956832135,
"learning_rate": 1.8163757214143993e-05,
"loss": 3.3074,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 22,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1816727679664128.0,
"train_batch_size": 19,
"trial_name": null,
"trial_params": null
}