hosseinbv's picture
Uploading /ephemeral/hossein/output/newData-progressive-yoco-tiny-llama-CDL-17
9fafd94 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 839,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011918951132300357,
"grad_norm": 4.142117453158603,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.2717,
"step": 1
},
{
"epoch": 0.0023837902264600714,
"grad_norm": 4.146443379436291,
"learning_rate": 4.000000000000001e-06,
"loss": 2.2673,
"step": 2
},
{
"epoch": 0.003575685339690107,
"grad_norm": 4.284702076234231,
"learning_rate": 6e-06,
"loss": 2.2178,
"step": 3
},
{
"epoch": 0.004767580452920143,
"grad_norm": 4.104853059576584,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2543,
"step": 4
},
{
"epoch": 0.0059594755661501785,
"grad_norm": 3.213179411663954,
"learning_rate": 1e-05,
"loss": 2.2236,
"step": 5
},
{
"epoch": 0.007151370679380214,
"grad_norm": 1.9110723795461666,
"learning_rate": 9.99996452624688e-06,
"loss": 2.2198,
"step": 6
},
{
"epoch": 0.00834326579261025,
"grad_norm": 1.7059526754812628,
"learning_rate": 9.999858105490868e-06,
"loss": 2.1753,
"step": 7
},
{
"epoch": 0.009535160905840286,
"grad_norm": 3.5843950702527754,
"learning_rate": 9.999680739242022e-06,
"loss": 2.1457,
"step": 8
},
{
"epoch": 0.010727056019070322,
"grad_norm": 3.0684912624884877,
"learning_rate": 9.999432430017084e-06,
"loss": 2.1636,
"step": 9
},
{
"epoch": 0.011918951132300357,
"grad_norm": 2.7608335284827397,
"learning_rate": 9.999113181339437e-06,
"loss": 2.1402,
"step": 10
},
{
"epoch": 0.013110846245530394,
"grad_norm": 2.2876952453740844,
"learning_rate": 9.99872299773906e-06,
"loss": 2.1377,
"step": 11
},
{
"epoch": 0.014302741358760428,
"grad_norm": 2.189713354043771,
"learning_rate": 9.998261884752463e-06,
"loss": 2.1297,
"step": 12
},
{
"epoch": 0.015494636471990465,
"grad_norm": 1.8497797869361814,
"learning_rate": 9.99772984892261e-06,
"loss": 2.1145,
"step": 13
},
{
"epoch": 0.0166865315852205,
"grad_norm": 1.4328123407133309,
"learning_rate": 9.997126897798826e-06,
"loss": 2.1099,
"step": 14
},
{
"epoch": 0.017878426698450536,
"grad_norm": 1.110872364131768,
"learning_rate": 9.996453039936682e-06,
"loss": 2.1171,
"step": 15
},
{
"epoch": 0.01907032181168057,
"grad_norm": 0.8634715903240043,
"learning_rate": 9.995708284897889e-06,
"loss": 2.0905,
"step": 16
},
{
"epoch": 0.02026221692491061,
"grad_norm": 0.7193294200124174,
"learning_rate": 9.994892643250147e-06,
"loss": 2.1082,
"step": 17
},
{
"epoch": 0.021454112038140644,
"grad_norm": 0.6321366046233599,
"learning_rate": 9.994006126567006e-06,
"loss": 2.0998,
"step": 18
},
{
"epoch": 0.02264600715137068,
"grad_norm": 0.5966553161988273,
"learning_rate": 9.993048747427696e-06,
"loss": 2.1107,
"step": 19
},
{
"epoch": 0.023837902264600714,
"grad_norm": 0.5609275352376858,
"learning_rate": 9.99202051941695e-06,
"loss": 2.1192,
"step": 20
},
{
"epoch": 0.025029797377830752,
"grad_norm": 0.5426385429041977,
"learning_rate": 9.990921457124807e-06,
"loss": 2.0809,
"step": 21
},
{
"epoch": 0.026221692491060787,
"grad_norm": 0.5394723771045817,
"learning_rate": 9.989751576146413e-06,
"loss": 2.0889,
"step": 22
},
{
"epoch": 0.027413587604290822,
"grad_norm": 0.5125643780082518,
"learning_rate": 9.9885108930818e-06,
"loss": 2.1025,
"step": 23
},
{
"epoch": 0.028605482717520857,
"grad_norm": 0.49224477502997804,
"learning_rate": 9.98719942553564e-06,
"loss": 2.0945,
"step": 24
},
{
"epoch": 0.029797377830750895,
"grad_norm": 0.45404804144183336,
"learning_rate": 9.985817192117001e-06,
"loss": 2.0985,
"step": 25
},
{
"epoch": 0.03098927294398093,
"grad_norm": 0.4619837784227509,
"learning_rate": 9.984364212439089e-06,
"loss": 2.1044,
"step": 26
},
{
"epoch": 0.03218116805721097,
"grad_norm": 0.43728216047255375,
"learning_rate": 9.982840507118959e-06,
"loss": 2.1109,
"step": 27
},
{
"epoch": 0.033373063170441,
"grad_norm": 0.4106377846709857,
"learning_rate": 9.98124609777723e-06,
"loss": 2.1169,
"step": 28
},
{
"epoch": 0.03456495828367104,
"grad_norm": 0.3844482158912619,
"learning_rate": 9.979581007037776e-06,
"loss": 2.084,
"step": 29
},
{
"epoch": 0.03575685339690107,
"grad_norm": 0.3615382921612817,
"learning_rate": 9.977845258527403e-06,
"loss": 2.0578,
"step": 30
},
{
"epoch": 0.03694874851013111,
"grad_norm": 0.3265425379192771,
"learning_rate": 9.976038876875519e-06,
"loss": 2.095,
"step": 31
},
{
"epoch": 0.03814064362336114,
"grad_norm": 0.3249807533911439,
"learning_rate": 9.974161887713775e-06,
"loss": 2.1091,
"step": 32
},
{
"epoch": 0.03933253873659118,
"grad_norm": 0.3258627805281391,
"learning_rate": 9.972214317675713e-06,
"loss": 2.0844,
"step": 33
},
{
"epoch": 0.04052443384982122,
"grad_norm": 0.3115021940469844,
"learning_rate": 9.970196194396383e-06,
"loss": 2.1214,
"step": 34
},
{
"epoch": 0.041716328963051254,
"grad_norm": 0.29359663265683655,
"learning_rate": 9.968107546511942e-06,
"loss": 2.1068,
"step": 35
},
{
"epoch": 0.04290822407628129,
"grad_norm": 0.2914709352436855,
"learning_rate": 9.965948403659267e-06,
"loss": 2.1358,
"step": 36
},
{
"epoch": 0.04410011918951132,
"grad_norm": 0.2801925528264536,
"learning_rate": 9.963718796475516e-06,
"loss": 2.0968,
"step": 37
},
{
"epoch": 0.04529201430274136,
"grad_norm": 0.2777208098847368,
"learning_rate": 9.961418756597703e-06,
"loss": 2.1118,
"step": 38
},
{
"epoch": 0.04648390941597139,
"grad_norm": 0.2627526369156367,
"learning_rate": 9.959048316662246e-06,
"loss": 2.1084,
"step": 39
},
{
"epoch": 0.04767580452920143,
"grad_norm": 0.27582542868025006,
"learning_rate": 9.956607510304508e-06,
"loss": 2.0636,
"step": 40
},
{
"epoch": 0.04886769964243146,
"grad_norm": 0.252514387355515,
"learning_rate": 9.95409637215831e-06,
"loss": 2.0842,
"step": 41
},
{
"epoch": 0.050059594755661505,
"grad_norm": 0.270250047965773,
"learning_rate": 9.951514937855455e-06,
"loss": 2.0476,
"step": 42
},
{
"epoch": 0.05125148986889154,
"grad_norm": 0.2626133865266297,
"learning_rate": 9.948863244025202e-06,
"loss": 2.0777,
"step": 43
},
{
"epoch": 0.052443384982121574,
"grad_norm": 0.25699273367046915,
"learning_rate": 9.94614132829377e-06,
"loss": 2.0944,
"step": 44
},
{
"epoch": 0.05363528009535161,
"grad_norm": 0.27340926527011333,
"learning_rate": 9.943349229283781e-06,
"loss": 2.0887,
"step": 45
},
{
"epoch": 0.054827175208581644,
"grad_norm": 0.24251549192329058,
"learning_rate": 9.94048698661373e-06,
"loss": 2.1024,
"step": 46
},
{
"epoch": 0.05601907032181168,
"grad_norm": 0.2575396666859324,
"learning_rate": 9.937554640897414e-06,
"loss": 2.083,
"step": 47
},
{
"epoch": 0.057210965435041714,
"grad_norm": 0.23558811037647728,
"learning_rate": 9.934552233743353e-06,
"loss": 2.0819,
"step": 48
},
{
"epoch": 0.058402860548271755,
"grad_norm": 0.24407061789389187,
"learning_rate": 9.931479807754209e-06,
"loss": 2.0793,
"step": 49
},
{
"epoch": 0.05959475566150179,
"grad_norm": 0.23931691434033772,
"learning_rate": 9.928337406526172e-06,
"loss": 2.1159,
"step": 50
},
{
"epoch": 0.060786650774731825,
"grad_norm": 0.23796062784470082,
"learning_rate": 9.925125074648352e-06,
"loss": 2.0824,
"step": 51
},
{
"epoch": 0.06197854588796186,
"grad_norm": 0.23466916513360747,
"learning_rate": 9.921842857702132e-06,
"loss": 2.0734,
"step": 52
},
{
"epoch": 0.0631704410011919,
"grad_norm": 0.23455835316060827,
"learning_rate": 9.918490802260538e-06,
"loss": 2.073,
"step": 53
},
{
"epoch": 0.06436233611442194,
"grad_norm": 0.2383847191126797,
"learning_rate": 9.915068955887564e-06,
"loss": 2.0621,
"step": 54
},
{
"epoch": 0.06555423122765197,
"grad_norm": 0.23851986543002354,
"learning_rate": 9.911577367137499e-06,
"loss": 2.0672,
"step": 55
},
{
"epoch": 0.066746126340882,
"grad_norm": 0.24283914455886954,
"learning_rate": 9.90801608555425e-06,
"loss": 2.0869,
"step": 56
},
{
"epoch": 0.06793802145411204,
"grad_norm": 0.24281061248826022,
"learning_rate": 9.904385161670626e-06,
"loss": 2.0755,
"step": 57
},
{
"epoch": 0.06912991656734208,
"grad_norm": 0.25365576984515253,
"learning_rate": 9.900684647007624e-06,
"loss": 2.097,
"step": 58
},
{
"epoch": 0.07032181168057211,
"grad_norm": 0.23883096980549337,
"learning_rate": 9.896914594073703e-06,
"loss": 2.0865,
"step": 59
},
{
"epoch": 0.07151370679380215,
"grad_norm": 0.25353821590332437,
"learning_rate": 9.893075056364034e-06,
"loss": 2.0597,
"step": 60
},
{
"epoch": 0.07270560190703218,
"grad_norm": 0.25666970441346676,
"learning_rate": 9.889166088359742e-06,
"loss": 2.0685,
"step": 61
},
{
"epoch": 0.07389749702026222,
"grad_norm": 0.25019996159388774,
"learning_rate": 9.885187745527132e-06,
"loss": 2.047,
"step": 62
},
{
"epoch": 0.07508939213349225,
"grad_norm": 0.25291578137222365,
"learning_rate": 9.881140084316907e-06,
"loss": 2.0874,
"step": 63
},
{
"epoch": 0.07628128724672228,
"grad_norm": 0.2636139470370503,
"learning_rate": 9.87702316216336e-06,
"loss": 2.0761,
"step": 64
},
{
"epoch": 0.07747318235995232,
"grad_norm": 0.267225445079766,
"learning_rate": 9.87283703748356e-06,
"loss": 2.0632,
"step": 65
},
{
"epoch": 0.07866507747318235,
"grad_norm": 0.26637209411345025,
"learning_rate": 9.868581769676532e-06,
"loss": 2.0465,
"step": 66
},
{
"epoch": 0.07985697258641239,
"grad_norm": 0.28338796894764773,
"learning_rate": 9.864257419122404e-06,
"loss": 2.0543,
"step": 67
},
{
"epoch": 0.08104886769964244,
"grad_norm": 0.26975885676108347,
"learning_rate": 9.859864047181551e-06,
"loss": 2.0612,
"step": 68
},
{
"epoch": 0.08224076281287247,
"grad_norm": 0.2782261191514193,
"learning_rate": 9.855401716193733e-06,
"loss": 2.0466,
"step": 69
},
{
"epoch": 0.08343265792610251,
"grad_norm": 0.3833695594063663,
"learning_rate": 9.850870489477198e-06,
"loss": 2.0592,
"step": 70
},
{
"epoch": 0.08462455303933254,
"grad_norm": 0.32898931613326715,
"learning_rate": 9.846270431327793e-06,
"loss": 2.0498,
"step": 71
},
{
"epoch": 0.08581644815256258,
"grad_norm": 0.3256809126198457,
"learning_rate": 9.841601607018052e-06,
"loss": 2.071,
"step": 72
},
{
"epoch": 0.08700834326579261,
"grad_norm": 0.3451979168625637,
"learning_rate": 9.83686408279626e-06,
"loss": 2.0497,
"step": 73
},
{
"epoch": 0.08820023837902265,
"grad_norm": 0.3114795827025238,
"learning_rate": 9.832057925885526e-06,
"loss": 2.034,
"step": 74
},
{
"epoch": 0.08939213349225268,
"grad_norm": 0.33202022850009677,
"learning_rate": 9.827183204482818e-06,
"loss": 2.0324,
"step": 75
},
{
"epoch": 0.09058402860548272,
"grad_norm": 0.36121743903908576,
"learning_rate": 9.822239987757999e-06,
"loss": 2.0491,
"step": 76
},
{
"epoch": 0.09177592371871275,
"grad_norm": 0.33897099215895576,
"learning_rate": 9.817228345852853e-06,
"loss": 2.043,
"step": 77
},
{
"epoch": 0.09296781883194279,
"grad_norm": 0.37620997544394924,
"learning_rate": 9.812148349880076e-06,
"loss": 2.0364,
"step": 78
},
{
"epoch": 0.09415971394517282,
"grad_norm": 0.33013400624562517,
"learning_rate": 9.807000071922279e-06,
"loss": 2.0375,
"step": 79
},
{
"epoch": 0.09535160905840286,
"grad_norm": 0.3453355696988705,
"learning_rate": 9.801783585030959e-06,
"loss": 2.0357,
"step": 80
},
{
"epoch": 0.09654350417163289,
"grad_norm": 0.36374461914208417,
"learning_rate": 9.79649896322546e-06,
"loss": 2.0358,
"step": 81
},
{
"epoch": 0.09773539928486293,
"grad_norm": 0.3949186996890207,
"learning_rate": 9.791146281491935e-06,
"loss": 2.0576,
"step": 82
},
{
"epoch": 0.09892729439809297,
"grad_norm": 0.5347002177369765,
"learning_rate": 9.785725615782262e-06,
"loss": 2.0536,
"step": 83
},
{
"epoch": 0.10011918951132301,
"grad_norm": 0.7315743014064646,
"learning_rate": 9.780237043012988e-06,
"loss": 2.0382,
"step": 84
},
{
"epoch": 0.10131108462455304,
"grad_norm": 0.8662501044705759,
"learning_rate": 9.774680641064223e-06,
"loss": 2.0254,
"step": 85
},
{
"epoch": 0.10250297973778308,
"grad_norm": 1.0007184298369625,
"learning_rate": 9.769056488778538e-06,
"loss": 2.0507,
"step": 86
},
{
"epoch": 0.10369487485101311,
"grad_norm": 0.6169288025962048,
"learning_rate": 9.76336466595985e-06,
"loss": 2.0605,
"step": 87
},
{
"epoch": 0.10488676996424315,
"grad_norm": 0.4654550573627205,
"learning_rate": 9.757605253372283e-06,
"loss": 2.0064,
"step": 88
},
{
"epoch": 0.10607866507747318,
"grad_norm": 0.7568695915967251,
"learning_rate": 9.751778332739033e-06,
"loss": 2.0206,
"step": 89
},
{
"epoch": 0.10727056019070322,
"grad_norm": 0.7116290914175033,
"learning_rate": 9.745883986741196e-06,
"loss": 2.0276,
"step": 90
},
{
"epoch": 0.10846245530393325,
"grad_norm": 0.4534995774251596,
"learning_rate": 9.739922299016601e-06,
"loss": 2.0372,
"step": 91
},
{
"epoch": 0.10965435041716329,
"grad_norm": 0.5202368477526325,
"learning_rate": 9.733893354158628e-06,
"loss": 2.0281,
"step": 92
},
{
"epoch": 0.11084624553039332,
"grad_norm": 0.6041387580680142,
"learning_rate": 9.727797237714991e-06,
"loss": 2.0148,
"step": 93
},
{
"epoch": 0.11203814064362336,
"grad_norm": 0.45293705742449053,
"learning_rate": 9.721634036186545e-06,
"loss": 2.0175,
"step": 94
},
{
"epoch": 0.11323003575685339,
"grad_norm": 0.4793331693139531,
"learning_rate": 9.715403837026046e-06,
"loss": 2.0328,
"step": 95
},
{
"epoch": 0.11442193087008343,
"grad_norm": 0.5140021564481994,
"learning_rate": 9.709106728636913e-06,
"loss": 2.0143,
"step": 96
},
{
"epoch": 0.11561382598331346,
"grad_norm": 0.5507096777112976,
"learning_rate": 9.702742800371972e-06,
"loss": 2.0451,
"step": 97
},
{
"epoch": 0.11680572109654351,
"grad_norm": 0.48239551212919374,
"learning_rate": 9.69631214253219e-06,
"loss": 2.0241,
"step": 98
},
{
"epoch": 0.11799761620977355,
"grad_norm": 0.48328110314880524,
"learning_rate": 9.689814846365399e-06,
"loss": 2.003,
"step": 99
},
{
"epoch": 0.11918951132300358,
"grad_norm": 0.5285205749893114,
"learning_rate": 9.68325100406499e-06,
"loss": 2.0333,
"step": 100
},
{
"epoch": 0.12038140643623362,
"grad_norm": 0.6086649755855322,
"learning_rate": 9.676620708768608e-06,
"loss": 2.0468,
"step": 101
},
{
"epoch": 0.12157330154946365,
"grad_norm": 0.6688243861727331,
"learning_rate": 9.669924054556836e-06,
"loss": 2.0052,
"step": 102
},
{
"epoch": 0.12276519666269368,
"grad_norm": 0.7098729745438024,
"learning_rate": 9.663161136451862e-06,
"loss": 2.0201,
"step": 103
},
{
"epoch": 0.12395709177592372,
"grad_norm": 0.7561634464725003,
"learning_rate": 9.656332050416118e-06,
"loss": 2.005,
"step": 104
},
{
"epoch": 0.12514898688915377,
"grad_norm": 0.7900403950856617,
"learning_rate": 9.64943689335093e-06,
"loss": 2.0312,
"step": 105
},
{
"epoch": 0.1263408820023838,
"grad_norm": 0.7565057129935538,
"learning_rate": 9.642475763095134e-06,
"loss": 2.034,
"step": 106
},
{
"epoch": 0.12753277711561384,
"grad_norm": 0.6925499516277225,
"learning_rate": 9.635448758423703e-06,
"loss": 2.0172,
"step": 107
},
{
"epoch": 0.12872467222884387,
"grad_norm": 0.5469713913154514,
"learning_rate": 9.628355979046325e-06,
"loss": 2.0306,
"step": 108
},
{
"epoch": 0.1299165673420739,
"grad_norm": 0.532846103968638,
"learning_rate": 9.621197525606e-06,
"loss": 2.0313,
"step": 109
},
{
"epoch": 0.13110846245530394,
"grad_norm": 0.5107064338016527,
"learning_rate": 9.613973499677613e-06,
"loss": 2.0483,
"step": 110
},
{
"epoch": 0.13230035756853398,
"grad_norm": 0.6341330772425801,
"learning_rate": 9.606684003766493e-06,
"loss": 2.0222,
"step": 111
},
{
"epoch": 0.133492252681764,
"grad_norm": 0.5976219627881748,
"learning_rate": 9.599329141306946e-06,
"loss": 2.0074,
"step": 112
},
{
"epoch": 0.13468414779499405,
"grad_norm": 0.5847751917110514,
"learning_rate": 9.591909016660806e-06,
"loss": 2.0206,
"step": 113
},
{
"epoch": 0.13587604290822408,
"grad_norm": 0.6813522136748844,
"learning_rate": 9.584423735115938e-06,
"loss": 2.0178,
"step": 114
},
{
"epoch": 0.13706793802145412,
"grad_norm": 0.767208353338879,
"learning_rate": 9.576873402884756e-06,
"loss": 1.9957,
"step": 115
},
{
"epoch": 0.13825983313468415,
"grad_norm": 0.9969526484589852,
"learning_rate": 9.569258127102708e-06,
"loss": 2.0152,
"step": 116
},
{
"epoch": 0.1394517282479142,
"grad_norm": 1.14614371471204,
"learning_rate": 9.561578015826758e-06,
"loss": 2.0156,
"step": 117
},
{
"epoch": 0.14064362336114422,
"grad_norm": 0.693890999588814,
"learning_rate": 9.553833178033856e-06,
"loss": 2.015,
"step": 118
},
{
"epoch": 0.14183551847437426,
"grad_norm": 0.6655521940110969,
"learning_rate": 9.546023723619387e-06,
"loss": 2.0357,
"step": 119
},
{
"epoch": 0.1430274135876043,
"grad_norm": 0.8792717595050646,
"learning_rate": 9.538149763395611e-06,
"loss": 2.0057,
"step": 120
},
{
"epoch": 0.14421930870083433,
"grad_norm": 0.9075837339408256,
"learning_rate": 9.530211409090104e-06,
"loss": 2.0324,
"step": 121
},
{
"epoch": 0.14541120381406436,
"grad_norm": 0.8920741190168875,
"learning_rate": 9.522208773344147e-06,
"loss": 1.9948,
"step": 122
},
{
"epoch": 0.1466030989272944,
"grad_norm": 0.8574927630149499,
"learning_rate": 9.514141969711155e-06,
"loss": 2.019,
"step": 123
},
{
"epoch": 0.14779499404052443,
"grad_norm": 0.6343463765213274,
"learning_rate": 9.506011112655045e-06,
"loss": 2.0193,
"step": 124
},
{
"epoch": 0.14898688915375446,
"grad_norm": 0.5630972285804464,
"learning_rate": 9.497816317548625e-06,
"loss": 2.0057,
"step": 125
},
{
"epoch": 0.1501787842669845,
"grad_norm": 0.7579610388968056,
"learning_rate": 9.489557700671948e-06,
"loss": 2.0315,
"step": 126
},
{
"epoch": 0.15137067938021453,
"grad_norm": 0.6850629250779653,
"learning_rate": 9.481235379210671e-06,
"loss": 2.001,
"step": 127
},
{
"epoch": 0.15256257449344457,
"grad_norm": 0.5362542526140824,
"learning_rate": 9.472849471254386e-06,
"loss": 2.0316,
"step": 128
},
{
"epoch": 0.1537544696066746,
"grad_norm": 0.608628527433765,
"learning_rate": 9.46440009579494e-06,
"loss": 2.035,
"step": 129
},
{
"epoch": 0.15494636471990464,
"grad_norm": 0.5093840827042088,
"learning_rate": 9.455887372724761e-06,
"loss": 2.0273,
"step": 130
},
{
"epoch": 0.15613825983313467,
"grad_norm": 0.646651425294055,
"learning_rate": 9.447311422835141e-06,
"loss": 2.0337,
"step": 131
},
{
"epoch": 0.1573301549463647,
"grad_norm": 0.6171589347028325,
"learning_rate": 9.438672367814532e-06,
"loss": 2.0111,
"step": 132
},
{
"epoch": 0.15852205005959474,
"grad_norm": 0.607124578385374,
"learning_rate": 9.429970330246817e-06,
"loss": 2.0207,
"step": 133
},
{
"epoch": 0.15971394517282478,
"grad_norm": 0.6668755869782658,
"learning_rate": 9.421205433609568e-06,
"loss": 2.0174,
"step": 134
},
{
"epoch": 0.16090584028605484,
"grad_norm": 0.7092639336616874,
"learning_rate": 9.412377802272296e-06,
"loss": 2.0061,
"step": 135
},
{
"epoch": 0.16209773539928488,
"grad_norm": 0.7386024648965732,
"learning_rate": 9.40348756149469e-06,
"loss": 2.0126,
"step": 136
},
{
"epoch": 0.1632896305125149,
"grad_norm": 0.6374704813920733,
"learning_rate": 9.39453483742483e-06,
"loss": 2.0176,
"step": 137
},
{
"epoch": 0.16448152562574495,
"grad_norm": 0.514905378407023,
"learning_rate": 9.385519757097405e-06,
"loss": 2.0055,
"step": 138
},
{
"epoch": 0.16567342073897498,
"grad_norm": 0.625583671688313,
"learning_rate": 9.376442448431911e-06,
"loss": 2.0109,
"step": 139
},
{
"epoch": 0.16686531585220502,
"grad_norm": 0.6190722916976653,
"learning_rate": 9.367303040230828e-06,
"loss": 1.9939,
"step": 140
},
{
"epoch": 0.16805721096543505,
"grad_norm": 0.5659222906567583,
"learning_rate": 9.358101662177804e-06,
"loss": 2.0111,
"step": 141
},
{
"epoch": 0.16924910607866508,
"grad_norm": 0.6584496167747385,
"learning_rate": 9.348838444835798e-06,
"loss": 2.0185,
"step": 142
},
{
"epoch": 0.17044100119189512,
"grad_norm": 0.5257356541865075,
"learning_rate": 9.33951351964525e-06,
"loss": 2.0167,
"step": 143
},
{
"epoch": 0.17163289630512515,
"grad_norm": 0.5343239683640106,
"learning_rate": 9.330127018922195e-06,
"loss": 2.0058,
"step": 144
},
{
"epoch": 0.1728247914183552,
"grad_norm": 0.5602849015914332,
"learning_rate": 9.320679075856396e-06,
"loss": 1.9952,
"step": 145
},
{
"epoch": 0.17401668653158522,
"grad_norm": 0.509174624093658,
"learning_rate": 9.311169824509454e-06,
"loss": 2.0035,
"step": 146
},
{
"epoch": 0.17520858164481526,
"grad_norm": 0.6065116610936728,
"learning_rate": 9.301599399812904e-06,
"loss": 1.9989,
"step": 147
},
{
"epoch": 0.1764004767580453,
"grad_norm": 0.6025058237653309,
"learning_rate": 9.291967937566297e-06,
"loss": 2.015,
"step": 148
},
{
"epoch": 0.17759237187127533,
"grad_norm": 0.5966629218921442,
"learning_rate": 9.28227557443528e-06,
"loss": 1.9871,
"step": 149
},
{
"epoch": 0.17878426698450536,
"grad_norm": 0.6244177338742471,
"learning_rate": 9.272522447949652e-06,
"loss": 1.9916,
"step": 150
},
{
"epoch": 0.1799761620977354,
"grad_norm": 0.522440075076418,
"learning_rate": 9.262708696501412e-06,
"loss": 1.9997,
"step": 151
},
{
"epoch": 0.18116805721096543,
"grad_norm": 0.5640728239700662,
"learning_rate": 9.252834459342801e-06,
"loss": 2.003,
"step": 152
},
{
"epoch": 0.18235995232419547,
"grad_norm": 0.6822460944537364,
"learning_rate": 9.242899876584317e-06,
"loss": 2.0198,
"step": 153
},
{
"epoch": 0.1835518474374255,
"grad_norm": 0.6013920222643127,
"learning_rate": 9.232905089192733e-06,
"loss": 1.983,
"step": 154
},
{
"epoch": 0.18474374255065554,
"grad_norm": 0.6210431332187637,
"learning_rate": 9.222850238989104e-06,
"loss": 1.9815,
"step": 155
},
{
"epoch": 0.18593563766388557,
"grad_norm": 0.5536506251912162,
"learning_rate": 9.21273546864673e-06,
"loss": 1.9943,
"step": 156
},
{
"epoch": 0.1871275327771156,
"grad_norm": 0.5108824250251738,
"learning_rate": 9.202560921689165e-06,
"loss": 1.9875,
"step": 157
},
{
"epoch": 0.18831942789034564,
"grad_norm": 0.6703972638895684,
"learning_rate": 9.192326742488153e-06,
"loss": 2.0054,
"step": 158
},
{
"epoch": 0.18951132300357568,
"grad_norm": 0.6911385466049688,
"learning_rate": 9.182033076261591e-06,
"loss": 2.013,
"step": 159
},
{
"epoch": 0.1907032181168057,
"grad_norm": 0.8228547705270176,
"learning_rate": 9.171680069071472e-06,
"loss": 2.0079,
"step": 160
},
{
"epoch": 0.19189511323003575,
"grad_norm": 0.8318482910273874,
"learning_rate": 9.161267867821802e-06,
"loss": 2.0116,
"step": 161
},
{
"epoch": 0.19308700834326578,
"grad_norm": 0.6993770001635832,
"learning_rate": 9.150796620256526e-06,
"loss": 2.0104,
"step": 162
},
{
"epoch": 0.19427890345649582,
"grad_norm": 0.6963815969965594,
"learning_rate": 9.140266474957421e-06,
"loss": 1.9932,
"step": 163
},
{
"epoch": 0.19547079856972585,
"grad_norm": 0.687540193587627,
"learning_rate": 9.129677581342e-06,
"loss": 1.9844,
"step": 164
},
{
"epoch": 0.1966626936829559,
"grad_norm": 0.6315324748513748,
"learning_rate": 9.11903008966138e-06,
"loss": 1.9964,
"step": 165
},
{
"epoch": 0.19785458879618595,
"grad_norm": 0.5152807583074759,
"learning_rate": 9.10832415099816e-06,
"loss": 2.0027,
"step": 166
},
{
"epoch": 0.19904648390941598,
"grad_norm": 0.4708357523523462,
"learning_rate": 9.097559917264268e-06,
"loss": 2.007,
"step": 167
},
{
"epoch": 0.20023837902264602,
"grad_norm": 0.5659309675022438,
"learning_rate": 9.086737541198812e-06,
"loss": 2.0065,
"step": 168
},
{
"epoch": 0.20143027413587605,
"grad_norm": 0.5973723979176943,
"learning_rate": 9.07585717636591e-06,
"loss": 1.9963,
"step": 169
},
{
"epoch": 0.2026221692491061,
"grad_norm": 0.612759197221063,
"learning_rate": 9.064918977152517e-06,
"loss": 2.0189,
"step": 170
},
{
"epoch": 0.20381406436233612,
"grad_norm": 0.6368297841192448,
"learning_rate": 9.053923098766218e-06,
"loss": 1.9996,
"step": 171
},
{
"epoch": 0.20500595947556616,
"grad_norm": 0.6267340913957593,
"learning_rate": 9.042869697233046e-06,
"loss": 2.0081,
"step": 172
},
{
"epoch": 0.2061978545887962,
"grad_norm": 0.5997679592985574,
"learning_rate": 9.031758929395259e-06,
"loss": 2.0087,
"step": 173
},
{
"epoch": 0.20738974970202623,
"grad_norm": 0.6540359851514235,
"learning_rate": 9.020590952909105e-06,
"loss": 1.9862,
"step": 174
},
{
"epoch": 0.20858164481525626,
"grad_norm": 0.6304008000188193,
"learning_rate": 9.009365926242603e-06,
"loss": 1.9845,
"step": 175
},
{
"epoch": 0.2097735399284863,
"grad_norm": 0.49409981260012525,
"learning_rate": 8.998084008673284e-06,
"loss": 1.9865,
"step": 176
},
{
"epoch": 0.21096543504171633,
"grad_norm": 0.428992104451379,
"learning_rate": 8.986745360285933e-06,
"loss": 1.9775,
"step": 177
},
{
"epoch": 0.21215733015494637,
"grad_norm": 0.4544484558085694,
"learning_rate": 8.975350141970312e-06,
"loss": 1.9974,
"step": 178
},
{
"epoch": 0.2133492252681764,
"grad_norm": 0.47713373163398903,
"learning_rate": 8.963898515418885e-06,
"loss": 1.9986,
"step": 179
},
{
"epoch": 0.21454112038140644,
"grad_norm": 0.5128102686619308,
"learning_rate": 8.952390643124524e-06,
"loss": 1.9926,
"step": 180
},
{
"epoch": 0.21573301549463647,
"grad_norm": 0.49123637812302784,
"learning_rate": 8.940826688378196e-06,
"loss": 2.0068,
"step": 181
},
{
"epoch": 0.2169249106078665,
"grad_norm": 0.4670667432350283,
"learning_rate": 8.929206815266653e-06,
"loss": 1.998,
"step": 182
},
{
"epoch": 0.21811680572109654,
"grad_norm": 0.5026402806403492,
"learning_rate": 8.917531188670096e-06,
"loss": 2.0023,
"step": 183
},
{
"epoch": 0.21930870083432658,
"grad_norm": 0.5146023032179888,
"learning_rate": 8.905799974259845e-06,
"loss": 1.9917,
"step": 184
},
{
"epoch": 0.2205005959475566,
"grad_norm": 0.500813938615368,
"learning_rate": 8.89401333849598e-06,
"loss": 2.0046,
"step": 185
},
{
"epoch": 0.22169249106078665,
"grad_norm": 0.5241153656092717,
"learning_rate": 8.882171448624988e-06,
"loss": 2.004,
"step": 186
},
{
"epoch": 0.22288438617401668,
"grad_norm": 0.5455210954026811,
"learning_rate": 8.870274472677376e-06,
"loss": 2.0136,
"step": 187
},
{
"epoch": 0.22407628128724671,
"grad_norm": 0.6182614320674238,
"learning_rate": 8.8583225794653e-06,
"loss": 1.9745,
"step": 188
},
{
"epoch": 0.22526817640047675,
"grad_norm": 0.7203972482184511,
"learning_rate": 8.846315938580163e-06,
"loss": 1.9876,
"step": 189
},
{
"epoch": 0.22646007151370678,
"grad_norm": 0.7651134846710912,
"learning_rate": 8.834254720390214e-06,
"loss": 2.0039,
"step": 190
},
{
"epoch": 0.22765196662693682,
"grad_norm": 0.717395085062428,
"learning_rate": 8.82213909603812e-06,
"loss": 1.9923,
"step": 191
},
{
"epoch": 0.22884386174016685,
"grad_norm": 0.6974046079010195,
"learning_rate": 8.80996923743855e-06,
"loss": 1.9902,
"step": 192
},
{
"epoch": 0.2300357568533969,
"grad_norm": 0.5749230359569363,
"learning_rate": 8.797745317275727e-06,
"loss": 2.0077,
"step": 193
},
{
"epoch": 0.23122765196662692,
"grad_norm": 0.47763377533604173,
"learning_rate": 8.78546750900098e-06,
"loss": 2.0175,
"step": 194
},
{
"epoch": 0.232419547079857,
"grad_norm": 0.4868384029481758,
"learning_rate": 8.773135986830289e-06,
"loss": 1.9817,
"step": 195
},
{
"epoch": 0.23361144219308702,
"grad_norm": 0.5411631589460403,
"learning_rate": 8.760750925741799e-06,
"loss": 2.0191,
"step": 196
},
{
"epoch": 0.23480333730631706,
"grad_norm": 0.5991085184799008,
"learning_rate": 8.748312501473351e-06,
"loss": 1.9872,
"step": 197
},
{
"epoch": 0.2359952324195471,
"grad_norm": 0.6561276515835338,
"learning_rate": 8.735820890519981e-06,
"loss": 1.9851,
"step": 198
},
{
"epoch": 0.23718712753277713,
"grad_norm": 0.7063577334823914,
"learning_rate": 8.723276270131422e-06,
"loss": 1.9897,
"step": 199
},
{
"epoch": 0.23837902264600716,
"grad_norm": 0.7581038228065401,
"learning_rate": 8.710678818309576e-06,
"loss": 2.0025,
"step": 200
},
{
"epoch": 0.2395709177592372,
"grad_norm": 0.7115966613137586,
"learning_rate": 8.698028713806005e-06,
"loss": 2.0004,
"step": 201
},
{
"epoch": 0.24076281287246723,
"grad_norm": 0.5976258958997295,
"learning_rate": 8.68532613611938e-06,
"loss": 2.018,
"step": 202
},
{
"epoch": 0.24195470798569726,
"grad_norm": 0.43540172054622217,
"learning_rate": 8.672571265492944e-06,
"loss": 1.9989,
"step": 203
},
{
"epoch": 0.2431466030989273,
"grad_norm": 0.5216426023045612,
"learning_rate": 8.659764282911948e-06,
"loss": 1.9866,
"step": 204
},
{
"epoch": 0.24433849821215733,
"grad_norm": 0.6613860116484914,
"learning_rate": 8.64690537010109e-06,
"loss": 2.0061,
"step": 205
},
{
"epoch": 0.24553039332538737,
"grad_norm": 0.7138301888755583,
"learning_rate": 8.63399470952193e-06,
"loss": 2.0107,
"step": 206
},
{
"epoch": 0.2467222884386174,
"grad_norm": 0.7998521068632918,
"learning_rate": 8.621032484370299e-06,
"loss": 1.9856,
"step": 207
},
{
"epoch": 0.24791418355184744,
"grad_norm": 0.6733799007638906,
"learning_rate": 8.60801887857371e-06,
"loss": 1.9789,
"step": 208
},
{
"epoch": 0.24910607866507747,
"grad_norm": 0.4890141413650463,
"learning_rate": 8.594954076788736e-06,
"loss": 1.9966,
"step": 209
},
{
"epoch": 0.25029797377830754,
"grad_norm": 0.510254285654425,
"learning_rate": 8.5818382643984e-06,
"loss": 2.0033,
"step": 210
},
{
"epoch": 0.25148986889153757,
"grad_norm": 0.6736096737562903,
"learning_rate": 8.56867162750954e-06,
"loss": 1.9882,
"step": 211
},
{
"epoch": 0.2526817640047676,
"grad_norm": 0.688224238343655,
"learning_rate": 8.555454352950161e-06,
"loss": 1.9826,
"step": 212
},
{
"epoch": 0.25387365911799764,
"grad_norm": 0.5310568361772406,
"learning_rate": 8.542186628266801e-06,
"loss": 2.018,
"step": 213
},
{
"epoch": 0.2550655542312277,
"grad_norm": 0.4622700149348845,
"learning_rate": 8.528868641721857e-06,
"loss": 1.9873,
"step": 214
},
{
"epoch": 0.2562574493444577,
"grad_norm": 0.44850296625902714,
"learning_rate": 8.515500582290914e-06,
"loss": 1.9738,
"step": 215
},
{
"epoch": 0.25744934445768775,
"grad_norm": 0.5800104445256365,
"learning_rate": 8.502082639660068e-06,
"loss": 2.0033,
"step": 216
},
{
"epoch": 0.2586412395709178,
"grad_norm": 0.5571007121924001,
"learning_rate": 8.488615004223233e-06,
"loss": 2.0097,
"step": 217
},
{
"epoch": 0.2598331346841478,
"grad_norm": 0.5363110521997889,
"learning_rate": 8.475097867079437e-06,
"loss": 1.9826,
"step": 218
},
{
"epoch": 0.26102502979737785,
"grad_norm": 0.46575794642736956,
"learning_rate": 8.461531420030117e-06,
"loss": 2.0129,
"step": 219
},
{
"epoch": 0.2622169249106079,
"grad_norm": 0.40917886114681945,
"learning_rate": 8.44791585557639e-06,
"loss": 2.0047,
"step": 220
},
{
"epoch": 0.2634088200238379,
"grad_norm": 0.428624008942813,
"learning_rate": 8.434251366916323e-06,
"loss": 1.9781,
"step": 221
},
{
"epoch": 0.26460071513706795,
"grad_norm": 0.4571746297128128,
"learning_rate": 8.420538147942196e-06,
"loss": 1.9844,
"step": 222
},
{
"epoch": 0.265792610250298,
"grad_norm": 0.47157884654181986,
"learning_rate": 8.406776393237748e-06,
"loss": 1.9985,
"step": 223
},
{
"epoch": 0.266984505363528,
"grad_norm": 0.46012310079193414,
"learning_rate": 8.392966298075413e-06,
"loss": 1.9945,
"step": 224
},
{
"epoch": 0.26817640047675806,
"grad_norm": 0.4551526365374971,
"learning_rate": 8.379108058413553e-06,
"loss": 1.9778,
"step": 225
},
{
"epoch": 0.2693682955899881,
"grad_norm": 0.4810916725254239,
"learning_rate": 8.36520187089368e-06,
"loss": 1.9814,
"step": 226
},
{
"epoch": 0.27056019070321813,
"grad_norm": 0.46258784460873204,
"learning_rate": 8.351247932837655e-06,
"loss": 1.9719,
"step": 227
},
{
"epoch": 0.27175208581644816,
"grad_norm": 0.45411997594863557,
"learning_rate": 8.337246442244902e-06,
"loss": 1.9753,
"step": 228
},
{
"epoch": 0.2729439809296782,
"grad_norm": 0.43996967181045016,
"learning_rate": 8.32319759778959e-06,
"loss": 2.0033,
"step": 229
},
{
"epoch": 0.27413587604290823,
"grad_norm": 0.507769478588206,
"learning_rate": 8.309101598817812e-06,
"loss": 2.0024,
"step": 230
},
{
"epoch": 0.27532777115613827,
"grad_norm": 0.48069601950891877,
"learning_rate": 8.294958645344766e-06,
"loss": 1.9824,
"step": 231
},
{
"epoch": 0.2765196662693683,
"grad_norm": 0.5157028595077698,
"learning_rate": 8.280768938051909e-06,
"loss": 1.9699,
"step": 232
},
{
"epoch": 0.27771156138259834,
"grad_norm": 0.579814229455722,
"learning_rate": 8.266532678284103e-06,
"loss": 1.984,
"step": 233
},
{
"epoch": 0.2789034564958284,
"grad_norm": 0.627324817155187,
"learning_rate": 8.252250068046784e-06,
"loss": 1.9861,
"step": 234
},
{
"epoch": 0.2800953516090584,
"grad_norm": 0.593805814527224,
"learning_rate": 8.23792131000306e-06,
"loss": 1.9693,
"step": 235
},
{
"epoch": 0.28128724672228844,
"grad_norm": 0.6552471095231857,
"learning_rate": 8.223546607470863e-06,
"loss": 1.9862,
"step": 236
},
{
"epoch": 0.2824791418355185,
"grad_norm": 0.6028562723069028,
"learning_rate": 8.209126164420056e-06,
"loss": 1.981,
"step": 237
},
{
"epoch": 0.2836710369487485,
"grad_norm": 0.5873677146224183,
"learning_rate": 8.19466018546953e-06,
"loss": 1.9967,
"step": 238
},
{
"epoch": 0.28486293206197855,
"grad_norm": 0.5279550914843492,
"learning_rate": 8.18014887588431e-06,
"loss": 1.9836,
"step": 239
},
{
"epoch": 0.2860548271752086,
"grad_norm": 0.5159083129491098,
"learning_rate": 8.165592441572648e-06,
"loss": 1.9906,
"step": 240
},
{
"epoch": 0.2872467222884386,
"grad_norm": 0.5540993574066266,
"learning_rate": 8.150991089083081e-06,
"loss": 1.9953,
"step": 241
},
{
"epoch": 0.28843861740166865,
"grad_norm": 0.6125101838648868,
"learning_rate": 8.13634502560152e-06,
"loss": 2.0038,
"step": 242
},
{
"epoch": 0.2896305125148987,
"grad_norm": 0.5519571584252633,
"learning_rate": 8.1216544589483e-06,
"loss": 1.9983,
"step": 243
},
{
"epoch": 0.2908224076281287,
"grad_norm": 0.544350413761365,
"learning_rate": 8.106919597575238e-06,
"loss": 1.9718,
"step": 244
},
{
"epoch": 0.29201430274135876,
"grad_norm": 0.5664660915352969,
"learning_rate": 8.092140650562665e-06,
"loss": 1.9671,
"step": 245
},
{
"epoch": 0.2932061978545888,
"grad_norm": 0.6296577119121265,
"learning_rate": 8.07731782761647e-06,
"loss": 1.9881,
"step": 246
},
{
"epoch": 0.2943980929678188,
"grad_norm": 0.4926647346394942,
"learning_rate": 8.062451339065116e-06,
"loss": 1.9609,
"step": 247
},
{
"epoch": 0.29558998808104886,
"grad_norm": 0.4624410592429987,
"learning_rate": 8.047541395856661e-06,
"loss": 1.9974,
"step": 248
},
{
"epoch": 0.2967818831942789,
"grad_norm": 0.559079602861405,
"learning_rate": 8.032588209555765e-06,
"loss": 1.999,
"step": 249
},
{
"epoch": 0.29797377830750893,
"grad_norm": 0.5257803282078808,
"learning_rate": 8.017591992340682e-06,
"loss": 1.99,
"step": 250
},
{
"epoch": 0.29916567342073896,
"grad_norm": 0.4532797658436555,
"learning_rate": 8.002552957000254e-06,
"loss": 1.9961,
"step": 251
},
{
"epoch": 0.300357568533969,
"grad_norm": 0.4967793482713224,
"learning_rate": 7.987471316930892e-06,
"loss": 1.9859,
"step": 252
},
{
"epoch": 0.30154946364719903,
"grad_norm": 0.5216037784287865,
"learning_rate": 7.972347286133549e-06,
"loss": 1.9775,
"step": 253
},
{
"epoch": 0.30274135876042907,
"grad_norm": 0.44165364383086597,
"learning_rate": 7.957181079210676e-06,
"loss": 1.9834,
"step": 254
},
{
"epoch": 0.3039332538736591,
"grad_norm": 0.4525734716636921,
"learning_rate": 7.941972911363187e-06,
"loss": 1.9834,
"step": 255
},
{
"epoch": 0.30512514898688914,
"grad_norm": 0.4399784793186879,
"learning_rate": 7.926722998387398e-06,
"loss": 1.9883,
"step": 256
},
{
"epoch": 0.3063170441001192,
"grad_norm": 0.4302293917353196,
"learning_rate": 7.911431556671967e-06,
"loss": 1.9888,
"step": 257
},
{
"epoch": 0.3075089392133492,
"grad_norm": 0.5077527400267277,
"learning_rate": 7.896098803194828e-06,
"loss": 1.9814,
"step": 258
},
{
"epoch": 0.30870083432657924,
"grad_norm": 0.5455522386411445,
"learning_rate": 7.880724955520105e-06,
"loss": 2.0022,
"step": 259
},
{
"epoch": 0.3098927294398093,
"grad_norm": 0.4734204507402147,
"learning_rate": 7.865310231795026e-06,
"loss": 1.9883,
"step": 260
},
{
"epoch": 0.3110846245530393,
"grad_norm": 0.46463402034819734,
"learning_rate": 7.849854850746834e-06,
"loss": 1.9871,
"step": 261
},
{
"epoch": 0.31227651966626935,
"grad_norm": 0.48102107314994796,
"learning_rate": 7.83435903167968e-06,
"loss": 1.9817,
"step": 262
},
{
"epoch": 0.3134684147794994,
"grad_norm": 0.49443270213282037,
"learning_rate": 7.818822994471504e-06,
"loss": 1.9726,
"step": 263
},
{
"epoch": 0.3146603098927294,
"grad_norm": 0.5141146391688594,
"learning_rate": 7.80324695957093e-06,
"loss": 1.9843,
"step": 264
},
{
"epoch": 0.31585220500595945,
"grad_norm": 0.48124603321709436,
"learning_rate": 7.78763114799412e-06,
"loss": 1.9713,
"step": 265
},
{
"epoch": 0.3170441001191895,
"grad_norm": 0.4573264323307654,
"learning_rate": 7.771975781321655e-06,
"loss": 1.9855,
"step": 266
},
{
"epoch": 0.3182359952324195,
"grad_norm": 0.497648183015366,
"learning_rate": 7.75628108169538e-06,
"loss": 1.9857,
"step": 267
},
{
"epoch": 0.31942789034564956,
"grad_norm": 0.5260277669621191,
"learning_rate": 7.740547271815253e-06,
"loss": 1.9867,
"step": 268
},
{
"epoch": 0.3206197854588796,
"grad_norm": 0.5443051292540823,
"learning_rate": 7.72477457493619e-06,
"loss": 1.9742,
"step": 269
},
{
"epoch": 0.3218116805721097,
"grad_norm": 0.4269306335257882,
"learning_rate": 7.70896321486489e-06,
"loss": 1.9768,
"step": 270
},
{
"epoch": 0.3230035756853397,
"grad_norm": 0.42010336549578936,
"learning_rate": 7.693113415956674e-06,
"loss": 1.9799,
"step": 271
},
{
"epoch": 0.32419547079856975,
"grad_norm": 0.46762767407360706,
"learning_rate": 7.677225403112277e-06,
"loss": 1.9843,
"step": 272
},
{
"epoch": 0.3253873659117998,
"grad_norm": 0.5057942342132519,
"learning_rate": 7.661299401774677e-06,
"loss": 1.9828,
"step": 273
},
{
"epoch": 0.3265792610250298,
"grad_norm": 0.5952967303729245,
"learning_rate": 7.645335637925897e-06,
"loss": 1.9796,
"step": 274
},
{
"epoch": 0.32777115613825986,
"grad_norm": 0.5273900870276448,
"learning_rate": 7.629334338083774e-06,
"loss": 1.9766,
"step": 275
},
{
"epoch": 0.3289630512514899,
"grad_norm": 0.45567977499071444,
"learning_rate": 7.6132957292987795e-06,
"loss": 1.9617,
"step": 276
},
{
"epoch": 0.3301549463647199,
"grad_norm": 0.5179821998771547,
"learning_rate": 7.597220039150768e-06,
"loss": 1.9863,
"step": 277
},
{
"epoch": 0.33134684147794996,
"grad_norm": 0.5651280024042905,
"learning_rate": 7.58110749574577e-06,
"loss": 1.9821,
"step": 278
},
{
"epoch": 0.33253873659118,
"grad_norm": 0.46901304611627237,
"learning_rate": 7.564958327712735e-06,
"loss": 1.9798,
"step": 279
},
{
"epoch": 0.33373063170441003,
"grad_norm": 0.4359604972801817,
"learning_rate": 7.5487727642003075e-06,
"loss": 1.9789,
"step": 280
},
{
"epoch": 0.33492252681764006,
"grad_norm": 0.4691874050085417,
"learning_rate": 7.532551034873558e-06,
"loss": 1.9858,
"step": 281
},
{
"epoch": 0.3361144219308701,
"grad_norm": 0.49078578351565005,
"learning_rate": 7.516293369910737e-06,
"loss": 1.9905,
"step": 282
},
{
"epoch": 0.33730631704410013,
"grad_norm": 0.4810414634759214,
"learning_rate": 7.500000000000001e-06,
"loss": 1.9757,
"step": 283
},
{
"epoch": 0.33849821215733017,
"grad_norm": 0.4004089110467056,
"learning_rate": 7.483671156336142e-06,
"loss": 1.9743,
"step": 284
},
{
"epoch": 0.3396901072705602,
"grad_norm": 0.48370804553795343,
"learning_rate": 7.467307070617309e-06,
"loss": 1.9882,
"step": 285
},
{
"epoch": 0.34088200238379024,
"grad_norm": 0.3916208994505171,
"learning_rate": 7.4509079750417154e-06,
"loss": 1.9906,
"step": 286
},
{
"epoch": 0.3420738974970203,
"grad_norm": 0.4440622088562717,
"learning_rate": 7.43447410230435e-06,
"loss": 1.9756,
"step": 287
},
{
"epoch": 0.3432657926102503,
"grad_norm": 0.4151369125535769,
"learning_rate": 7.418005685593669e-06,
"loss": 1.98,
"step": 288
},
{
"epoch": 0.34445768772348034,
"grad_norm": 0.42888099521221656,
"learning_rate": 7.4015029585882925e-06,
"loss": 1.9597,
"step": 289
},
{
"epoch": 0.3456495828367104,
"grad_norm": 0.4031068379998817,
"learning_rate": 7.384966155453686e-06,
"loss": 1.9909,
"step": 290
},
{
"epoch": 0.3468414779499404,
"grad_norm": 0.4288403976952624,
"learning_rate": 7.368395510838838e-06,
"loss": 1.9715,
"step": 291
},
{
"epoch": 0.34803337306317045,
"grad_norm": 0.4047372419449946,
"learning_rate": 7.351791259872929e-06,
"loss": 1.9933,
"step": 292
},
{
"epoch": 0.3492252681764005,
"grad_norm": 0.42040782221308876,
"learning_rate": 7.335153638162005e-06,
"loss": 1.9875,
"step": 293
},
{
"epoch": 0.3504171632896305,
"grad_norm": 0.40151800416240474,
"learning_rate": 7.318482881785612e-06,
"loss": 1.9827,
"step": 294
},
{
"epoch": 0.35160905840286055,
"grad_norm": 0.40534989415691614,
"learning_rate": 7.301779227293475e-06,
"loss": 1.9899,
"step": 295
},
{
"epoch": 0.3528009535160906,
"grad_norm": 0.41437334261849135,
"learning_rate": 7.285042911702116e-06,
"loss": 1.9761,
"step": 296
},
{
"epoch": 0.3539928486293206,
"grad_norm": 0.43461149682609845,
"learning_rate": 7.268274172491508e-06,
"loss": 2.0009,
"step": 297
},
{
"epoch": 0.35518474374255066,
"grad_norm": 0.42255392024397564,
"learning_rate": 7.251473247601698e-06,
"loss": 1.9805,
"step": 298
},
{
"epoch": 0.3563766388557807,
"grad_norm": 0.44303489088588954,
"learning_rate": 7.234640375429427e-06,
"loss": 1.9824,
"step": 299
},
{
"epoch": 0.3575685339690107,
"grad_norm": 0.43499397642762283,
"learning_rate": 7.217775794824759e-06,
"loss": 1.9785,
"step": 300
},
{
"epoch": 0.35876042908224076,
"grad_norm": 0.4208326930599362,
"learning_rate": 7.200879745087681e-06,
"loss": 1.994,
"step": 301
},
{
"epoch": 0.3599523241954708,
"grad_norm": 0.4452902733869807,
"learning_rate": 7.183952465964711e-06,
"loss": 1.9741,
"step": 302
},
{
"epoch": 0.36114421930870083,
"grad_norm": 0.4764827599963297,
"learning_rate": 7.166994197645497e-06,
"loss": 1.9826,
"step": 303
},
{
"epoch": 0.36233611442193087,
"grad_norm": 0.4460964876445021,
"learning_rate": 7.150005180759411e-06,
"loss": 1.9808,
"step": 304
},
{
"epoch": 0.3635280095351609,
"grad_norm": 0.42052492138452646,
"learning_rate": 7.132985656372126e-06,
"loss": 1.9652,
"step": 305
},
{
"epoch": 0.36471990464839094,
"grad_norm": 0.3578650107792017,
"learning_rate": 7.115935865982205e-06,
"loss": 2.0037,
"step": 306
},
{
"epoch": 0.36591179976162097,
"grad_norm": 0.4213839735073625,
"learning_rate": 7.098856051517673e-06,
"loss": 1.9983,
"step": 307
},
{
"epoch": 0.367103694874851,
"grad_norm": 0.41798689890135715,
"learning_rate": 7.0817464553325764e-06,
"loss": 1.9833,
"step": 308
},
{
"epoch": 0.36829558998808104,
"grad_norm": 0.46301273631831313,
"learning_rate": 7.064607320203552e-06,
"loss": 1.9785,
"step": 309
},
{
"epoch": 0.3694874851013111,
"grad_norm": 0.3853768039281196,
"learning_rate": 7.047438889326377e-06,
"loss": 1.9953,
"step": 310
},
{
"epoch": 0.3706793802145411,
"grad_norm": 0.39106836774943315,
"learning_rate": 7.030241406312528e-06,
"loss": 1.9908,
"step": 311
},
{
"epoch": 0.37187127532777114,
"grad_norm": 0.3557595574168793,
"learning_rate": 7.013015115185706e-06,
"loss": 1.9711,
"step": 312
},
{
"epoch": 0.3730631704410012,
"grad_norm": 0.462884994313804,
"learning_rate": 6.9957602603783944e-06,
"loss": 2.0036,
"step": 313
},
{
"epoch": 0.3742550655542312,
"grad_norm": 0.42933967393666006,
"learning_rate": 6.978477086728375e-06,
"loss": 1.9843,
"step": 314
},
{
"epoch": 0.37544696066746125,
"grad_norm": 0.43775594546905017,
"learning_rate": 6.961165839475262e-06,
"loss": 1.9799,
"step": 315
},
{
"epoch": 0.3766388557806913,
"grad_norm": 0.40786517623408314,
"learning_rate": 6.9438267642570216e-06,
"loss": 1.9674,
"step": 316
},
{
"epoch": 0.3778307508939213,
"grad_norm": 0.3812009351969576,
"learning_rate": 6.926460107106483e-06,
"loss": 1.9835,
"step": 317
},
{
"epoch": 0.37902264600715135,
"grad_norm": 0.43023083569572035,
"learning_rate": 6.909066114447847e-06,
"loss": 1.9843,
"step": 318
},
{
"epoch": 0.3802145411203814,
"grad_norm": 0.4055444095073271,
"learning_rate": 6.891645033093196e-06,
"loss": 1.9802,
"step": 319
},
{
"epoch": 0.3814064362336114,
"grad_norm": 0.43023837992568775,
"learning_rate": 6.874197110238986e-06,
"loss": 1.9756,
"step": 320
},
{
"epoch": 0.38259833134684146,
"grad_norm": 0.4061991284550457,
"learning_rate": 6.8567225934625385e-06,
"loss": 1.9793,
"step": 321
},
{
"epoch": 0.3837902264600715,
"grad_norm": 0.46263343121001,
"learning_rate": 6.8392217307185325e-06,
"loss": 1.9888,
"step": 322
},
{
"epoch": 0.38498212157330153,
"grad_norm": 0.5183393565092786,
"learning_rate": 6.8216947703354815e-06,
"loss": 1.9678,
"step": 323
},
{
"epoch": 0.38617401668653156,
"grad_norm": 0.4914054711777072,
"learning_rate": 6.804141961012213e-06,
"loss": 1.9774,
"step": 324
},
{
"epoch": 0.3873659117997616,
"grad_norm": 0.38775497500354755,
"learning_rate": 6.786563551814333e-06,
"loss": 1.9843,
"step": 325
},
{
"epoch": 0.38855780691299163,
"grad_norm": 0.4175239392741797,
"learning_rate": 6.7689597921707065e-06,
"loss": 1.9812,
"step": 326
},
{
"epoch": 0.38974970202622167,
"grad_norm": 0.5074081729621598,
"learning_rate": 6.7513309318698975e-06,
"loss": 1.9673,
"step": 327
},
{
"epoch": 0.3909415971394517,
"grad_norm": 0.5759724338089542,
"learning_rate": 6.733677221056645e-06,
"loss": 1.9595,
"step": 328
},
{
"epoch": 0.39213349225268174,
"grad_norm": 0.45858283981603526,
"learning_rate": 6.715998910228296e-06,
"loss": 1.979,
"step": 329
},
{
"epoch": 0.3933253873659118,
"grad_norm": 0.39590782238976335,
"learning_rate": 6.698296250231271e-06,
"loss": 1.981,
"step": 330
},
{
"epoch": 0.39451728247914186,
"grad_norm": 0.5514883543457016,
"learning_rate": 6.68056949225748e-06,
"loss": 1.9754,
"step": 331
},
{
"epoch": 0.3957091775923719,
"grad_norm": 0.5367006385906758,
"learning_rate": 6.6628188878407806e-06,
"loss": 1.9688,
"step": 332
},
{
"epoch": 0.39690107270560193,
"grad_norm": 0.4563028045170266,
"learning_rate": 6.645044688853396e-06,
"loss": 1.9792,
"step": 333
},
{
"epoch": 0.39809296781883197,
"grad_norm": 0.4705275885547744,
"learning_rate": 6.627247147502343e-06,
"loss": 1.9751,
"step": 334
},
{
"epoch": 0.399284862932062,
"grad_norm": 0.39053085326929393,
"learning_rate": 6.609426516325859e-06,
"loss": 1.9809,
"step": 335
},
{
"epoch": 0.40047675804529204,
"grad_norm": 0.46336889396641767,
"learning_rate": 6.591583048189812e-06,
"loss": 1.9819,
"step": 336
},
{
"epoch": 0.40166865315852207,
"grad_norm": 0.41312116285494427,
"learning_rate": 6.573716996284114e-06,
"loss": 1.9956,
"step": 337
},
{
"epoch": 0.4028605482717521,
"grad_norm": 0.4261033537644772,
"learning_rate": 6.555828614119132e-06,
"loss": 1.9864,
"step": 338
},
{
"epoch": 0.40405244338498214,
"grad_norm": 0.5571802621996744,
"learning_rate": 6.537918155522089e-06,
"loss": 1.9881,
"step": 339
},
{
"epoch": 0.4052443384982122,
"grad_norm": 0.42763390364122206,
"learning_rate": 6.519985874633454e-06,
"loss": 1.981,
"step": 340
},
{
"epoch": 0.4064362336114422,
"grad_norm": 0.41484190699219026,
"learning_rate": 6.502032025903356e-06,
"loss": 1.9641,
"step": 341
},
{
"epoch": 0.40762812872467225,
"grad_norm": 0.3838791164718351,
"learning_rate": 6.484056864087948e-06,
"loss": 1.9709,
"step": 342
},
{
"epoch": 0.4088200238379023,
"grad_norm": 0.4023689175266171,
"learning_rate": 6.4660606442458155e-06,
"loss": 1.9713,
"step": 343
},
{
"epoch": 0.4100119189511323,
"grad_norm": 0.4336427044211903,
"learning_rate": 6.4480436217343366e-06,
"loss": 1.9534,
"step": 344
},
{
"epoch": 0.41120381406436235,
"grad_norm": 0.37598773624858467,
"learning_rate": 6.430006052206083e-06,
"loss": 1.9603,
"step": 345
},
{
"epoch": 0.4123957091775924,
"grad_norm": 0.43416807891817494,
"learning_rate": 6.411948191605164e-06,
"loss": 1.9787,
"step": 346
},
{
"epoch": 0.4135876042908224,
"grad_norm": 0.3977800151758,
"learning_rate": 6.393870296163616e-06,
"loss": 1.9916,
"step": 347
},
{
"epoch": 0.41477949940405245,
"grad_norm": 0.4008696555982334,
"learning_rate": 6.375772622397762e-06,
"loss": 1.9804,
"step": 348
},
{
"epoch": 0.4159713945172825,
"grad_norm": 0.3451532285909086,
"learning_rate": 6.357655427104562e-06,
"loss": 1.9663,
"step": 349
},
{
"epoch": 0.4171632896305125,
"grad_norm": 0.4341428658767691,
"learning_rate": 6.339518967357985e-06,
"loss": 1.9744,
"step": 350
},
{
"epoch": 0.41835518474374256,
"grad_norm": 0.37680689737786904,
"learning_rate": 6.321363500505348e-06,
"loss": 1.994,
"step": 351
},
{
"epoch": 0.4195470798569726,
"grad_norm": 0.36788506489233713,
"learning_rate": 6.3031892841636685e-06,
"loss": 1.9847,
"step": 352
},
{
"epoch": 0.42073897497020263,
"grad_norm": 0.38396929856917666,
"learning_rate": 6.284996576216014e-06,
"loss": 1.9722,
"step": 353
},
{
"epoch": 0.42193087008343266,
"grad_norm": 0.3512841210948969,
"learning_rate": 6.266785634807838e-06,
"loss": 1.9504,
"step": 354
},
{
"epoch": 0.4231227651966627,
"grad_norm": 0.3841371233710849,
"learning_rate": 6.248556718343314e-06,
"loss": 1.9997,
"step": 355
},
{
"epoch": 0.42431466030989273,
"grad_norm": 0.41345223603319187,
"learning_rate": 6.230310085481677e-06,
"loss": 1.9754,
"step": 356
},
{
"epoch": 0.42550655542312277,
"grad_norm": 0.36115831056461284,
"learning_rate": 6.212045995133543e-06,
"loss": 1.9735,
"step": 357
},
{
"epoch": 0.4266984505363528,
"grad_norm": 0.37667258015583416,
"learning_rate": 6.193764706457249e-06,
"loss": 1.9669,
"step": 358
},
{
"epoch": 0.42789034564958284,
"grad_norm": 0.34439222602136627,
"learning_rate": 6.175466478855161e-06,
"loss": 1.9788,
"step": 359
},
{
"epoch": 0.42908224076281287,
"grad_norm": 0.406649190145765,
"learning_rate": 6.157151571970005e-06,
"loss": 1.9868,
"step": 360
},
{
"epoch": 0.4302741358760429,
"grad_norm": 0.37410746997126837,
"learning_rate": 6.13882024568117e-06,
"loss": 1.9588,
"step": 361
},
{
"epoch": 0.43146603098927294,
"grad_norm": 0.38935925565712926,
"learning_rate": 6.1204727601010396e-06,
"loss": 1.978,
"step": 362
},
{
"epoch": 0.432657926102503,
"grad_norm": 0.3934047570972324,
"learning_rate": 6.10210937557128e-06,
"loss": 1.9728,
"step": 363
},
{
"epoch": 0.433849821215733,
"grad_norm": 0.3740037082900391,
"learning_rate": 6.083730352659158e-06,
"loss": 1.9777,
"step": 364
},
{
"epoch": 0.43504171632896305,
"grad_norm": 0.3962866525803316,
"learning_rate": 6.065335952153846e-06,
"loss": 1.9753,
"step": 365
},
{
"epoch": 0.4362336114421931,
"grad_norm": 0.3703123980920405,
"learning_rate": 6.0469264350627075e-06,
"loss": 1.9685,
"step": 366
},
{
"epoch": 0.4374255065554231,
"grad_norm": 0.3772080775482272,
"learning_rate": 6.0285020626076115e-06,
"loss": 1.9918,
"step": 367
},
{
"epoch": 0.43861740166865315,
"grad_norm": 0.36096021522163296,
"learning_rate": 6.010063096221215e-06,
"loss": 1.9857,
"step": 368
},
{
"epoch": 0.4398092967818832,
"grad_norm": 0.4027363280332516,
"learning_rate": 5.991609797543253e-06,
"loss": 1.9772,
"step": 369
},
{
"epoch": 0.4410011918951132,
"grad_norm": 0.36449407433194586,
"learning_rate": 5.973142428416829e-06,
"loss": 1.9926,
"step": 370
},
{
"epoch": 0.44219308700834326,
"grad_norm": 0.41922022657177943,
"learning_rate": 5.954661250884704e-06,
"loss": 1.9851,
"step": 371
},
{
"epoch": 0.4433849821215733,
"grad_norm": 0.3957989777206615,
"learning_rate": 5.936166527185565e-06,
"loss": 1.9627,
"step": 372
},
{
"epoch": 0.4445768772348033,
"grad_norm": 0.39452398707557135,
"learning_rate": 5.91765851975032e-06,
"loss": 1.9876,
"step": 373
},
{
"epoch": 0.44576877234803336,
"grad_norm": 0.39493419711592515,
"learning_rate": 5.899137491198364e-06,
"loss": 1.9686,
"step": 374
},
{
"epoch": 0.4469606674612634,
"grad_norm": 0.4099934527801523,
"learning_rate": 5.880603704333851e-06,
"loss": 1.9534,
"step": 375
},
{
"epoch": 0.44815256257449343,
"grad_norm": 0.36964455654061956,
"learning_rate": 5.862057422141979e-06,
"loss": 1.9523,
"step": 376
},
{
"epoch": 0.44934445768772346,
"grad_norm": 0.3902869598970143,
"learning_rate": 5.843498907785236e-06,
"loss": 1.9554,
"step": 377
},
{
"epoch": 0.4505363528009535,
"grad_norm": 0.3969483119716555,
"learning_rate": 5.8249284245996905e-06,
"loss": 1.9907,
"step": 378
},
{
"epoch": 0.45172824791418353,
"grad_norm": 0.3960234150743317,
"learning_rate": 5.806346236091232e-06,
"loss": 1.9906,
"step": 379
},
{
"epoch": 0.45292014302741357,
"grad_norm": 0.3810498242078963,
"learning_rate": 5.78775260593185e-06,
"loss": 1.9612,
"step": 380
},
{
"epoch": 0.4541120381406436,
"grad_norm": 0.385855393557767,
"learning_rate": 5.769147797955882e-06,
"loss": 1.9736,
"step": 381
},
{
"epoch": 0.45530393325387364,
"grad_norm": 0.34406815893035153,
"learning_rate": 5.7505320761562735e-06,
"loss": 1.9864,
"step": 382
},
{
"epoch": 0.4564958283671037,
"grad_norm": 0.37884788683749326,
"learning_rate": 5.731905704680834e-06,
"loss": 1.9878,
"step": 383
},
{
"epoch": 0.4576877234803337,
"grad_norm": 0.38229911057814764,
"learning_rate": 5.713268947828484e-06,
"loss": 1.9677,
"step": 384
},
{
"epoch": 0.45887961859356374,
"grad_norm": 0.3930195565597414,
"learning_rate": 5.694622070045507e-06,
"loss": 1.9831,
"step": 385
},
{
"epoch": 0.4600715137067938,
"grad_norm": 0.35771899505040233,
"learning_rate": 5.6759653359218e-06,
"loss": 1.938,
"step": 386
},
{
"epoch": 0.4612634088200238,
"grad_norm": 0.3844248408562967,
"learning_rate": 5.657299010187116e-06,
"loss": 1.983,
"step": 387
},
{
"epoch": 0.46245530393325385,
"grad_norm": 0.374339760496431,
"learning_rate": 5.638623357707304e-06,
"loss": 1.9696,
"step": 388
},
{
"epoch": 0.4636471990464839,
"grad_norm": 0.4187861158867821,
"learning_rate": 5.6199386434805615e-06,
"loss": 1.9678,
"step": 389
},
{
"epoch": 0.464839094159714,
"grad_norm": 0.37470925657624427,
"learning_rate": 5.601245132633662e-06,
"loss": 1.9708,
"step": 390
},
{
"epoch": 0.466030989272944,
"grad_norm": 0.43682382668647773,
"learning_rate": 5.582543090418203e-06,
"loss": 1.9742,
"step": 391
},
{
"epoch": 0.46722288438617404,
"grad_norm": 0.38062716223853055,
"learning_rate": 5.563832782206835e-06,
"loss": 1.956,
"step": 392
},
{
"epoch": 0.4684147794994041,
"grad_norm": 0.39166492023793653,
"learning_rate": 5.5451144734895e-06,
"loss": 1.9479,
"step": 393
},
{
"epoch": 0.4696066746126341,
"grad_norm": 0.45740493772589974,
"learning_rate": 5.526388429869663e-06,
"loss": 1.9757,
"step": 394
},
{
"epoch": 0.47079856972586415,
"grad_norm": 0.3532441760302746,
"learning_rate": 5.507654917060541e-06,
"loss": 1.9774,
"step": 395
},
{
"epoch": 0.4719904648390942,
"grad_norm": 0.4162677343329253,
"learning_rate": 5.48891420088134e-06,
"loss": 1.9837,
"step": 396
},
{
"epoch": 0.4731823599523242,
"grad_norm": 0.4634604848492295,
"learning_rate": 5.470166547253476e-06,
"loss": 1.9923,
"step": 397
},
{
"epoch": 0.47437425506555425,
"grad_norm": 0.4001952153469404,
"learning_rate": 5.451412222196801e-06,
"loss": 1.969,
"step": 398
},
{
"epoch": 0.4755661501787843,
"grad_norm": 0.4117431494583168,
"learning_rate": 5.432651491825837e-06,
"loss": 1.9609,
"step": 399
},
{
"epoch": 0.4767580452920143,
"grad_norm": 0.4367947660920832,
"learning_rate": 5.4138846223459895e-06,
"loss": 1.9621,
"step": 400
},
{
"epoch": 0.47794994040524436,
"grad_norm": 0.34963770890851276,
"learning_rate": 5.395111880049775e-06,
"loss": 1.9564,
"step": 401
},
{
"epoch": 0.4791418355184744,
"grad_norm": 0.4080401962751008,
"learning_rate": 5.376333531313046e-06,
"loss": 1.9689,
"step": 402
},
{
"epoch": 0.4803337306317044,
"grad_norm": 0.39779512665663647,
"learning_rate": 5.3575498425912046e-06,
"loss": 1.9752,
"step": 403
},
{
"epoch": 0.48152562574493446,
"grad_norm": 0.3494078316294088,
"learning_rate": 5.338761080415425e-06,
"loss": 1.988,
"step": 404
},
{
"epoch": 0.4827175208581645,
"grad_norm": 0.38403810675465305,
"learning_rate": 5.319967511388871e-06,
"loss": 1.9849,
"step": 405
},
{
"epoch": 0.48390941597139453,
"grad_norm": 0.41925050485912146,
"learning_rate": 5.301169402182915e-06,
"loss": 1.9744,
"step": 406
},
{
"epoch": 0.48510131108462456,
"grad_norm": 0.3659050285550682,
"learning_rate": 5.28236701953335e-06,
"loss": 1.9594,
"step": 407
},
{
"epoch": 0.4862932061978546,
"grad_norm": 0.3779979519911562,
"learning_rate": 5.263560630236611e-06,
"loss": 1.969,
"step": 408
},
{
"epoch": 0.48748510131108463,
"grad_norm": 0.4051001024185403,
"learning_rate": 5.244750501145977e-06,
"loss": 1.9758,
"step": 409
},
{
"epoch": 0.48867699642431467,
"grad_norm": 0.3580954206397942,
"learning_rate": 5.225936899167803e-06,
"loss": 1.9712,
"step": 410
},
{
"epoch": 0.4898688915375447,
"grad_norm": 0.37492205319973293,
"learning_rate": 5.207120091257715e-06,
"loss": 1.9924,
"step": 411
},
{
"epoch": 0.49106078665077474,
"grad_norm": 0.3787755420296742,
"learning_rate": 5.188300344416834e-06,
"loss": 1.9607,
"step": 412
},
{
"epoch": 0.4922526817640048,
"grad_norm": 0.3594245434434773,
"learning_rate": 5.169477925687981e-06,
"loss": 1.9596,
"step": 413
},
{
"epoch": 0.4934445768772348,
"grad_norm": 0.4048509843155868,
"learning_rate": 5.15065310215189e-06,
"loss": 1.9811,
"step": 414
},
{
"epoch": 0.49463647199046484,
"grad_norm": 0.33930841548544644,
"learning_rate": 5.1318261409234185e-06,
"loss": 1.9785,
"step": 415
},
{
"epoch": 0.4958283671036949,
"grad_norm": 0.3971904008450457,
"learning_rate": 5.112997309147753e-06,
"loss": 1.9538,
"step": 416
},
{
"epoch": 0.4970202622169249,
"grad_norm": 0.4109703239083303,
"learning_rate": 5.094166873996632e-06,
"loss": 1.9442,
"step": 417
},
{
"epoch": 0.49821215733015495,
"grad_norm": 0.35849090963357355,
"learning_rate": 5.075335102664533e-06,
"loss": 1.9611,
"step": 418
},
{
"epoch": 0.499404052443385,
"grad_norm": 0.3315925723712266,
"learning_rate": 5.0565022623649e-06,
"loss": 1.9507,
"step": 419
},
{
"epoch": 0.5005959475566151,
"grad_norm": 0.40128345634186274,
"learning_rate": 5.037668620326343e-06,
"loss": 1.9965,
"step": 420
},
{
"epoch": 0.5017878426698451,
"grad_norm": 0.34631267401835186,
"learning_rate": 5.018834443788855e-06,
"loss": 1.9739,
"step": 421
},
{
"epoch": 0.5029797377830751,
"grad_norm": 0.37750605356600553,
"learning_rate": 5e-06,
"loss": 1.9577,
"step": 422
},
{
"epoch": 0.5041716328963052,
"grad_norm": 0.325413886379343,
"learning_rate": 4.9811655562111465e-06,
"loss": 1.964,
"step": 423
},
{
"epoch": 0.5053635280095352,
"grad_norm": 0.37792660484449137,
"learning_rate": 4.9623313796736575e-06,
"loss": 1.9834,
"step": 424
},
{
"epoch": 0.5065554231227652,
"grad_norm": 0.3212926587032829,
"learning_rate": 4.943497737635103e-06,
"loss": 1.9652,
"step": 425
},
{
"epoch": 0.5077473182359953,
"grad_norm": 0.3666539973322088,
"learning_rate": 4.9246648973354704e-06,
"loss": 1.9898,
"step": 426
},
{
"epoch": 0.5089392133492253,
"grad_norm": 0.3470498382172804,
"learning_rate": 4.905833126003371e-06,
"loss": 1.986,
"step": 427
},
{
"epoch": 0.5101311084624554,
"grad_norm": 0.3509551861996659,
"learning_rate": 4.887002690852249e-06,
"loss": 1.9765,
"step": 428
},
{
"epoch": 0.5113230035756854,
"grad_norm": 0.33773403719361406,
"learning_rate": 4.868173859076585e-06,
"loss": 1.9514,
"step": 429
},
{
"epoch": 0.5125148986889154,
"grad_norm": 0.33839162767720193,
"learning_rate": 4.849346897848111e-06,
"loss": 1.9671,
"step": 430
},
{
"epoch": 0.5137067938021455,
"grad_norm": 0.34429335199030947,
"learning_rate": 4.830522074312019e-06,
"loss": 1.9739,
"step": 431
},
{
"epoch": 0.5148986889153755,
"grad_norm": 0.35288845889112397,
"learning_rate": 4.811699655583167e-06,
"loss": 1.9912,
"step": 432
},
{
"epoch": 0.5160905840286055,
"grad_norm": 0.3461629113067177,
"learning_rate": 4.792879908742285e-06,
"loss": 1.9484,
"step": 433
},
{
"epoch": 0.5172824791418356,
"grad_norm": 0.3196675261690019,
"learning_rate": 4.774063100832199e-06,
"loss": 1.9688,
"step": 434
},
{
"epoch": 0.5184743742550656,
"grad_norm": 0.3392521279527564,
"learning_rate": 4.755249498854024e-06,
"loss": 1.9506,
"step": 435
},
{
"epoch": 0.5196662693682956,
"grad_norm": 0.3457448380814436,
"learning_rate": 4.736439369763391e-06,
"loss": 1.9743,
"step": 436
},
{
"epoch": 0.5208581644815257,
"grad_norm": 0.33206346746993015,
"learning_rate": 4.717632980466652e-06,
"loss": 1.9593,
"step": 437
},
{
"epoch": 0.5220500595947557,
"grad_norm": 0.3528235654912419,
"learning_rate": 4.698830597817087e-06,
"loss": 1.9665,
"step": 438
},
{
"epoch": 0.5232419547079857,
"grad_norm": 0.3556856155018991,
"learning_rate": 4.680032488611131e-06,
"loss": 1.9799,
"step": 439
},
{
"epoch": 0.5244338498212158,
"grad_norm": 0.32848860913310046,
"learning_rate": 4.661238919584578e-06,
"loss": 1.9803,
"step": 440
},
{
"epoch": 0.5256257449344458,
"grad_norm": 0.32687019162828723,
"learning_rate": 4.642450157408798e-06,
"loss": 1.9428,
"step": 441
},
{
"epoch": 0.5268176400476758,
"grad_norm": 0.3453824430384208,
"learning_rate": 4.623666468686956e-06,
"loss": 1.9822,
"step": 442
},
{
"epoch": 0.5280095351609059,
"grad_norm": 0.37003751019358744,
"learning_rate": 4.6048881199502265e-06,
"loss": 1.9483,
"step": 443
},
{
"epoch": 0.5292014302741359,
"grad_norm": 0.4147946881041239,
"learning_rate": 4.586115377654014e-06,
"loss": 1.9617,
"step": 444
},
{
"epoch": 0.5303933253873659,
"grad_norm": 0.3574077732974426,
"learning_rate": 4.567348508174164e-06,
"loss": 1.9583,
"step": 445
},
{
"epoch": 0.531585220500596,
"grad_norm": 0.40825647248866936,
"learning_rate": 4.548587777803198e-06,
"loss": 1.9804,
"step": 446
},
{
"epoch": 0.532777115613826,
"grad_norm": 0.37613255907704796,
"learning_rate": 4.529833452746526e-06,
"loss": 1.9927,
"step": 447
},
{
"epoch": 0.533969010727056,
"grad_norm": 0.594095776694764,
"learning_rate": 4.5110857991186606e-06,
"loss": 1.9719,
"step": 448
},
{
"epoch": 0.5351609058402861,
"grad_norm": 0.3717370719647907,
"learning_rate": 4.49234508293946e-06,
"loss": 1.9593,
"step": 449
},
{
"epoch": 0.5363528009535161,
"grad_norm": 0.3648799549586229,
"learning_rate": 4.47361157013034e-06,
"loss": 1.967,
"step": 450
},
{
"epoch": 0.5375446960667462,
"grad_norm": 0.33995726389121855,
"learning_rate": 4.454885526510501e-06,
"loss": 1.9753,
"step": 451
},
{
"epoch": 0.5387365911799762,
"grad_norm": 0.39251871849389397,
"learning_rate": 4.436167217793167e-06,
"loss": 1.9818,
"step": 452
},
{
"epoch": 0.5399284862932062,
"grad_norm": 0.3199928030279707,
"learning_rate": 4.417456909581798e-06,
"loss": 1.9552,
"step": 453
},
{
"epoch": 0.5411203814064363,
"grad_norm": 0.3669027651321596,
"learning_rate": 4.398754867366339e-06,
"loss": 1.9775,
"step": 454
},
{
"epoch": 0.5423122765196663,
"grad_norm": 0.3436432164393003,
"learning_rate": 4.38006135651944e-06,
"loss": 1.9772,
"step": 455
},
{
"epoch": 0.5435041716328963,
"grad_norm": 0.361749523201955,
"learning_rate": 4.361376642292698e-06,
"loss": 1.9683,
"step": 456
},
{
"epoch": 0.5446960667461264,
"grad_norm": 0.39560797233498957,
"learning_rate": 4.3427009898128865e-06,
"loss": 1.9671,
"step": 457
},
{
"epoch": 0.5458879618593564,
"grad_norm": 0.3602620583029035,
"learning_rate": 4.3240346640782014e-06,
"loss": 1.9944,
"step": 458
},
{
"epoch": 0.5470798569725864,
"grad_norm": 0.36119141344790967,
"learning_rate": 4.305377929954495e-06,
"loss": 1.9761,
"step": 459
},
{
"epoch": 0.5482717520858165,
"grad_norm": 0.3678120966781157,
"learning_rate": 4.286731052171518e-06,
"loss": 1.958,
"step": 460
},
{
"epoch": 0.5494636471990465,
"grad_norm": 0.3617639507211402,
"learning_rate": 4.268094295319167e-06,
"loss": 1.9813,
"step": 461
},
{
"epoch": 0.5506555423122765,
"grad_norm": 0.3608243381659533,
"learning_rate": 4.249467923843728e-06,
"loss": 1.9641,
"step": 462
},
{
"epoch": 0.5518474374255066,
"grad_norm": 0.36278415417064125,
"learning_rate": 4.23085220204412e-06,
"loss": 1.9709,
"step": 463
},
{
"epoch": 0.5530393325387366,
"grad_norm": 0.3588218797888413,
"learning_rate": 4.212247394068151e-06,
"loss": 1.9626,
"step": 464
},
{
"epoch": 0.5542312276519666,
"grad_norm": 0.33036367069937955,
"learning_rate": 4.19365376390877e-06,
"loss": 1.9832,
"step": 465
},
{
"epoch": 0.5554231227651967,
"grad_norm": 0.4074815848531431,
"learning_rate": 4.175071575400311e-06,
"loss": 1.9776,
"step": 466
},
{
"epoch": 0.5566150178784267,
"grad_norm": 0.34881797295660344,
"learning_rate": 4.1565010922147644e-06,
"loss": 1.957,
"step": 467
},
{
"epoch": 0.5578069129916567,
"grad_norm": 0.37520529930498075,
"learning_rate": 4.137942577858023e-06,
"loss": 1.975,
"step": 468
},
{
"epoch": 0.5589988081048868,
"grad_norm": 0.3630248536367911,
"learning_rate": 4.11939629566615e-06,
"loss": 1.9608,
"step": 469
},
{
"epoch": 0.5601907032181168,
"grad_norm": 0.33352062979381114,
"learning_rate": 4.100862508801639e-06,
"loss": 1.968,
"step": 470
},
{
"epoch": 0.5613825983313468,
"grad_norm": 0.3586501055080032,
"learning_rate": 4.082341480249681e-06,
"loss": 1.951,
"step": 471
},
{
"epoch": 0.5625744934445769,
"grad_norm": 0.32010968502296533,
"learning_rate": 4.063833472814437e-06,
"loss": 1.9712,
"step": 472
},
{
"epoch": 0.5637663885578069,
"grad_norm": 0.35914935623294864,
"learning_rate": 4.045338749115299e-06,
"loss": 1.9451,
"step": 473
},
{
"epoch": 0.564958283671037,
"grad_norm": 0.3215562828423304,
"learning_rate": 4.026857571583173e-06,
"loss": 1.9914,
"step": 474
},
{
"epoch": 0.566150178784267,
"grad_norm": 0.34368077323068136,
"learning_rate": 4.008390202456748e-06,
"loss": 1.9602,
"step": 475
},
{
"epoch": 0.567342073897497,
"grad_norm": 0.33832066930772653,
"learning_rate": 3.989936903778785e-06,
"loss": 1.9604,
"step": 476
},
{
"epoch": 0.5685339690107271,
"grad_norm": 0.3169050597259224,
"learning_rate": 3.971497937392388e-06,
"loss": 2.0011,
"step": 477
},
{
"epoch": 0.5697258641239571,
"grad_norm": 0.33977687443341886,
"learning_rate": 3.953073564937293e-06,
"loss": 1.9483,
"step": 478
},
{
"epoch": 0.5709177592371871,
"grad_norm": 0.31238016120669476,
"learning_rate": 3.934664047846157e-06,
"loss": 1.967,
"step": 479
},
{
"epoch": 0.5721096543504172,
"grad_norm": 0.3251899985092587,
"learning_rate": 3.916269647340843e-06,
"loss": 1.958,
"step": 480
},
{
"epoch": 0.5733015494636472,
"grad_norm": 0.34188581574139687,
"learning_rate": 3.897890624428721e-06,
"loss": 1.97,
"step": 481
},
{
"epoch": 0.5744934445768772,
"grad_norm": 0.3179020258722567,
"learning_rate": 3.879527239898962e-06,
"loss": 1.9713,
"step": 482
},
{
"epoch": 0.5756853396901073,
"grad_norm": 0.35414300160209977,
"learning_rate": 3.86117975431883e-06,
"loss": 1.9387,
"step": 483
},
{
"epoch": 0.5768772348033373,
"grad_norm": 0.296876137502102,
"learning_rate": 3.8428484280299975e-06,
"loss": 1.9918,
"step": 484
},
{
"epoch": 0.5780691299165673,
"grad_norm": 0.35059849213295274,
"learning_rate": 3.8245335211448404e-06,
"loss": 1.9622,
"step": 485
},
{
"epoch": 0.5792610250297974,
"grad_norm": 0.2899536086006706,
"learning_rate": 3.8062352935427526e-06,
"loss": 1.9727,
"step": 486
},
{
"epoch": 0.5804529201430274,
"grad_norm": 0.3244862339368592,
"learning_rate": 3.787954004866459e-06,
"loss": 1.9829,
"step": 487
},
{
"epoch": 0.5816448152562574,
"grad_norm": 0.31060144753736796,
"learning_rate": 3.769689914518326e-06,
"loss": 1.9743,
"step": 488
},
{
"epoch": 0.5828367103694875,
"grad_norm": 0.3081671121318371,
"learning_rate": 3.751443281656688e-06,
"loss": 1.9716,
"step": 489
},
{
"epoch": 0.5840286054827175,
"grad_norm": 0.28679657845355666,
"learning_rate": 3.733214365192162e-06,
"loss": 1.9836,
"step": 490
},
{
"epoch": 0.5852205005959475,
"grad_norm": 0.31077612486695794,
"learning_rate": 3.715003423783986e-06,
"loss": 1.9894,
"step": 491
},
{
"epoch": 0.5864123957091776,
"grad_norm": 0.2980657403471547,
"learning_rate": 3.696810715836332e-06,
"loss": 1.9712,
"step": 492
},
{
"epoch": 0.5876042908224076,
"grad_norm": 0.28507782391437864,
"learning_rate": 3.6786364994946543e-06,
"loss": 1.9652,
"step": 493
},
{
"epoch": 0.5887961859356377,
"grad_norm": 0.3076841882401857,
"learning_rate": 3.660481032642016e-06,
"loss": 1.9756,
"step": 494
},
{
"epoch": 0.5899880810488677,
"grad_norm": 0.2873213364073368,
"learning_rate": 3.6423445728954393e-06,
"loss": 1.9702,
"step": 495
},
{
"epoch": 0.5911799761620977,
"grad_norm": 0.30064962474416257,
"learning_rate": 3.6242273776022396e-06,
"loss": 1.9798,
"step": 496
},
{
"epoch": 0.5923718712753278,
"grad_norm": 0.30016520129470653,
"learning_rate": 3.6061297038363853e-06,
"loss": 1.9708,
"step": 497
},
{
"epoch": 0.5935637663885578,
"grad_norm": 0.3186216715211957,
"learning_rate": 3.5880518083948377e-06,
"loss": 1.9786,
"step": 498
},
{
"epoch": 0.5947556615017878,
"grad_norm": 0.3093775837624005,
"learning_rate": 3.5699939477939183e-06,
"loss": 1.9585,
"step": 499
},
{
"epoch": 0.5959475566150179,
"grad_norm": 0.28193348662211454,
"learning_rate": 3.5519563782656642e-06,
"loss": 1.9738,
"step": 500
},
{
"epoch": 0.5971394517282479,
"grad_norm": 0.32328773490671,
"learning_rate": 3.533939355754188e-06,
"loss": 1.9619,
"step": 501
},
{
"epoch": 0.5983313468414779,
"grad_norm": 0.30291671495352485,
"learning_rate": 3.5159431359120545e-06,
"loss": 1.9651,
"step": 502
},
{
"epoch": 0.599523241954708,
"grad_norm": 0.3080909269221942,
"learning_rate": 3.497967974096647e-06,
"loss": 1.9783,
"step": 503
},
{
"epoch": 0.600715137067938,
"grad_norm": 0.32314557640507674,
"learning_rate": 3.4800141253665463e-06,
"loss": 1.9657,
"step": 504
},
{
"epoch": 0.601907032181168,
"grad_norm": 0.29346056048517033,
"learning_rate": 3.4620818444779126e-06,
"loss": 1.9787,
"step": 505
},
{
"epoch": 0.6030989272943981,
"grad_norm": 0.3110390571856809,
"learning_rate": 3.4441713858808684e-06,
"loss": 1.9414,
"step": 506
},
{
"epoch": 0.6042908224076281,
"grad_norm": 0.31467381689979457,
"learning_rate": 3.426283003715886e-06,
"loss": 1.9619,
"step": 507
},
{
"epoch": 0.6054827175208581,
"grad_norm": 0.2969133354888754,
"learning_rate": 3.4084169518101896e-06,
"loss": 1.9604,
"step": 508
},
{
"epoch": 0.6066746126340882,
"grad_norm": 0.3184238842438653,
"learning_rate": 3.3905734836741415e-06,
"loss": 1.953,
"step": 509
},
{
"epoch": 0.6078665077473182,
"grad_norm": 0.2969150683168432,
"learning_rate": 3.3727528524976583e-06,
"loss": 1.9664,
"step": 510
},
{
"epoch": 0.6090584028605482,
"grad_norm": 0.33154057267330567,
"learning_rate": 3.354955311146606e-06,
"loss": 1.9776,
"step": 511
},
{
"epoch": 0.6102502979737783,
"grad_norm": 0.30901718720421373,
"learning_rate": 3.3371811121592203e-06,
"loss": 1.9917,
"step": 512
},
{
"epoch": 0.6114421930870083,
"grad_norm": 0.3212832298222802,
"learning_rate": 3.3194305077425215e-06,
"loss": 1.9928,
"step": 513
},
{
"epoch": 0.6126340882002383,
"grad_norm": 0.34130767861666084,
"learning_rate": 3.3017037497687303e-06,
"loss": 1.9501,
"step": 514
},
{
"epoch": 0.6138259833134684,
"grad_norm": 0.2919077388333617,
"learning_rate": 3.2840010897717045e-06,
"loss": 1.9657,
"step": 515
},
{
"epoch": 0.6150178784266984,
"grad_norm": 0.3277066008449366,
"learning_rate": 3.2663227789433573e-06,
"loss": 1.9602,
"step": 516
},
{
"epoch": 0.6162097735399285,
"grad_norm": 0.2903404769911658,
"learning_rate": 3.2486690681301046e-06,
"loss": 1.959,
"step": 517
},
{
"epoch": 0.6174016686531585,
"grad_norm": 0.284277433828357,
"learning_rate": 3.2310402078292956e-06,
"loss": 1.9718,
"step": 518
},
{
"epoch": 0.6185935637663885,
"grad_norm": 0.3258141085919218,
"learning_rate": 3.2134364481856663e-06,
"loss": 1.9612,
"step": 519
},
{
"epoch": 0.6197854588796186,
"grad_norm": 0.285408156114209,
"learning_rate": 3.1958580389877876e-06,
"loss": 1.9747,
"step": 520
},
{
"epoch": 0.6209773539928486,
"grad_norm": 0.3071499624906975,
"learning_rate": 3.178305229664519e-06,
"loss": 1.9781,
"step": 521
},
{
"epoch": 0.6221692491060786,
"grad_norm": 0.29430716274498264,
"learning_rate": 3.1607782692814683e-06,
"loss": 1.9785,
"step": 522
},
{
"epoch": 0.6233611442193087,
"grad_norm": 0.29446694445491767,
"learning_rate": 3.1432774065374628e-06,
"loss": 1.9651,
"step": 523
},
{
"epoch": 0.6245530393325387,
"grad_norm": 0.2868927792141283,
"learning_rate": 3.125802889761016e-06,
"loss": 1.9604,
"step": 524
},
{
"epoch": 0.6257449344457687,
"grad_norm": 0.3075894856023552,
"learning_rate": 3.1083549669068048e-06,
"loss": 1.981,
"step": 525
},
{
"epoch": 0.6269368295589988,
"grad_norm": 0.30553317063832414,
"learning_rate": 3.090933885552155e-06,
"loss": 1.968,
"step": 526
},
{
"epoch": 0.6281287246722288,
"grad_norm": 0.2883247866247332,
"learning_rate": 3.073539892893519e-06,
"loss": 1.9647,
"step": 527
},
{
"epoch": 0.6293206197854588,
"grad_norm": 0.3093327452992941,
"learning_rate": 3.0561732357429797e-06,
"loss": 1.9691,
"step": 528
},
{
"epoch": 0.6305125148986889,
"grad_norm": 0.2944434342418357,
"learning_rate": 3.0388341605247385e-06,
"loss": 1.9756,
"step": 529
},
{
"epoch": 0.6317044100119189,
"grad_norm": 0.3231077122645434,
"learning_rate": 3.021522913271627e-06,
"loss": 1.9774,
"step": 530
},
{
"epoch": 0.6328963051251489,
"grad_norm": 0.2937937539093132,
"learning_rate": 3.0042397396216076e-06,
"loss": 1.9813,
"step": 531
},
{
"epoch": 0.634088200238379,
"grad_norm": 0.33747028062165074,
"learning_rate": 2.9869848848142957e-06,
"loss": 1.9817,
"step": 532
},
{
"epoch": 0.635280095351609,
"grad_norm": 0.27860436170886715,
"learning_rate": 2.969758593687475e-06,
"loss": 1.995,
"step": 533
},
{
"epoch": 0.636471990464839,
"grad_norm": 0.2686660592261799,
"learning_rate": 2.952561110673623e-06,
"loss": 2.004,
"step": 534
},
{
"epoch": 0.6376638855780691,
"grad_norm": 0.3171126513844146,
"learning_rate": 2.9353926797964495e-06,
"loss": 1.9675,
"step": 535
},
{
"epoch": 0.6388557806912991,
"grad_norm": 0.26076405849359174,
"learning_rate": 2.9182535446674244e-06,
"loss": 1.9606,
"step": 536
},
{
"epoch": 0.6400476758045291,
"grad_norm": 0.311798441596794,
"learning_rate": 2.9011439484823287e-06,
"loss": 1.9566,
"step": 537
},
{
"epoch": 0.6412395709177592,
"grad_norm": 0.2667721525695941,
"learning_rate": 2.8840641340177955e-06,
"loss": 1.9571,
"step": 538
},
{
"epoch": 0.6424314660309892,
"grad_norm": 0.29165327528369395,
"learning_rate": 2.8670143436278757e-06,
"loss": 1.9648,
"step": 539
},
{
"epoch": 0.6436233611442194,
"grad_norm": 0.29487930858334793,
"learning_rate": 2.84999481924059e-06,
"loss": 1.9499,
"step": 540
},
{
"epoch": 0.6448152562574494,
"grad_norm": 0.31540084878211927,
"learning_rate": 2.8330058023545027e-06,
"loss": 1.9658,
"step": 541
},
{
"epoch": 0.6460071513706794,
"grad_norm": 0.2789685559518471,
"learning_rate": 2.8160475340352913e-06,
"loss": 1.9638,
"step": 542
},
{
"epoch": 0.6471990464839095,
"grad_norm": 0.28954283549505694,
"learning_rate": 2.799120254912321e-06,
"loss": 1.964,
"step": 543
},
{
"epoch": 0.6483909415971395,
"grad_norm": 0.29043220060176517,
"learning_rate": 2.7822242051752425e-06,
"loss": 1.9457,
"step": 544
},
{
"epoch": 0.6495828367103695,
"grad_norm": 0.268629176168656,
"learning_rate": 2.765359624570574e-06,
"loss": 1.9753,
"step": 545
},
{
"epoch": 0.6507747318235996,
"grad_norm": 0.29396871373699995,
"learning_rate": 2.7485267523983038e-06,
"loss": 1.9803,
"step": 546
},
{
"epoch": 0.6519666269368296,
"grad_norm": 0.2938578682137881,
"learning_rate": 2.731725827508494e-06,
"loss": 1.9559,
"step": 547
},
{
"epoch": 0.6531585220500596,
"grad_norm": 0.26444066496746194,
"learning_rate": 2.714957088297886e-06,
"loss": 1.9621,
"step": 548
},
{
"epoch": 0.6543504171632897,
"grad_norm": 0.2898176558803259,
"learning_rate": 2.6982207727065252e-06,
"loss": 1.9551,
"step": 549
},
{
"epoch": 0.6555423122765197,
"grad_norm": 0.3003676611598843,
"learning_rate": 2.681517118214389e-06,
"loss": 1.9841,
"step": 550
},
{
"epoch": 0.6567342073897497,
"grad_norm": 0.2592919375869367,
"learning_rate": 2.664846361837997e-06,
"loss": 1.976,
"step": 551
},
{
"epoch": 0.6579261025029798,
"grad_norm": 0.3266565084733632,
"learning_rate": 2.6482087401270705e-06,
"loss": 1.9564,
"step": 552
},
{
"epoch": 0.6591179976162098,
"grad_norm": 0.2995845038649281,
"learning_rate": 2.6316044891611633e-06,
"loss": 1.969,
"step": 553
},
{
"epoch": 0.6603098927294399,
"grad_norm": 0.2804027081600714,
"learning_rate": 2.6150338445463146e-06,
"loss": 1.9693,
"step": 554
},
{
"epoch": 0.6615017878426699,
"grad_norm": 0.27698419373196886,
"learning_rate": 2.5984970414117096e-06,
"loss": 1.9788,
"step": 555
},
{
"epoch": 0.6626936829558999,
"grad_norm": 0.31032114395815213,
"learning_rate": 2.5819943144063326e-06,
"loss": 1.9741,
"step": 556
},
{
"epoch": 0.66388557806913,
"grad_norm": 0.28800726045711933,
"learning_rate": 2.565525897695651e-06,
"loss": 1.9507,
"step": 557
},
{
"epoch": 0.66507747318236,
"grad_norm": 0.29802393651993614,
"learning_rate": 2.549092024958285e-06,
"loss": 1.9664,
"step": 558
},
{
"epoch": 0.66626936829559,
"grad_norm": 0.2982356345030979,
"learning_rate": 2.532692929382692e-06,
"loss": 1.9789,
"step": 559
},
{
"epoch": 0.6674612634088201,
"grad_norm": 0.2803035272437382,
"learning_rate": 2.51632884366386e-06,
"loss": 1.9609,
"step": 560
},
{
"epoch": 0.6686531585220501,
"grad_norm": 0.29369752020144174,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.9665,
"step": 561
},
{
"epoch": 0.6698450536352801,
"grad_norm": 0.2692763488935535,
"learning_rate": 2.4837066300892647e-06,
"loss": 1.9775,
"step": 562
},
{
"epoch": 0.6710369487485102,
"grad_norm": 0.2640671578025783,
"learning_rate": 2.4674489651264433e-06,
"loss": 1.9621,
"step": 563
},
{
"epoch": 0.6722288438617402,
"grad_norm": 0.2968222691817008,
"learning_rate": 2.4512272357996937e-06,
"loss": 1.956,
"step": 564
},
{
"epoch": 0.6734207389749702,
"grad_norm": 0.3011250889616646,
"learning_rate": 2.4350416722872657e-06,
"loss": 1.9775,
"step": 565
},
{
"epoch": 0.6746126340882003,
"grad_norm": 0.27706203721849776,
"learning_rate": 2.418892504254231e-06,
"loss": 1.9858,
"step": 566
},
{
"epoch": 0.6758045292014303,
"grad_norm": 0.2886529947325675,
"learning_rate": 2.402779960849232e-06,
"loss": 1.9778,
"step": 567
},
{
"epoch": 0.6769964243146603,
"grad_norm": 0.32555422289644703,
"learning_rate": 2.3867042707012234e-06,
"loss": 1.9652,
"step": 568
},
{
"epoch": 0.6781883194278904,
"grad_norm": 0.2728774574387877,
"learning_rate": 2.3706656619162278e-06,
"loss": 1.9556,
"step": 569
},
{
"epoch": 0.6793802145411204,
"grad_norm": 0.29791540079606743,
"learning_rate": 2.3546643620741054e-06,
"loss": 1.9665,
"step": 570
},
{
"epoch": 0.6805721096543504,
"grad_norm": 0.28429090975445814,
"learning_rate": 2.3387005982253218e-06,
"loss": 1.9947,
"step": 571
},
{
"epoch": 0.6817640047675805,
"grad_norm": 0.2933689275167632,
"learning_rate": 2.322774596887726e-06,
"loss": 1.9811,
"step": 572
},
{
"epoch": 0.6829558998808105,
"grad_norm": 0.27022852014602733,
"learning_rate": 2.3068865840433286e-06,
"loss": 1.9643,
"step": 573
},
{
"epoch": 0.6841477949940405,
"grad_norm": 0.27566403732559813,
"learning_rate": 2.29103678513511e-06,
"loss": 1.9494,
"step": 574
},
{
"epoch": 0.6853396901072706,
"grad_norm": 0.2813649305654506,
"learning_rate": 2.275225425063813e-06,
"loss": 1.9596,
"step": 575
},
{
"epoch": 0.6865315852205006,
"grad_norm": 0.28703149945139833,
"learning_rate": 2.259452728184749e-06,
"loss": 1.9674,
"step": 576
},
{
"epoch": 0.6877234803337307,
"grad_norm": 0.278836772705952,
"learning_rate": 2.2437189183046236e-06,
"loss": 1.9683,
"step": 577
},
{
"epoch": 0.6889153754469607,
"grad_norm": 0.3180141045052597,
"learning_rate": 2.2280242186783473e-06,
"loss": 1.9588,
"step": 578
},
{
"epoch": 0.6901072705601907,
"grad_norm": 0.2622104807864232,
"learning_rate": 2.21236885200588e-06,
"loss": 1.9587,
"step": 579
},
{
"epoch": 0.6912991656734208,
"grad_norm": 0.28789973870105057,
"learning_rate": 2.1967530404290702e-06,
"loss": 1.9827,
"step": 580
},
{
"epoch": 0.6924910607866508,
"grad_norm": 0.2821097592933177,
"learning_rate": 2.1811770055284968e-06,
"loss": 2.0036,
"step": 581
},
{
"epoch": 0.6936829558998808,
"grad_norm": 0.24946555260466954,
"learning_rate": 2.1656409683203216e-06,
"loss": 1.9897,
"step": 582
},
{
"epoch": 0.6948748510131109,
"grad_norm": 0.29219706494149983,
"learning_rate": 2.1501451492531664e-06,
"loss": 1.9703,
"step": 583
},
{
"epoch": 0.6960667461263409,
"grad_norm": 0.26827509295364377,
"learning_rate": 2.134689768204975e-06,
"loss": 1.9539,
"step": 584
},
{
"epoch": 0.6972586412395709,
"grad_norm": 0.30266646603465935,
"learning_rate": 2.1192750444798982e-06,
"loss": 1.986,
"step": 585
},
{
"epoch": 0.698450536352801,
"grad_norm": 0.2718431001798245,
"learning_rate": 2.103901196805173e-06,
"loss": 1.9738,
"step": 586
},
{
"epoch": 0.699642431466031,
"grad_norm": 0.2687110838757682,
"learning_rate": 2.0885684433280336e-06,
"loss": 1.9494,
"step": 587
},
{
"epoch": 0.700834326579261,
"grad_norm": 0.2776314528817648,
"learning_rate": 2.073277001612603e-06,
"loss": 1.9529,
"step": 588
},
{
"epoch": 0.7020262216924911,
"grad_norm": 0.25980607862615657,
"learning_rate": 2.058027088636814e-06,
"loss": 1.9529,
"step": 589
},
{
"epoch": 0.7032181168057211,
"grad_norm": 0.2801681412198667,
"learning_rate": 2.042818920789326e-06,
"loss": 1.9688,
"step": 590
},
{
"epoch": 0.7044100119189511,
"grad_norm": 0.2631013529820137,
"learning_rate": 2.0276527138664537e-06,
"loss": 1.9363,
"step": 591
},
{
"epoch": 0.7056019070321812,
"grad_norm": 0.2528230435660016,
"learning_rate": 2.012528683069109e-06,
"loss": 1.9542,
"step": 592
},
{
"epoch": 0.7067938021454112,
"grad_norm": 0.2473972746312196,
"learning_rate": 1.9974470429997482e-06,
"loss": 1.9962,
"step": 593
},
{
"epoch": 0.7079856972586412,
"grad_norm": 0.284941379850682,
"learning_rate": 1.98240800765932e-06,
"loss": 1.9447,
"step": 594
},
{
"epoch": 0.7091775923718713,
"grad_norm": 0.2621960635197473,
"learning_rate": 1.9674117904442364e-06,
"loss": 1.9812,
"step": 595
},
{
"epoch": 0.7103694874851013,
"grad_norm": 0.24858361697066161,
"learning_rate": 1.9524586041433393e-06,
"loss": 1.9562,
"step": 596
},
{
"epoch": 0.7115613825983313,
"grad_norm": 0.2669834824927238,
"learning_rate": 1.9375486609348842e-06,
"loss": 1.987,
"step": 597
},
{
"epoch": 0.7127532777115614,
"grad_norm": 0.26234172310570103,
"learning_rate": 1.9226821723835322e-06,
"loss": 1.9735,
"step": 598
},
{
"epoch": 0.7139451728247914,
"grad_norm": 0.25384961760334385,
"learning_rate": 1.907859349437336e-06,
"loss": 1.9831,
"step": 599
},
{
"epoch": 0.7151370679380215,
"grad_norm": 0.3104750369664491,
"learning_rate": 1.8930804024247635e-06,
"loss": 1.9714,
"step": 600
},
{
"epoch": 0.7163289630512515,
"grad_norm": 0.2458078645357097,
"learning_rate": 1.8783455410517004e-06,
"loss": 1.9468,
"step": 601
},
{
"epoch": 0.7175208581644815,
"grad_norm": 0.26529680805920836,
"learning_rate": 1.8636549743984815e-06,
"loss": 1.9593,
"step": 602
},
{
"epoch": 0.7187127532777116,
"grad_norm": 0.25080419801242315,
"learning_rate": 1.8490089109169218e-06,
"loss": 1.9808,
"step": 603
},
{
"epoch": 0.7199046483909416,
"grad_norm": 0.26413238202627376,
"learning_rate": 1.8344075584273547e-06,
"loss": 1.9487,
"step": 604
},
{
"epoch": 0.7210965435041716,
"grad_norm": 0.2674448281901473,
"learning_rate": 1.8198511241156902e-06,
"loss": 1.9598,
"step": 605
},
{
"epoch": 0.7222884386174017,
"grad_norm": 0.24083245686353985,
"learning_rate": 1.8053398145304723e-06,
"loss": 1.9662,
"step": 606
},
{
"epoch": 0.7234803337306317,
"grad_norm": 0.25961756440068884,
"learning_rate": 1.7908738355799454e-06,
"loss": 1.9868,
"step": 607
},
{
"epoch": 0.7246722288438617,
"grad_norm": 0.2784591415570306,
"learning_rate": 1.776453392529139e-06,
"loss": 1.9473,
"step": 608
},
{
"epoch": 0.7258641239570918,
"grad_norm": 0.23968494857480035,
"learning_rate": 1.7620786899969412e-06,
"loss": 1.9716,
"step": 609
},
{
"epoch": 0.7270560190703218,
"grad_norm": 0.23937998852690856,
"learning_rate": 1.747749931953217e-06,
"loss": 1.9635,
"step": 610
},
{
"epoch": 0.7282479141835518,
"grad_norm": 0.259732006086446,
"learning_rate": 1.7334673217158976e-06,
"loss": 1.9616,
"step": 611
},
{
"epoch": 0.7294398092967819,
"grad_norm": 0.25239102464142604,
"learning_rate": 1.719231061948094e-06,
"loss": 1.9656,
"step": 612
},
{
"epoch": 0.7306317044100119,
"grad_norm": 0.2550463812437055,
"learning_rate": 1.7050413546552347e-06,
"loss": 1.9784,
"step": 613
},
{
"epoch": 0.7318235995232419,
"grad_norm": 0.2535210200301375,
"learning_rate": 1.6908984011821883e-06,
"loss": 1.9847,
"step": 614
},
{
"epoch": 0.733015494636472,
"grad_norm": 0.24932432687921058,
"learning_rate": 1.6768024022104106e-06,
"loss": 1.972,
"step": 615
},
{
"epoch": 0.734207389749702,
"grad_norm": 0.2644613269238538,
"learning_rate": 1.6627535577550996e-06,
"loss": 1.9716,
"step": 616
},
{
"epoch": 0.735399284862932,
"grad_norm": 0.3944302146845491,
"learning_rate": 1.6487520671623469e-06,
"loss": 1.9595,
"step": 617
},
{
"epoch": 0.7365911799761621,
"grad_norm": 0.244722231687242,
"learning_rate": 1.6347981291063224e-06,
"loss": 1.9688,
"step": 618
},
{
"epoch": 0.7377830750893921,
"grad_norm": 0.2504826371525299,
"learning_rate": 1.6208919415864476e-06,
"loss": 1.9721,
"step": 619
},
{
"epoch": 0.7389749702026222,
"grad_norm": 0.2523790844757924,
"learning_rate": 1.6070337019245896e-06,
"loss": 1.9456,
"step": 620
},
{
"epoch": 0.7401668653158522,
"grad_norm": 0.26338811471433093,
"learning_rate": 1.5932236067622542e-06,
"loss": 1.9613,
"step": 621
},
{
"epoch": 0.7413587604290822,
"grad_norm": 0.25146034966929337,
"learning_rate": 1.5794618520578053e-06,
"loss": 1.981,
"step": 622
},
{
"epoch": 0.7425506555423123,
"grad_norm": 0.2478403982473681,
"learning_rate": 1.5657486330836786e-06,
"loss": 1.9263,
"step": 623
},
{
"epoch": 0.7437425506555423,
"grad_norm": 0.2536474779363047,
"learning_rate": 1.5520841444236118e-06,
"loss": 1.9789,
"step": 624
},
{
"epoch": 0.7449344457687723,
"grad_norm": 0.2615274746690614,
"learning_rate": 1.5384685799698839e-06,
"loss": 1.9783,
"step": 625
},
{
"epoch": 0.7461263408820024,
"grad_norm": 0.2679161856145564,
"learning_rate": 1.5249021329205638e-06,
"loss": 1.9513,
"step": 626
},
{
"epoch": 0.7473182359952324,
"grad_norm": 0.24553342227151687,
"learning_rate": 1.5113849957767685e-06,
"loss": 1.9711,
"step": 627
},
{
"epoch": 0.7485101311084624,
"grad_norm": 0.246019311870797,
"learning_rate": 1.4979173603399323e-06,
"loss": 1.9734,
"step": 628
},
{
"epoch": 0.7497020262216925,
"grad_norm": 0.25764970394173725,
"learning_rate": 1.4844994177090871e-06,
"loss": 1.9575,
"step": 629
},
{
"epoch": 0.7508939213349225,
"grad_norm": 0.2419520407437769,
"learning_rate": 1.4711313582781434e-06,
"loss": 1.9444,
"step": 630
},
{
"epoch": 0.7520858164481525,
"grad_norm": 0.2386706941133275,
"learning_rate": 1.4578133717331982e-06,
"loss": 1.9675,
"step": 631
},
{
"epoch": 0.7532777115613826,
"grad_norm": 0.251990632652635,
"learning_rate": 1.4445456470498392e-06,
"loss": 1.9571,
"step": 632
},
{
"epoch": 0.7544696066746126,
"grad_norm": 0.24481833940935246,
"learning_rate": 1.4313283724904632e-06,
"loss": 1.9538,
"step": 633
},
{
"epoch": 0.7556615017878426,
"grad_norm": 0.24576950539499237,
"learning_rate": 1.418161735601601e-06,
"loss": 1.9676,
"step": 634
},
{
"epoch": 0.7568533969010727,
"grad_norm": 0.24675237000023065,
"learning_rate": 1.4050459232112652e-06,
"loss": 1.9672,
"step": 635
},
{
"epoch": 0.7580452920143027,
"grad_norm": 0.2407161568341905,
"learning_rate": 1.3919811214262913e-06,
"loss": 1.9726,
"step": 636
},
{
"epoch": 0.7592371871275327,
"grad_norm": 0.23031407014507166,
"learning_rate": 1.378967515629701e-06,
"loss": 1.9768,
"step": 637
},
{
"epoch": 0.7604290822407628,
"grad_norm": 0.2345707206990765,
"learning_rate": 1.3660052904780707e-06,
"loss": 1.9517,
"step": 638
},
{
"epoch": 0.7616209773539928,
"grad_norm": 0.23677366971206826,
"learning_rate": 1.353094629898909e-06,
"loss": 1.9654,
"step": 639
},
{
"epoch": 0.7628128724672228,
"grad_norm": 0.24749335727794808,
"learning_rate": 1.3402357170880514e-06,
"loss": 1.9752,
"step": 640
},
{
"epoch": 0.7640047675804529,
"grad_norm": 0.23017419897906063,
"learning_rate": 1.3274287345070564e-06,
"loss": 1.9538,
"step": 641
},
{
"epoch": 0.7651966626936829,
"grad_norm": 0.24400711432750527,
"learning_rate": 1.3146738638806217e-06,
"loss": 1.9571,
"step": 642
},
{
"epoch": 0.766388557806913,
"grad_norm": 0.2322768595933808,
"learning_rate": 1.3019712861939964e-06,
"loss": 1.967,
"step": 643
},
{
"epoch": 0.767580452920143,
"grad_norm": 0.2448647193354467,
"learning_rate": 1.2893211816904243e-06,
"loss": 1.9702,
"step": 644
},
{
"epoch": 0.768772348033373,
"grad_norm": 0.2264734125461794,
"learning_rate": 1.2767237298685787e-06,
"loss": 1.9708,
"step": 645
},
{
"epoch": 0.7699642431466031,
"grad_norm": 0.24280998510060245,
"learning_rate": 1.26417910948002e-06,
"loss": 2.0062,
"step": 646
},
{
"epoch": 0.7711561382598331,
"grad_norm": 0.2599649290379438,
"learning_rate": 1.2516874985266508e-06,
"loss": 1.9641,
"step": 647
},
{
"epoch": 0.7723480333730631,
"grad_norm": 0.23209096205716762,
"learning_rate": 1.239249074258203e-06,
"loss": 1.9844,
"step": 648
},
{
"epoch": 0.7735399284862932,
"grad_norm": 0.2366200983286952,
"learning_rate": 1.2268640131697129e-06,
"loss": 1.9591,
"step": 649
},
{
"epoch": 0.7747318235995232,
"grad_norm": 0.22549692632142865,
"learning_rate": 1.2145324909990202e-06,
"loss": 1.9638,
"step": 650
},
{
"epoch": 0.7759237187127532,
"grad_norm": 0.2201422471843865,
"learning_rate": 1.202254682724276e-06,
"loss": 1.96,
"step": 651
},
{
"epoch": 0.7771156138259833,
"grad_norm": 0.23804071076564637,
"learning_rate": 1.190030762561452e-06,
"loss": 1.9429,
"step": 652
},
{
"epoch": 0.7783075089392133,
"grad_norm": 0.23445786497651513,
"learning_rate": 1.1778609039618804e-06,
"loss": 1.9441,
"step": 653
},
{
"epoch": 0.7794994040524433,
"grad_norm": 0.23319783177552136,
"learning_rate": 1.1657452796097879e-06,
"loss": 1.9561,
"step": 654
},
{
"epoch": 0.7806912991656734,
"grad_norm": 0.21246102421189209,
"learning_rate": 1.1536840614198376e-06,
"loss": 1.9552,
"step": 655
},
{
"epoch": 0.7818831942789034,
"grad_norm": 0.21558582464035986,
"learning_rate": 1.1416774205347015e-06,
"loss": 1.9535,
"step": 656
},
{
"epoch": 0.7830750893921334,
"grad_norm": 0.2478855415089653,
"learning_rate": 1.1297255273226254e-06,
"loss": 1.9648,
"step": 657
},
{
"epoch": 0.7842669845053635,
"grad_norm": 0.24079598014625692,
"learning_rate": 1.117828551375013e-06,
"loss": 1.9517,
"step": 658
},
{
"epoch": 0.7854588796185935,
"grad_norm": 0.22483152992478453,
"learning_rate": 1.1059866615040205e-06,
"loss": 1.9615,
"step": 659
},
{
"epoch": 0.7866507747318237,
"grad_norm": 0.21611761849037114,
"learning_rate": 1.094200025740157e-06,
"loss": 1.9544,
"step": 660
},
{
"epoch": 0.7878426698450537,
"grad_norm": 0.22680299546251373,
"learning_rate": 1.0824688113299054e-06,
"loss": 1.9656,
"step": 661
},
{
"epoch": 0.7890345649582837,
"grad_norm": 0.22651384710874864,
"learning_rate": 1.0707931847333487e-06,
"loss": 1.952,
"step": 662
},
{
"epoch": 0.7902264600715138,
"grad_norm": 0.22804104499330677,
"learning_rate": 1.0591733116218046e-06,
"loss": 1.9469,
"step": 663
},
{
"epoch": 0.7914183551847438,
"grad_norm": 0.23170987494579412,
"learning_rate": 1.0476093568754776e-06,
"loss": 1.9743,
"step": 664
},
{
"epoch": 0.7926102502979738,
"grad_norm": 0.22978004850491673,
"learning_rate": 1.036101484581117e-06,
"loss": 1.9595,
"step": 665
},
{
"epoch": 0.7938021454112039,
"grad_norm": 0.21260865957457795,
"learning_rate": 1.0246498580296903e-06,
"loss": 1.9656,
"step": 666
},
{
"epoch": 0.7949940405244339,
"grad_norm": 0.22425557844267943,
"learning_rate": 1.0132546397140687e-06,
"loss": 1.9755,
"step": 667
},
{
"epoch": 0.7961859356376639,
"grad_norm": 0.2266231438335908,
"learning_rate": 1.0019159913267156e-06,
"loss": 1.9871,
"step": 668
},
{
"epoch": 0.797377830750894,
"grad_norm": 0.21739761610592676,
"learning_rate": 9.90634073757397e-07,
"loss": 1.9599,
"step": 669
},
{
"epoch": 0.798569725864124,
"grad_norm": 0.22507089101888264,
"learning_rate": 9.794090470908962e-07,
"loss": 1.9703,
"step": 670
},
{
"epoch": 0.799761620977354,
"grad_norm": 0.2076814121868233,
"learning_rate": 9.68241070604743e-07,
"loss": 1.964,
"step": 671
},
{
"epoch": 0.8009535160905841,
"grad_norm": 0.23327916717788147,
"learning_rate": 9.571303027669548e-07,
"loss": 1.9825,
"step": 672
},
{
"epoch": 0.8021454112038141,
"grad_norm": 0.21841469332058575,
"learning_rate": 9.460769012337839e-07,
"loss": 1.9897,
"step": 673
},
{
"epoch": 0.8033373063170441,
"grad_norm": 0.22795437088618667,
"learning_rate": 9.350810228474855e-07,
"loss": 1.9548,
"step": 674
},
{
"epoch": 0.8045292014302742,
"grad_norm": 0.24461982798572574,
"learning_rate": 9.241428236340904e-07,
"loss": 1.971,
"step": 675
},
{
"epoch": 0.8057210965435042,
"grad_norm": 0.22693929127887172,
"learning_rate": 9.132624588011896e-07,
"loss": 1.9697,
"step": 676
},
{
"epoch": 0.8069129916567342,
"grad_norm": 0.22481042822198152,
"learning_rate": 9.024400827357344e-07,
"loss": 1.9729,
"step": 677
},
{
"epoch": 0.8081048867699643,
"grad_norm": 0.21859877558397856,
"learning_rate": 8.916758490018418e-07,
"loss": 1.9666,
"step": 678
},
{
"epoch": 0.8092967818831943,
"grad_norm": 0.2260921434511296,
"learning_rate": 8.809699103386204e-07,
"loss": 1.964,
"step": 679
},
{
"epoch": 0.8104886769964244,
"grad_norm": 0.20963128480459883,
"learning_rate": 8.703224186580012e-07,
"loss": 1.9969,
"step": 680
},
{
"epoch": 0.8116805721096544,
"grad_norm": 0.2204158197482051,
"learning_rate": 8.597335250425809e-07,
"loss": 1.9494,
"step": 681
},
{
"epoch": 0.8128724672228844,
"grad_norm": 0.22459531839550131,
"learning_rate": 8.492033797434762e-07,
"loss": 1.9473,
"step": 682
},
{
"epoch": 0.8140643623361145,
"grad_norm": 0.22674947382748567,
"learning_rate": 8.387321321781977e-07,
"loss": 1.9591,
"step": 683
},
{
"epoch": 0.8152562574493445,
"grad_norm": 0.23228573604473446,
"learning_rate": 8.283199309285284e-07,
"loss": 1.9622,
"step": 684
},
{
"epoch": 0.8164481525625745,
"grad_norm": 0.2287116546758065,
"learning_rate": 8.179669237384097e-07,
"loss": 1.971,
"step": 685
},
{
"epoch": 0.8176400476758046,
"grad_norm": 0.20888102799928682,
"learning_rate": 8.07673257511849e-07,
"loss": 1.9647,
"step": 686
},
{
"epoch": 0.8188319427890346,
"grad_norm": 0.23285143313735843,
"learning_rate": 7.97439078310836e-07,
"loss": 1.9475,
"step": 687
},
{
"epoch": 0.8200238379022646,
"grad_norm": 0.2306317998265742,
"learning_rate": 7.872645313532701e-07,
"loss": 1.9843,
"step": 688
},
{
"epoch": 0.8212157330154947,
"grad_norm": 0.22262551838654496,
"learning_rate": 7.771497610108981e-07,
"loss": 1.9715,
"step": 689
},
{
"epoch": 0.8224076281287247,
"grad_norm": 0.23849053028826073,
"learning_rate": 7.670949108072673e-07,
"loss": 1.944,
"step": 690
},
{
"epoch": 0.8235995232419547,
"grad_norm": 0.21447305264782468,
"learning_rate": 7.57100123415685e-07,
"loss": 1.9642,
"step": 691
},
{
"epoch": 0.8247914183551848,
"grad_norm": 0.2531000448062807,
"learning_rate": 7.471655406572003e-07,
"loss": 1.9447,
"step": 692
},
{
"epoch": 0.8259833134684148,
"grad_norm": 0.23002093940013935,
"learning_rate": 7.372913034985879e-07,
"loss": 1.9441,
"step": 693
},
{
"epoch": 0.8271752085816448,
"grad_norm": 0.21804664562427592,
"learning_rate": 7.274775520503491e-07,
"loss": 1.9494,
"step": 694
},
{
"epoch": 0.8283671036948749,
"grad_norm": 0.2180719845670406,
"learning_rate": 7.177244255647209e-07,
"loss": 1.9612,
"step": 695
},
{
"epoch": 0.8295589988081049,
"grad_norm": 0.2292893667744583,
"learning_rate": 7.080320624337039e-07,
"loss": 1.9631,
"step": 696
},
{
"epoch": 0.8307508939213349,
"grad_norm": 0.22411915203502086,
"learning_rate": 6.984006001870974e-07,
"loss": 1.9558,
"step": 697
},
{
"epoch": 0.831942789034565,
"grad_norm": 0.22749753478121626,
"learning_rate": 6.888301754905469e-07,
"loss": 1.9498,
"step": 698
},
{
"epoch": 0.833134684147795,
"grad_norm": 0.22227813746152136,
"learning_rate": 6.79320924143605e-07,
"loss": 1.9746,
"step": 699
},
{
"epoch": 0.834326579261025,
"grad_norm": 0.21559253597914696,
"learning_rate": 6.698729810778065e-07,
"loss": 1.9528,
"step": 700
},
{
"epoch": 0.8355184743742551,
"grad_norm": 0.22780943536330842,
"learning_rate": 6.604864803547511e-07,
"loss": 1.9803,
"step": 701
},
{
"epoch": 0.8367103694874851,
"grad_norm": 0.21085095301925635,
"learning_rate": 6.51161555164203e-07,
"loss": 1.973,
"step": 702
},
{
"epoch": 0.8379022646007152,
"grad_norm": 0.212543861677965,
"learning_rate": 6.418983378221988e-07,
"loss": 1.9623,
"step": 703
},
{
"epoch": 0.8390941597139452,
"grad_norm": 0.2115457183313653,
"learning_rate": 6.326969597691724e-07,
"loss": 1.9817,
"step": 704
},
{
"epoch": 0.8402860548271752,
"grad_norm": 0.21617807812407117,
"learning_rate": 6.235575515680898e-07,
"loss": 1.968,
"step": 705
},
{
"epoch": 0.8414779499404053,
"grad_norm": 0.21587790930172882,
"learning_rate": 6.144802429025948e-07,
"loss": 1.9549,
"step": 706
},
{
"epoch": 0.8426698450536353,
"grad_norm": 0.21797830405681992,
"learning_rate": 6.054651625751717e-07,
"loss": 1.9833,
"step": 707
},
{
"epoch": 0.8438617401668653,
"grad_norm": 0.22284253238842683,
"learning_rate": 5.965124385053112e-07,
"loss": 1.9498,
"step": 708
},
{
"epoch": 0.8450536352800954,
"grad_norm": 0.20628741944346807,
"learning_rate": 5.876221977277042e-07,
"loss": 1.9382,
"step": 709
},
{
"epoch": 0.8462455303933254,
"grad_norm": 0.22293588358500385,
"learning_rate": 5.787945663904332e-07,
"loss": 1.9773,
"step": 710
},
{
"epoch": 0.8474374255065554,
"grad_norm": 0.22508597630366683,
"learning_rate": 5.700296697531843e-07,
"loss": 1.9659,
"step": 711
},
{
"epoch": 0.8486293206197855,
"grad_norm": 0.22145576581778206,
"learning_rate": 5.613276321854699e-07,
"loss": 1.9536,
"step": 712
},
{
"epoch": 0.8498212157330155,
"grad_norm": 0.21697947016074837,
"learning_rate": 5.526885771648599e-07,
"loss": 1.9686,
"step": 713
},
{
"epoch": 0.8510131108462455,
"grad_norm": 0.2238437978274647,
"learning_rate": 5.441126272752395e-07,
"loss": 1.9654,
"step": 714
},
{
"epoch": 0.8522050059594756,
"grad_norm": 0.20552250633575228,
"learning_rate": 5.355999042050603e-07,
"loss": 1.9679,
"step": 715
},
{
"epoch": 0.8533969010727056,
"grad_norm": 0.20678965975151994,
"learning_rate": 5.271505287456153e-07,
"loss": 1.9695,
"step": 716
},
{
"epoch": 0.8545887961859356,
"grad_norm": 0.22026378225643617,
"learning_rate": 5.187646207893287e-07,
"loss": 1.9459,
"step": 717
},
{
"epoch": 0.8557806912991657,
"grad_norm": 0.21952615459392946,
"learning_rate": 5.104422993280522e-07,
"loss": 1.9583,
"step": 718
},
{
"epoch": 0.8569725864123957,
"grad_norm": 0.2103248912718566,
"learning_rate": 5.021836824513759e-07,
"loss": 1.9653,
"step": 719
},
{
"epoch": 0.8581644815256257,
"grad_norm": 0.21006195755848364,
"learning_rate": 4.939888873449567e-07,
"loss": 1.9688,
"step": 720
},
{
"epoch": 0.8593563766388558,
"grad_norm": 0.20402501881530985,
"learning_rate": 4.858580302888466e-07,
"loss": 1.9765,
"step": 721
},
{
"epoch": 0.8605482717520858,
"grad_norm": 0.20084862547322885,
"learning_rate": 4.777912266558532e-07,
"loss": 1.9761,
"step": 722
},
{
"epoch": 0.8617401668653158,
"grad_norm": 0.1988469838008945,
"learning_rate": 4.6978859090989703e-07,
"loss": 1.9694,
"step": 723
},
{
"epoch": 0.8629320619785459,
"grad_norm": 0.203864028540796,
"learning_rate": 4.618502366043881e-07,
"loss": 1.9775,
"step": 724
},
{
"epoch": 0.8641239570917759,
"grad_norm": 0.2145910427428764,
"learning_rate": 4.5397627638061604e-07,
"loss": 1.96,
"step": 725
},
{
"epoch": 0.865315852205006,
"grad_norm": 0.2050136901691872,
"learning_rate": 4.4616682196614636e-07,
"loss": 1.9623,
"step": 726
},
{
"epoch": 0.866507747318236,
"grad_norm": 0.20872576917324093,
"learning_rate": 4.3842198417324346e-07,
"loss": 1.9554,
"step": 727
},
{
"epoch": 0.867699642431466,
"grad_norm": 0.20606498295864895,
"learning_rate": 4.307418728972934e-07,
"loss": 1.9572,
"step": 728
},
{
"epoch": 0.8688915375446961,
"grad_norm": 0.20387463599521696,
"learning_rate": 4.2312659711524486e-07,
"loss": 1.9873,
"step": 729
},
{
"epoch": 0.8700834326579261,
"grad_norm": 0.20613519314048598,
"learning_rate": 4.1557626488406223e-07,
"loss": 1.9745,
"step": 730
},
{
"epoch": 0.8712753277711561,
"grad_norm": 0.2119361724107287,
"learning_rate": 4.080909833391944e-07,
"loss": 1.956,
"step": 731
},
{
"epoch": 0.8724672228843862,
"grad_norm": 0.21723399868985763,
"learning_rate": 4.0067085869305357e-07,
"loss": 1.9787,
"step": 732
},
{
"epoch": 0.8736591179976162,
"grad_norm": 0.206994355395634,
"learning_rate": 3.9331599623350815e-07,
"loss": 1.9593,
"step": 733
},
{
"epoch": 0.8748510131108462,
"grad_norm": 0.20592726537984876,
"learning_rate": 3.8602650032238675e-07,
"loss": 1.9687,
"step": 734
},
{
"epoch": 0.8760429082240763,
"grad_norm": 0.19758730236891384,
"learning_rate": 3.788024743940016e-07,
"loss": 1.9957,
"step": 735
},
{
"epoch": 0.8772348033373063,
"grad_norm": 0.20119012937681818,
"learning_rate": 3.71644020953677e-07,
"loss": 1.9908,
"step": 736
},
{
"epoch": 0.8784266984505363,
"grad_norm": 0.1987555097318407,
"learning_rate": 3.6455124157629805e-07,
"loss": 1.963,
"step": 737
},
{
"epoch": 0.8796185935637664,
"grad_norm": 0.20693505027292836,
"learning_rate": 3.575242369048665e-07,
"loss": 1.956,
"step": 738
},
{
"epoch": 0.8808104886769964,
"grad_norm": 0.20983712357706624,
"learning_rate": 3.505631066490728e-07,
"loss": 1.9719,
"step": 739
},
{
"epoch": 0.8820023837902264,
"grad_norm": 0.20291800945532407,
"learning_rate": 3.436679495838835e-07,
"loss": 1.9658,
"step": 740
},
{
"epoch": 0.8831942789034565,
"grad_norm": 0.20400470569172324,
"learning_rate": 3.3683886354813953e-07,
"loss": 1.9785,
"step": 741
},
{
"epoch": 0.8843861740166865,
"grad_norm": 0.20085471728798332,
"learning_rate": 3.300759454431657e-07,
"loss": 1.9534,
"step": 742
},
{
"epoch": 0.8855780691299165,
"grad_norm": 0.20101578952892968,
"learning_rate": 3.233792912313943e-07,
"loss": 1.9637,
"step": 743
},
{
"epoch": 0.8867699642431466,
"grad_norm": 0.194386663867366,
"learning_rate": 3.1674899593501175e-07,
"loss": 1.9718,
"step": 744
},
{
"epoch": 0.8879618593563766,
"grad_norm": 0.2022754658332612,
"learning_rate": 3.101851536346007e-07,
"loss": 1.9493,
"step": 745
},
{
"epoch": 0.8891537544696066,
"grad_norm": 0.19855072055520911,
"learning_rate": 3.0368785746780925e-07,
"loss": 1.9845,
"step": 746
},
{
"epoch": 0.8903456495828367,
"grad_norm": 0.20555522738375503,
"learning_rate": 2.9725719962802936e-07,
"loss": 1.9562,
"step": 747
},
{
"epoch": 0.8915375446960667,
"grad_norm": 0.20020380210201041,
"learning_rate": 2.9089327136308855e-07,
"loss": 1.9423,
"step": 748
},
{
"epoch": 0.8927294398092968,
"grad_norm": 0.21841772751668617,
"learning_rate": 2.8459616297395464e-07,
"loss": 1.9513,
"step": 749
},
{
"epoch": 0.8939213349225268,
"grad_norm": 0.20236116487467856,
"learning_rate": 2.7836596381345613e-07,
"loss": 1.9567,
"step": 750
},
{
"epoch": 0.8951132300357568,
"grad_norm": 0.20135449349261023,
"learning_rate": 2.722027622850104e-07,
"loss": 1.9645,
"step": 751
},
{
"epoch": 0.8963051251489869,
"grad_norm": 0.18378339190654983,
"learning_rate": 2.6610664584137413e-07,
"loss": 1.9556,
"step": 752
},
{
"epoch": 0.8974970202622169,
"grad_norm": 0.19531080180392058,
"learning_rate": 2.600777009833982e-07,
"loss": 1.9651,
"step": 753
},
{
"epoch": 0.8986889153754469,
"grad_norm": 0.19913934223741267,
"learning_rate": 2.541160132588044e-07,
"loss": 1.9903,
"step": 754
},
{
"epoch": 0.899880810488677,
"grad_norm": 0.19578044282345453,
"learning_rate": 2.482216672609677e-07,
"loss": 1.9826,
"step": 755
},
{
"epoch": 0.901072705601907,
"grad_norm": 0.19669019170528293,
"learning_rate": 2.423947466277177e-07,
"loss": 1.9608,
"step": 756
},
{
"epoch": 0.902264600715137,
"grad_norm": 0.20106014848036682,
"learning_rate": 2.3663533404015227e-07,
"loss": 1.9479,
"step": 757
},
{
"epoch": 0.9034564958283671,
"grad_norm": 0.19075058248139964,
"learning_rate": 2.3094351122146307e-07,
"loss": 1.9461,
"step": 758
},
{
"epoch": 0.9046483909415971,
"grad_norm": 0.1936347340988058,
"learning_rate": 2.2531935893577827e-07,
"loss": 1.9786,
"step": 759
},
{
"epoch": 0.9058402860548271,
"grad_norm": 0.1953475101067191,
"learning_rate": 2.1976295698701245e-07,
"loss": 1.9602,
"step": 760
},
{
"epoch": 0.9070321811680572,
"grad_norm": 0.2018953126999259,
"learning_rate": 2.142743842177386e-07,
"loss": 1.9589,
"step": 761
},
{
"epoch": 0.9082240762812872,
"grad_norm": 0.2413615126875786,
"learning_rate": 2.0885371850806691e-07,
"loss": 1.9761,
"step": 762
},
{
"epoch": 0.9094159713945172,
"grad_norm": 0.20022969875884347,
"learning_rate": 2.0350103677454047e-07,
"loss": 1.9589,
"step": 763
},
{
"epoch": 0.9106078665077473,
"grad_norm": 0.19437922719148687,
"learning_rate": 1.98216414969043e-07,
"loss": 1.9522,
"step": 764
},
{
"epoch": 0.9117997616209773,
"grad_norm": 0.19907577309780014,
"learning_rate": 1.9299992807772173e-07,
"loss": 1.9416,
"step": 765
},
{
"epoch": 0.9129916567342073,
"grad_norm": 0.20237562054567065,
"learning_rate": 1.8785165011992513e-07,
"loss": 1.9472,
"step": 766
},
{
"epoch": 0.9141835518474374,
"grad_norm": 0.19811233452806024,
"learning_rate": 1.8277165414714858e-07,
"loss": 1.9539,
"step": 767
},
{
"epoch": 0.9153754469606674,
"grad_norm": 0.19216150911802557,
"learning_rate": 1.7776001224200257e-07,
"loss": 1.9735,
"step": 768
},
{
"epoch": 0.9165673420738975,
"grad_norm": 0.20840514563527793,
"learning_rate": 1.7281679551718445e-07,
"loss": 1.9809,
"step": 769
},
{
"epoch": 0.9177592371871275,
"grad_norm": 0.20320307604353186,
"learning_rate": 1.6794207411447548e-07,
"loss": 1.9701,
"step": 770
},
{
"epoch": 0.9189511323003575,
"grad_norm": 0.1931262061968698,
"learning_rate": 1.6313591720374057e-07,
"loss": 1.9379,
"step": 771
},
{
"epoch": 0.9201430274135876,
"grad_norm": 0.1960925515484337,
"learning_rate": 1.583983929819488e-07,
"loss": 1.9537,
"step": 772
},
{
"epoch": 0.9213349225268176,
"grad_norm": 0.20033984414838282,
"learning_rate": 1.5372956867220678e-07,
"loss": 1.9524,
"step": 773
},
{
"epoch": 0.9225268176400476,
"grad_norm": 0.1969245380458675,
"learning_rate": 1.49129510522803e-07,
"loss": 1.9909,
"step": 774
},
{
"epoch": 0.9237187127532777,
"grad_norm": 0.19242733300556847,
"learning_rate": 1.445982838062676e-07,
"loss": 1.9672,
"step": 775
},
{
"epoch": 0.9249106078665077,
"grad_norm": 0.19349859868942168,
"learning_rate": 1.4013595281844872e-07,
"loss": 1.9694,
"step": 776
},
{
"epoch": 0.9261025029797377,
"grad_norm": 0.19361152253228486,
"learning_rate": 1.357425808775964e-07,
"loss": 1.982,
"step": 777
},
{
"epoch": 0.9272943980929678,
"grad_norm": 0.20619875373877572,
"learning_rate": 1.3141823032346736e-07,
"loss": 1.9625,
"step": 778
},
{
"epoch": 0.9284862932061978,
"grad_norm": 0.19945778231248415,
"learning_rate": 1.2716296251644e-07,
"loss": 1.9819,
"step": 779
},
{
"epoch": 0.929678188319428,
"grad_norm": 0.19580818822309443,
"learning_rate": 1.2297683783664138e-07,
"loss": 1.971,
"step": 780
},
{
"epoch": 0.930870083432658,
"grad_norm": 0.1969258751508119,
"learning_rate": 1.1885991568309385e-07,
"loss": 1.9684,
"step": 781
},
{
"epoch": 0.932061978545888,
"grad_norm": 0.19668726045406146,
"learning_rate": 1.1481225447286803e-07,
"loss": 1.9336,
"step": 782
},
{
"epoch": 0.933253873659118,
"grad_norm": 0.19002790593711985,
"learning_rate": 1.1083391164025903e-07,
"loss": 1.9776,
"step": 783
},
{
"epoch": 0.9344457687723481,
"grad_norm": 0.19965702113266218,
"learning_rate": 1.069249436359665e-07,
"loss": 1.982,
"step": 784
},
{
"epoch": 0.9356376638855781,
"grad_norm": 0.1916717569664899,
"learning_rate": 1.0308540592629756e-07,
"loss": 1.9611,
"step": 785
},
{
"epoch": 0.9368295589988082,
"grad_norm": 0.1958220628621678,
"learning_rate": 9.931535299237737e-08,
"loss": 1.9439,
"step": 786
},
{
"epoch": 0.9380214541120382,
"grad_norm": 0.19546126458914154,
"learning_rate": 9.561483832937535e-08,
"loss": 1.9596,
"step": 787
},
{
"epoch": 0.9392133492252682,
"grad_norm": 0.18627727476440192,
"learning_rate": 9.198391444575072e-08,
"loss": 1.977,
"step": 788
},
{
"epoch": 0.9404052443384983,
"grad_norm": 0.19214479226727124,
"learning_rate": 8.842263286250208e-08,
"loss": 1.9714,
"step": 789
},
{
"epoch": 0.9415971394517283,
"grad_norm": 0.19627451760011,
"learning_rate": 8.493104411243791e-08,
"loss": 1.9846,
"step": 790
},
{
"epoch": 0.9427890345649583,
"grad_norm": 0.189201378107075,
"learning_rate": 8.150919773946165e-08,
"loss": 1.9438,
"step": 791
},
{
"epoch": 0.9439809296781884,
"grad_norm": 0.1881449814121689,
"learning_rate": 7.81571422978672e-08,
"loss": 1.9758,
"step": 792
},
{
"epoch": 0.9451728247914184,
"grad_norm": 0.19461953430827816,
"learning_rate": 7.487492535164842e-08,
"loss": 1.9538,
"step": 793
},
{
"epoch": 0.9463647199046484,
"grad_norm": 0.19961269699233244,
"learning_rate": 7.166259347382854e-08,
"loss": 1.9861,
"step": 794
},
{
"epoch": 0.9475566150178785,
"grad_norm": 0.19603572773830497,
"learning_rate": 6.852019224579287e-08,
"loss": 1.954,
"step": 795
},
{
"epoch": 0.9487485101311085,
"grad_norm": 0.18552652214530319,
"learning_rate": 6.544776625664829e-08,
"loss": 1.9701,
"step": 796
},
{
"epoch": 0.9499404052443385,
"grad_norm": 0.18737364550182184,
"learning_rate": 6.244535910258697e-08,
"loss": 1.9507,
"step": 797
},
{
"epoch": 0.9511323003575686,
"grad_norm": 0.19128218729370722,
"learning_rate": 5.95130133862698e-08,
"loss": 1.963,
"step": 798
},
{
"epoch": 0.9523241954707986,
"grad_norm": 0.1857403939801107,
"learning_rate": 5.665077071621894e-08,
"loss": 1.9782,
"step": 799
},
{
"epoch": 0.9535160905840286,
"grad_norm": 0.18845875837578616,
"learning_rate": 5.3858671706230605e-08,
"loss": 1.9714,
"step": 800
},
{
"epoch": 0.9547079856972587,
"grad_norm": 0.191894557195297,
"learning_rate": 5.1136755974797724e-08,
"loss": 1.9802,
"step": 801
},
{
"epoch": 0.9558998808104887,
"grad_norm": 0.18884552490025192,
"learning_rate": 4.848506214454651e-08,
"loss": 1.9635,
"step": 802
},
{
"epoch": 0.9570917759237187,
"grad_norm": 0.1921418572611478,
"learning_rate": 4.590362784169022e-08,
"loss": 1.9863,
"step": 803
},
{
"epoch": 0.9582836710369488,
"grad_norm": 0.19545913000509813,
"learning_rate": 4.3392489695493475e-08,
"loss": 1.9582,
"step": 804
},
{
"epoch": 0.9594755661501788,
"grad_norm": 0.18809267333949384,
"learning_rate": 4.0951683337754345e-08,
"loss": 1.9486,
"step": 805
},
{
"epoch": 0.9606674612634089,
"grad_norm": 0.1993552148845141,
"learning_rate": 3.858124340229863e-08,
"loss": 1.9596,
"step": 806
},
{
"epoch": 0.9618593563766389,
"grad_norm": 0.19546359586133671,
"learning_rate": 3.628120352448583e-08,
"loss": 1.9635,
"step": 807
},
{
"epoch": 0.9630512514898689,
"grad_norm": 0.1965379164207019,
"learning_rate": 3.405159634073452e-08,
"loss": 1.9586,
"step": 808
},
{
"epoch": 0.964243146603099,
"grad_norm": 0.18935857235084785,
"learning_rate": 3.1892453488058803e-08,
"loss": 1.9854,
"step": 809
},
{
"epoch": 0.965435041716329,
"grad_norm": 0.19406372777092382,
"learning_rate": 2.9803805603619283e-08,
"loss": 1.9588,
"step": 810
},
{
"epoch": 0.966626936829559,
"grad_norm": 0.19087575145791982,
"learning_rate": 2.77856823242878e-08,
"loss": 1.9681,
"step": 811
},
{
"epoch": 0.9678188319427891,
"grad_norm": 0.20093522828177285,
"learning_rate": 2.5838112286226123e-08,
"loss": 1.9667,
"step": 812
},
{
"epoch": 0.9690107270560191,
"grad_norm": 0.18798772341602374,
"learning_rate": 2.39611231244824e-08,
"loss": 1.9722,
"step": 813
},
{
"epoch": 0.9702026221692491,
"grad_norm": 0.1848757446131922,
"learning_rate": 2.2154741472596996e-08,
"loss": 1.9578,
"step": 814
},
{
"epoch": 0.9713945172824792,
"grad_norm": 0.18755577360898026,
"learning_rate": 2.0418992962224495e-08,
"loss": 1.963,
"step": 815
},
{
"epoch": 0.9725864123957092,
"grad_norm": 0.18908507808905262,
"learning_rate": 1.8753902222770627e-08,
"loss": 1.9986,
"step": 816
},
{
"epoch": 0.9737783075089392,
"grad_norm": 0.1919401118801061,
"learning_rate": 1.7159492881041462e-08,
"loss": 1.9351,
"step": 817
},
{
"epoch": 0.9749702026221693,
"grad_norm": 0.1877371294012426,
"learning_rate": 1.563578756091144e-08,
"loss": 1.9486,
"step": 818
},
{
"epoch": 0.9761620977353993,
"grad_norm": 0.1915342862821692,
"learning_rate": 1.4182807882999194e-08,
"loss": 1.9647,
"step": 819
},
{
"epoch": 0.9773539928486293,
"grad_norm": 0.18698005671466014,
"learning_rate": 1.2800574464361115e-08,
"loss": 1.9578,
"step": 820
},
{
"epoch": 0.9785458879618594,
"grad_norm": 0.1894762036895136,
"learning_rate": 1.1489106918200487e-08,
"loss": 1.9497,
"step": 821
},
{
"epoch": 0.9797377830750894,
"grad_norm": 0.19184360419844976,
"learning_rate": 1.0248423853587154e-08,
"loss": 1.9767,
"step": 822
},
{
"epoch": 0.9809296781883194,
"grad_norm": 0.19532092856528233,
"learning_rate": 9.07854287519494e-09,
"loss": 1.9623,
"step": 823
},
{
"epoch": 0.9821215733015495,
"grad_norm": 0.19624970994803084,
"learning_rate": 7.979480583052423e-09,
"loss": 1.961,
"step": 824
},
{
"epoch": 0.9833134684147795,
"grad_norm": 0.1845931499070557,
"learning_rate": 6.951252572304224e-09,
"loss": 1.983,
"step": 825
},
{
"epoch": 0.9845053635280095,
"grad_norm": 0.18794485352405654,
"learning_rate": 5.993873432993957e-09,
"loss": 1.9616,
"step": 826
},
{
"epoch": 0.9856972586412396,
"grad_norm": 0.19891751745555894,
"learning_rate": 5.107356749853298e-09,
"loss": 1.9535,
"step": 827
},
{
"epoch": 0.9868891537544696,
"grad_norm": 0.19040177607162215,
"learning_rate": 4.291715102112126e-09,
"loss": 1.9726,
"step": 828
},
{
"epoch": 0.9880810488676997,
"grad_norm": 0.22336004861620634,
"learning_rate": 3.546960063319227e-09,
"loss": 1.966,
"step": 829
},
{
"epoch": 0.9892729439809297,
"grad_norm": 0.18822951819678269,
"learning_rate": 2.8731022011757593e-09,
"loss": 1.9966,
"step": 830
},
{
"epoch": 0.9904648390941597,
"grad_norm": 0.18927421286397889,
"learning_rate": 2.27015107739037e-09,
"loss": 1.9726,
"step": 831
},
{
"epoch": 0.9916567342073898,
"grad_norm": 0.19221873853139876,
"learning_rate": 1.7381152475376416e-09,
"loss": 1.9832,
"step": 832
},
{
"epoch": 0.9928486293206198,
"grad_norm": 0.1904281270223511,
"learning_rate": 1.2770022609409628e-09,
"loss": 1.9563,
"step": 833
},
{
"epoch": 0.9940405244338498,
"grad_norm": 0.18797717350418608,
"learning_rate": 8.868186605631712e-10,
"loss": 1.9507,
"step": 834
},
{
"epoch": 0.9952324195470799,
"grad_norm": 0.18924896153536938,
"learning_rate": 5.675699829160719e-10,
"loss": 1.9705,
"step": 835
},
{
"epoch": 0.9964243146603099,
"grad_norm": 0.19596674067827927,
"learning_rate": 3.1926075797827914e-10,
"loss": 1.9888,
"step": 836
},
{
"epoch": 0.9976162097735399,
"grad_norm": 0.1862037805845138,
"learning_rate": 1.4189450913415505e-10,
"loss": 1.9437,
"step": 837
},
{
"epoch": 0.99880810488677,
"grad_norm": 0.19347983202580893,
"learning_rate": 3.547375312218382e-11,
"loss": 1.9667,
"step": 838
},
{
"epoch": 1.0,
"grad_norm": 0.20375226613820588,
"learning_rate": 0.0,
"loss": 1.9468,
"step": 839
},
{
"epoch": 1.0,
"step": 839,
"total_flos": 1802102510714880.0,
"train_loss": 1.9892315510771414,
"train_runtime": 26830.5336,
"train_samples_per_second": 58.025,
"train_steps_per_second": 0.031
}
],
"logging_steps": 1,
"max_steps": 839,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1802102510714880.0,
"train_batch_size": 58,
"trial_name": null,
"trial_params": null
}