diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5915 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 50, + "global_step": 839, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011918951132300357, + "grad_norm": 4.142117453158603, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.2717, + "step": 1 + }, + { + "epoch": 0.0023837902264600714, + "grad_norm": 4.146443379436291, + "learning_rate": 4.000000000000001e-06, + "loss": 2.2673, + "step": 2 + }, + { + "epoch": 0.003575685339690107, + "grad_norm": 4.284702076234231, + "learning_rate": 6e-06, + "loss": 2.2178, + "step": 3 + }, + { + "epoch": 0.004767580452920143, + "grad_norm": 4.104853059576584, + "learning_rate": 8.000000000000001e-06, + "loss": 2.2543, + "step": 4 + }, + { + "epoch": 0.0059594755661501785, + "grad_norm": 3.213179411663954, + "learning_rate": 1e-05, + "loss": 2.2236, + "step": 5 + }, + { + "epoch": 0.007151370679380214, + "grad_norm": 1.9110723795461666, + "learning_rate": 9.99996452624688e-06, + "loss": 2.2198, + "step": 6 + }, + { + "epoch": 0.00834326579261025, + "grad_norm": 1.7059526754812628, + "learning_rate": 9.999858105490868e-06, + "loss": 2.1753, + "step": 7 + }, + { + "epoch": 0.009535160905840286, + "grad_norm": 3.5843950702527754, + "learning_rate": 9.999680739242022e-06, + "loss": 2.1457, + "step": 8 + }, + { + "epoch": 0.010727056019070322, + "grad_norm": 3.0684912624884877, + "learning_rate": 9.999432430017084e-06, + "loss": 2.1636, + "step": 9 + }, + { + "epoch": 0.011918951132300357, + "grad_norm": 2.7608335284827397, + "learning_rate": 9.999113181339437e-06, + "loss": 2.1402, + "step": 10 + }, + { + "epoch": 0.013110846245530394, + "grad_norm": 2.2876952453740844, + "learning_rate": 9.99872299773906e-06, + "loss": 2.1377, + "step": 11 + }, + { + "epoch": 0.014302741358760428, + "grad_norm": 2.189713354043771, + "learning_rate": 9.998261884752463e-06, + "loss": 2.1297, + "step": 12 + }, + { + "epoch": 0.015494636471990465, + "grad_norm": 1.8497797869361814, + "learning_rate": 9.99772984892261e-06, + "loss": 2.1145, + "step": 13 + }, + { + "epoch": 0.0166865315852205, + "grad_norm": 1.4328123407133309, + "learning_rate": 9.997126897798826e-06, + "loss": 2.1099, + "step": 14 + }, + { + "epoch": 0.017878426698450536, + "grad_norm": 1.110872364131768, + "learning_rate": 9.996453039936682e-06, + "loss": 2.1171, + "step": 15 + }, + { + "epoch": 0.01907032181168057, + "grad_norm": 0.8634715903240043, + "learning_rate": 9.995708284897889e-06, + "loss": 2.0905, + "step": 16 + }, + { + "epoch": 0.02026221692491061, + "grad_norm": 0.7193294200124174, + "learning_rate": 9.994892643250147e-06, + "loss": 2.1082, + "step": 17 + }, + { + "epoch": 0.021454112038140644, + "grad_norm": 0.6321366046233599, + "learning_rate": 9.994006126567006e-06, + "loss": 2.0998, + "step": 18 + }, + { + "epoch": 0.02264600715137068, + "grad_norm": 0.5966553161988273, + "learning_rate": 9.993048747427696e-06, + "loss": 2.1107, + "step": 19 + }, + { + "epoch": 0.023837902264600714, + "grad_norm": 0.5609275352376858, + "learning_rate": 9.99202051941695e-06, + "loss": 2.1192, + "step": 20 + }, + { + "epoch": 0.025029797377830752, + "grad_norm": 0.5426385429041977, + "learning_rate": 9.990921457124807e-06, + "loss": 2.0809, + "step": 21 + }, + { + "epoch": 0.026221692491060787, + "grad_norm": 0.5394723771045817, + "learning_rate": 9.989751576146413e-06, + "loss": 2.0889, + "step": 22 + }, + { + "epoch": 0.027413587604290822, + "grad_norm": 0.5125643780082518, + "learning_rate": 9.9885108930818e-06, + "loss": 2.1025, + "step": 23 + }, + { + "epoch": 0.028605482717520857, + "grad_norm": 0.49224477502997804, + "learning_rate": 9.98719942553564e-06, + "loss": 2.0945, + "step": 24 + }, + { + "epoch": 0.029797377830750895, + "grad_norm": 0.45404804144183336, + "learning_rate": 9.985817192117001e-06, + "loss": 2.0985, + "step": 25 + }, + { + "epoch": 0.03098927294398093, + "grad_norm": 0.4619837784227509, + "learning_rate": 9.984364212439089e-06, + "loss": 2.1044, + "step": 26 + }, + { + "epoch": 0.03218116805721097, + "grad_norm": 0.43728216047255375, + "learning_rate": 9.982840507118959e-06, + "loss": 2.1109, + "step": 27 + }, + { + "epoch": 0.033373063170441, + "grad_norm": 0.4106377846709857, + "learning_rate": 9.98124609777723e-06, + "loss": 2.1169, + "step": 28 + }, + { + "epoch": 0.03456495828367104, + "grad_norm": 0.3844482158912619, + "learning_rate": 9.979581007037776e-06, + "loss": 2.084, + "step": 29 + }, + { + "epoch": 0.03575685339690107, + "grad_norm": 0.3615382921612817, + "learning_rate": 9.977845258527403e-06, + "loss": 2.0578, + "step": 30 + }, + { + "epoch": 0.03694874851013111, + "grad_norm": 0.3265425379192771, + "learning_rate": 9.976038876875519e-06, + "loss": 2.095, + "step": 31 + }, + { + "epoch": 0.03814064362336114, + "grad_norm": 0.3249807533911439, + "learning_rate": 9.974161887713775e-06, + "loss": 2.1091, + "step": 32 + }, + { + "epoch": 0.03933253873659118, + "grad_norm": 0.3258627805281391, + "learning_rate": 9.972214317675713e-06, + "loss": 2.0844, + "step": 33 + }, + { + "epoch": 0.04052443384982122, + "grad_norm": 0.3115021940469844, + "learning_rate": 9.970196194396383e-06, + "loss": 2.1214, + "step": 34 + }, + { + "epoch": 0.041716328963051254, + "grad_norm": 0.29359663265683655, + "learning_rate": 9.968107546511942e-06, + "loss": 2.1068, + "step": 35 + }, + { + "epoch": 0.04290822407628129, + "grad_norm": 0.2914709352436855, + "learning_rate": 9.965948403659267e-06, + "loss": 2.1358, + "step": 36 + }, + { + "epoch": 0.04410011918951132, + "grad_norm": 0.2801925528264536, + "learning_rate": 9.963718796475516e-06, + "loss": 2.0968, + "step": 37 + }, + { + "epoch": 0.04529201430274136, + "grad_norm": 0.2777208098847368, + "learning_rate": 9.961418756597703e-06, + "loss": 2.1118, + "step": 38 + }, + { + "epoch": 0.04648390941597139, + "grad_norm": 0.2627526369156367, + "learning_rate": 9.959048316662246e-06, + "loss": 2.1084, + "step": 39 + }, + { + "epoch": 0.04767580452920143, + "grad_norm": 0.27582542868025006, + "learning_rate": 9.956607510304508e-06, + "loss": 2.0636, + "step": 40 + }, + { + "epoch": 0.04886769964243146, + "grad_norm": 0.252514387355515, + "learning_rate": 9.95409637215831e-06, + "loss": 2.0842, + "step": 41 + }, + { + "epoch": 0.050059594755661505, + "grad_norm": 0.270250047965773, + "learning_rate": 9.951514937855455e-06, + "loss": 2.0476, + "step": 42 + }, + { + "epoch": 0.05125148986889154, + "grad_norm": 0.2626133865266297, + "learning_rate": 9.948863244025202e-06, + "loss": 2.0777, + "step": 43 + }, + { + "epoch": 0.052443384982121574, + "grad_norm": 0.25699273367046915, + "learning_rate": 9.94614132829377e-06, + "loss": 2.0944, + "step": 44 + }, + { + "epoch": 0.05363528009535161, + "grad_norm": 0.27340926527011333, + "learning_rate": 9.943349229283781e-06, + "loss": 2.0887, + "step": 45 + }, + { + "epoch": 0.054827175208581644, + "grad_norm": 0.24251549192329058, + "learning_rate": 9.94048698661373e-06, + "loss": 2.1024, + "step": 46 + }, + { + "epoch": 0.05601907032181168, + "grad_norm": 0.2575396666859324, + "learning_rate": 9.937554640897414e-06, + "loss": 2.083, + "step": 47 + }, + { + "epoch": 0.057210965435041714, + "grad_norm": 0.23558811037647728, + "learning_rate": 9.934552233743353e-06, + "loss": 2.0819, + "step": 48 + }, + { + "epoch": 0.058402860548271755, + "grad_norm": 0.24407061789389187, + "learning_rate": 9.931479807754209e-06, + "loss": 2.0793, + "step": 49 + }, + { + "epoch": 0.05959475566150179, + "grad_norm": 0.23931691434033772, + "learning_rate": 9.928337406526172e-06, + "loss": 2.1159, + "step": 50 + }, + { + "epoch": 0.060786650774731825, + "grad_norm": 0.23796062784470082, + "learning_rate": 9.925125074648352e-06, + "loss": 2.0824, + "step": 51 + }, + { + "epoch": 0.06197854588796186, + "grad_norm": 0.23466916513360747, + "learning_rate": 9.921842857702132e-06, + "loss": 2.0734, + "step": 52 + }, + { + "epoch": 0.0631704410011919, + "grad_norm": 0.23455835316060827, + "learning_rate": 9.918490802260538e-06, + "loss": 2.073, + "step": 53 + }, + { + "epoch": 0.06436233611442194, + "grad_norm": 0.2383847191126797, + "learning_rate": 9.915068955887564e-06, + "loss": 2.0621, + "step": 54 + }, + { + "epoch": 0.06555423122765197, + "grad_norm": 0.23851986543002354, + "learning_rate": 9.911577367137499e-06, + "loss": 2.0672, + "step": 55 + }, + { + "epoch": 0.066746126340882, + "grad_norm": 0.24283914455886954, + "learning_rate": 9.90801608555425e-06, + "loss": 2.0869, + "step": 56 + }, + { + "epoch": 0.06793802145411204, + "grad_norm": 0.24281061248826022, + "learning_rate": 9.904385161670626e-06, + "loss": 2.0755, + "step": 57 + }, + { + "epoch": 0.06912991656734208, + "grad_norm": 0.25365576984515253, + "learning_rate": 9.900684647007624e-06, + "loss": 2.097, + "step": 58 + }, + { + "epoch": 0.07032181168057211, + "grad_norm": 0.23883096980549337, + "learning_rate": 9.896914594073703e-06, + "loss": 2.0865, + "step": 59 + }, + { + "epoch": 0.07151370679380215, + "grad_norm": 0.25353821590332437, + "learning_rate": 9.893075056364034e-06, + "loss": 2.0597, + "step": 60 + }, + { + "epoch": 0.07270560190703218, + "grad_norm": 0.25666970441346676, + "learning_rate": 9.889166088359742e-06, + "loss": 2.0685, + "step": 61 + }, + { + "epoch": 0.07389749702026222, + "grad_norm": 0.25019996159388774, + "learning_rate": 9.885187745527132e-06, + "loss": 2.047, + "step": 62 + }, + { + "epoch": 0.07508939213349225, + "grad_norm": 0.25291578137222365, + "learning_rate": 9.881140084316907e-06, + "loss": 2.0874, + "step": 63 + }, + { + "epoch": 0.07628128724672228, + "grad_norm": 0.2636139470370503, + "learning_rate": 9.87702316216336e-06, + "loss": 2.0761, + "step": 64 + }, + { + "epoch": 0.07747318235995232, + "grad_norm": 0.267225445079766, + "learning_rate": 9.87283703748356e-06, + "loss": 2.0632, + "step": 65 + }, + { + "epoch": 0.07866507747318235, + "grad_norm": 0.26637209411345025, + "learning_rate": 9.868581769676532e-06, + "loss": 2.0465, + "step": 66 + }, + { + "epoch": 0.07985697258641239, + "grad_norm": 0.28338796894764773, + "learning_rate": 9.864257419122404e-06, + "loss": 2.0543, + "step": 67 + }, + { + "epoch": 0.08104886769964244, + "grad_norm": 0.26975885676108347, + "learning_rate": 9.859864047181551e-06, + "loss": 2.0612, + "step": 68 + }, + { + "epoch": 0.08224076281287247, + "grad_norm": 0.2782261191514193, + "learning_rate": 9.855401716193733e-06, + "loss": 2.0466, + "step": 69 + }, + { + "epoch": 0.08343265792610251, + "grad_norm": 0.3833695594063663, + "learning_rate": 9.850870489477198e-06, + "loss": 2.0592, + "step": 70 + }, + { + "epoch": 0.08462455303933254, + "grad_norm": 0.32898931613326715, + "learning_rate": 9.846270431327793e-06, + "loss": 2.0498, + "step": 71 + }, + { + "epoch": 0.08581644815256258, + "grad_norm": 0.3256809126198457, + "learning_rate": 9.841601607018052e-06, + "loss": 2.071, + "step": 72 + }, + { + "epoch": 0.08700834326579261, + "grad_norm": 0.3451979168625637, + "learning_rate": 9.83686408279626e-06, + "loss": 2.0497, + "step": 73 + }, + { + "epoch": 0.08820023837902265, + "grad_norm": 0.3114795827025238, + "learning_rate": 9.832057925885526e-06, + "loss": 2.034, + "step": 74 + }, + { + "epoch": 0.08939213349225268, + "grad_norm": 0.33202022850009677, + "learning_rate": 9.827183204482818e-06, + "loss": 2.0324, + "step": 75 + }, + { + "epoch": 0.09058402860548272, + "grad_norm": 0.36121743903908576, + "learning_rate": 9.822239987757999e-06, + "loss": 2.0491, + "step": 76 + }, + { + "epoch": 0.09177592371871275, + "grad_norm": 0.33897099215895576, + "learning_rate": 9.817228345852853e-06, + "loss": 2.043, + "step": 77 + }, + { + "epoch": 0.09296781883194279, + "grad_norm": 0.37620997544394924, + "learning_rate": 9.812148349880076e-06, + "loss": 2.0364, + "step": 78 + }, + { + "epoch": 0.09415971394517282, + "grad_norm": 0.33013400624562517, + "learning_rate": 9.807000071922279e-06, + "loss": 2.0375, + "step": 79 + }, + { + "epoch": 0.09535160905840286, + "grad_norm": 0.3453355696988705, + "learning_rate": 9.801783585030959e-06, + "loss": 2.0357, + "step": 80 + }, + { + "epoch": 0.09654350417163289, + "grad_norm": 0.36374461914208417, + "learning_rate": 9.79649896322546e-06, + "loss": 2.0358, + "step": 81 + }, + { + "epoch": 0.09773539928486293, + "grad_norm": 0.3949186996890207, + "learning_rate": 9.791146281491935e-06, + "loss": 2.0576, + "step": 82 + }, + { + "epoch": 0.09892729439809297, + "grad_norm": 0.5347002177369765, + "learning_rate": 9.785725615782262e-06, + "loss": 2.0536, + "step": 83 + }, + { + "epoch": 0.10011918951132301, + "grad_norm": 0.7315743014064646, + "learning_rate": 9.780237043012988e-06, + "loss": 2.0382, + "step": 84 + }, + { + "epoch": 0.10131108462455304, + "grad_norm": 0.8662501044705759, + "learning_rate": 9.774680641064223e-06, + "loss": 2.0254, + "step": 85 + }, + { + "epoch": 0.10250297973778308, + "grad_norm": 1.0007184298369625, + "learning_rate": 9.769056488778538e-06, + "loss": 2.0507, + "step": 86 + }, + { + "epoch": 0.10369487485101311, + "grad_norm": 0.6169288025962048, + "learning_rate": 9.76336466595985e-06, + "loss": 2.0605, + "step": 87 + }, + { + "epoch": 0.10488676996424315, + "grad_norm": 0.4654550573627205, + "learning_rate": 9.757605253372283e-06, + "loss": 2.0064, + "step": 88 + }, + { + "epoch": 0.10607866507747318, + "grad_norm": 0.7568695915967251, + "learning_rate": 9.751778332739033e-06, + "loss": 2.0206, + "step": 89 + }, + { + "epoch": 0.10727056019070322, + "grad_norm": 0.7116290914175033, + "learning_rate": 9.745883986741196e-06, + "loss": 2.0276, + "step": 90 + }, + { + "epoch": 0.10846245530393325, + "grad_norm": 0.4534995774251596, + "learning_rate": 9.739922299016601e-06, + "loss": 2.0372, + "step": 91 + }, + { + "epoch": 0.10965435041716329, + "grad_norm": 0.5202368477526325, + "learning_rate": 9.733893354158628e-06, + "loss": 2.0281, + "step": 92 + }, + { + "epoch": 0.11084624553039332, + "grad_norm": 0.6041387580680142, + "learning_rate": 9.727797237714991e-06, + "loss": 2.0148, + "step": 93 + }, + { + "epoch": 0.11203814064362336, + "grad_norm": 0.45293705742449053, + "learning_rate": 9.721634036186545e-06, + "loss": 2.0175, + "step": 94 + }, + { + "epoch": 0.11323003575685339, + "grad_norm": 0.4793331693139531, + "learning_rate": 9.715403837026046e-06, + "loss": 2.0328, + "step": 95 + }, + { + "epoch": 0.11442193087008343, + "grad_norm": 0.5140021564481994, + "learning_rate": 9.709106728636913e-06, + "loss": 2.0143, + "step": 96 + }, + { + "epoch": 0.11561382598331346, + "grad_norm": 0.5507096777112976, + "learning_rate": 9.702742800371972e-06, + "loss": 2.0451, + "step": 97 + }, + { + "epoch": 0.11680572109654351, + "grad_norm": 0.48239551212919374, + "learning_rate": 9.69631214253219e-06, + "loss": 2.0241, + "step": 98 + }, + { + "epoch": 0.11799761620977355, + "grad_norm": 0.48328110314880524, + "learning_rate": 9.689814846365399e-06, + "loss": 2.003, + "step": 99 + }, + { + "epoch": 0.11918951132300358, + "grad_norm": 0.5285205749893114, + "learning_rate": 9.68325100406499e-06, + "loss": 2.0333, + "step": 100 + }, + { + "epoch": 0.12038140643623362, + "grad_norm": 0.6086649755855322, + "learning_rate": 9.676620708768608e-06, + "loss": 2.0468, + "step": 101 + }, + { + "epoch": 0.12157330154946365, + "grad_norm": 0.6688243861727331, + "learning_rate": 9.669924054556836e-06, + "loss": 2.0052, + "step": 102 + }, + { + "epoch": 0.12276519666269368, + "grad_norm": 0.7098729745438024, + "learning_rate": 9.663161136451862e-06, + "loss": 2.0201, + "step": 103 + }, + { + "epoch": 0.12395709177592372, + "grad_norm": 0.7561634464725003, + "learning_rate": 9.656332050416118e-06, + "loss": 2.005, + "step": 104 + }, + { + "epoch": 0.12514898688915377, + "grad_norm": 0.7900403950856617, + "learning_rate": 9.64943689335093e-06, + "loss": 2.0312, + "step": 105 + }, + { + "epoch": 0.1263408820023838, + "grad_norm": 0.7565057129935538, + "learning_rate": 9.642475763095134e-06, + "loss": 2.034, + "step": 106 + }, + { + "epoch": 0.12753277711561384, + "grad_norm": 0.6925499516277225, + "learning_rate": 9.635448758423703e-06, + "loss": 2.0172, + "step": 107 + }, + { + "epoch": 0.12872467222884387, + "grad_norm": 0.5469713913154514, + "learning_rate": 9.628355979046325e-06, + "loss": 2.0306, + "step": 108 + }, + { + "epoch": 0.1299165673420739, + "grad_norm": 0.532846103968638, + "learning_rate": 9.621197525606e-06, + "loss": 2.0313, + "step": 109 + }, + { + "epoch": 0.13110846245530394, + "grad_norm": 0.5107064338016527, + "learning_rate": 9.613973499677613e-06, + "loss": 2.0483, + "step": 110 + }, + { + "epoch": 0.13230035756853398, + "grad_norm": 0.6341330772425801, + "learning_rate": 9.606684003766493e-06, + "loss": 2.0222, + "step": 111 + }, + { + "epoch": 0.133492252681764, + "grad_norm": 0.5976219627881748, + "learning_rate": 9.599329141306946e-06, + "loss": 2.0074, + "step": 112 + }, + { + "epoch": 0.13468414779499405, + "grad_norm": 0.5847751917110514, + "learning_rate": 9.591909016660806e-06, + "loss": 2.0206, + "step": 113 + }, + { + "epoch": 0.13587604290822408, + "grad_norm": 0.6813522136748844, + "learning_rate": 9.584423735115938e-06, + "loss": 2.0178, + "step": 114 + }, + { + "epoch": 0.13706793802145412, + "grad_norm": 0.767208353338879, + "learning_rate": 9.576873402884756e-06, + "loss": 1.9957, + "step": 115 + }, + { + "epoch": 0.13825983313468415, + "grad_norm": 0.9969526484589852, + "learning_rate": 9.569258127102708e-06, + "loss": 2.0152, + "step": 116 + }, + { + "epoch": 0.1394517282479142, + "grad_norm": 1.14614371471204, + "learning_rate": 9.561578015826758e-06, + "loss": 2.0156, + "step": 117 + }, + { + "epoch": 0.14064362336114422, + "grad_norm": 0.693890999588814, + "learning_rate": 9.553833178033856e-06, + "loss": 2.015, + "step": 118 + }, + { + "epoch": 0.14183551847437426, + "grad_norm": 0.6655521940110969, + "learning_rate": 9.546023723619387e-06, + "loss": 2.0357, + "step": 119 + }, + { + "epoch": 0.1430274135876043, + "grad_norm": 0.8792717595050646, + "learning_rate": 9.538149763395611e-06, + "loss": 2.0057, + "step": 120 + }, + { + "epoch": 0.14421930870083433, + "grad_norm": 0.9075837339408256, + "learning_rate": 9.530211409090104e-06, + "loss": 2.0324, + "step": 121 + }, + { + "epoch": 0.14541120381406436, + "grad_norm": 0.8920741190168875, + "learning_rate": 9.522208773344147e-06, + "loss": 1.9948, + "step": 122 + }, + { + "epoch": 0.1466030989272944, + "grad_norm": 0.8574927630149499, + "learning_rate": 9.514141969711155e-06, + "loss": 2.019, + "step": 123 + }, + { + "epoch": 0.14779499404052443, + "grad_norm": 0.6343463765213274, + "learning_rate": 9.506011112655045e-06, + "loss": 2.0193, + "step": 124 + }, + { + "epoch": 0.14898688915375446, + "grad_norm": 0.5630972285804464, + "learning_rate": 9.497816317548625e-06, + "loss": 2.0057, + "step": 125 + }, + { + "epoch": 0.1501787842669845, + "grad_norm": 0.7579610388968056, + "learning_rate": 9.489557700671948e-06, + "loss": 2.0315, + "step": 126 + }, + { + "epoch": 0.15137067938021453, + "grad_norm": 0.6850629250779653, + "learning_rate": 9.481235379210671e-06, + "loss": 2.001, + "step": 127 + }, + { + "epoch": 0.15256257449344457, + "grad_norm": 0.5362542526140824, + "learning_rate": 9.472849471254386e-06, + "loss": 2.0316, + "step": 128 + }, + { + "epoch": 0.1537544696066746, + "grad_norm": 0.608628527433765, + "learning_rate": 9.46440009579494e-06, + "loss": 2.035, + "step": 129 + }, + { + "epoch": 0.15494636471990464, + "grad_norm": 0.5093840827042088, + "learning_rate": 9.455887372724761e-06, + "loss": 2.0273, + "step": 130 + }, + { + "epoch": 0.15613825983313467, + "grad_norm": 0.646651425294055, + "learning_rate": 9.447311422835141e-06, + "loss": 2.0337, + "step": 131 + }, + { + "epoch": 0.1573301549463647, + "grad_norm": 0.6171589347028325, + "learning_rate": 9.438672367814532e-06, + "loss": 2.0111, + "step": 132 + }, + { + "epoch": 0.15852205005959474, + "grad_norm": 0.607124578385374, + "learning_rate": 9.429970330246817e-06, + "loss": 2.0207, + "step": 133 + }, + { + "epoch": 0.15971394517282478, + "grad_norm": 0.6668755869782658, + "learning_rate": 9.421205433609568e-06, + "loss": 2.0174, + "step": 134 + }, + { + "epoch": 0.16090584028605484, + "grad_norm": 0.7092639336616874, + "learning_rate": 9.412377802272296e-06, + "loss": 2.0061, + "step": 135 + }, + { + "epoch": 0.16209773539928488, + "grad_norm": 0.7386024648965732, + "learning_rate": 9.40348756149469e-06, + "loss": 2.0126, + "step": 136 + }, + { + "epoch": 0.1632896305125149, + "grad_norm": 0.6374704813920733, + "learning_rate": 9.39453483742483e-06, + "loss": 2.0176, + "step": 137 + }, + { + "epoch": 0.16448152562574495, + "grad_norm": 0.514905378407023, + "learning_rate": 9.385519757097405e-06, + "loss": 2.0055, + "step": 138 + }, + { + "epoch": 0.16567342073897498, + "grad_norm": 0.625583671688313, + "learning_rate": 9.376442448431911e-06, + "loss": 2.0109, + "step": 139 + }, + { + "epoch": 0.16686531585220502, + "grad_norm": 0.6190722916976653, + "learning_rate": 9.367303040230828e-06, + "loss": 1.9939, + "step": 140 + }, + { + "epoch": 0.16805721096543505, + "grad_norm": 0.5659222906567583, + "learning_rate": 9.358101662177804e-06, + "loss": 2.0111, + "step": 141 + }, + { + "epoch": 0.16924910607866508, + "grad_norm": 0.6584496167747385, + "learning_rate": 9.348838444835798e-06, + "loss": 2.0185, + "step": 142 + }, + { + "epoch": 0.17044100119189512, + "grad_norm": 0.5257356541865075, + "learning_rate": 9.33951351964525e-06, + "loss": 2.0167, + "step": 143 + }, + { + "epoch": 0.17163289630512515, + "grad_norm": 0.5343239683640106, + "learning_rate": 9.330127018922195e-06, + "loss": 2.0058, + "step": 144 + }, + { + "epoch": 0.1728247914183552, + "grad_norm": 0.5602849015914332, + "learning_rate": 9.320679075856396e-06, + "loss": 1.9952, + "step": 145 + }, + { + "epoch": 0.17401668653158522, + "grad_norm": 0.509174624093658, + "learning_rate": 9.311169824509454e-06, + "loss": 2.0035, + "step": 146 + }, + { + "epoch": 0.17520858164481526, + "grad_norm": 0.6065116610936728, + "learning_rate": 9.301599399812904e-06, + "loss": 1.9989, + "step": 147 + }, + { + "epoch": 0.1764004767580453, + "grad_norm": 0.6025058237653309, + "learning_rate": 9.291967937566297e-06, + "loss": 2.015, + "step": 148 + }, + { + "epoch": 0.17759237187127533, + "grad_norm": 0.5966629218921442, + "learning_rate": 9.28227557443528e-06, + "loss": 1.9871, + "step": 149 + }, + { + "epoch": 0.17878426698450536, + "grad_norm": 0.6244177338742471, + "learning_rate": 9.272522447949652e-06, + "loss": 1.9916, + "step": 150 + }, + { + "epoch": 0.1799761620977354, + "grad_norm": 0.522440075076418, + "learning_rate": 9.262708696501412e-06, + "loss": 1.9997, + "step": 151 + }, + { + "epoch": 0.18116805721096543, + "grad_norm": 0.5640728239700662, + "learning_rate": 9.252834459342801e-06, + "loss": 2.003, + "step": 152 + }, + { + "epoch": 0.18235995232419547, + "grad_norm": 0.6822460944537364, + "learning_rate": 9.242899876584317e-06, + "loss": 2.0198, + "step": 153 + }, + { + "epoch": 0.1835518474374255, + "grad_norm": 0.6013920222643127, + "learning_rate": 9.232905089192733e-06, + "loss": 1.983, + "step": 154 + }, + { + "epoch": 0.18474374255065554, + "grad_norm": 0.6210431332187637, + "learning_rate": 9.222850238989104e-06, + "loss": 1.9815, + "step": 155 + }, + { + "epoch": 0.18593563766388557, + "grad_norm": 0.5536506251912162, + "learning_rate": 9.21273546864673e-06, + "loss": 1.9943, + "step": 156 + }, + { + "epoch": 0.1871275327771156, + "grad_norm": 0.5108824250251738, + "learning_rate": 9.202560921689165e-06, + "loss": 1.9875, + "step": 157 + }, + { + "epoch": 0.18831942789034564, + "grad_norm": 0.6703972638895684, + "learning_rate": 9.192326742488153e-06, + "loss": 2.0054, + "step": 158 + }, + { + "epoch": 0.18951132300357568, + "grad_norm": 0.6911385466049688, + "learning_rate": 9.182033076261591e-06, + "loss": 2.013, + "step": 159 + }, + { + "epoch": 0.1907032181168057, + "grad_norm": 0.8228547705270176, + "learning_rate": 9.171680069071472e-06, + "loss": 2.0079, + "step": 160 + }, + { + "epoch": 0.19189511323003575, + "grad_norm": 0.8318482910273874, + "learning_rate": 9.161267867821802e-06, + "loss": 2.0116, + "step": 161 + }, + { + "epoch": 0.19308700834326578, + "grad_norm": 0.6993770001635832, + "learning_rate": 9.150796620256526e-06, + "loss": 2.0104, + "step": 162 + }, + { + "epoch": 0.19427890345649582, + "grad_norm": 0.6963815969965594, + "learning_rate": 9.140266474957421e-06, + "loss": 1.9932, + "step": 163 + }, + { + "epoch": 0.19547079856972585, + "grad_norm": 0.687540193587627, + "learning_rate": 9.129677581342e-06, + "loss": 1.9844, + "step": 164 + }, + { + "epoch": 0.1966626936829559, + "grad_norm": 0.6315324748513748, + "learning_rate": 9.11903008966138e-06, + "loss": 1.9964, + "step": 165 + }, + { + "epoch": 0.19785458879618595, + "grad_norm": 0.5152807583074759, + "learning_rate": 9.10832415099816e-06, + "loss": 2.0027, + "step": 166 + }, + { + "epoch": 0.19904648390941598, + "grad_norm": 0.4708357523523462, + "learning_rate": 9.097559917264268e-06, + "loss": 2.007, + "step": 167 + }, + { + "epoch": 0.20023837902264602, + "grad_norm": 0.5659309675022438, + "learning_rate": 9.086737541198812e-06, + "loss": 2.0065, + "step": 168 + }, + { + "epoch": 0.20143027413587605, + "grad_norm": 0.5973723979176943, + "learning_rate": 9.07585717636591e-06, + "loss": 1.9963, + "step": 169 + }, + { + "epoch": 0.2026221692491061, + "grad_norm": 0.612759197221063, + "learning_rate": 9.064918977152517e-06, + "loss": 2.0189, + "step": 170 + }, + { + "epoch": 0.20381406436233612, + "grad_norm": 0.6368297841192448, + "learning_rate": 9.053923098766218e-06, + "loss": 1.9996, + "step": 171 + }, + { + "epoch": 0.20500595947556616, + "grad_norm": 0.6267340913957593, + "learning_rate": 9.042869697233046e-06, + "loss": 2.0081, + "step": 172 + }, + { + "epoch": 0.2061978545887962, + "grad_norm": 0.5997679592985574, + "learning_rate": 9.031758929395259e-06, + "loss": 2.0087, + "step": 173 + }, + { + "epoch": 0.20738974970202623, + "grad_norm": 0.6540359851514235, + "learning_rate": 9.020590952909105e-06, + "loss": 1.9862, + "step": 174 + }, + { + "epoch": 0.20858164481525626, + "grad_norm": 0.6304008000188193, + "learning_rate": 9.009365926242603e-06, + "loss": 1.9845, + "step": 175 + }, + { + "epoch": 0.2097735399284863, + "grad_norm": 0.49409981260012525, + "learning_rate": 8.998084008673284e-06, + "loss": 1.9865, + "step": 176 + }, + { + "epoch": 0.21096543504171633, + "grad_norm": 0.428992104451379, + "learning_rate": 8.986745360285933e-06, + "loss": 1.9775, + "step": 177 + }, + { + "epoch": 0.21215733015494637, + "grad_norm": 0.4544484558085694, + "learning_rate": 8.975350141970312e-06, + "loss": 1.9974, + "step": 178 + }, + { + "epoch": 0.2133492252681764, + "grad_norm": 0.47713373163398903, + "learning_rate": 8.963898515418885e-06, + "loss": 1.9986, + "step": 179 + }, + { + "epoch": 0.21454112038140644, + "grad_norm": 0.5128102686619308, + "learning_rate": 8.952390643124524e-06, + "loss": 1.9926, + "step": 180 + }, + { + "epoch": 0.21573301549463647, + "grad_norm": 0.49123637812302784, + "learning_rate": 8.940826688378196e-06, + "loss": 2.0068, + "step": 181 + }, + { + "epoch": 0.2169249106078665, + "grad_norm": 0.4670667432350283, + "learning_rate": 8.929206815266653e-06, + "loss": 1.998, + "step": 182 + }, + { + "epoch": 0.21811680572109654, + "grad_norm": 0.5026402806403492, + "learning_rate": 8.917531188670096e-06, + "loss": 2.0023, + "step": 183 + }, + { + "epoch": 0.21930870083432658, + "grad_norm": 0.5146023032179888, + "learning_rate": 8.905799974259845e-06, + "loss": 1.9917, + "step": 184 + }, + { + "epoch": 0.2205005959475566, + "grad_norm": 0.500813938615368, + "learning_rate": 8.89401333849598e-06, + "loss": 2.0046, + "step": 185 + }, + { + "epoch": 0.22169249106078665, + "grad_norm": 0.5241153656092717, + "learning_rate": 8.882171448624988e-06, + "loss": 2.004, + "step": 186 + }, + { + "epoch": 0.22288438617401668, + "grad_norm": 0.5455210954026811, + "learning_rate": 8.870274472677376e-06, + "loss": 2.0136, + "step": 187 + }, + { + "epoch": 0.22407628128724671, + "grad_norm": 0.6182614320674238, + "learning_rate": 8.8583225794653e-06, + "loss": 1.9745, + "step": 188 + }, + { + "epoch": 0.22526817640047675, + "grad_norm": 0.7203972482184511, + "learning_rate": 8.846315938580163e-06, + "loss": 1.9876, + "step": 189 + }, + { + "epoch": 0.22646007151370678, + "grad_norm": 0.7651134846710912, + "learning_rate": 8.834254720390214e-06, + "loss": 2.0039, + "step": 190 + }, + { + "epoch": 0.22765196662693682, + "grad_norm": 0.717395085062428, + "learning_rate": 8.82213909603812e-06, + "loss": 1.9923, + "step": 191 + }, + { + "epoch": 0.22884386174016685, + "grad_norm": 0.6974046079010195, + "learning_rate": 8.80996923743855e-06, + "loss": 1.9902, + "step": 192 + }, + { + "epoch": 0.2300357568533969, + "grad_norm": 0.5749230359569363, + "learning_rate": 8.797745317275727e-06, + "loss": 2.0077, + "step": 193 + }, + { + "epoch": 0.23122765196662692, + "grad_norm": 0.47763377533604173, + "learning_rate": 8.78546750900098e-06, + "loss": 2.0175, + "step": 194 + }, + { + "epoch": 0.232419547079857, + "grad_norm": 0.4868384029481758, + "learning_rate": 8.773135986830289e-06, + "loss": 1.9817, + "step": 195 + }, + { + "epoch": 0.23361144219308702, + "grad_norm": 0.5411631589460403, + "learning_rate": 8.760750925741799e-06, + "loss": 2.0191, + "step": 196 + }, + { + "epoch": 0.23480333730631706, + "grad_norm": 0.5991085184799008, + "learning_rate": 8.748312501473351e-06, + "loss": 1.9872, + "step": 197 + }, + { + "epoch": 0.2359952324195471, + "grad_norm": 0.6561276515835338, + "learning_rate": 8.735820890519981e-06, + "loss": 1.9851, + "step": 198 + }, + { + "epoch": 0.23718712753277713, + "grad_norm": 0.7063577334823914, + "learning_rate": 8.723276270131422e-06, + "loss": 1.9897, + "step": 199 + }, + { + "epoch": 0.23837902264600716, + "grad_norm": 0.7581038228065401, + "learning_rate": 8.710678818309576e-06, + "loss": 2.0025, + "step": 200 + }, + { + "epoch": 0.2395709177592372, + "grad_norm": 0.7115966613137586, + "learning_rate": 8.698028713806005e-06, + "loss": 2.0004, + "step": 201 + }, + { + "epoch": 0.24076281287246723, + "grad_norm": 0.5976258958997295, + "learning_rate": 8.68532613611938e-06, + "loss": 2.018, + "step": 202 + }, + { + "epoch": 0.24195470798569726, + "grad_norm": 0.43540172054622217, + "learning_rate": 8.672571265492944e-06, + "loss": 1.9989, + "step": 203 + }, + { + "epoch": 0.2431466030989273, + "grad_norm": 0.5216426023045612, + "learning_rate": 8.659764282911948e-06, + "loss": 1.9866, + "step": 204 + }, + { + "epoch": 0.24433849821215733, + "grad_norm": 0.6613860116484914, + "learning_rate": 8.64690537010109e-06, + "loss": 2.0061, + "step": 205 + }, + { + "epoch": 0.24553039332538737, + "grad_norm": 0.7138301888755583, + "learning_rate": 8.63399470952193e-06, + "loss": 2.0107, + "step": 206 + }, + { + "epoch": 0.2467222884386174, + "grad_norm": 0.7998521068632918, + "learning_rate": 8.621032484370299e-06, + "loss": 1.9856, + "step": 207 + }, + { + "epoch": 0.24791418355184744, + "grad_norm": 0.6733799007638906, + "learning_rate": 8.60801887857371e-06, + "loss": 1.9789, + "step": 208 + }, + { + "epoch": 0.24910607866507747, + "grad_norm": 0.4890141413650463, + "learning_rate": 8.594954076788736e-06, + "loss": 1.9966, + "step": 209 + }, + { + "epoch": 0.25029797377830754, + "grad_norm": 0.510254285654425, + "learning_rate": 8.5818382643984e-06, + "loss": 2.0033, + "step": 210 + }, + { + "epoch": 0.25148986889153757, + "grad_norm": 0.6736096737562903, + "learning_rate": 8.56867162750954e-06, + "loss": 1.9882, + "step": 211 + }, + { + "epoch": 0.2526817640047676, + "grad_norm": 0.688224238343655, + "learning_rate": 8.555454352950161e-06, + "loss": 1.9826, + "step": 212 + }, + { + "epoch": 0.25387365911799764, + "grad_norm": 0.5310568361772406, + "learning_rate": 8.542186628266801e-06, + "loss": 2.018, + "step": 213 + }, + { + "epoch": 0.2550655542312277, + "grad_norm": 0.4622700149348845, + "learning_rate": 8.528868641721857e-06, + "loss": 1.9873, + "step": 214 + }, + { + "epoch": 0.2562574493444577, + "grad_norm": 0.44850296625902714, + "learning_rate": 8.515500582290914e-06, + "loss": 1.9738, + "step": 215 + }, + { + "epoch": 0.25744934445768775, + "grad_norm": 0.5800104445256365, + "learning_rate": 8.502082639660068e-06, + "loss": 2.0033, + "step": 216 + }, + { + "epoch": 0.2586412395709178, + "grad_norm": 0.5571007121924001, + "learning_rate": 8.488615004223233e-06, + "loss": 2.0097, + "step": 217 + }, + { + "epoch": 0.2598331346841478, + "grad_norm": 0.5363110521997889, + "learning_rate": 8.475097867079437e-06, + "loss": 1.9826, + "step": 218 + }, + { + "epoch": 0.26102502979737785, + "grad_norm": 0.46575794642736956, + "learning_rate": 8.461531420030117e-06, + "loss": 2.0129, + "step": 219 + }, + { + "epoch": 0.2622169249106079, + "grad_norm": 0.40917886114681945, + "learning_rate": 8.44791585557639e-06, + "loss": 2.0047, + "step": 220 + }, + { + "epoch": 0.2634088200238379, + "grad_norm": 0.428624008942813, + "learning_rate": 8.434251366916323e-06, + "loss": 1.9781, + "step": 221 + }, + { + "epoch": 0.26460071513706795, + "grad_norm": 0.4571746297128128, + "learning_rate": 8.420538147942196e-06, + "loss": 1.9844, + "step": 222 + }, + { + "epoch": 0.265792610250298, + "grad_norm": 0.47157884654181986, + "learning_rate": 8.406776393237748e-06, + "loss": 1.9985, + "step": 223 + }, + { + "epoch": 0.266984505363528, + "grad_norm": 0.46012310079193414, + "learning_rate": 8.392966298075413e-06, + "loss": 1.9945, + "step": 224 + }, + { + "epoch": 0.26817640047675806, + "grad_norm": 0.4551526365374971, + "learning_rate": 8.379108058413553e-06, + "loss": 1.9778, + "step": 225 + }, + { + "epoch": 0.2693682955899881, + "grad_norm": 0.4810916725254239, + "learning_rate": 8.36520187089368e-06, + "loss": 1.9814, + "step": 226 + }, + { + "epoch": 0.27056019070321813, + "grad_norm": 0.46258784460873204, + "learning_rate": 8.351247932837655e-06, + "loss": 1.9719, + "step": 227 + }, + { + "epoch": 0.27175208581644816, + "grad_norm": 0.45411997594863557, + "learning_rate": 8.337246442244902e-06, + "loss": 1.9753, + "step": 228 + }, + { + "epoch": 0.2729439809296782, + "grad_norm": 0.43996967181045016, + "learning_rate": 8.32319759778959e-06, + "loss": 2.0033, + "step": 229 + }, + { + "epoch": 0.27413587604290823, + "grad_norm": 0.507769478588206, + "learning_rate": 8.309101598817812e-06, + "loss": 2.0024, + "step": 230 + }, + { + "epoch": 0.27532777115613827, + "grad_norm": 0.48069601950891877, + "learning_rate": 8.294958645344766e-06, + "loss": 1.9824, + "step": 231 + }, + { + "epoch": 0.2765196662693683, + "grad_norm": 0.5157028595077698, + "learning_rate": 8.280768938051909e-06, + "loss": 1.9699, + "step": 232 + }, + { + "epoch": 0.27771156138259834, + "grad_norm": 0.579814229455722, + "learning_rate": 8.266532678284103e-06, + "loss": 1.984, + "step": 233 + }, + { + "epoch": 0.2789034564958284, + "grad_norm": 0.627324817155187, + "learning_rate": 8.252250068046784e-06, + "loss": 1.9861, + "step": 234 + }, + { + "epoch": 0.2800953516090584, + "grad_norm": 0.593805814527224, + "learning_rate": 8.23792131000306e-06, + "loss": 1.9693, + "step": 235 + }, + { + "epoch": 0.28128724672228844, + "grad_norm": 0.6552471095231857, + "learning_rate": 8.223546607470863e-06, + "loss": 1.9862, + "step": 236 + }, + { + "epoch": 0.2824791418355185, + "grad_norm": 0.6028562723069028, + "learning_rate": 8.209126164420056e-06, + "loss": 1.981, + "step": 237 + }, + { + "epoch": 0.2836710369487485, + "grad_norm": 0.5873677146224183, + "learning_rate": 8.19466018546953e-06, + "loss": 1.9967, + "step": 238 + }, + { + "epoch": 0.28486293206197855, + "grad_norm": 0.5279550914843492, + "learning_rate": 8.18014887588431e-06, + "loss": 1.9836, + "step": 239 + }, + { + "epoch": 0.2860548271752086, + "grad_norm": 0.5159083129491098, + "learning_rate": 8.165592441572648e-06, + "loss": 1.9906, + "step": 240 + }, + { + "epoch": 0.2872467222884386, + "grad_norm": 0.5540993574066266, + "learning_rate": 8.150991089083081e-06, + "loss": 1.9953, + "step": 241 + }, + { + "epoch": 0.28843861740166865, + "grad_norm": 0.6125101838648868, + "learning_rate": 8.13634502560152e-06, + "loss": 2.0038, + "step": 242 + }, + { + "epoch": 0.2896305125148987, + "grad_norm": 0.5519571584252633, + "learning_rate": 8.1216544589483e-06, + "loss": 1.9983, + "step": 243 + }, + { + "epoch": 0.2908224076281287, + "grad_norm": 0.544350413761365, + "learning_rate": 8.106919597575238e-06, + "loss": 1.9718, + "step": 244 + }, + { + "epoch": 0.29201430274135876, + "grad_norm": 0.5664660915352969, + "learning_rate": 8.092140650562665e-06, + "loss": 1.9671, + "step": 245 + }, + { + "epoch": 0.2932061978545888, + "grad_norm": 0.6296577119121265, + "learning_rate": 8.07731782761647e-06, + "loss": 1.9881, + "step": 246 + }, + { + "epoch": 0.2943980929678188, + "grad_norm": 0.4926647346394942, + "learning_rate": 8.062451339065116e-06, + "loss": 1.9609, + "step": 247 + }, + { + "epoch": 0.29558998808104886, + "grad_norm": 0.4624410592429987, + "learning_rate": 8.047541395856661e-06, + "loss": 1.9974, + "step": 248 + }, + { + "epoch": 0.2967818831942789, + "grad_norm": 0.559079602861405, + "learning_rate": 8.032588209555765e-06, + "loss": 1.999, + "step": 249 + }, + { + "epoch": 0.29797377830750893, + "grad_norm": 0.5257803282078808, + "learning_rate": 8.017591992340682e-06, + "loss": 1.99, + "step": 250 + }, + { + "epoch": 0.29916567342073896, + "grad_norm": 0.4532797658436555, + "learning_rate": 8.002552957000254e-06, + "loss": 1.9961, + "step": 251 + }, + { + "epoch": 0.300357568533969, + "grad_norm": 0.4967793482713224, + "learning_rate": 7.987471316930892e-06, + "loss": 1.9859, + "step": 252 + }, + { + "epoch": 0.30154946364719903, + "grad_norm": 0.5216037784287865, + "learning_rate": 7.972347286133549e-06, + "loss": 1.9775, + "step": 253 + }, + { + "epoch": 0.30274135876042907, + "grad_norm": 0.44165364383086597, + "learning_rate": 7.957181079210676e-06, + "loss": 1.9834, + "step": 254 + }, + { + "epoch": 0.3039332538736591, + "grad_norm": 0.4525734716636921, + "learning_rate": 7.941972911363187e-06, + "loss": 1.9834, + "step": 255 + }, + { + "epoch": 0.30512514898688914, + "grad_norm": 0.4399784793186879, + "learning_rate": 7.926722998387398e-06, + "loss": 1.9883, + "step": 256 + }, + { + "epoch": 0.3063170441001192, + "grad_norm": 0.4302293917353196, + "learning_rate": 7.911431556671967e-06, + "loss": 1.9888, + "step": 257 + }, + { + "epoch": 0.3075089392133492, + "grad_norm": 0.5077527400267277, + "learning_rate": 7.896098803194828e-06, + "loss": 1.9814, + "step": 258 + }, + { + "epoch": 0.30870083432657924, + "grad_norm": 0.5455522386411445, + "learning_rate": 7.880724955520105e-06, + "loss": 2.0022, + "step": 259 + }, + { + "epoch": 0.3098927294398093, + "grad_norm": 0.4734204507402147, + "learning_rate": 7.865310231795026e-06, + "loss": 1.9883, + "step": 260 + }, + { + "epoch": 0.3110846245530393, + "grad_norm": 0.46463402034819734, + "learning_rate": 7.849854850746834e-06, + "loss": 1.9871, + "step": 261 + }, + { + "epoch": 0.31227651966626935, + "grad_norm": 0.48102107314994796, + "learning_rate": 7.83435903167968e-06, + "loss": 1.9817, + "step": 262 + }, + { + "epoch": 0.3134684147794994, + "grad_norm": 0.49443270213282037, + "learning_rate": 7.818822994471504e-06, + "loss": 1.9726, + "step": 263 + }, + { + "epoch": 0.3146603098927294, + "grad_norm": 0.5141146391688594, + "learning_rate": 7.80324695957093e-06, + "loss": 1.9843, + "step": 264 + }, + { + "epoch": 0.31585220500595945, + "grad_norm": 0.48124603321709436, + "learning_rate": 7.78763114799412e-06, + "loss": 1.9713, + "step": 265 + }, + { + "epoch": 0.3170441001191895, + "grad_norm": 0.4573264323307654, + "learning_rate": 7.771975781321655e-06, + "loss": 1.9855, + "step": 266 + }, + { + "epoch": 0.3182359952324195, + "grad_norm": 0.497648183015366, + "learning_rate": 7.75628108169538e-06, + "loss": 1.9857, + "step": 267 + }, + { + "epoch": 0.31942789034564956, + "grad_norm": 0.5260277669621191, + "learning_rate": 7.740547271815253e-06, + "loss": 1.9867, + "step": 268 + }, + { + "epoch": 0.3206197854588796, + "grad_norm": 0.5443051292540823, + "learning_rate": 7.72477457493619e-06, + "loss": 1.9742, + "step": 269 + }, + { + "epoch": 0.3218116805721097, + "grad_norm": 0.4269306335257882, + "learning_rate": 7.70896321486489e-06, + "loss": 1.9768, + "step": 270 + }, + { + "epoch": 0.3230035756853397, + "grad_norm": 0.42010336549578936, + "learning_rate": 7.693113415956674e-06, + "loss": 1.9799, + "step": 271 + }, + { + "epoch": 0.32419547079856975, + "grad_norm": 0.46762767407360706, + "learning_rate": 7.677225403112277e-06, + "loss": 1.9843, + "step": 272 + }, + { + "epoch": 0.3253873659117998, + "grad_norm": 0.5057942342132519, + "learning_rate": 7.661299401774677e-06, + "loss": 1.9828, + "step": 273 + }, + { + "epoch": 0.3265792610250298, + "grad_norm": 0.5952967303729245, + "learning_rate": 7.645335637925897e-06, + "loss": 1.9796, + "step": 274 + }, + { + "epoch": 0.32777115613825986, + "grad_norm": 0.5273900870276448, + "learning_rate": 7.629334338083774e-06, + "loss": 1.9766, + "step": 275 + }, + { + "epoch": 0.3289630512514899, + "grad_norm": 0.45567977499071444, + "learning_rate": 7.6132957292987795e-06, + "loss": 1.9617, + "step": 276 + }, + { + "epoch": 0.3301549463647199, + "grad_norm": 0.5179821998771547, + "learning_rate": 7.597220039150768e-06, + "loss": 1.9863, + "step": 277 + }, + { + "epoch": 0.33134684147794996, + "grad_norm": 0.5651280024042905, + "learning_rate": 7.58110749574577e-06, + "loss": 1.9821, + "step": 278 + }, + { + "epoch": 0.33253873659118, + "grad_norm": 0.46901304611627237, + "learning_rate": 7.564958327712735e-06, + "loss": 1.9798, + "step": 279 + }, + { + "epoch": 0.33373063170441003, + "grad_norm": 0.4359604972801817, + "learning_rate": 7.5487727642003075e-06, + "loss": 1.9789, + "step": 280 + }, + { + "epoch": 0.33492252681764006, + "grad_norm": 0.4691874050085417, + "learning_rate": 7.532551034873558e-06, + "loss": 1.9858, + "step": 281 + }, + { + "epoch": 0.3361144219308701, + "grad_norm": 0.49078578351565005, + "learning_rate": 7.516293369910737e-06, + "loss": 1.9905, + "step": 282 + }, + { + "epoch": 0.33730631704410013, + "grad_norm": 0.4810414634759214, + "learning_rate": 7.500000000000001e-06, + "loss": 1.9757, + "step": 283 + }, + { + "epoch": 0.33849821215733017, + "grad_norm": 0.4004089110467056, + "learning_rate": 7.483671156336142e-06, + "loss": 1.9743, + "step": 284 + }, + { + "epoch": 0.3396901072705602, + "grad_norm": 0.48370804553795343, + "learning_rate": 7.467307070617309e-06, + "loss": 1.9882, + "step": 285 + }, + { + "epoch": 0.34088200238379024, + "grad_norm": 0.3916208994505171, + "learning_rate": 7.4509079750417154e-06, + "loss": 1.9906, + "step": 286 + }, + { + "epoch": 0.3420738974970203, + "grad_norm": 0.4440622088562717, + "learning_rate": 7.43447410230435e-06, + "loss": 1.9756, + "step": 287 + }, + { + "epoch": 0.3432657926102503, + "grad_norm": 0.4151369125535769, + "learning_rate": 7.418005685593669e-06, + "loss": 1.98, + "step": 288 + }, + { + "epoch": 0.34445768772348034, + "grad_norm": 0.42888099521221656, + "learning_rate": 7.4015029585882925e-06, + "loss": 1.9597, + "step": 289 + }, + { + "epoch": 0.3456495828367104, + "grad_norm": 0.4031068379998817, + "learning_rate": 7.384966155453686e-06, + "loss": 1.9909, + "step": 290 + }, + { + "epoch": 0.3468414779499404, + "grad_norm": 0.4288403976952624, + "learning_rate": 7.368395510838838e-06, + "loss": 1.9715, + "step": 291 + }, + { + "epoch": 0.34803337306317045, + "grad_norm": 0.4047372419449946, + "learning_rate": 7.351791259872929e-06, + "loss": 1.9933, + "step": 292 + }, + { + "epoch": 0.3492252681764005, + "grad_norm": 0.42040782221308876, + "learning_rate": 7.335153638162005e-06, + "loss": 1.9875, + "step": 293 + }, + { + "epoch": 0.3504171632896305, + "grad_norm": 0.40151800416240474, + "learning_rate": 7.318482881785612e-06, + "loss": 1.9827, + "step": 294 + }, + { + "epoch": 0.35160905840286055, + "grad_norm": 0.40534989415691614, + "learning_rate": 7.301779227293475e-06, + "loss": 1.9899, + "step": 295 + }, + { + "epoch": 0.3528009535160906, + "grad_norm": 0.41437334261849135, + "learning_rate": 7.285042911702116e-06, + "loss": 1.9761, + "step": 296 + }, + { + "epoch": 0.3539928486293206, + "grad_norm": 0.43461149682609845, + "learning_rate": 7.268274172491508e-06, + "loss": 2.0009, + "step": 297 + }, + { + "epoch": 0.35518474374255066, + "grad_norm": 0.42255392024397564, + "learning_rate": 7.251473247601698e-06, + "loss": 1.9805, + "step": 298 + }, + { + "epoch": 0.3563766388557807, + "grad_norm": 0.44303489088588954, + "learning_rate": 7.234640375429427e-06, + "loss": 1.9824, + "step": 299 + }, + { + "epoch": 0.3575685339690107, + "grad_norm": 0.43499397642762283, + "learning_rate": 7.217775794824759e-06, + "loss": 1.9785, + "step": 300 + }, + { + "epoch": 0.35876042908224076, + "grad_norm": 0.4208326930599362, + "learning_rate": 7.200879745087681e-06, + "loss": 1.994, + "step": 301 + }, + { + "epoch": 0.3599523241954708, + "grad_norm": 0.4452902733869807, + "learning_rate": 7.183952465964711e-06, + "loss": 1.9741, + "step": 302 + }, + { + "epoch": 0.36114421930870083, + "grad_norm": 0.4764827599963297, + "learning_rate": 7.166994197645497e-06, + "loss": 1.9826, + "step": 303 + }, + { + "epoch": 0.36233611442193087, + "grad_norm": 0.4460964876445021, + "learning_rate": 7.150005180759411e-06, + "loss": 1.9808, + "step": 304 + }, + { + "epoch": 0.3635280095351609, + "grad_norm": 0.42052492138452646, + "learning_rate": 7.132985656372126e-06, + "loss": 1.9652, + "step": 305 + }, + { + "epoch": 0.36471990464839094, + "grad_norm": 0.3578650107792017, + "learning_rate": 7.115935865982205e-06, + "loss": 2.0037, + "step": 306 + }, + { + "epoch": 0.36591179976162097, + "grad_norm": 0.4213839735073625, + "learning_rate": 7.098856051517673e-06, + "loss": 1.9983, + "step": 307 + }, + { + "epoch": 0.367103694874851, + "grad_norm": 0.41798689890135715, + "learning_rate": 7.0817464553325764e-06, + "loss": 1.9833, + "step": 308 + }, + { + "epoch": 0.36829558998808104, + "grad_norm": 0.46301273631831313, + "learning_rate": 7.064607320203552e-06, + "loss": 1.9785, + "step": 309 + }, + { + "epoch": 0.3694874851013111, + "grad_norm": 0.3853768039281196, + "learning_rate": 7.047438889326377e-06, + "loss": 1.9953, + "step": 310 + }, + { + "epoch": 0.3706793802145411, + "grad_norm": 0.39106836774943315, + "learning_rate": 7.030241406312528e-06, + "loss": 1.9908, + "step": 311 + }, + { + "epoch": 0.37187127532777114, + "grad_norm": 0.3557595574168793, + "learning_rate": 7.013015115185706e-06, + "loss": 1.9711, + "step": 312 + }, + { + "epoch": 0.3730631704410012, + "grad_norm": 0.462884994313804, + "learning_rate": 6.9957602603783944e-06, + "loss": 2.0036, + "step": 313 + }, + { + "epoch": 0.3742550655542312, + "grad_norm": 0.42933967393666006, + "learning_rate": 6.978477086728375e-06, + "loss": 1.9843, + "step": 314 + }, + { + "epoch": 0.37544696066746125, + "grad_norm": 0.43775594546905017, + "learning_rate": 6.961165839475262e-06, + "loss": 1.9799, + "step": 315 + }, + { + "epoch": 0.3766388557806913, + "grad_norm": 0.40786517623408314, + "learning_rate": 6.9438267642570216e-06, + "loss": 1.9674, + "step": 316 + }, + { + "epoch": 0.3778307508939213, + "grad_norm": 0.3812009351969576, + "learning_rate": 6.926460107106483e-06, + "loss": 1.9835, + "step": 317 + }, + { + "epoch": 0.37902264600715135, + "grad_norm": 0.43023083569572035, + "learning_rate": 6.909066114447847e-06, + "loss": 1.9843, + "step": 318 + }, + { + "epoch": 0.3802145411203814, + "grad_norm": 0.4055444095073271, + "learning_rate": 6.891645033093196e-06, + "loss": 1.9802, + "step": 319 + }, + { + "epoch": 0.3814064362336114, + "grad_norm": 0.43023837992568775, + "learning_rate": 6.874197110238986e-06, + "loss": 1.9756, + "step": 320 + }, + { + "epoch": 0.38259833134684146, + "grad_norm": 0.4061991284550457, + "learning_rate": 6.8567225934625385e-06, + "loss": 1.9793, + "step": 321 + }, + { + "epoch": 0.3837902264600715, + "grad_norm": 0.46263343121001, + "learning_rate": 6.8392217307185325e-06, + "loss": 1.9888, + "step": 322 + }, + { + "epoch": 0.38498212157330153, + "grad_norm": 0.5183393565092786, + "learning_rate": 6.8216947703354815e-06, + "loss": 1.9678, + "step": 323 + }, + { + "epoch": 0.38617401668653156, + "grad_norm": 0.4914054711777072, + "learning_rate": 6.804141961012213e-06, + "loss": 1.9774, + "step": 324 + }, + { + "epoch": 0.3873659117997616, + "grad_norm": 0.38775497500354755, + "learning_rate": 6.786563551814333e-06, + "loss": 1.9843, + "step": 325 + }, + { + "epoch": 0.38855780691299163, + "grad_norm": 0.4175239392741797, + "learning_rate": 6.7689597921707065e-06, + "loss": 1.9812, + "step": 326 + }, + { + "epoch": 0.38974970202622167, + "grad_norm": 0.5074081729621598, + "learning_rate": 6.7513309318698975e-06, + "loss": 1.9673, + "step": 327 + }, + { + "epoch": 0.3909415971394517, + "grad_norm": 0.5759724338089542, + "learning_rate": 6.733677221056645e-06, + "loss": 1.9595, + "step": 328 + }, + { + "epoch": 0.39213349225268174, + "grad_norm": 0.45858283981603526, + "learning_rate": 6.715998910228296e-06, + "loss": 1.979, + "step": 329 + }, + { + "epoch": 0.3933253873659118, + "grad_norm": 0.39590782238976335, + "learning_rate": 6.698296250231271e-06, + "loss": 1.981, + "step": 330 + }, + { + "epoch": 0.39451728247914186, + "grad_norm": 0.5514883543457016, + "learning_rate": 6.68056949225748e-06, + "loss": 1.9754, + "step": 331 + }, + { + "epoch": 0.3957091775923719, + "grad_norm": 0.5367006385906758, + "learning_rate": 6.6628188878407806e-06, + "loss": 1.9688, + "step": 332 + }, + { + "epoch": 0.39690107270560193, + "grad_norm": 0.4563028045170266, + "learning_rate": 6.645044688853396e-06, + "loss": 1.9792, + "step": 333 + }, + { + "epoch": 0.39809296781883197, + "grad_norm": 0.4705275885547744, + "learning_rate": 6.627247147502343e-06, + "loss": 1.9751, + "step": 334 + }, + { + "epoch": 0.399284862932062, + "grad_norm": 0.39053085326929393, + "learning_rate": 6.609426516325859e-06, + "loss": 1.9809, + "step": 335 + }, + { + "epoch": 0.40047675804529204, + "grad_norm": 0.46336889396641767, + "learning_rate": 6.591583048189812e-06, + "loss": 1.9819, + "step": 336 + }, + { + "epoch": 0.40166865315852207, + "grad_norm": 0.41312116285494427, + "learning_rate": 6.573716996284114e-06, + "loss": 1.9956, + "step": 337 + }, + { + "epoch": 0.4028605482717521, + "grad_norm": 0.4261033537644772, + "learning_rate": 6.555828614119132e-06, + "loss": 1.9864, + "step": 338 + }, + { + "epoch": 0.40405244338498214, + "grad_norm": 0.5571802621996744, + "learning_rate": 6.537918155522089e-06, + "loss": 1.9881, + "step": 339 + }, + { + "epoch": 0.4052443384982122, + "grad_norm": 0.42763390364122206, + "learning_rate": 6.519985874633454e-06, + "loss": 1.981, + "step": 340 + }, + { + "epoch": 0.4064362336114422, + "grad_norm": 0.41484190699219026, + "learning_rate": 6.502032025903356e-06, + "loss": 1.9641, + "step": 341 + }, + { + "epoch": 0.40762812872467225, + "grad_norm": 0.3838791164718351, + "learning_rate": 6.484056864087948e-06, + "loss": 1.9709, + "step": 342 + }, + { + "epoch": 0.4088200238379023, + "grad_norm": 0.4023689175266171, + "learning_rate": 6.4660606442458155e-06, + "loss": 1.9713, + "step": 343 + }, + { + "epoch": 0.4100119189511323, + "grad_norm": 0.4336427044211903, + "learning_rate": 6.4480436217343366e-06, + "loss": 1.9534, + "step": 344 + }, + { + "epoch": 0.41120381406436235, + "grad_norm": 0.37598773624858467, + "learning_rate": 6.430006052206083e-06, + "loss": 1.9603, + "step": 345 + }, + { + "epoch": 0.4123957091775924, + "grad_norm": 0.43416807891817494, + "learning_rate": 6.411948191605164e-06, + "loss": 1.9787, + "step": 346 + }, + { + "epoch": 0.4135876042908224, + "grad_norm": 0.3977800151758, + "learning_rate": 6.393870296163616e-06, + "loss": 1.9916, + "step": 347 + }, + { + "epoch": 0.41477949940405245, + "grad_norm": 0.4008696555982334, + "learning_rate": 6.375772622397762e-06, + "loss": 1.9804, + "step": 348 + }, + { + "epoch": 0.4159713945172825, + "grad_norm": 0.3451532285909086, + "learning_rate": 6.357655427104562e-06, + "loss": 1.9663, + "step": 349 + }, + { + "epoch": 0.4171632896305125, + "grad_norm": 0.4341428658767691, + "learning_rate": 6.339518967357985e-06, + "loss": 1.9744, + "step": 350 + }, + { + "epoch": 0.41835518474374256, + "grad_norm": 0.37680689737786904, + "learning_rate": 6.321363500505348e-06, + "loss": 1.994, + "step": 351 + }, + { + "epoch": 0.4195470798569726, + "grad_norm": 0.36788506489233713, + "learning_rate": 6.3031892841636685e-06, + "loss": 1.9847, + "step": 352 + }, + { + "epoch": 0.42073897497020263, + "grad_norm": 0.38396929856917666, + "learning_rate": 6.284996576216014e-06, + "loss": 1.9722, + "step": 353 + }, + { + "epoch": 0.42193087008343266, + "grad_norm": 0.3512841210948969, + "learning_rate": 6.266785634807838e-06, + "loss": 1.9504, + "step": 354 + }, + { + "epoch": 0.4231227651966627, + "grad_norm": 0.3841371233710849, + "learning_rate": 6.248556718343314e-06, + "loss": 1.9997, + "step": 355 + }, + { + "epoch": 0.42431466030989273, + "grad_norm": 0.41345223603319187, + "learning_rate": 6.230310085481677e-06, + "loss": 1.9754, + "step": 356 + }, + { + "epoch": 0.42550655542312277, + "grad_norm": 0.36115831056461284, + "learning_rate": 6.212045995133543e-06, + "loss": 1.9735, + "step": 357 + }, + { + "epoch": 0.4266984505363528, + "grad_norm": 0.37667258015583416, + "learning_rate": 6.193764706457249e-06, + "loss": 1.9669, + "step": 358 + }, + { + "epoch": 0.42789034564958284, + "grad_norm": 0.34439222602136627, + "learning_rate": 6.175466478855161e-06, + "loss": 1.9788, + "step": 359 + }, + { + "epoch": 0.42908224076281287, + "grad_norm": 0.406649190145765, + "learning_rate": 6.157151571970005e-06, + "loss": 1.9868, + "step": 360 + }, + { + "epoch": 0.4302741358760429, + "grad_norm": 0.37410746997126837, + "learning_rate": 6.13882024568117e-06, + "loss": 1.9588, + "step": 361 + }, + { + "epoch": 0.43146603098927294, + "grad_norm": 0.38935925565712926, + "learning_rate": 6.1204727601010396e-06, + "loss": 1.978, + "step": 362 + }, + { + "epoch": 0.432657926102503, + "grad_norm": 0.3934047570972324, + "learning_rate": 6.10210937557128e-06, + "loss": 1.9728, + "step": 363 + }, + { + "epoch": 0.433849821215733, + "grad_norm": 0.3740037082900391, + "learning_rate": 6.083730352659158e-06, + "loss": 1.9777, + "step": 364 + }, + { + "epoch": 0.43504171632896305, + "grad_norm": 0.3962866525803316, + "learning_rate": 6.065335952153846e-06, + "loss": 1.9753, + "step": 365 + }, + { + "epoch": 0.4362336114421931, + "grad_norm": 0.3703123980920405, + "learning_rate": 6.0469264350627075e-06, + "loss": 1.9685, + "step": 366 + }, + { + "epoch": 0.4374255065554231, + "grad_norm": 0.3772080775482272, + "learning_rate": 6.0285020626076115e-06, + "loss": 1.9918, + "step": 367 + }, + { + "epoch": 0.43861740166865315, + "grad_norm": 0.36096021522163296, + "learning_rate": 6.010063096221215e-06, + "loss": 1.9857, + "step": 368 + }, + { + "epoch": 0.4398092967818832, + "grad_norm": 0.4027363280332516, + "learning_rate": 5.991609797543253e-06, + "loss": 1.9772, + "step": 369 + }, + { + "epoch": 0.4410011918951132, + "grad_norm": 0.36449407433194586, + "learning_rate": 5.973142428416829e-06, + "loss": 1.9926, + "step": 370 + }, + { + "epoch": 0.44219308700834326, + "grad_norm": 0.41922022657177943, + "learning_rate": 5.954661250884704e-06, + "loss": 1.9851, + "step": 371 + }, + { + "epoch": 0.4433849821215733, + "grad_norm": 0.3957989777206615, + "learning_rate": 5.936166527185565e-06, + "loss": 1.9627, + "step": 372 + }, + { + "epoch": 0.4445768772348033, + "grad_norm": 0.39452398707557135, + "learning_rate": 5.91765851975032e-06, + "loss": 1.9876, + "step": 373 + }, + { + "epoch": 0.44576877234803336, + "grad_norm": 0.39493419711592515, + "learning_rate": 5.899137491198364e-06, + "loss": 1.9686, + "step": 374 + }, + { + "epoch": 0.4469606674612634, + "grad_norm": 0.4099934527801523, + "learning_rate": 5.880603704333851e-06, + "loss": 1.9534, + "step": 375 + }, + { + "epoch": 0.44815256257449343, + "grad_norm": 0.36964455654061956, + "learning_rate": 5.862057422141979e-06, + "loss": 1.9523, + "step": 376 + }, + { + "epoch": 0.44934445768772346, + "grad_norm": 0.3902869598970143, + "learning_rate": 5.843498907785236e-06, + "loss": 1.9554, + "step": 377 + }, + { + "epoch": 0.4505363528009535, + "grad_norm": 0.3969483119716555, + "learning_rate": 5.8249284245996905e-06, + "loss": 1.9907, + "step": 378 + }, + { + "epoch": 0.45172824791418353, + "grad_norm": 0.3960234150743317, + "learning_rate": 5.806346236091232e-06, + "loss": 1.9906, + "step": 379 + }, + { + "epoch": 0.45292014302741357, + "grad_norm": 0.3810498242078963, + "learning_rate": 5.78775260593185e-06, + "loss": 1.9612, + "step": 380 + }, + { + "epoch": 0.4541120381406436, + "grad_norm": 0.385855393557767, + "learning_rate": 5.769147797955882e-06, + "loss": 1.9736, + "step": 381 + }, + { + "epoch": 0.45530393325387364, + "grad_norm": 0.34406815893035153, + "learning_rate": 5.7505320761562735e-06, + "loss": 1.9864, + "step": 382 + }, + { + "epoch": 0.4564958283671037, + "grad_norm": 0.37884788683749326, + "learning_rate": 5.731905704680834e-06, + "loss": 1.9878, + "step": 383 + }, + { + "epoch": 0.4576877234803337, + "grad_norm": 0.38229911057814764, + "learning_rate": 5.713268947828484e-06, + "loss": 1.9677, + "step": 384 + }, + { + "epoch": 0.45887961859356374, + "grad_norm": 0.3930195565597414, + "learning_rate": 5.694622070045507e-06, + "loss": 1.9831, + "step": 385 + }, + { + "epoch": 0.4600715137067938, + "grad_norm": 0.35771899505040233, + "learning_rate": 5.6759653359218e-06, + "loss": 1.938, + "step": 386 + }, + { + "epoch": 0.4612634088200238, + "grad_norm": 0.3844248408562967, + "learning_rate": 5.657299010187116e-06, + "loss": 1.983, + "step": 387 + }, + { + "epoch": 0.46245530393325385, + "grad_norm": 0.374339760496431, + "learning_rate": 5.638623357707304e-06, + "loss": 1.9696, + "step": 388 + }, + { + "epoch": 0.4636471990464839, + "grad_norm": 0.4187861158867821, + "learning_rate": 5.6199386434805615e-06, + "loss": 1.9678, + "step": 389 + }, + { + "epoch": 0.464839094159714, + "grad_norm": 0.37470925657624427, + "learning_rate": 5.601245132633662e-06, + "loss": 1.9708, + "step": 390 + }, + { + "epoch": 0.466030989272944, + "grad_norm": 0.43682382668647773, + "learning_rate": 5.582543090418203e-06, + "loss": 1.9742, + "step": 391 + }, + { + "epoch": 0.46722288438617404, + "grad_norm": 0.38062716223853055, + "learning_rate": 5.563832782206835e-06, + "loss": 1.956, + "step": 392 + }, + { + "epoch": 0.4684147794994041, + "grad_norm": 0.39166492023793653, + "learning_rate": 5.5451144734895e-06, + "loss": 1.9479, + "step": 393 + }, + { + "epoch": 0.4696066746126341, + "grad_norm": 0.45740493772589974, + "learning_rate": 5.526388429869663e-06, + "loss": 1.9757, + "step": 394 + }, + { + "epoch": 0.47079856972586415, + "grad_norm": 0.3532441760302746, + "learning_rate": 5.507654917060541e-06, + "loss": 1.9774, + "step": 395 + }, + { + "epoch": 0.4719904648390942, + "grad_norm": 0.4162677343329253, + "learning_rate": 5.48891420088134e-06, + "loss": 1.9837, + "step": 396 + }, + { + "epoch": 0.4731823599523242, + "grad_norm": 0.4634604848492295, + "learning_rate": 5.470166547253476e-06, + "loss": 1.9923, + "step": 397 + }, + { + "epoch": 0.47437425506555425, + "grad_norm": 0.4001952153469404, + "learning_rate": 5.451412222196801e-06, + "loss": 1.969, + "step": 398 + }, + { + "epoch": 0.4755661501787843, + "grad_norm": 0.4117431494583168, + "learning_rate": 5.432651491825837e-06, + "loss": 1.9609, + "step": 399 + }, + { + "epoch": 0.4767580452920143, + "grad_norm": 0.4367947660920832, + "learning_rate": 5.4138846223459895e-06, + "loss": 1.9621, + "step": 400 + }, + { + "epoch": 0.47794994040524436, + "grad_norm": 0.34963770890851276, + "learning_rate": 5.395111880049775e-06, + "loss": 1.9564, + "step": 401 + }, + { + "epoch": 0.4791418355184744, + "grad_norm": 0.4080401962751008, + "learning_rate": 5.376333531313046e-06, + "loss": 1.9689, + "step": 402 + }, + { + "epoch": 0.4803337306317044, + "grad_norm": 0.39779512665663647, + "learning_rate": 5.3575498425912046e-06, + "loss": 1.9752, + "step": 403 + }, + { + "epoch": 0.48152562574493446, + "grad_norm": 0.3494078316294088, + "learning_rate": 5.338761080415425e-06, + "loss": 1.988, + "step": 404 + }, + { + "epoch": 0.4827175208581645, + "grad_norm": 0.38403810675465305, + "learning_rate": 5.319967511388871e-06, + "loss": 1.9849, + "step": 405 + }, + { + "epoch": 0.48390941597139453, + "grad_norm": 0.41925050485912146, + "learning_rate": 5.301169402182915e-06, + "loss": 1.9744, + "step": 406 + }, + { + "epoch": 0.48510131108462456, + "grad_norm": 0.3659050285550682, + "learning_rate": 5.28236701953335e-06, + "loss": 1.9594, + "step": 407 + }, + { + "epoch": 0.4862932061978546, + "grad_norm": 0.3779979519911562, + "learning_rate": 5.263560630236611e-06, + "loss": 1.969, + "step": 408 + }, + { + "epoch": 0.48748510131108463, + "grad_norm": 0.4051001024185403, + "learning_rate": 5.244750501145977e-06, + "loss": 1.9758, + "step": 409 + }, + { + "epoch": 0.48867699642431467, + "grad_norm": 0.3580954206397942, + "learning_rate": 5.225936899167803e-06, + "loss": 1.9712, + "step": 410 + }, + { + "epoch": 0.4898688915375447, + "grad_norm": 0.37492205319973293, + "learning_rate": 5.207120091257715e-06, + "loss": 1.9924, + "step": 411 + }, + { + "epoch": 0.49106078665077474, + "grad_norm": 0.3787755420296742, + "learning_rate": 5.188300344416834e-06, + "loss": 1.9607, + "step": 412 + }, + { + "epoch": 0.4922526817640048, + "grad_norm": 0.3594245434434773, + "learning_rate": 5.169477925687981e-06, + "loss": 1.9596, + "step": 413 + }, + { + "epoch": 0.4934445768772348, + "grad_norm": 0.4048509843155868, + "learning_rate": 5.15065310215189e-06, + "loss": 1.9811, + "step": 414 + }, + { + "epoch": 0.49463647199046484, + "grad_norm": 0.33930841548544644, + "learning_rate": 5.1318261409234185e-06, + "loss": 1.9785, + "step": 415 + }, + { + "epoch": 0.4958283671036949, + "grad_norm": 0.3971904008450457, + "learning_rate": 5.112997309147753e-06, + "loss": 1.9538, + "step": 416 + }, + { + "epoch": 0.4970202622169249, + "grad_norm": 0.4109703239083303, + "learning_rate": 5.094166873996632e-06, + "loss": 1.9442, + "step": 417 + }, + { + "epoch": 0.49821215733015495, + "grad_norm": 0.35849090963357355, + "learning_rate": 5.075335102664533e-06, + "loss": 1.9611, + "step": 418 + }, + { + "epoch": 0.499404052443385, + "grad_norm": 0.3315925723712266, + "learning_rate": 5.0565022623649e-06, + "loss": 1.9507, + "step": 419 + }, + { + "epoch": 0.5005959475566151, + "grad_norm": 0.40128345634186274, + "learning_rate": 5.037668620326343e-06, + "loss": 1.9965, + "step": 420 + }, + { + "epoch": 0.5017878426698451, + "grad_norm": 0.34631267401835186, + "learning_rate": 5.018834443788855e-06, + "loss": 1.9739, + "step": 421 + }, + { + "epoch": 0.5029797377830751, + "grad_norm": 0.37750605356600553, + "learning_rate": 5e-06, + "loss": 1.9577, + "step": 422 + }, + { + "epoch": 0.5041716328963052, + "grad_norm": 0.325413886379343, + "learning_rate": 4.9811655562111465e-06, + "loss": 1.964, + "step": 423 + }, + { + "epoch": 0.5053635280095352, + "grad_norm": 0.37792660484449137, + "learning_rate": 4.9623313796736575e-06, + "loss": 1.9834, + "step": 424 + }, + { + "epoch": 0.5065554231227652, + "grad_norm": 0.3212926587032829, + "learning_rate": 4.943497737635103e-06, + "loss": 1.9652, + "step": 425 + }, + { + "epoch": 0.5077473182359953, + "grad_norm": 0.3666539973322088, + "learning_rate": 4.9246648973354704e-06, + "loss": 1.9898, + "step": 426 + }, + { + "epoch": 0.5089392133492253, + "grad_norm": 0.3470498382172804, + "learning_rate": 4.905833126003371e-06, + "loss": 1.986, + "step": 427 + }, + { + "epoch": 0.5101311084624554, + "grad_norm": 0.3509551861996659, + "learning_rate": 4.887002690852249e-06, + "loss": 1.9765, + "step": 428 + }, + { + "epoch": 0.5113230035756854, + "grad_norm": 0.33773403719361406, + "learning_rate": 4.868173859076585e-06, + "loss": 1.9514, + "step": 429 + }, + { + "epoch": 0.5125148986889154, + "grad_norm": 0.33839162767720193, + "learning_rate": 4.849346897848111e-06, + "loss": 1.9671, + "step": 430 + }, + { + "epoch": 0.5137067938021455, + "grad_norm": 0.34429335199030947, + "learning_rate": 4.830522074312019e-06, + "loss": 1.9739, + "step": 431 + }, + { + "epoch": 0.5148986889153755, + "grad_norm": 0.35288845889112397, + "learning_rate": 4.811699655583167e-06, + "loss": 1.9912, + "step": 432 + }, + { + "epoch": 0.5160905840286055, + "grad_norm": 0.3461629113067177, + "learning_rate": 4.792879908742285e-06, + "loss": 1.9484, + "step": 433 + }, + { + "epoch": 0.5172824791418356, + "grad_norm": 0.3196675261690019, + "learning_rate": 4.774063100832199e-06, + "loss": 1.9688, + "step": 434 + }, + { + "epoch": 0.5184743742550656, + "grad_norm": 0.3392521279527564, + "learning_rate": 4.755249498854024e-06, + "loss": 1.9506, + "step": 435 + }, + { + "epoch": 0.5196662693682956, + "grad_norm": 0.3457448380814436, + "learning_rate": 4.736439369763391e-06, + "loss": 1.9743, + "step": 436 + }, + { + "epoch": 0.5208581644815257, + "grad_norm": 0.33206346746993015, + "learning_rate": 4.717632980466652e-06, + "loss": 1.9593, + "step": 437 + }, + { + "epoch": 0.5220500595947557, + "grad_norm": 0.3528235654912419, + "learning_rate": 4.698830597817087e-06, + "loss": 1.9665, + "step": 438 + }, + { + "epoch": 0.5232419547079857, + "grad_norm": 0.3556856155018991, + "learning_rate": 4.680032488611131e-06, + "loss": 1.9799, + "step": 439 + }, + { + "epoch": 0.5244338498212158, + "grad_norm": 0.32848860913310046, + "learning_rate": 4.661238919584578e-06, + "loss": 1.9803, + "step": 440 + }, + { + "epoch": 0.5256257449344458, + "grad_norm": 0.32687019162828723, + "learning_rate": 4.642450157408798e-06, + "loss": 1.9428, + "step": 441 + }, + { + "epoch": 0.5268176400476758, + "grad_norm": 0.3453824430384208, + "learning_rate": 4.623666468686956e-06, + "loss": 1.9822, + "step": 442 + }, + { + "epoch": 0.5280095351609059, + "grad_norm": 0.37003751019358744, + "learning_rate": 4.6048881199502265e-06, + "loss": 1.9483, + "step": 443 + }, + { + "epoch": 0.5292014302741359, + "grad_norm": 0.4147946881041239, + "learning_rate": 4.586115377654014e-06, + "loss": 1.9617, + "step": 444 + }, + { + "epoch": 0.5303933253873659, + "grad_norm": 0.3574077732974426, + "learning_rate": 4.567348508174164e-06, + "loss": 1.9583, + "step": 445 + }, + { + "epoch": 0.531585220500596, + "grad_norm": 0.40825647248866936, + "learning_rate": 4.548587777803198e-06, + "loss": 1.9804, + "step": 446 + }, + { + "epoch": 0.532777115613826, + "grad_norm": 0.37613255907704796, + "learning_rate": 4.529833452746526e-06, + "loss": 1.9927, + "step": 447 + }, + { + "epoch": 0.533969010727056, + "grad_norm": 0.594095776694764, + "learning_rate": 4.5110857991186606e-06, + "loss": 1.9719, + "step": 448 + }, + { + "epoch": 0.5351609058402861, + "grad_norm": 0.3717370719647907, + "learning_rate": 4.49234508293946e-06, + "loss": 1.9593, + "step": 449 + }, + { + "epoch": 0.5363528009535161, + "grad_norm": 0.3648799549586229, + "learning_rate": 4.47361157013034e-06, + "loss": 1.967, + "step": 450 + }, + { + "epoch": 0.5375446960667462, + "grad_norm": 0.33995726389121855, + "learning_rate": 4.454885526510501e-06, + "loss": 1.9753, + "step": 451 + }, + { + "epoch": 0.5387365911799762, + "grad_norm": 0.39251871849389397, + "learning_rate": 4.436167217793167e-06, + "loss": 1.9818, + "step": 452 + }, + { + "epoch": 0.5399284862932062, + "grad_norm": 0.3199928030279707, + "learning_rate": 4.417456909581798e-06, + "loss": 1.9552, + "step": 453 + }, + { + "epoch": 0.5411203814064363, + "grad_norm": 0.3669027651321596, + "learning_rate": 4.398754867366339e-06, + "loss": 1.9775, + "step": 454 + }, + { + "epoch": 0.5423122765196663, + "grad_norm": 0.3436432164393003, + "learning_rate": 4.38006135651944e-06, + "loss": 1.9772, + "step": 455 + }, + { + "epoch": 0.5435041716328963, + "grad_norm": 0.361749523201955, + "learning_rate": 4.361376642292698e-06, + "loss": 1.9683, + "step": 456 + }, + { + "epoch": 0.5446960667461264, + "grad_norm": 0.39560797233498957, + "learning_rate": 4.3427009898128865e-06, + "loss": 1.9671, + "step": 457 + }, + { + "epoch": 0.5458879618593564, + "grad_norm": 0.3602620583029035, + "learning_rate": 4.3240346640782014e-06, + "loss": 1.9944, + "step": 458 + }, + { + "epoch": 0.5470798569725864, + "grad_norm": 0.36119141344790967, + "learning_rate": 4.305377929954495e-06, + "loss": 1.9761, + "step": 459 + }, + { + "epoch": 0.5482717520858165, + "grad_norm": 0.3678120966781157, + "learning_rate": 4.286731052171518e-06, + "loss": 1.958, + "step": 460 + }, + { + "epoch": 0.5494636471990465, + "grad_norm": 0.3617639507211402, + "learning_rate": 4.268094295319167e-06, + "loss": 1.9813, + "step": 461 + }, + { + "epoch": 0.5506555423122765, + "grad_norm": 0.3608243381659533, + "learning_rate": 4.249467923843728e-06, + "loss": 1.9641, + "step": 462 + }, + { + "epoch": 0.5518474374255066, + "grad_norm": 0.36278415417064125, + "learning_rate": 4.23085220204412e-06, + "loss": 1.9709, + "step": 463 + }, + { + "epoch": 0.5530393325387366, + "grad_norm": 0.3588218797888413, + "learning_rate": 4.212247394068151e-06, + "loss": 1.9626, + "step": 464 + }, + { + "epoch": 0.5542312276519666, + "grad_norm": 0.33036367069937955, + "learning_rate": 4.19365376390877e-06, + "loss": 1.9832, + "step": 465 + }, + { + "epoch": 0.5554231227651967, + "grad_norm": 0.4074815848531431, + "learning_rate": 4.175071575400311e-06, + "loss": 1.9776, + "step": 466 + }, + { + "epoch": 0.5566150178784267, + "grad_norm": 0.34881797295660344, + "learning_rate": 4.1565010922147644e-06, + "loss": 1.957, + "step": 467 + }, + { + "epoch": 0.5578069129916567, + "grad_norm": 0.37520529930498075, + "learning_rate": 4.137942577858023e-06, + "loss": 1.975, + "step": 468 + }, + { + "epoch": 0.5589988081048868, + "grad_norm": 0.3630248536367911, + "learning_rate": 4.11939629566615e-06, + "loss": 1.9608, + "step": 469 + }, + { + "epoch": 0.5601907032181168, + "grad_norm": 0.33352062979381114, + "learning_rate": 4.100862508801639e-06, + "loss": 1.968, + "step": 470 + }, + { + "epoch": 0.5613825983313468, + "grad_norm": 0.3586501055080032, + "learning_rate": 4.082341480249681e-06, + "loss": 1.951, + "step": 471 + }, + { + "epoch": 0.5625744934445769, + "grad_norm": 0.32010968502296533, + "learning_rate": 4.063833472814437e-06, + "loss": 1.9712, + "step": 472 + }, + { + "epoch": 0.5637663885578069, + "grad_norm": 0.35914935623294864, + "learning_rate": 4.045338749115299e-06, + "loss": 1.9451, + "step": 473 + }, + { + "epoch": 0.564958283671037, + "grad_norm": 0.3215562828423304, + "learning_rate": 4.026857571583173e-06, + "loss": 1.9914, + "step": 474 + }, + { + "epoch": 0.566150178784267, + "grad_norm": 0.34368077323068136, + "learning_rate": 4.008390202456748e-06, + "loss": 1.9602, + "step": 475 + }, + { + "epoch": 0.567342073897497, + "grad_norm": 0.33832066930772653, + "learning_rate": 3.989936903778785e-06, + "loss": 1.9604, + "step": 476 + }, + { + "epoch": 0.5685339690107271, + "grad_norm": 0.3169050597259224, + "learning_rate": 3.971497937392388e-06, + "loss": 2.0011, + "step": 477 + }, + { + "epoch": 0.5697258641239571, + "grad_norm": 0.33977687443341886, + "learning_rate": 3.953073564937293e-06, + "loss": 1.9483, + "step": 478 + }, + { + "epoch": 0.5709177592371871, + "grad_norm": 0.31238016120669476, + "learning_rate": 3.934664047846157e-06, + "loss": 1.967, + "step": 479 + }, + { + "epoch": 0.5721096543504172, + "grad_norm": 0.3251899985092587, + "learning_rate": 3.916269647340843e-06, + "loss": 1.958, + "step": 480 + }, + { + "epoch": 0.5733015494636472, + "grad_norm": 0.34188581574139687, + "learning_rate": 3.897890624428721e-06, + "loss": 1.97, + "step": 481 + }, + { + "epoch": 0.5744934445768772, + "grad_norm": 0.3179020258722567, + "learning_rate": 3.879527239898962e-06, + "loss": 1.9713, + "step": 482 + }, + { + "epoch": 0.5756853396901073, + "grad_norm": 0.35414300160209977, + "learning_rate": 3.86117975431883e-06, + "loss": 1.9387, + "step": 483 + }, + { + "epoch": 0.5768772348033373, + "grad_norm": 0.296876137502102, + "learning_rate": 3.8428484280299975e-06, + "loss": 1.9918, + "step": 484 + }, + { + "epoch": 0.5780691299165673, + "grad_norm": 0.35059849213295274, + "learning_rate": 3.8245335211448404e-06, + "loss": 1.9622, + "step": 485 + }, + { + "epoch": 0.5792610250297974, + "grad_norm": 0.2899536086006706, + "learning_rate": 3.8062352935427526e-06, + "loss": 1.9727, + "step": 486 + }, + { + "epoch": 0.5804529201430274, + "grad_norm": 0.3244862339368592, + "learning_rate": 3.787954004866459e-06, + "loss": 1.9829, + "step": 487 + }, + { + "epoch": 0.5816448152562574, + "grad_norm": 0.31060144753736796, + "learning_rate": 3.769689914518326e-06, + "loss": 1.9743, + "step": 488 + }, + { + "epoch": 0.5828367103694875, + "grad_norm": 0.3081671121318371, + "learning_rate": 3.751443281656688e-06, + "loss": 1.9716, + "step": 489 + }, + { + "epoch": 0.5840286054827175, + "grad_norm": 0.28679657845355666, + "learning_rate": 3.733214365192162e-06, + "loss": 1.9836, + "step": 490 + }, + { + "epoch": 0.5852205005959475, + "grad_norm": 0.31077612486695794, + "learning_rate": 3.715003423783986e-06, + "loss": 1.9894, + "step": 491 + }, + { + "epoch": 0.5864123957091776, + "grad_norm": 0.2980657403471547, + "learning_rate": 3.696810715836332e-06, + "loss": 1.9712, + "step": 492 + }, + { + "epoch": 0.5876042908224076, + "grad_norm": 0.28507782391437864, + "learning_rate": 3.6786364994946543e-06, + "loss": 1.9652, + "step": 493 + }, + { + "epoch": 0.5887961859356377, + "grad_norm": 0.3076841882401857, + "learning_rate": 3.660481032642016e-06, + "loss": 1.9756, + "step": 494 + }, + { + "epoch": 0.5899880810488677, + "grad_norm": 0.2873213364073368, + "learning_rate": 3.6423445728954393e-06, + "loss": 1.9702, + "step": 495 + }, + { + "epoch": 0.5911799761620977, + "grad_norm": 0.30064962474416257, + "learning_rate": 3.6242273776022396e-06, + "loss": 1.9798, + "step": 496 + }, + { + "epoch": 0.5923718712753278, + "grad_norm": 0.30016520129470653, + "learning_rate": 3.6061297038363853e-06, + "loss": 1.9708, + "step": 497 + }, + { + "epoch": 0.5935637663885578, + "grad_norm": 0.3186216715211957, + "learning_rate": 3.5880518083948377e-06, + "loss": 1.9786, + "step": 498 + }, + { + "epoch": 0.5947556615017878, + "grad_norm": 0.3093775837624005, + "learning_rate": 3.5699939477939183e-06, + "loss": 1.9585, + "step": 499 + }, + { + "epoch": 0.5959475566150179, + "grad_norm": 0.28193348662211454, + "learning_rate": 3.5519563782656642e-06, + "loss": 1.9738, + "step": 500 + }, + { + "epoch": 0.5971394517282479, + "grad_norm": 0.32328773490671, + "learning_rate": 3.533939355754188e-06, + "loss": 1.9619, + "step": 501 + }, + { + "epoch": 0.5983313468414779, + "grad_norm": 0.30291671495352485, + "learning_rate": 3.5159431359120545e-06, + "loss": 1.9651, + "step": 502 + }, + { + "epoch": 0.599523241954708, + "grad_norm": 0.3080909269221942, + "learning_rate": 3.497967974096647e-06, + "loss": 1.9783, + "step": 503 + }, + { + "epoch": 0.600715137067938, + "grad_norm": 0.32314557640507674, + "learning_rate": 3.4800141253665463e-06, + "loss": 1.9657, + "step": 504 + }, + { + "epoch": 0.601907032181168, + "grad_norm": 0.29346056048517033, + "learning_rate": 3.4620818444779126e-06, + "loss": 1.9787, + "step": 505 + }, + { + "epoch": 0.6030989272943981, + "grad_norm": 0.3110390571856809, + "learning_rate": 3.4441713858808684e-06, + "loss": 1.9414, + "step": 506 + }, + { + "epoch": 0.6042908224076281, + "grad_norm": 0.31467381689979457, + "learning_rate": 3.426283003715886e-06, + "loss": 1.9619, + "step": 507 + }, + { + "epoch": 0.6054827175208581, + "grad_norm": 0.2969133354888754, + "learning_rate": 3.4084169518101896e-06, + "loss": 1.9604, + "step": 508 + }, + { + "epoch": 0.6066746126340882, + "grad_norm": 0.3184238842438653, + "learning_rate": 3.3905734836741415e-06, + "loss": 1.953, + "step": 509 + }, + { + "epoch": 0.6078665077473182, + "grad_norm": 0.2969150683168432, + "learning_rate": 3.3727528524976583e-06, + "loss": 1.9664, + "step": 510 + }, + { + "epoch": 0.6090584028605482, + "grad_norm": 0.33154057267330567, + "learning_rate": 3.354955311146606e-06, + "loss": 1.9776, + "step": 511 + }, + { + "epoch": 0.6102502979737783, + "grad_norm": 0.30901718720421373, + "learning_rate": 3.3371811121592203e-06, + "loss": 1.9917, + "step": 512 + }, + { + "epoch": 0.6114421930870083, + "grad_norm": 0.3212832298222802, + "learning_rate": 3.3194305077425215e-06, + "loss": 1.9928, + "step": 513 + }, + { + "epoch": 0.6126340882002383, + "grad_norm": 0.34130767861666084, + "learning_rate": 3.3017037497687303e-06, + "loss": 1.9501, + "step": 514 + }, + { + "epoch": 0.6138259833134684, + "grad_norm": 0.2919077388333617, + "learning_rate": 3.2840010897717045e-06, + "loss": 1.9657, + "step": 515 + }, + { + "epoch": 0.6150178784266984, + "grad_norm": 0.3277066008449366, + "learning_rate": 3.2663227789433573e-06, + "loss": 1.9602, + "step": 516 + }, + { + "epoch": 0.6162097735399285, + "grad_norm": 0.2903404769911658, + "learning_rate": 3.2486690681301046e-06, + "loss": 1.959, + "step": 517 + }, + { + "epoch": 0.6174016686531585, + "grad_norm": 0.284277433828357, + "learning_rate": 3.2310402078292956e-06, + "loss": 1.9718, + "step": 518 + }, + { + "epoch": 0.6185935637663885, + "grad_norm": 0.3258141085919218, + "learning_rate": 3.2134364481856663e-06, + "loss": 1.9612, + "step": 519 + }, + { + "epoch": 0.6197854588796186, + "grad_norm": 0.285408156114209, + "learning_rate": 3.1958580389877876e-06, + "loss": 1.9747, + "step": 520 + }, + { + "epoch": 0.6209773539928486, + "grad_norm": 0.3071499624906975, + "learning_rate": 3.178305229664519e-06, + "loss": 1.9781, + "step": 521 + }, + { + "epoch": 0.6221692491060786, + "grad_norm": 0.29430716274498264, + "learning_rate": 3.1607782692814683e-06, + "loss": 1.9785, + "step": 522 + }, + { + "epoch": 0.6233611442193087, + "grad_norm": 0.29446694445491767, + "learning_rate": 3.1432774065374628e-06, + "loss": 1.9651, + "step": 523 + }, + { + "epoch": 0.6245530393325387, + "grad_norm": 0.2868927792141283, + "learning_rate": 3.125802889761016e-06, + "loss": 1.9604, + "step": 524 + }, + { + "epoch": 0.6257449344457687, + "grad_norm": 0.3075894856023552, + "learning_rate": 3.1083549669068048e-06, + "loss": 1.981, + "step": 525 + }, + { + "epoch": 0.6269368295589988, + "grad_norm": 0.30553317063832414, + "learning_rate": 3.090933885552155e-06, + "loss": 1.968, + "step": 526 + }, + { + "epoch": 0.6281287246722288, + "grad_norm": 0.2883247866247332, + "learning_rate": 3.073539892893519e-06, + "loss": 1.9647, + "step": 527 + }, + { + "epoch": 0.6293206197854588, + "grad_norm": 0.3093327452992941, + "learning_rate": 3.0561732357429797e-06, + "loss": 1.9691, + "step": 528 + }, + { + "epoch": 0.6305125148986889, + "grad_norm": 0.2944434342418357, + "learning_rate": 3.0388341605247385e-06, + "loss": 1.9756, + "step": 529 + }, + { + "epoch": 0.6317044100119189, + "grad_norm": 0.3231077122645434, + "learning_rate": 3.021522913271627e-06, + "loss": 1.9774, + "step": 530 + }, + { + "epoch": 0.6328963051251489, + "grad_norm": 0.2937937539093132, + "learning_rate": 3.0042397396216076e-06, + "loss": 1.9813, + "step": 531 + }, + { + "epoch": 0.634088200238379, + "grad_norm": 0.33747028062165074, + "learning_rate": 2.9869848848142957e-06, + "loss": 1.9817, + "step": 532 + }, + { + "epoch": 0.635280095351609, + "grad_norm": 0.27860436170886715, + "learning_rate": 2.969758593687475e-06, + "loss": 1.995, + "step": 533 + }, + { + "epoch": 0.636471990464839, + "grad_norm": 0.2686660592261799, + "learning_rate": 2.952561110673623e-06, + "loss": 2.004, + "step": 534 + }, + { + "epoch": 0.6376638855780691, + "grad_norm": 0.3171126513844146, + "learning_rate": 2.9353926797964495e-06, + "loss": 1.9675, + "step": 535 + }, + { + "epoch": 0.6388557806912991, + "grad_norm": 0.26076405849359174, + "learning_rate": 2.9182535446674244e-06, + "loss": 1.9606, + "step": 536 + }, + { + "epoch": 0.6400476758045291, + "grad_norm": 0.311798441596794, + "learning_rate": 2.9011439484823287e-06, + "loss": 1.9566, + "step": 537 + }, + { + "epoch": 0.6412395709177592, + "grad_norm": 0.2667721525695941, + "learning_rate": 2.8840641340177955e-06, + "loss": 1.9571, + "step": 538 + }, + { + "epoch": 0.6424314660309892, + "grad_norm": 0.29165327528369395, + "learning_rate": 2.8670143436278757e-06, + "loss": 1.9648, + "step": 539 + }, + { + "epoch": 0.6436233611442194, + "grad_norm": 0.29487930858334793, + "learning_rate": 2.84999481924059e-06, + "loss": 1.9499, + "step": 540 + }, + { + "epoch": 0.6448152562574494, + "grad_norm": 0.31540084878211927, + "learning_rate": 2.8330058023545027e-06, + "loss": 1.9658, + "step": 541 + }, + { + "epoch": 0.6460071513706794, + "grad_norm": 0.2789685559518471, + "learning_rate": 2.8160475340352913e-06, + "loss": 1.9638, + "step": 542 + }, + { + "epoch": 0.6471990464839095, + "grad_norm": 0.28954283549505694, + "learning_rate": 2.799120254912321e-06, + "loss": 1.964, + "step": 543 + }, + { + "epoch": 0.6483909415971395, + "grad_norm": 0.29043220060176517, + "learning_rate": 2.7822242051752425e-06, + "loss": 1.9457, + "step": 544 + }, + { + "epoch": 0.6495828367103695, + "grad_norm": 0.268629176168656, + "learning_rate": 2.765359624570574e-06, + "loss": 1.9753, + "step": 545 + }, + { + "epoch": 0.6507747318235996, + "grad_norm": 0.29396871373699995, + "learning_rate": 2.7485267523983038e-06, + "loss": 1.9803, + "step": 546 + }, + { + "epoch": 0.6519666269368296, + "grad_norm": 0.2938578682137881, + "learning_rate": 2.731725827508494e-06, + "loss": 1.9559, + "step": 547 + }, + { + "epoch": 0.6531585220500596, + "grad_norm": 0.26444066496746194, + "learning_rate": 2.714957088297886e-06, + "loss": 1.9621, + "step": 548 + }, + { + "epoch": 0.6543504171632897, + "grad_norm": 0.2898176558803259, + "learning_rate": 2.6982207727065252e-06, + "loss": 1.9551, + "step": 549 + }, + { + "epoch": 0.6555423122765197, + "grad_norm": 0.3003676611598843, + "learning_rate": 2.681517118214389e-06, + "loss": 1.9841, + "step": 550 + }, + { + "epoch": 0.6567342073897497, + "grad_norm": 0.2592919375869367, + "learning_rate": 2.664846361837997e-06, + "loss": 1.976, + "step": 551 + }, + { + "epoch": 0.6579261025029798, + "grad_norm": 0.3266565084733632, + "learning_rate": 2.6482087401270705e-06, + "loss": 1.9564, + "step": 552 + }, + { + "epoch": 0.6591179976162098, + "grad_norm": 0.2995845038649281, + "learning_rate": 2.6316044891611633e-06, + "loss": 1.969, + "step": 553 + }, + { + "epoch": 0.6603098927294399, + "grad_norm": 0.2804027081600714, + "learning_rate": 2.6150338445463146e-06, + "loss": 1.9693, + "step": 554 + }, + { + "epoch": 0.6615017878426699, + "grad_norm": 0.27698419373196886, + "learning_rate": 2.5984970414117096e-06, + "loss": 1.9788, + "step": 555 + }, + { + "epoch": 0.6626936829558999, + "grad_norm": 0.31032114395815213, + "learning_rate": 2.5819943144063326e-06, + "loss": 1.9741, + "step": 556 + }, + { + "epoch": 0.66388557806913, + "grad_norm": 0.28800726045711933, + "learning_rate": 2.565525897695651e-06, + "loss": 1.9507, + "step": 557 + }, + { + "epoch": 0.66507747318236, + "grad_norm": 0.29802393651993614, + "learning_rate": 2.549092024958285e-06, + "loss": 1.9664, + "step": 558 + }, + { + "epoch": 0.66626936829559, + "grad_norm": 0.2982356345030979, + "learning_rate": 2.532692929382692e-06, + "loss": 1.9789, + "step": 559 + }, + { + "epoch": 0.6674612634088201, + "grad_norm": 0.2803035272437382, + "learning_rate": 2.51632884366386e-06, + "loss": 1.9609, + "step": 560 + }, + { + "epoch": 0.6686531585220501, + "grad_norm": 0.29369752020144174, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.9665, + "step": 561 + }, + { + "epoch": 0.6698450536352801, + "grad_norm": 0.2692763488935535, + "learning_rate": 2.4837066300892647e-06, + "loss": 1.9775, + "step": 562 + }, + { + "epoch": 0.6710369487485102, + "grad_norm": 0.2640671578025783, + "learning_rate": 2.4674489651264433e-06, + "loss": 1.9621, + "step": 563 + }, + { + "epoch": 0.6722288438617402, + "grad_norm": 0.2968222691817008, + "learning_rate": 2.4512272357996937e-06, + "loss": 1.956, + "step": 564 + }, + { + "epoch": 0.6734207389749702, + "grad_norm": 0.3011250889616646, + "learning_rate": 2.4350416722872657e-06, + "loss": 1.9775, + "step": 565 + }, + { + "epoch": 0.6746126340882003, + "grad_norm": 0.27706203721849776, + "learning_rate": 2.418892504254231e-06, + "loss": 1.9858, + "step": 566 + }, + { + "epoch": 0.6758045292014303, + "grad_norm": 0.2886529947325675, + "learning_rate": 2.402779960849232e-06, + "loss": 1.9778, + "step": 567 + }, + { + "epoch": 0.6769964243146603, + "grad_norm": 0.32555422289644703, + "learning_rate": 2.3867042707012234e-06, + "loss": 1.9652, + "step": 568 + }, + { + "epoch": 0.6781883194278904, + "grad_norm": 0.2728774574387877, + "learning_rate": 2.3706656619162278e-06, + "loss": 1.9556, + "step": 569 + }, + { + "epoch": 0.6793802145411204, + "grad_norm": 0.29791540079606743, + "learning_rate": 2.3546643620741054e-06, + "loss": 1.9665, + "step": 570 + }, + { + "epoch": 0.6805721096543504, + "grad_norm": 0.28429090975445814, + "learning_rate": 2.3387005982253218e-06, + "loss": 1.9947, + "step": 571 + }, + { + "epoch": 0.6817640047675805, + "grad_norm": 0.2933689275167632, + "learning_rate": 2.322774596887726e-06, + "loss": 1.9811, + "step": 572 + }, + { + "epoch": 0.6829558998808105, + "grad_norm": 0.27022852014602733, + "learning_rate": 2.3068865840433286e-06, + "loss": 1.9643, + "step": 573 + }, + { + "epoch": 0.6841477949940405, + "grad_norm": 0.27566403732559813, + "learning_rate": 2.29103678513511e-06, + "loss": 1.9494, + "step": 574 + }, + { + "epoch": 0.6853396901072706, + "grad_norm": 0.2813649305654506, + "learning_rate": 2.275225425063813e-06, + "loss": 1.9596, + "step": 575 + }, + { + "epoch": 0.6865315852205006, + "grad_norm": 0.28703149945139833, + "learning_rate": 2.259452728184749e-06, + "loss": 1.9674, + "step": 576 + }, + { + "epoch": 0.6877234803337307, + "grad_norm": 0.278836772705952, + "learning_rate": 2.2437189183046236e-06, + "loss": 1.9683, + "step": 577 + }, + { + "epoch": 0.6889153754469607, + "grad_norm": 0.3180141045052597, + "learning_rate": 2.2280242186783473e-06, + "loss": 1.9588, + "step": 578 + }, + { + "epoch": 0.6901072705601907, + "grad_norm": 0.2622104807864232, + "learning_rate": 2.21236885200588e-06, + "loss": 1.9587, + "step": 579 + }, + { + "epoch": 0.6912991656734208, + "grad_norm": 0.28789973870105057, + "learning_rate": 2.1967530404290702e-06, + "loss": 1.9827, + "step": 580 + }, + { + "epoch": 0.6924910607866508, + "grad_norm": 0.2821097592933177, + "learning_rate": 2.1811770055284968e-06, + "loss": 2.0036, + "step": 581 + }, + { + "epoch": 0.6936829558998808, + "grad_norm": 0.24946555260466954, + "learning_rate": 2.1656409683203216e-06, + "loss": 1.9897, + "step": 582 + }, + { + "epoch": 0.6948748510131109, + "grad_norm": 0.29219706494149983, + "learning_rate": 2.1501451492531664e-06, + "loss": 1.9703, + "step": 583 + }, + { + "epoch": 0.6960667461263409, + "grad_norm": 0.26827509295364377, + "learning_rate": 2.134689768204975e-06, + "loss": 1.9539, + "step": 584 + }, + { + "epoch": 0.6972586412395709, + "grad_norm": 0.30266646603465935, + "learning_rate": 2.1192750444798982e-06, + "loss": 1.986, + "step": 585 + }, + { + "epoch": 0.698450536352801, + "grad_norm": 0.2718431001798245, + "learning_rate": 2.103901196805173e-06, + "loss": 1.9738, + "step": 586 + }, + { + "epoch": 0.699642431466031, + "grad_norm": 0.2687110838757682, + "learning_rate": 2.0885684433280336e-06, + "loss": 1.9494, + "step": 587 + }, + { + "epoch": 0.700834326579261, + "grad_norm": 0.2776314528817648, + "learning_rate": 2.073277001612603e-06, + "loss": 1.9529, + "step": 588 + }, + { + "epoch": 0.7020262216924911, + "grad_norm": 0.25980607862615657, + "learning_rate": 2.058027088636814e-06, + "loss": 1.9529, + "step": 589 + }, + { + "epoch": 0.7032181168057211, + "grad_norm": 0.2801681412198667, + "learning_rate": 2.042818920789326e-06, + "loss": 1.9688, + "step": 590 + }, + { + "epoch": 0.7044100119189511, + "grad_norm": 0.2631013529820137, + "learning_rate": 2.0276527138664537e-06, + "loss": 1.9363, + "step": 591 + }, + { + "epoch": 0.7056019070321812, + "grad_norm": 0.2528230435660016, + "learning_rate": 2.012528683069109e-06, + "loss": 1.9542, + "step": 592 + }, + { + "epoch": 0.7067938021454112, + "grad_norm": 0.2473972746312196, + "learning_rate": 1.9974470429997482e-06, + "loss": 1.9962, + "step": 593 + }, + { + "epoch": 0.7079856972586412, + "grad_norm": 0.284941379850682, + "learning_rate": 1.98240800765932e-06, + "loss": 1.9447, + "step": 594 + }, + { + "epoch": 0.7091775923718713, + "grad_norm": 0.2621960635197473, + "learning_rate": 1.9674117904442364e-06, + "loss": 1.9812, + "step": 595 + }, + { + "epoch": 0.7103694874851013, + "grad_norm": 0.24858361697066161, + "learning_rate": 1.9524586041433393e-06, + "loss": 1.9562, + "step": 596 + }, + { + "epoch": 0.7115613825983313, + "grad_norm": 0.2669834824927238, + "learning_rate": 1.9375486609348842e-06, + "loss": 1.987, + "step": 597 + }, + { + "epoch": 0.7127532777115614, + "grad_norm": 0.26234172310570103, + "learning_rate": 1.9226821723835322e-06, + "loss": 1.9735, + "step": 598 + }, + { + "epoch": 0.7139451728247914, + "grad_norm": 0.25384961760334385, + "learning_rate": 1.907859349437336e-06, + "loss": 1.9831, + "step": 599 + }, + { + "epoch": 0.7151370679380215, + "grad_norm": 0.3104750369664491, + "learning_rate": 1.8930804024247635e-06, + "loss": 1.9714, + "step": 600 + }, + { + "epoch": 0.7163289630512515, + "grad_norm": 0.2458078645357097, + "learning_rate": 1.8783455410517004e-06, + "loss": 1.9468, + "step": 601 + }, + { + "epoch": 0.7175208581644815, + "grad_norm": 0.26529680805920836, + "learning_rate": 1.8636549743984815e-06, + "loss": 1.9593, + "step": 602 + }, + { + "epoch": 0.7187127532777116, + "grad_norm": 0.25080419801242315, + "learning_rate": 1.8490089109169218e-06, + "loss": 1.9808, + "step": 603 + }, + { + "epoch": 0.7199046483909416, + "grad_norm": 0.26413238202627376, + "learning_rate": 1.8344075584273547e-06, + "loss": 1.9487, + "step": 604 + }, + { + "epoch": 0.7210965435041716, + "grad_norm": 0.2674448281901473, + "learning_rate": 1.8198511241156902e-06, + "loss": 1.9598, + "step": 605 + }, + { + "epoch": 0.7222884386174017, + "grad_norm": 0.24083245686353985, + "learning_rate": 1.8053398145304723e-06, + "loss": 1.9662, + "step": 606 + }, + { + "epoch": 0.7234803337306317, + "grad_norm": 0.25961756440068884, + "learning_rate": 1.7908738355799454e-06, + "loss": 1.9868, + "step": 607 + }, + { + "epoch": 0.7246722288438617, + "grad_norm": 0.2784591415570306, + "learning_rate": 1.776453392529139e-06, + "loss": 1.9473, + "step": 608 + }, + { + "epoch": 0.7258641239570918, + "grad_norm": 0.23968494857480035, + "learning_rate": 1.7620786899969412e-06, + "loss": 1.9716, + "step": 609 + }, + { + "epoch": 0.7270560190703218, + "grad_norm": 0.23937998852690856, + "learning_rate": 1.747749931953217e-06, + "loss": 1.9635, + "step": 610 + }, + { + "epoch": 0.7282479141835518, + "grad_norm": 0.259732006086446, + "learning_rate": 1.7334673217158976e-06, + "loss": 1.9616, + "step": 611 + }, + { + "epoch": 0.7294398092967819, + "grad_norm": 0.25239102464142604, + "learning_rate": 1.719231061948094e-06, + "loss": 1.9656, + "step": 612 + }, + { + "epoch": 0.7306317044100119, + "grad_norm": 0.2550463812437055, + "learning_rate": 1.7050413546552347e-06, + "loss": 1.9784, + "step": 613 + }, + { + "epoch": 0.7318235995232419, + "grad_norm": 0.2535210200301375, + "learning_rate": 1.6908984011821883e-06, + "loss": 1.9847, + "step": 614 + }, + { + "epoch": 0.733015494636472, + "grad_norm": 0.24932432687921058, + "learning_rate": 1.6768024022104106e-06, + "loss": 1.972, + "step": 615 + }, + { + "epoch": 0.734207389749702, + "grad_norm": 0.2644613269238538, + "learning_rate": 1.6627535577550996e-06, + "loss": 1.9716, + "step": 616 + }, + { + "epoch": 0.735399284862932, + "grad_norm": 0.3944302146845491, + "learning_rate": 1.6487520671623469e-06, + "loss": 1.9595, + "step": 617 + }, + { + "epoch": 0.7365911799761621, + "grad_norm": 0.244722231687242, + "learning_rate": 1.6347981291063224e-06, + "loss": 1.9688, + "step": 618 + }, + { + "epoch": 0.7377830750893921, + "grad_norm": 0.2504826371525299, + "learning_rate": 1.6208919415864476e-06, + "loss": 1.9721, + "step": 619 + }, + { + "epoch": 0.7389749702026222, + "grad_norm": 0.2523790844757924, + "learning_rate": 1.6070337019245896e-06, + "loss": 1.9456, + "step": 620 + }, + { + "epoch": 0.7401668653158522, + "grad_norm": 0.26338811471433093, + "learning_rate": 1.5932236067622542e-06, + "loss": 1.9613, + "step": 621 + }, + { + "epoch": 0.7413587604290822, + "grad_norm": 0.25146034966929337, + "learning_rate": 1.5794618520578053e-06, + "loss": 1.981, + "step": 622 + }, + { + "epoch": 0.7425506555423123, + "grad_norm": 0.2478403982473681, + "learning_rate": 1.5657486330836786e-06, + "loss": 1.9263, + "step": 623 + }, + { + "epoch": 0.7437425506555423, + "grad_norm": 0.2536474779363047, + "learning_rate": 1.5520841444236118e-06, + "loss": 1.9789, + "step": 624 + }, + { + "epoch": 0.7449344457687723, + "grad_norm": 0.2615274746690614, + "learning_rate": 1.5384685799698839e-06, + "loss": 1.9783, + "step": 625 + }, + { + "epoch": 0.7461263408820024, + "grad_norm": 0.2679161856145564, + "learning_rate": 1.5249021329205638e-06, + "loss": 1.9513, + "step": 626 + }, + { + "epoch": 0.7473182359952324, + "grad_norm": 0.24553342227151687, + "learning_rate": 1.5113849957767685e-06, + "loss": 1.9711, + "step": 627 + }, + { + "epoch": 0.7485101311084624, + "grad_norm": 0.246019311870797, + "learning_rate": 1.4979173603399323e-06, + "loss": 1.9734, + "step": 628 + }, + { + "epoch": 0.7497020262216925, + "grad_norm": 0.25764970394173725, + "learning_rate": 1.4844994177090871e-06, + "loss": 1.9575, + "step": 629 + }, + { + "epoch": 0.7508939213349225, + "grad_norm": 0.2419520407437769, + "learning_rate": 1.4711313582781434e-06, + "loss": 1.9444, + "step": 630 + }, + { + "epoch": 0.7520858164481525, + "grad_norm": 0.2386706941133275, + "learning_rate": 1.4578133717331982e-06, + "loss": 1.9675, + "step": 631 + }, + { + "epoch": 0.7532777115613826, + "grad_norm": 0.251990632652635, + "learning_rate": 1.4445456470498392e-06, + "loss": 1.9571, + "step": 632 + }, + { + "epoch": 0.7544696066746126, + "grad_norm": 0.24481833940935246, + "learning_rate": 1.4313283724904632e-06, + "loss": 1.9538, + "step": 633 + }, + { + "epoch": 0.7556615017878426, + "grad_norm": 0.24576950539499237, + "learning_rate": 1.418161735601601e-06, + "loss": 1.9676, + "step": 634 + }, + { + "epoch": 0.7568533969010727, + "grad_norm": 0.24675237000023065, + "learning_rate": 1.4050459232112652e-06, + "loss": 1.9672, + "step": 635 + }, + { + "epoch": 0.7580452920143027, + "grad_norm": 0.2407161568341905, + "learning_rate": 1.3919811214262913e-06, + "loss": 1.9726, + "step": 636 + }, + { + "epoch": 0.7592371871275327, + "grad_norm": 0.23031407014507166, + "learning_rate": 1.378967515629701e-06, + "loss": 1.9768, + "step": 637 + }, + { + "epoch": 0.7604290822407628, + "grad_norm": 0.2345707206990765, + "learning_rate": 1.3660052904780707e-06, + "loss": 1.9517, + "step": 638 + }, + { + "epoch": 0.7616209773539928, + "grad_norm": 0.23677366971206826, + "learning_rate": 1.353094629898909e-06, + "loss": 1.9654, + "step": 639 + }, + { + "epoch": 0.7628128724672228, + "grad_norm": 0.24749335727794808, + "learning_rate": 1.3402357170880514e-06, + "loss": 1.9752, + "step": 640 + }, + { + "epoch": 0.7640047675804529, + "grad_norm": 0.23017419897906063, + "learning_rate": 1.3274287345070564e-06, + "loss": 1.9538, + "step": 641 + }, + { + "epoch": 0.7651966626936829, + "grad_norm": 0.24400711432750527, + "learning_rate": 1.3146738638806217e-06, + "loss": 1.9571, + "step": 642 + }, + { + "epoch": 0.766388557806913, + "grad_norm": 0.2322768595933808, + "learning_rate": 1.3019712861939964e-06, + "loss": 1.967, + "step": 643 + }, + { + "epoch": 0.767580452920143, + "grad_norm": 0.2448647193354467, + "learning_rate": 1.2893211816904243e-06, + "loss": 1.9702, + "step": 644 + }, + { + "epoch": 0.768772348033373, + "grad_norm": 0.2264734125461794, + "learning_rate": 1.2767237298685787e-06, + "loss": 1.9708, + "step": 645 + }, + { + "epoch": 0.7699642431466031, + "grad_norm": 0.24280998510060245, + "learning_rate": 1.26417910948002e-06, + "loss": 2.0062, + "step": 646 + }, + { + "epoch": 0.7711561382598331, + "grad_norm": 0.2599649290379438, + "learning_rate": 1.2516874985266508e-06, + "loss": 1.9641, + "step": 647 + }, + { + "epoch": 0.7723480333730631, + "grad_norm": 0.23209096205716762, + "learning_rate": 1.239249074258203e-06, + "loss": 1.9844, + "step": 648 + }, + { + "epoch": 0.7735399284862932, + "grad_norm": 0.2366200983286952, + "learning_rate": 1.2268640131697129e-06, + "loss": 1.9591, + "step": 649 + }, + { + "epoch": 0.7747318235995232, + "grad_norm": 0.22549692632142865, + "learning_rate": 1.2145324909990202e-06, + "loss": 1.9638, + "step": 650 + }, + { + "epoch": 0.7759237187127532, + "grad_norm": 0.2201422471843865, + "learning_rate": 1.202254682724276e-06, + "loss": 1.96, + "step": 651 + }, + { + "epoch": 0.7771156138259833, + "grad_norm": 0.23804071076564637, + "learning_rate": 1.190030762561452e-06, + "loss": 1.9429, + "step": 652 + }, + { + "epoch": 0.7783075089392133, + "grad_norm": 0.23445786497651513, + "learning_rate": 1.1778609039618804e-06, + "loss": 1.9441, + "step": 653 + }, + { + "epoch": 0.7794994040524433, + "grad_norm": 0.23319783177552136, + "learning_rate": 1.1657452796097879e-06, + "loss": 1.9561, + "step": 654 + }, + { + "epoch": 0.7806912991656734, + "grad_norm": 0.21246102421189209, + "learning_rate": 1.1536840614198376e-06, + "loss": 1.9552, + "step": 655 + }, + { + "epoch": 0.7818831942789034, + "grad_norm": 0.21558582464035986, + "learning_rate": 1.1416774205347015e-06, + "loss": 1.9535, + "step": 656 + }, + { + "epoch": 0.7830750893921334, + "grad_norm": 0.2478855415089653, + "learning_rate": 1.1297255273226254e-06, + "loss": 1.9648, + "step": 657 + }, + { + "epoch": 0.7842669845053635, + "grad_norm": 0.24079598014625692, + "learning_rate": 1.117828551375013e-06, + "loss": 1.9517, + "step": 658 + }, + { + "epoch": 0.7854588796185935, + "grad_norm": 0.22483152992478453, + "learning_rate": 1.1059866615040205e-06, + "loss": 1.9615, + "step": 659 + }, + { + "epoch": 0.7866507747318237, + "grad_norm": 0.21611761849037114, + "learning_rate": 1.094200025740157e-06, + "loss": 1.9544, + "step": 660 + }, + { + "epoch": 0.7878426698450537, + "grad_norm": 0.22680299546251373, + "learning_rate": 1.0824688113299054e-06, + "loss": 1.9656, + "step": 661 + }, + { + "epoch": 0.7890345649582837, + "grad_norm": 0.22651384710874864, + "learning_rate": 1.0707931847333487e-06, + "loss": 1.952, + "step": 662 + }, + { + "epoch": 0.7902264600715138, + "grad_norm": 0.22804104499330677, + "learning_rate": 1.0591733116218046e-06, + "loss": 1.9469, + "step": 663 + }, + { + "epoch": 0.7914183551847438, + "grad_norm": 0.23170987494579412, + "learning_rate": 1.0476093568754776e-06, + "loss": 1.9743, + "step": 664 + }, + { + "epoch": 0.7926102502979738, + "grad_norm": 0.22978004850491673, + "learning_rate": 1.036101484581117e-06, + "loss": 1.9595, + "step": 665 + }, + { + "epoch": 0.7938021454112039, + "grad_norm": 0.21260865957457795, + "learning_rate": 1.0246498580296903e-06, + "loss": 1.9656, + "step": 666 + }, + { + "epoch": 0.7949940405244339, + "grad_norm": 0.22425557844267943, + "learning_rate": 1.0132546397140687e-06, + "loss": 1.9755, + "step": 667 + }, + { + "epoch": 0.7961859356376639, + "grad_norm": 0.2266231438335908, + "learning_rate": 1.0019159913267156e-06, + "loss": 1.9871, + "step": 668 + }, + { + "epoch": 0.797377830750894, + "grad_norm": 0.21739761610592676, + "learning_rate": 9.90634073757397e-07, + "loss": 1.9599, + "step": 669 + }, + { + "epoch": 0.798569725864124, + "grad_norm": 0.22507089101888264, + "learning_rate": 9.794090470908962e-07, + "loss": 1.9703, + "step": 670 + }, + { + "epoch": 0.799761620977354, + "grad_norm": 0.2076814121868233, + "learning_rate": 9.68241070604743e-07, + "loss": 1.964, + "step": 671 + }, + { + "epoch": 0.8009535160905841, + "grad_norm": 0.23327916717788147, + "learning_rate": 9.571303027669548e-07, + "loss": 1.9825, + "step": 672 + }, + { + "epoch": 0.8021454112038141, + "grad_norm": 0.21841469332058575, + "learning_rate": 9.460769012337839e-07, + "loss": 1.9897, + "step": 673 + }, + { + "epoch": 0.8033373063170441, + "grad_norm": 0.22795437088618667, + "learning_rate": 9.350810228474855e-07, + "loss": 1.9548, + "step": 674 + }, + { + "epoch": 0.8045292014302742, + "grad_norm": 0.24461982798572574, + "learning_rate": 9.241428236340904e-07, + "loss": 1.971, + "step": 675 + }, + { + "epoch": 0.8057210965435042, + "grad_norm": 0.22693929127887172, + "learning_rate": 9.132624588011896e-07, + "loss": 1.9697, + "step": 676 + }, + { + "epoch": 0.8069129916567342, + "grad_norm": 0.22481042822198152, + "learning_rate": 9.024400827357344e-07, + "loss": 1.9729, + "step": 677 + }, + { + "epoch": 0.8081048867699643, + "grad_norm": 0.21859877558397856, + "learning_rate": 8.916758490018418e-07, + "loss": 1.9666, + "step": 678 + }, + { + "epoch": 0.8092967818831943, + "grad_norm": 0.2260921434511296, + "learning_rate": 8.809699103386204e-07, + "loss": 1.964, + "step": 679 + }, + { + "epoch": 0.8104886769964244, + "grad_norm": 0.20963128480459883, + "learning_rate": 8.703224186580012e-07, + "loss": 1.9969, + "step": 680 + }, + { + "epoch": 0.8116805721096544, + "grad_norm": 0.2204158197482051, + "learning_rate": 8.597335250425809e-07, + "loss": 1.9494, + "step": 681 + }, + { + "epoch": 0.8128724672228844, + "grad_norm": 0.22459531839550131, + "learning_rate": 8.492033797434762e-07, + "loss": 1.9473, + "step": 682 + }, + { + "epoch": 0.8140643623361145, + "grad_norm": 0.22674947382748567, + "learning_rate": 8.387321321781977e-07, + "loss": 1.9591, + "step": 683 + }, + { + "epoch": 0.8152562574493445, + "grad_norm": 0.23228573604473446, + "learning_rate": 8.283199309285284e-07, + "loss": 1.9622, + "step": 684 + }, + { + "epoch": 0.8164481525625745, + "grad_norm": 0.2287116546758065, + "learning_rate": 8.179669237384097e-07, + "loss": 1.971, + "step": 685 + }, + { + "epoch": 0.8176400476758046, + "grad_norm": 0.20888102799928682, + "learning_rate": 8.07673257511849e-07, + "loss": 1.9647, + "step": 686 + }, + { + "epoch": 0.8188319427890346, + "grad_norm": 0.23285143313735843, + "learning_rate": 7.97439078310836e-07, + "loss": 1.9475, + "step": 687 + }, + { + "epoch": 0.8200238379022646, + "grad_norm": 0.2306317998265742, + "learning_rate": 7.872645313532701e-07, + "loss": 1.9843, + "step": 688 + }, + { + "epoch": 0.8212157330154947, + "grad_norm": 0.22262551838654496, + "learning_rate": 7.771497610108981e-07, + "loss": 1.9715, + "step": 689 + }, + { + "epoch": 0.8224076281287247, + "grad_norm": 0.23849053028826073, + "learning_rate": 7.670949108072673e-07, + "loss": 1.944, + "step": 690 + }, + { + "epoch": 0.8235995232419547, + "grad_norm": 0.21447305264782468, + "learning_rate": 7.57100123415685e-07, + "loss": 1.9642, + "step": 691 + }, + { + "epoch": 0.8247914183551848, + "grad_norm": 0.2531000448062807, + "learning_rate": 7.471655406572003e-07, + "loss": 1.9447, + "step": 692 + }, + { + "epoch": 0.8259833134684148, + "grad_norm": 0.23002093940013935, + "learning_rate": 7.372913034985879e-07, + "loss": 1.9441, + "step": 693 + }, + { + "epoch": 0.8271752085816448, + "grad_norm": 0.21804664562427592, + "learning_rate": 7.274775520503491e-07, + "loss": 1.9494, + "step": 694 + }, + { + "epoch": 0.8283671036948749, + "grad_norm": 0.2180719845670406, + "learning_rate": 7.177244255647209e-07, + "loss": 1.9612, + "step": 695 + }, + { + "epoch": 0.8295589988081049, + "grad_norm": 0.2292893667744583, + "learning_rate": 7.080320624337039e-07, + "loss": 1.9631, + "step": 696 + }, + { + "epoch": 0.8307508939213349, + "grad_norm": 0.22411915203502086, + "learning_rate": 6.984006001870974e-07, + "loss": 1.9558, + "step": 697 + }, + { + "epoch": 0.831942789034565, + "grad_norm": 0.22749753478121626, + "learning_rate": 6.888301754905469e-07, + "loss": 1.9498, + "step": 698 + }, + { + "epoch": 0.833134684147795, + "grad_norm": 0.22227813746152136, + "learning_rate": 6.79320924143605e-07, + "loss": 1.9746, + "step": 699 + }, + { + "epoch": 0.834326579261025, + "grad_norm": 0.21559253597914696, + "learning_rate": 6.698729810778065e-07, + "loss": 1.9528, + "step": 700 + }, + { + "epoch": 0.8355184743742551, + "grad_norm": 0.22780943536330842, + "learning_rate": 6.604864803547511e-07, + "loss": 1.9803, + "step": 701 + }, + { + "epoch": 0.8367103694874851, + "grad_norm": 0.21085095301925635, + "learning_rate": 6.51161555164203e-07, + "loss": 1.973, + "step": 702 + }, + { + "epoch": 0.8379022646007152, + "grad_norm": 0.212543861677965, + "learning_rate": 6.418983378221988e-07, + "loss": 1.9623, + "step": 703 + }, + { + "epoch": 0.8390941597139452, + "grad_norm": 0.2115457183313653, + "learning_rate": 6.326969597691724e-07, + "loss": 1.9817, + "step": 704 + }, + { + "epoch": 0.8402860548271752, + "grad_norm": 0.21617807812407117, + "learning_rate": 6.235575515680898e-07, + "loss": 1.968, + "step": 705 + }, + { + "epoch": 0.8414779499404053, + "grad_norm": 0.21587790930172882, + "learning_rate": 6.144802429025948e-07, + "loss": 1.9549, + "step": 706 + }, + { + "epoch": 0.8426698450536353, + "grad_norm": 0.21797830405681992, + "learning_rate": 6.054651625751717e-07, + "loss": 1.9833, + "step": 707 + }, + { + "epoch": 0.8438617401668653, + "grad_norm": 0.22284253238842683, + "learning_rate": 5.965124385053112e-07, + "loss": 1.9498, + "step": 708 + }, + { + "epoch": 0.8450536352800954, + "grad_norm": 0.20628741944346807, + "learning_rate": 5.876221977277042e-07, + "loss": 1.9382, + "step": 709 + }, + { + "epoch": 0.8462455303933254, + "grad_norm": 0.22293588358500385, + "learning_rate": 5.787945663904332e-07, + "loss": 1.9773, + "step": 710 + }, + { + "epoch": 0.8474374255065554, + "grad_norm": 0.22508597630366683, + "learning_rate": 5.700296697531843e-07, + "loss": 1.9659, + "step": 711 + }, + { + "epoch": 0.8486293206197855, + "grad_norm": 0.22145576581778206, + "learning_rate": 5.613276321854699e-07, + "loss": 1.9536, + "step": 712 + }, + { + "epoch": 0.8498212157330155, + "grad_norm": 0.21697947016074837, + "learning_rate": 5.526885771648599e-07, + "loss": 1.9686, + "step": 713 + }, + { + "epoch": 0.8510131108462455, + "grad_norm": 0.2238437978274647, + "learning_rate": 5.441126272752395e-07, + "loss": 1.9654, + "step": 714 + }, + { + "epoch": 0.8522050059594756, + "grad_norm": 0.20552250633575228, + "learning_rate": 5.355999042050603e-07, + "loss": 1.9679, + "step": 715 + }, + { + "epoch": 0.8533969010727056, + "grad_norm": 0.20678965975151994, + "learning_rate": 5.271505287456153e-07, + "loss": 1.9695, + "step": 716 + }, + { + "epoch": 0.8545887961859356, + "grad_norm": 0.22026378225643617, + "learning_rate": 5.187646207893287e-07, + "loss": 1.9459, + "step": 717 + }, + { + "epoch": 0.8557806912991657, + "grad_norm": 0.21952615459392946, + "learning_rate": 5.104422993280522e-07, + "loss": 1.9583, + "step": 718 + }, + { + "epoch": 0.8569725864123957, + "grad_norm": 0.2103248912718566, + "learning_rate": 5.021836824513759e-07, + "loss": 1.9653, + "step": 719 + }, + { + "epoch": 0.8581644815256257, + "grad_norm": 0.21006195755848364, + "learning_rate": 4.939888873449567e-07, + "loss": 1.9688, + "step": 720 + }, + { + "epoch": 0.8593563766388558, + "grad_norm": 0.20402501881530985, + "learning_rate": 4.858580302888466e-07, + "loss": 1.9765, + "step": 721 + }, + { + "epoch": 0.8605482717520858, + "grad_norm": 0.20084862547322885, + "learning_rate": 4.777912266558532e-07, + "loss": 1.9761, + "step": 722 + }, + { + "epoch": 0.8617401668653158, + "grad_norm": 0.1988469838008945, + "learning_rate": 4.6978859090989703e-07, + "loss": 1.9694, + "step": 723 + }, + { + "epoch": 0.8629320619785459, + "grad_norm": 0.203864028540796, + "learning_rate": 4.618502366043881e-07, + "loss": 1.9775, + "step": 724 + }, + { + "epoch": 0.8641239570917759, + "grad_norm": 0.2145910427428764, + "learning_rate": 4.5397627638061604e-07, + "loss": 1.96, + "step": 725 + }, + { + "epoch": 0.865315852205006, + "grad_norm": 0.2050136901691872, + "learning_rate": 4.4616682196614636e-07, + "loss": 1.9623, + "step": 726 + }, + { + "epoch": 0.866507747318236, + "grad_norm": 0.20872576917324093, + "learning_rate": 4.3842198417324346e-07, + "loss": 1.9554, + "step": 727 + }, + { + "epoch": 0.867699642431466, + "grad_norm": 0.20606498295864895, + "learning_rate": 4.307418728972934e-07, + "loss": 1.9572, + "step": 728 + }, + { + "epoch": 0.8688915375446961, + "grad_norm": 0.20387463599521696, + "learning_rate": 4.2312659711524486e-07, + "loss": 1.9873, + "step": 729 + }, + { + "epoch": 0.8700834326579261, + "grad_norm": 0.20613519314048598, + "learning_rate": 4.1557626488406223e-07, + "loss": 1.9745, + "step": 730 + }, + { + "epoch": 0.8712753277711561, + "grad_norm": 0.2119361724107287, + "learning_rate": 4.080909833391944e-07, + "loss": 1.956, + "step": 731 + }, + { + "epoch": 0.8724672228843862, + "grad_norm": 0.21723399868985763, + "learning_rate": 4.0067085869305357e-07, + "loss": 1.9787, + "step": 732 + }, + { + "epoch": 0.8736591179976162, + "grad_norm": 0.206994355395634, + "learning_rate": 3.9331599623350815e-07, + "loss": 1.9593, + "step": 733 + }, + { + "epoch": 0.8748510131108462, + "grad_norm": 0.20592726537984876, + "learning_rate": 3.8602650032238675e-07, + "loss": 1.9687, + "step": 734 + }, + { + "epoch": 0.8760429082240763, + "grad_norm": 0.19758730236891384, + "learning_rate": 3.788024743940016e-07, + "loss": 1.9957, + "step": 735 + }, + { + "epoch": 0.8772348033373063, + "grad_norm": 0.20119012937681818, + "learning_rate": 3.71644020953677e-07, + "loss": 1.9908, + "step": 736 + }, + { + "epoch": 0.8784266984505363, + "grad_norm": 0.1987555097318407, + "learning_rate": 3.6455124157629805e-07, + "loss": 1.963, + "step": 737 + }, + { + "epoch": 0.8796185935637664, + "grad_norm": 0.20693505027292836, + "learning_rate": 3.575242369048665e-07, + "loss": 1.956, + "step": 738 + }, + { + "epoch": 0.8808104886769964, + "grad_norm": 0.20983712357706624, + "learning_rate": 3.505631066490728e-07, + "loss": 1.9719, + "step": 739 + }, + { + "epoch": 0.8820023837902264, + "grad_norm": 0.20291800945532407, + "learning_rate": 3.436679495838835e-07, + "loss": 1.9658, + "step": 740 + }, + { + "epoch": 0.8831942789034565, + "grad_norm": 0.20400470569172324, + "learning_rate": 3.3683886354813953e-07, + "loss": 1.9785, + "step": 741 + }, + { + "epoch": 0.8843861740166865, + "grad_norm": 0.20085471728798332, + "learning_rate": 3.300759454431657e-07, + "loss": 1.9534, + "step": 742 + }, + { + "epoch": 0.8855780691299165, + "grad_norm": 0.20101578952892968, + "learning_rate": 3.233792912313943e-07, + "loss": 1.9637, + "step": 743 + }, + { + "epoch": 0.8867699642431466, + "grad_norm": 0.194386663867366, + "learning_rate": 3.1674899593501175e-07, + "loss": 1.9718, + "step": 744 + }, + { + "epoch": 0.8879618593563766, + "grad_norm": 0.2022754658332612, + "learning_rate": 3.101851536346007e-07, + "loss": 1.9493, + "step": 745 + }, + { + "epoch": 0.8891537544696066, + "grad_norm": 0.19855072055520911, + "learning_rate": 3.0368785746780925e-07, + "loss": 1.9845, + "step": 746 + }, + { + "epoch": 0.8903456495828367, + "grad_norm": 0.20555522738375503, + "learning_rate": 2.9725719962802936e-07, + "loss": 1.9562, + "step": 747 + }, + { + "epoch": 0.8915375446960667, + "grad_norm": 0.20020380210201041, + "learning_rate": 2.9089327136308855e-07, + "loss": 1.9423, + "step": 748 + }, + { + "epoch": 0.8927294398092968, + "grad_norm": 0.21841772751668617, + "learning_rate": 2.8459616297395464e-07, + "loss": 1.9513, + "step": 749 + }, + { + "epoch": 0.8939213349225268, + "grad_norm": 0.20236116487467856, + "learning_rate": 2.7836596381345613e-07, + "loss": 1.9567, + "step": 750 + }, + { + "epoch": 0.8951132300357568, + "grad_norm": 0.20135449349261023, + "learning_rate": 2.722027622850104e-07, + "loss": 1.9645, + "step": 751 + }, + { + "epoch": 0.8963051251489869, + "grad_norm": 0.18378339190654983, + "learning_rate": 2.6610664584137413e-07, + "loss": 1.9556, + "step": 752 + }, + { + "epoch": 0.8974970202622169, + "grad_norm": 0.19531080180392058, + "learning_rate": 2.600777009833982e-07, + "loss": 1.9651, + "step": 753 + }, + { + "epoch": 0.8986889153754469, + "grad_norm": 0.19913934223741267, + "learning_rate": 2.541160132588044e-07, + "loss": 1.9903, + "step": 754 + }, + { + "epoch": 0.899880810488677, + "grad_norm": 0.19578044282345453, + "learning_rate": 2.482216672609677e-07, + "loss": 1.9826, + "step": 755 + }, + { + "epoch": 0.901072705601907, + "grad_norm": 0.19669019170528293, + "learning_rate": 2.423947466277177e-07, + "loss": 1.9608, + "step": 756 + }, + { + "epoch": 0.902264600715137, + "grad_norm": 0.20106014848036682, + "learning_rate": 2.3663533404015227e-07, + "loss": 1.9479, + "step": 757 + }, + { + "epoch": 0.9034564958283671, + "grad_norm": 0.19075058248139964, + "learning_rate": 2.3094351122146307e-07, + "loss": 1.9461, + "step": 758 + }, + { + "epoch": 0.9046483909415971, + "grad_norm": 0.1936347340988058, + "learning_rate": 2.2531935893577827e-07, + "loss": 1.9786, + "step": 759 + }, + { + "epoch": 0.9058402860548271, + "grad_norm": 0.1953475101067191, + "learning_rate": 2.1976295698701245e-07, + "loss": 1.9602, + "step": 760 + }, + { + "epoch": 0.9070321811680572, + "grad_norm": 0.2018953126999259, + "learning_rate": 2.142743842177386e-07, + "loss": 1.9589, + "step": 761 + }, + { + "epoch": 0.9082240762812872, + "grad_norm": 0.2413615126875786, + "learning_rate": 2.0885371850806691e-07, + "loss": 1.9761, + "step": 762 + }, + { + "epoch": 0.9094159713945172, + "grad_norm": 0.20022969875884347, + "learning_rate": 2.0350103677454047e-07, + "loss": 1.9589, + "step": 763 + }, + { + "epoch": 0.9106078665077473, + "grad_norm": 0.19437922719148687, + "learning_rate": 1.98216414969043e-07, + "loss": 1.9522, + "step": 764 + }, + { + "epoch": 0.9117997616209773, + "grad_norm": 0.19907577309780014, + "learning_rate": 1.9299992807772173e-07, + "loss": 1.9416, + "step": 765 + }, + { + "epoch": 0.9129916567342073, + "grad_norm": 0.20237562054567065, + "learning_rate": 1.8785165011992513e-07, + "loss": 1.9472, + "step": 766 + }, + { + "epoch": 0.9141835518474374, + "grad_norm": 0.19811233452806024, + "learning_rate": 1.8277165414714858e-07, + "loss": 1.9539, + "step": 767 + }, + { + "epoch": 0.9153754469606674, + "grad_norm": 0.19216150911802557, + "learning_rate": 1.7776001224200257e-07, + "loss": 1.9735, + "step": 768 + }, + { + "epoch": 0.9165673420738975, + "grad_norm": 0.20840514563527793, + "learning_rate": 1.7281679551718445e-07, + "loss": 1.9809, + "step": 769 + }, + { + "epoch": 0.9177592371871275, + "grad_norm": 0.20320307604353186, + "learning_rate": 1.6794207411447548e-07, + "loss": 1.9701, + "step": 770 + }, + { + "epoch": 0.9189511323003575, + "grad_norm": 0.1931262061968698, + "learning_rate": 1.6313591720374057e-07, + "loss": 1.9379, + "step": 771 + }, + { + "epoch": 0.9201430274135876, + "grad_norm": 0.1960925515484337, + "learning_rate": 1.583983929819488e-07, + "loss": 1.9537, + "step": 772 + }, + { + "epoch": 0.9213349225268176, + "grad_norm": 0.20033984414838282, + "learning_rate": 1.5372956867220678e-07, + "loss": 1.9524, + "step": 773 + }, + { + "epoch": 0.9225268176400476, + "grad_norm": 0.1969245380458675, + "learning_rate": 1.49129510522803e-07, + "loss": 1.9909, + "step": 774 + }, + { + "epoch": 0.9237187127532777, + "grad_norm": 0.19242733300556847, + "learning_rate": 1.445982838062676e-07, + "loss": 1.9672, + "step": 775 + }, + { + "epoch": 0.9249106078665077, + "grad_norm": 0.19349859868942168, + "learning_rate": 1.4013595281844872e-07, + "loss": 1.9694, + "step": 776 + }, + { + "epoch": 0.9261025029797377, + "grad_norm": 0.19361152253228486, + "learning_rate": 1.357425808775964e-07, + "loss": 1.982, + "step": 777 + }, + { + "epoch": 0.9272943980929678, + "grad_norm": 0.20619875373877572, + "learning_rate": 1.3141823032346736e-07, + "loss": 1.9625, + "step": 778 + }, + { + "epoch": 0.9284862932061978, + "grad_norm": 0.19945778231248415, + "learning_rate": 1.2716296251644e-07, + "loss": 1.9819, + "step": 779 + }, + { + "epoch": 0.929678188319428, + "grad_norm": 0.19580818822309443, + "learning_rate": 1.2297683783664138e-07, + "loss": 1.971, + "step": 780 + }, + { + "epoch": 0.930870083432658, + "grad_norm": 0.1969258751508119, + "learning_rate": 1.1885991568309385e-07, + "loss": 1.9684, + "step": 781 + }, + { + "epoch": 0.932061978545888, + "grad_norm": 0.19668726045406146, + "learning_rate": 1.1481225447286803e-07, + "loss": 1.9336, + "step": 782 + }, + { + "epoch": 0.933253873659118, + "grad_norm": 0.19002790593711985, + "learning_rate": 1.1083391164025903e-07, + "loss": 1.9776, + "step": 783 + }, + { + "epoch": 0.9344457687723481, + "grad_norm": 0.19965702113266218, + "learning_rate": 1.069249436359665e-07, + "loss": 1.982, + "step": 784 + }, + { + "epoch": 0.9356376638855781, + "grad_norm": 0.1916717569664899, + "learning_rate": 1.0308540592629756e-07, + "loss": 1.9611, + "step": 785 + }, + { + "epoch": 0.9368295589988082, + "grad_norm": 0.1958220628621678, + "learning_rate": 9.931535299237737e-08, + "loss": 1.9439, + "step": 786 + }, + { + "epoch": 0.9380214541120382, + "grad_norm": 0.19546126458914154, + "learning_rate": 9.561483832937535e-08, + "loss": 1.9596, + "step": 787 + }, + { + "epoch": 0.9392133492252682, + "grad_norm": 0.18627727476440192, + "learning_rate": 9.198391444575072e-08, + "loss": 1.977, + "step": 788 + }, + { + "epoch": 0.9404052443384983, + "grad_norm": 0.19214479226727124, + "learning_rate": 8.842263286250208e-08, + "loss": 1.9714, + "step": 789 + }, + { + "epoch": 0.9415971394517283, + "grad_norm": 0.19627451760011, + "learning_rate": 8.493104411243791e-08, + "loss": 1.9846, + "step": 790 + }, + { + "epoch": 0.9427890345649583, + "grad_norm": 0.189201378107075, + "learning_rate": 8.150919773946165e-08, + "loss": 1.9438, + "step": 791 + }, + { + "epoch": 0.9439809296781884, + "grad_norm": 0.1881449814121689, + "learning_rate": 7.81571422978672e-08, + "loss": 1.9758, + "step": 792 + }, + { + "epoch": 0.9451728247914184, + "grad_norm": 0.19461953430827816, + "learning_rate": 7.487492535164842e-08, + "loss": 1.9538, + "step": 793 + }, + { + "epoch": 0.9463647199046484, + "grad_norm": 0.19961269699233244, + "learning_rate": 7.166259347382854e-08, + "loss": 1.9861, + "step": 794 + }, + { + "epoch": 0.9475566150178785, + "grad_norm": 0.19603572773830497, + "learning_rate": 6.852019224579287e-08, + "loss": 1.954, + "step": 795 + }, + { + "epoch": 0.9487485101311085, + "grad_norm": 0.18552652214530319, + "learning_rate": 6.544776625664829e-08, + "loss": 1.9701, + "step": 796 + }, + { + "epoch": 0.9499404052443385, + "grad_norm": 0.18737364550182184, + "learning_rate": 6.244535910258697e-08, + "loss": 1.9507, + "step": 797 + }, + { + "epoch": 0.9511323003575686, + "grad_norm": 0.19128218729370722, + "learning_rate": 5.95130133862698e-08, + "loss": 1.963, + "step": 798 + }, + { + "epoch": 0.9523241954707986, + "grad_norm": 0.1857403939801107, + "learning_rate": 5.665077071621894e-08, + "loss": 1.9782, + "step": 799 + }, + { + "epoch": 0.9535160905840286, + "grad_norm": 0.18845875837578616, + "learning_rate": 5.3858671706230605e-08, + "loss": 1.9714, + "step": 800 + }, + { + "epoch": 0.9547079856972587, + "grad_norm": 0.191894557195297, + "learning_rate": 5.1136755974797724e-08, + "loss": 1.9802, + "step": 801 + }, + { + "epoch": 0.9558998808104887, + "grad_norm": 0.18884552490025192, + "learning_rate": 4.848506214454651e-08, + "loss": 1.9635, + "step": 802 + }, + { + "epoch": 0.9570917759237187, + "grad_norm": 0.1921418572611478, + "learning_rate": 4.590362784169022e-08, + "loss": 1.9863, + "step": 803 + }, + { + "epoch": 0.9582836710369488, + "grad_norm": 0.19545913000509813, + "learning_rate": 4.3392489695493475e-08, + "loss": 1.9582, + "step": 804 + }, + { + "epoch": 0.9594755661501788, + "grad_norm": 0.18809267333949384, + "learning_rate": 4.0951683337754345e-08, + "loss": 1.9486, + "step": 805 + }, + { + "epoch": 0.9606674612634089, + "grad_norm": 0.1993552148845141, + "learning_rate": 3.858124340229863e-08, + "loss": 1.9596, + "step": 806 + }, + { + "epoch": 0.9618593563766389, + "grad_norm": 0.19546359586133671, + "learning_rate": 3.628120352448583e-08, + "loss": 1.9635, + "step": 807 + }, + { + "epoch": 0.9630512514898689, + "grad_norm": 0.1965379164207019, + "learning_rate": 3.405159634073452e-08, + "loss": 1.9586, + "step": 808 + }, + { + "epoch": 0.964243146603099, + "grad_norm": 0.18935857235084785, + "learning_rate": 3.1892453488058803e-08, + "loss": 1.9854, + "step": 809 + }, + { + "epoch": 0.965435041716329, + "grad_norm": 0.19406372777092382, + "learning_rate": 2.9803805603619283e-08, + "loss": 1.9588, + "step": 810 + }, + { + "epoch": 0.966626936829559, + "grad_norm": 0.19087575145791982, + "learning_rate": 2.77856823242878e-08, + "loss": 1.9681, + "step": 811 + }, + { + "epoch": 0.9678188319427891, + "grad_norm": 0.20093522828177285, + "learning_rate": 2.5838112286226123e-08, + "loss": 1.9667, + "step": 812 + }, + { + "epoch": 0.9690107270560191, + "grad_norm": 0.18798772341602374, + "learning_rate": 2.39611231244824e-08, + "loss": 1.9722, + "step": 813 + }, + { + "epoch": 0.9702026221692491, + "grad_norm": 0.1848757446131922, + "learning_rate": 2.2154741472596996e-08, + "loss": 1.9578, + "step": 814 + }, + { + "epoch": 0.9713945172824792, + "grad_norm": 0.18755577360898026, + "learning_rate": 2.0418992962224495e-08, + "loss": 1.963, + "step": 815 + }, + { + "epoch": 0.9725864123957092, + "grad_norm": 0.18908507808905262, + "learning_rate": 1.8753902222770627e-08, + "loss": 1.9986, + "step": 816 + }, + { + "epoch": 0.9737783075089392, + "grad_norm": 0.1919401118801061, + "learning_rate": 1.7159492881041462e-08, + "loss": 1.9351, + "step": 817 + }, + { + "epoch": 0.9749702026221693, + "grad_norm": 0.1877371294012426, + "learning_rate": 1.563578756091144e-08, + "loss": 1.9486, + "step": 818 + }, + { + "epoch": 0.9761620977353993, + "grad_norm": 0.1915342862821692, + "learning_rate": 1.4182807882999194e-08, + "loss": 1.9647, + "step": 819 + }, + { + "epoch": 0.9773539928486293, + "grad_norm": 0.18698005671466014, + "learning_rate": 1.2800574464361115e-08, + "loss": 1.9578, + "step": 820 + }, + { + "epoch": 0.9785458879618594, + "grad_norm": 0.1894762036895136, + "learning_rate": 1.1489106918200487e-08, + "loss": 1.9497, + "step": 821 + }, + { + "epoch": 0.9797377830750894, + "grad_norm": 0.19184360419844976, + "learning_rate": 1.0248423853587154e-08, + "loss": 1.9767, + "step": 822 + }, + { + "epoch": 0.9809296781883194, + "grad_norm": 0.19532092856528233, + "learning_rate": 9.07854287519494e-09, + "loss": 1.9623, + "step": 823 + }, + { + "epoch": 0.9821215733015495, + "grad_norm": 0.19624970994803084, + "learning_rate": 7.979480583052423e-09, + "loss": 1.961, + "step": 824 + }, + { + "epoch": 0.9833134684147795, + "grad_norm": 0.1845931499070557, + "learning_rate": 6.951252572304224e-09, + "loss": 1.983, + "step": 825 + }, + { + "epoch": 0.9845053635280095, + "grad_norm": 0.18794485352405654, + "learning_rate": 5.993873432993957e-09, + "loss": 1.9616, + "step": 826 + }, + { + "epoch": 0.9856972586412396, + "grad_norm": 0.19891751745555894, + "learning_rate": 5.107356749853298e-09, + "loss": 1.9535, + "step": 827 + }, + { + "epoch": 0.9868891537544696, + "grad_norm": 0.19040177607162215, + "learning_rate": 4.291715102112126e-09, + "loss": 1.9726, + "step": 828 + }, + { + "epoch": 0.9880810488676997, + "grad_norm": 0.22336004861620634, + "learning_rate": 3.546960063319227e-09, + "loss": 1.966, + "step": 829 + }, + { + "epoch": 0.9892729439809297, + "grad_norm": 0.18822951819678269, + "learning_rate": 2.8731022011757593e-09, + "loss": 1.9966, + "step": 830 + }, + { + "epoch": 0.9904648390941597, + "grad_norm": 0.18927421286397889, + "learning_rate": 2.27015107739037e-09, + "loss": 1.9726, + "step": 831 + }, + { + "epoch": 0.9916567342073898, + "grad_norm": 0.19221873853139876, + "learning_rate": 1.7381152475376416e-09, + "loss": 1.9832, + "step": 832 + }, + { + "epoch": 0.9928486293206198, + "grad_norm": 0.1904281270223511, + "learning_rate": 1.2770022609409628e-09, + "loss": 1.9563, + "step": 833 + }, + { + "epoch": 0.9940405244338498, + "grad_norm": 0.18797717350418608, + "learning_rate": 8.868186605631712e-10, + "loss": 1.9507, + "step": 834 + }, + { + "epoch": 0.9952324195470799, + "grad_norm": 0.18924896153536938, + "learning_rate": 5.675699829160719e-10, + "loss": 1.9705, + "step": 835 + }, + { + "epoch": 0.9964243146603099, + "grad_norm": 0.19596674067827927, + "learning_rate": 3.1926075797827914e-10, + "loss": 1.9888, + "step": 836 + }, + { + "epoch": 0.9976162097735399, + "grad_norm": 0.1862037805845138, + "learning_rate": 1.4189450913415505e-10, + "loss": 1.9437, + "step": 837 + }, + { + "epoch": 0.99880810488677, + "grad_norm": 0.19347983202580893, + "learning_rate": 3.547375312218382e-11, + "loss": 1.9667, + "step": 838 + }, + { + "epoch": 1.0, + "grad_norm": 0.20375226613820588, + "learning_rate": 0.0, + "loss": 1.9468, + "step": 839 + }, + { + "epoch": 1.0, + "step": 839, + "total_flos": 1802102510714880.0, + "train_loss": 1.9892315510771414, + "train_runtime": 26830.5336, + "train_samples_per_second": 58.025, + "train_steps_per_second": 0.031 + } + ], + "logging_steps": 1, + "max_steps": 839, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1802102510714880.0, + "train_batch_size": 58, + "trial_name": null, + "trial_params": null +}