{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 839, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011918951132300357, "grad_norm": 4.142117453158603, "learning_rate": 2.0000000000000003e-06, "loss": 2.2717, "step": 1 }, { "epoch": 0.0023837902264600714, "grad_norm": 4.146443379436291, "learning_rate": 4.000000000000001e-06, "loss": 2.2673, "step": 2 }, { "epoch": 0.003575685339690107, "grad_norm": 4.284702076234231, "learning_rate": 6e-06, "loss": 2.2178, "step": 3 }, { "epoch": 0.004767580452920143, "grad_norm": 4.104853059576584, "learning_rate": 8.000000000000001e-06, "loss": 2.2543, "step": 4 }, { "epoch": 0.0059594755661501785, "grad_norm": 3.213179411663954, "learning_rate": 1e-05, "loss": 2.2236, "step": 5 }, { "epoch": 0.007151370679380214, "grad_norm": 1.9110723795461666, "learning_rate": 9.99996452624688e-06, "loss": 2.2198, "step": 6 }, { "epoch": 0.00834326579261025, "grad_norm": 1.7059526754812628, "learning_rate": 9.999858105490868e-06, "loss": 2.1753, "step": 7 }, { "epoch": 0.009535160905840286, "grad_norm": 3.5843950702527754, "learning_rate": 9.999680739242022e-06, "loss": 2.1457, "step": 8 }, { "epoch": 0.010727056019070322, "grad_norm": 3.0684912624884877, "learning_rate": 9.999432430017084e-06, "loss": 2.1636, "step": 9 }, { "epoch": 0.011918951132300357, "grad_norm": 2.7608335284827397, "learning_rate": 9.999113181339437e-06, "loss": 2.1402, "step": 10 }, { "epoch": 0.013110846245530394, "grad_norm": 2.2876952453740844, "learning_rate": 9.99872299773906e-06, "loss": 2.1377, "step": 11 }, { "epoch": 0.014302741358760428, "grad_norm": 2.189713354043771, "learning_rate": 9.998261884752463e-06, "loss": 2.1297, "step": 12 }, { "epoch": 0.015494636471990465, "grad_norm": 1.8497797869361814, "learning_rate": 9.99772984892261e-06, "loss": 2.1145, "step": 13 }, { "epoch": 0.0166865315852205, "grad_norm": 1.4328123407133309, "learning_rate": 9.997126897798826e-06, "loss": 2.1099, "step": 14 }, { "epoch": 0.017878426698450536, "grad_norm": 1.110872364131768, "learning_rate": 9.996453039936682e-06, "loss": 2.1171, "step": 15 }, { "epoch": 0.01907032181168057, "grad_norm": 0.8634715903240043, "learning_rate": 9.995708284897889e-06, "loss": 2.0905, "step": 16 }, { "epoch": 0.02026221692491061, "grad_norm": 0.7193294200124174, "learning_rate": 9.994892643250147e-06, "loss": 2.1082, "step": 17 }, { "epoch": 0.021454112038140644, "grad_norm": 0.6321366046233599, "learning_rate": 9.994006126567006e-06, "loss": 2.0998, "step": 18 }, { "epoch": 0.02264600715137068, "grad_norm": 0.5966553161988273, "learning_rate": 9.993048747427696e-06, "loss": 2.1107, "step": 19 }, { "epoch": 0.023837902264600714, "grad_norm": 0.5609275352376858, "learning_rate": 9.99202051941695e-06, "loss": 2.1192, "step": 20 }, { "epoch": 0.025029797377830752, "grad_norm": 0.5426385429041977, "learning_rate": 9.990921457124807e-06, "loss": 2.0809, "step": 21 }, { "epoch": 0.026221692491060787, "grad_norm": 0.5394723771045817, "learning_rate": 9.989751576146413e-06, "loss": 2.0889, "step": 22 }, { "epoch": 0.027413587604290822, "grad_norm": 0.5125643780082518, "learning_rate": 9.9885108930818e-06, "loss": 2.1025, "step": 23 }, { "epoch": 0.028605482717520857, "grad_norm": 0.49224477502997804, "learning_rate": 9.98719942553564e-06, "loss": 2.0945, "step": 24 }, { "epoch": 0.029797377830750895, "grad_norm": 0.45404804144183336, "learning_rate": 9.985817192117001e-06, "loss": 2.0985, "step": 25 }, { "epoch": 0.03098927294398093, "grad_norm": 0.4619837784227509, "learning_rate": 9.984364212439089e-06, "loss": 2.1044, "step": 26 }, { "epoch": 0.03218116805721097, "grad_norm": 0.43728216047255375, "learning_rate": 9.982840507118959e-06, "loss": 2.1109, "step": 27 }, { "epoch": 0.033373063170441, "grad_norm": 0.4106377846709857, "learning_rate": 9.98124609777723e-06, "loss": 2.1169, "step": 28 }, { "epoch": 0.03456495828367104, "grad_norm": 0.3844482158912619, "learning_rate": 9.979581007037776e-06, "loss": 2.084, "step": 29 }, { "epoch": 0.03575685339690107, "grad_norm": 0.3615382921612817, "learning_rate": 9.977845258527403e-06, "loss": 2.0578, "step": 30 }, { "epoch": 0.03694874851013111, "grad_norm": 0.3265425379192771, "learning_rate": 9.976038876875519e-06, "loss": 2.095, "step": 31 }, { "epoch": 0.03814064362336114, "grad_norm": 0.3249807533911439, "learning_rate": 9.974161887713775e-06, "loss": 2.1091, "step": 32 }, { "epoch": 0.03933253873659118, "grad_norm": 0.3258627805281391, "learning_rate": 9.972214317675713e-06, "loss": 2.0844, "step": 33 }, { "epoch": 0.04052443384982122, "grad_norm": 0.3115021940469844, "learning_rate": 9.970196194396383e-06, "loss": 2.1214, "step": 34 }, { "epoch": 0.041716328963051254, "grad_norm": 0.29359663265683655, "learning_rate": 9.968107546511942e-06, "loss": 2.1068, "step": 35 }, { "epoch": 0.04290822407628129, "grad_norm": 0.2914709352436855, "learning_rate": 9.965948403659267e-06, "loss": 2.1358, "step": 36 }, { "epoch": 0.04410011918951132, "grad_norm": 0.2801925528264536, "learning_rate": 9.963718796475516e-06, "loss": 2.0968, "step": 37 }, { "epoch": 0.04529201430274136, "grad_norm": 0.2777208098847368, "learning_rate": 9.961418756597703e-06, "loss": 2.1118, "step": 38 }, { "epoch": 0.04648390941597139, "grad_norm": 0.2627526369156367, "learning_rate": 9.959048316662246e-06, "loss": 2.1084, "step": 39 }, { "epoch": 0.04767580452920143, "grad_norm": 0.27582542868025006, "learning_rate": 9.956607510304508e-06, "loss": 2.0636, "step": 40 }, { "epoch": 0.04886769964243146, "grad_norm": 0.252514387355515, "learning_rate": 9.95409637215831e-06, "loss": 2.0842, "step": 41 }, { "epoch": 0.050059594755661505, "grad_norm": 0.270250047965773, "learning_rate": 9.951514937855455e-06, "loss": 2.0476, "step": 42 }, { "epoch": 0.05125148986889154, "grad_norm": 0.2626133865266297, "learning_rate": 9.948863244025202e-06, "loss": 2.0777, "step": 43 }, { "epoch": 0.052443384982121574, "grad_norm": 0.25699273367046915, "learning_rate": 9.94614132829377e-06, "loss": 2.0944, "step": 44 }, { "epoch": 0.05363528009535161, "grad_norm": 0.27340926527011333, "learning_rate": 9.943349229283781e-06, "loss": 2.0887, "step": 45 }, { "epoch": 0.054827175208581644, "grad_norm": 0.24251549192329058, "learning_rate": 9.94048698661373e-06, "loss": 2.1024, "step": 46 }, { "epoch": 0.05601907032181168, "grad_norm": 0.2575396666859324, "learning_rate": 9.937554640897414e-06, "loss": 2.083, "step": 47 }, { "epoch": 0.057210965435041714, "grad_norm": 0.23558811037647728, "learning_rate": 9.934552233743353e-06, "loss": 2.0819, "step": 48 }, { "epoch": 0.058402860548271755, "grad_norm": 0.24407061789389187, "learning_rate": 9.931479807754209e-06, "loss": 2.0793, "step": 49 }, { "epoch": 0.05959475566150179, "grad_norm": 0.23931691434033772, "learning_rate": 9.928337406526172e-06, "loss": 2.1159, "step": 50 }, { "epoch": 0.060786650774731825, "grad_norm": 0.23796062784470082, "learning_rate": 9.925125074648352e-06, "loss": 2.0824, "step": 51 }, { "epoch": 0.06197854588796186, "grad_norm": 0.23466916513360747, "learning_rate": 9.921842857702132e-06, "loss": 2.0734, "step": 52 }, { "epoch": 0.0631704410011919, "grad_norm": 0.23455835316060827, "learning_rate": 9.918490802260538e-06, "loss": 2.073, "step": 53 }, { "epoch": 0.06436233611442194, "grad_norm": 0.2383847191126797, "learning_rate": 9.915068955887564e-06, "loss": 2.0621, "step": 54 }, { "epoch": 0.06555423122765197, "grad_norm": 0.23851986543002354, "learning_rate": 9.911577367137499e-06, "loss": 2.0672, "step": 55 }, { "epoch": 0.066746126340882, "grad_norm": 0.24283914455886954, "learning_rate": 9.90801608555425e-06, "loss": 2.0869, "step": 56 }, { "epoch": 0.06793802145411204, "grad_norm": 0.24281061248826022, "learning_rate": 9.904385161670626e-06, "loss": 2.0755, "step": 57 }, { "epoch": 0.06912991656734208, "grad_norm": 0.25365576984515253, "learning_rate": 9.900684647007624e-06, "loss": 2.097, "step": 58 }, { "epoch": 0.07032181168057211, "grad_norm": 0.23883096980549337, "learning_rate": 9.896914594073703e-06, "loss": 2.0865, "step": 59 }, { "epoch": 0.07151370679380215, "grad_norm": 0.25353821590332437, "learning_rate": 9.893075056364034e-06, "loss": 2.0597, "step": 60 }, { "epoch": 0.07270560190703218, "grad_norm": 0.25666970441346676, "learning_rate": 9.889166088359742e-06, "loss": 2.0685, "step": 61 }, { "epoch": 0.07389749702026222, "grad_norm": 0.25019996159388774, "learning_rate": 9.885187745527132e-06, "loss": 2.047, "step": 62 }, { "epoch": 0.07508939213349225, "grad_norm": 0.25291578137222365, "learning_rate": 9.881140084316907e-06, "loss": 2.0874, "step": 63 }, { "epoch": 0.07628128724672228, "grad_norm": 0.2636139470370503, "learning_rate": 9.87702316216336e-06, "loss": 2.0761, "step": 64 }, { "epoch": 0.07747318235995232, "grad_norm": 0.267225445079766, "learning_rate": 9.87283703748356e-06, "loss": 2.0632, "step": 65 }, { "epoch": 0.07866507747318235, "grad_norm": 0.26637209411345025, "learning_rate": 9.868581769676532e-06, "loss": 2.0465, "step": 66 }, { "epoch": 0.07985697258641239, "grad_norm": 0.28338796894764773, "learning_rate": 9.864257419122404e-06, "loss": 2.0543, "step": 67 }, { "epoch": 0.08104886769964244, "grad_norm": 0.26975885676108347, "learning_rate": 9.859864047181551e-06, "loss": 2.0612, "step": 68 }, { "epoch": 0.08224076281287247, "grad_norm": 0.2782261191514193, "learning_rate": 9.855401716193733e-06, "loss": 2.0466, "step": 69 }, { "epoch": 0.08343265792610251, "grad_norm": 0.3833695594063663, "learning_rate": 9.850870489477198e-06, "loss": 2.0592, "step": 70 }, { "epoch": 0.08462455303933254, "grad_norm": 0.32898931613326715, "learning_rate": 9.846270431327793e-06, "loss": 2.0498, "step": 71 }, { "epoch": 0.08581644815256258, "grad_norm": 0.3256809126198457, "learning_rate": 9.841601607018052e-06, "loss": 2.071, "step": 72 }, { "epoch": 0.08700834326579261, "grad_norm": 0.3451979168625637, "learning_rate": 9.83686408279626e-06, "loss": 2.0497, "step": 73 }, { "epoch": 0.08820023837902265, "grad_norm": 0.3114795827025238, "learning_rate": 9.832057925885526e-06, "loss": 2.034, "step": 74 }, { "epoch": 0.08939213349225268, "grad_norm": 0.33202022850009677, "learning_rate": 9.827183204482818e-06, "loss": 2.0324, "step": 75 }, { "epoch": 0.09058402860548272, "grad_norm": 0.36121743903908576, "learning_rate": 9.822239987757999e-06, "loss": 2.0491, "step": 76 }, { "epoch": 0.09177592371871275, "grad_norm": 0.33897099215895576, "learning_rate": 9.817228345852853e-06, "loss": 2.043, "step": 77 }, { "epoch": 0.09296781883194279, "grad_norm": 0.37620997544394924, "learning_rate": 9.812148349880076e-06, "loss": 2.0364, "step": 78 }, { "epoch": 0.09415971394517282, "grad_norm": 0.33013400624562517, "learning_rate": 9.807000071922279e-06, "loss": 2.0375, "step": 79 }, { "epoch": 0.09535160905840286, "grad_norm": 0.3453355696988705, "learning_rate": 9.801783585030959e-06, "loss": 2.0357, "step": 80 }, { "epoch": 0.09654350417163289, "grad_norm": 0.36374461914208417, "learning_rate": 9.79649896322546e-06, "loss": 2.0358, "step": 81 }, { "epoch": 0.09773539928486293, "grad_norm": 0.3949186996890207, "learning_rate": 9.791146281491935e-06, "loss": 2.0576, "step": 82 }, { "epoch": 0.09892729439809297, "grad_norm": 0.5347002177369765, "learning_rate": 9.785725615782262e-06, "loss": 2.0536, "step": 83 }, { "epoch": 0.10011918951132301, "grad_norm": 0.7315743014064646, "learning_rate": 9.780237043012988e-06, "loss": 2.0382, "step": 84 }, { "epoch": 0.10131108462455304, "grad_norm": 0.8662501044705759, "learning_rate": 9.774680641064223e-06, "loss": 2.0254, "step": 85 }, { "epoch": 0.10250297973778308, "grad_norm": 1.0007184298369625, "learning_rate": 9.769056488778538e-06, "loss": 2.0507, "step": 86 }, { "epoch": 0.10369487485101311, "grad_norm": 0.6169288025962048, "learning_rate": 9.76336466595985e-06, "loss": 2.0605, "step": 87 }, { "epoch": 0.10488676996424315, "grad_norm": 0.4654550573627205, "learning_rate": 9.757605253372283e-06, "loss": 2.0064, "step": 88 }, { "epoch": 0.10607866507747318, "grad_norm": 0.7568695915967251, "learning_rate": 9.751778332739033e-06, "loss": 2.0206, "step": 89 }, { "epoch": 0.10727056019070322, "grad_norm": 0.7116290914175033, "learning_rate": 9.745883986741196e-06, "loss": 2.0276, "step": 90 }, { "epoch": 0.10846245530393325, "grad_norm": 0.4534995774251596, "learning_rate": 9.739922299016601e-06, "loss": 2.0372, "step": 91 }, { "epoch": 0.10965435041716329, "grad_norm": 0.5202368477526325, "learning_rate": 9.733893354158628e-06, "loss": 2.0281, "step": 92 }, { "epoch": 0.11084624553039332, "grad_norm": 0.6041387580680142, "learning_rate": 9.727797237714991e-06, "loss": 2.0148, "step": 93 }, { "epoch": 0.11203814064362336, "grad_norm": 0.45293705742449053, "learning_rate": 9.721634036186545e-06, "loss": 2.0175, "step": 94 }, { "epoch": 0.11323003575685339, "grad_norm": 0.4793331693139531, "learning_rate": 9.715403837026046e-06, "loss": 2.0328, "step": 95 }, { "epoch": 0.11442193087008343, "grad_norm": 0.5140021564481994, "learning_rate": 9.709106728636913e-06, "loss": 2.0143, "step": 96 }, { "epoch": 0.11561382598331346, "grad_norm": 0.5507096777112976, "learning_rate": 9.702742800371972e-06, "loss": 2.0451, "step": 97 }, { "epoch": 0.11680572109654351, "grad_norm": 0.48239551212919374, "learning_rate": 9.69631214253219e-06, "loss": 2.0241, "step": 98 }, { "epoch": 0.11799761620977355, "grad_norm": 0.48328110314880524, "learning_rate": 9.689814846365399e-06, "loss": 2.003, "step": 99 }, { "epoch": 0.11918951132300358, "grad_norm": 0.5285205749893114, "learning_rate": 9.68325100406499e-06, "loss": 2.0333, "step": 100 }, { "epoch": 0.12038140643623362, "grad_norm": 0.6086649755855322, "learning_rate": 9.676620708768608e-06, "loss": 2.0468, "step": 101 }, { "epoch": 0.12157330154946365, "grad_norm": 0.6688243861727331, "learning_rate": 9.669924054556836e-06, "loss": 2.0052, "step": 102 }, { "epoch": 0.12276519666269368, "grad_norm": 0.7098729745438024, "learning_rate": 9.663161136451862e-06, "loss": 2.0201, "step": 103 }, { "epoch": 0.12395709177592372, "grad_norm": 0.7561634464725003, "learning_rate": 9.656332050416118e-06, "loss": 2.005, "step": 104 }, { "epoch": 0.12514898688915377, "grad_norm": 0.7900403950856617, "learning_rate": 9.64943689335093e-06, "loss": 2.0312, "step": 105 }, { "epoch": 0.1263408820023838, "grad_norm": 0.7565057129935538, "learning_rate": 9.642475763095134e-06, "loss": 2.034, "step": 106 }, { "epoch": 0.12753277711561384, "grad_norm": 0.6925499516277225, "learning_rate": 9.635448758423703e-06, "loss": 2.0172, "step": 107 }, { "epoch": 0.12872467222884387, "grad_norm": 0.5469713913154514, "learning_rate": 9.628355979046325e-06, "loss": 2.0306, "step": 108 }, { "epoch": 0.1299165673420739, "grad_norm": 0.532846103968638, "learning_rate": 9.621197525606e-06, "loss": 2.0313, "step": 109 }, { "epoch": 0.13110846245530394, "grad_norm": 0.5107064338016527, "learning_rate": 9.613973499677613e-06, "loss": 2.0483, "step": 110 }, { "epoch": 0.13230035756853398, "grad_norm": 0.6341330772425801, "learning_rate": 9.606684003766493e-06, "loss": 2.0222, "step": 111 }, { "epoch": 0.133492252681764, "grad_norm": 0.5976219627881748, "learning_rate": 9.599329141306946e-06, "loss": 2.0074, "step": 112 }, { "epoch": 0.13468414779499405, "grad_norm": 0.5847751917110514, "learning_rate": 9.591909016660806e-06, "loss": 2.0206, "step": 113 }, { "epoch": 0.13587604290822408, "grad_norm": 0.6813522136748844, "learning_rate": 9.584423735115938e-06, "loss": 2.0178, "step": 114 }, { "epoch": 0.13706793802145412, "grad_norm": 0.767208353338879, "learning_rate": 9.576873402884756e-06, "loss": 1.9957, "step": 115 }, { "epoch": 0.13825983313468415, "grad_norm": 0.9969526484589852, "learning_rate": 9.569258127102708e-06, "loss": 2.0152, "step": 116 }, { "epoch": 0.1394517282479142, "grad_norm": 1.14614371471204, "learning_rate": 9.561578015826758e-06, "loss": 2.0156, "step": 117 }, { "epoch": 0.14064362336114422, "grad_norm": 0.693890999588814, "learning_rate": 9.553833178033856e-06, "loss": 2.015, "step": 118 }, { "epoch": 0.14183551847437426, "grad_norm": 0.6655521940110969, "learning_rate": 9.546023723619387e-06, "loss": 2.0357, "step": 119 }, { "epoch": 0.1430274135876043, "grad_norm": 0.8792717595050646, "learning_rate": 9.538149763395611e-06, "loss": 2.0057, "step": 120 }, { "epoch": 0.14421930870083433, "grad_norm": 0.9075837339408256, "learning_rate": 9.530211409090104e-06, "loss": 2.0324, "step": 121 }, { "epoch": 0.14541120381406436, "grad_norm": 0.8920741190168875, "learning_rate": 9.522208773344147e-06, "loss": 1.9948, "step": 122 }, { "epoch": 0.1466030989272944, "grad_norm": 0.8574927630149499, "learning_rate": 9.514141969711155e-06, "loss": 2.019, "step": 123 }, { "epoch": 0.14779499404052443, "grad_norm": 0.6343463765213274, "learning_rate": 9.506011112655045e-06, "loss": 2.0193, "step": 124 }, { "epoch": 0.14898688915375446, "grad_norm": 0.5630972285804464, "learning_rate": 9.497816317548625e-06, "loss": 2.0057, "step": 125 }, { "epoch": 0.1501787842669845, "grad_norm": 0.7579610388968056, "learning_rate": 9.489557700671948e-06, "loss": 2.0315, "step": 126 }, { "epoch": 0.15137067938021453, "grad_norm": 0.6850629250779653, "learning_rate": 9.481235379210671e-06, "loss": 2.001, "step": 127 }, { "epoch": 0.15256257449344457, "grad_norm": 0.5362542526140824, "learning_rate": 9.472849471254386e-06, "loss": 2.0316, "step": 128 }, { "epoch": 0.1537544696066746, "grad_norm": 0.608628527433765, "learning_rate": 9.46440009579494e-06, "loss": 2.035, "step": 129 }, { "epoch": 0.15494636471990464, "grad_norm": 0.5093840827042088, "learning_rate": 9.455887372724761e-06, "loss": 2.0273, "step": 130 }, { "epoch": 0.15613825983313467, "grad_norm": 0.646651425294055, "learning_rate": 9.447311422835141e-06, "loss": 2.0337, "step": 131 }, { "epoch": 0.1573301549463647, "grad_norm": 0.6171589347028325, "learning_rate": 9.438672367814532e-06, "loss": 2.0111, "step": 132 }, { "epoch": 0.15852205005959474, "grad_norm": 0.607124578385374, "learning_rate": 9.429970330246817e-06, "loss": 2.0207, "step": 133 }, { "epoch": 0.15971394517282478, "grad_norm": 0.6668755869782658, "learning_rate": 9.421205433609568e-06, "loss": 2.0174, "step": 134 }, { "epoch": 0.16090584028605484, "grad_norm": 0.7092639336616874, "learning_rate": 9.412377802272296e-06, "loss": 2.0061, "step": 135 }, { "epoch": 0.16209773539928488, "grad_norm": 0.7386024648965732, "learning_rate": 9.40348756149469e-06, "loss": 2.0126, "step": 136 }, { "epoch": 0.1632896305125149, "grad_norm": 0.6374704813920733, "learning_rate": 9.39453483742483e-06, "loss": 2.0176, "step": 137 }, { "epoch": 0.16448152562574495, "grad_norm": 0.514905378407023, "learning_rate": 9.385519757097405e-06, "loss": 2.0055, "step": 138 }, { "epoch": 0.16567342073897498, "grad_norm": 0.625583671688313, "learning_rate": 9.376442448431911e-06, "loss": 2.0109, "step": 139 }, { "epoch": 0.16686531585220502, "grad_norm": 0.6190722916976653, "learning_rate": 9.367303040230828e-06, "loss": 1.9939, "step": 140 }, { "epoch": 0.16805721096543505, "grad_norm": 0.5659222906567583, "learning_rate": 9.358101662177804e-06, "loss": 2.0111, "step": 141 }, { "epoch": 0.16924910607866508, "grad_norm": 0.6584496167747385, "learning_rate": 9.348838444835798e-06, "loss": 2.0185, "step": 142 }, { "epoch": 0.17044100119189512, "grad_norm": 0.5257356541865075, "learning_rate": 9.33951351964525e-06, "loss": 2.0167, "step": 143 }, { "epoch": 0.17163289630512515, "grad_norm": 0.5343239683640106, "learning_rate": 9.330127018922195e-06, "loss": 2.0058, "step": 144 }, { "epoch": 0.1728247914183552, "grad_norm": 0.5602849015914332, "learning_rate": 9.320679075856396e-06, "loss": 1.9952, "step": 145 }, { "epoch": 0.17401668653158522, "grad_norm": 0.509174624093658, "learning_rate": 9.311169824509454e-06, "loss": 2.0035, "step": 146 }, { "epoch": 0.17520858164481526, "grad_norm": 0.6065116610936728, "learning_rate": 9.301599399812904e-06, "loss": 1.9989, "step": 147 }, { "epoch": 0.1764004767580453, "grad_norm": 0.6025058237653309, "learning_rate": 9.291967937566297e-06, "loss": 2.015, "step": 148 }, { "epoch": 0.17759237187127533, "grad_norm": 0.5966629218921442, "learning_rate": 9.28227557443528e-06, "loss": 1.9871, "step": 149 }, { "epoch": 0.17878426698450536, "grad_norm": 0.6244177338742471, "learning_rate": 9.272522447949652e-06, "loss": 1.9916, "step": 150 }, { "epoch": 0.1799761620977354, "grad_norm": 0.522440075076418, "learning_rate": 9.262708696501412e-06, "loss": 1.9997, "step": 151 }, { "epoch": 0.18116805721096543, "grad_norm": 0.5640728239700662, "learning_rate": 9.252834459342801e-06, "loss": 2.003, "step": 152 }, { "epoch": 0.18235995232419547, "grad_norm": 0.6822460944537364, "learning_rate": 9.242899876584317e-06, "loss": 2.0198, "step": 153 }, { "epoch": 0.1835518474374255, "grad_norm": 0.6013920222643127, "learning_rate": 9.232905089192733e-06, "loss": 1.983, "step": 154 }, { "epoch": 0.18474374255065554, "grad_norm": 0.6210431332187637, "learning_rate": 9.222850238989104e-06, "loss": 1.9815, "step": 155 }, { "epoch": 0.18593563766388557, "grad_norm": 0.5536506251912162, "learning_rate": 9.21273546864673e-06, "loss": 1.9943, "step": 156 }, { "epoch": 0.1871275327771156, "grad_norm": 0.5108824250251738, "learning_rate": 9.202560921689165e-06, "loss": 1.9875, "step": 157 }, { "epoch": 0.18831942789034564, "grad_norm": 0.6703972638895684, "learning_rate": 9.192326742488153e-06, "loss": 2.0054, "step": 158 }, { "epoch": 0.18951132300357568, "grad_norm": 0.6911385466049688, "learning_rate": 9.182033076261591e-06, "loss": 2.013, "step": 159 }, { "epoch": 0.1907032181168057, "grad_norm": 0.8228547705270176, "learning_rate": 9.171680069071472e-06, "loss": 2.0079, "step": 160 }, { "epoch": 0.19189511323003575, "grad_norm": 0.8318482910273874, "learning_rate": 9.161267867821802e-06, "loss": 2.0116, "step": 161 }, { "epoch": 0.19308700834326578, "grad_norm": 0.6993770001635832, "learning_rate": 9.150796620256526e-06, "loss": 2.0104, "step": 162 }, { "epoch": 0.19427890345649582, "grad_norm": 0.6963815969965594, "learning_rate": 9.140266474957421e-06, "loss": 1.9932, "step": 163 }, { "epoch": 0.19547079856972585, "grad_norm": 0.687540193587627, "learning_rate": 9.129677581342e-06, "loss": 1.9844, "step": 164 }, { "epoch": 0.1966626936829559, "grad_norm": 0.6315324748513748, "learning_rate": 9.11903008966138e-06, "loss": 1.9964, "step": 165 }, { "epoch": 0.19785458879618595, "grad_norm": 0.5152807583074759, "learning_rate": 9.10832415099816e-06, "loss": 2.0027, "step": 166 }, { "epoch": 0.19904648390941598, "grad_norm": 0.4708357523523462, "learning_rate": 9.097559917264268e-06, "loss": 2.007, "step": 167 }, { "epoch": 0.20023837902264602, "grad_norm": 0.5659309675022438, "learning_rate": 9.086737541198812e-06, "loss": 2.0065, "step": 168 }, { "epoch": 0.20143027413587605, "grad_norm": 0.5973723979176943, "learning_rate": 9.07585717636591e-06, "loss": 1.9963, "step": 169 }, { "epoch": 0.2026221692491061, "grad_norm": 0.612759197221063, "learning_rate": 9.064918977152517e-06, "loss": 2.0189, "step": 170 }, { "epoch": 0.20381406436233612, "grad_norm": 0.6368297841192448, "learning_rate": 9.053923098766218e-06, "loss": 1.9996, "step": 171 }, { "epoch": 0.20500595947556616, "grad_norm": 0.6267340913957593, "learning_rate": 9.042869697233046e-06, "loss": 2.0081, "step": 172 }, { "epoch": 0.2061978545887962, "grad_norm": 0.5997679592985574, "learning_rate": 9.031758929395259e-06, "loss": 2.0087, "step": 173 }, { "epoch": 0.20738974970202623, "grad_norm": 0.6540359851514235, "learning_rate": 9.020590952909105e-06, "loss": 1.9862, "step": 174 }, { "epoch": 0.20858164481525626, "grad_norm": 0.6304008000188193, "learning_rate": 9.009365926242603e-06, "loss": 1.9845, "step": 175 }, { "epoch": 0.2097735399284863, "grad_norm": 0.49409981260012525, "learning_rate": 8.998084008673284e-06, "loss": 1.9865, "step": 176 }, { "epoch": 0.21096543504171633, "grad_norm": 0.428992104451379, "learning_rate": 8.986745360285933e-06, "loss": 1.9775, "step": 177 }, { "epoch": 0.21215733015494637, "grad_norm": 0.4544484558085694, "learning_rate": 8.975350141970312e-06, "loss": 1.9974, "step": 178 }, { "epoch": 0.2133492252681764, "grad_norm": 0.47713373163398903, "learning_rate": 8.963898515418885e-06, "loss": 1.9986, "step": 179 }, { "epoch": 0.21454112038140644, "grad_norm": 0.5128102686619308, "learning_rate": 8.952390643124524e-06, "loss": 1.9926, "step": 180 }, { "epoch": 0.21573301549463647, "grad_norm": 0.49123637812302784, "learning_rate": 8.940826688378196e-06, "loss": 2.0068, "step": 181 }, { "epoch": 0.2169249106078665, "grad_norm": 0.4670667432350283, "learning_rate": 8.929206815266653e-06, "loss": 1.998, "step": 182 }, { "epoch": 0.21811680572109654, "grad_norm": 0.5026402806403492, "learning_rate": 8.917531188670096e-06, "loss": 2.0023, "step": 183 }, { "epoch": 0.21930870083432658, "grad_norm": 0.5146023032179888, "learning_rate": 8.905799974259845e-06, "loss": 1.9917, "step": 184 }, { "epoch": 0.2205005959475566, "grad_norm": 0.500813938615368, "learning_rate": 8.89401333849598e-06, "loss": 2.0046, "step": 185 }, { "epoch": 0.22169249106078665, "grad_norm": 0.5241153656092717, "learning_rate": 8.882171448624988e-06, "loss": 2.004, "step": 186 }, { "epoch": 0.22288438617401668, "grad_norm": 0.5455210954026811, "learning_rate": 8.870274472677376e-06, "loss": 2.0136, "step": 187 }, { "epoch": 0.22407628128724671, "grad_norm": 0.6182614320674238, "learning_rate": 8.8583225794653e-06, "loss": 1.9745, "step": 188 }, { "epoch": 0.22526817640047675, "grad_norm": 0.7203972482184511, "learning_rate": 8.846315938580163e-06, "loss": 1.9876, "step": 189 }, { "epoch": 0.22646007151370678, "grad_norm": 0.7651134846710912, "learning_rate": 8.834254720390214e-06, "loss": 2.0039, "step": 190 }, { "epoch": 0.22765196662693682, "grad_norm": 0.717395085062428, "learning_rate": 8.82213909603812e-06, "loss": 1.9923, "step": 191 }, { "epoch": 0.22884386174016685, "grad_norm": 0.6974046079010195, "learning_rate": 8.80996923743855e-06, "loss": 1.9902, "step": 192 }, { "epoch": 0.2300357568533969, "grad_norm": 0.5749230359569363, "learning_rate": 8.797745317275727e-06, "loss": 2.0077, "step": 193 }, { "epoch": 0.23122765196662692, "grad_norm": 0.47763377533604173, "learning_rate": 8.78546750900098e-06, "loss": 2.0175, "step": 194 }, { "epoch": 0.232419547079857, "grad_norm": 0.4868384029481758, "learning_rate": 8.773135986830289e-06, "loss": 1.9817, "step": 195 }, { "epoch": 0.23361144219308702, "grad_norm": 0.5411631589460403, "learning_rate": 8.760750925741799e-06, "loss": 2.0191, "step": 196 }, { "epoch": 0.23480333730631706, "grad_norm": 0.5991085184799008, "learning_rate": 8.748312501473351e-06, "loss": 1.9872, "step": 197 }, { "epoch": 0.2359952324195471, "grad_norm": 0.6561276515835338, "learning_rate": 8.735820890519981e-06, "loss": 1.9851, "step": 198 }, { "epoch": 0.23718712753277713, "grad_norm": 0.7063577334823914, "learning_rate": 8.723276270131422e-06, "loss": 1.9897, "step": 199 }, { "epoch": 0.23837902264600716, "grad_norm": 0.7581038228065401, "learning_rate": 8.710678818309576e-06, "loss": 2.0025, "step": 200 }, { "epoch": 0.2395709177592372, "grad_norm": 0.7115966613137586, "learning_rate": 8.698028713806005e-06, "loss": 2.0004, "step": 201 }, { "epoch": 0.24076281287246723, "grad_norm": 0.5976258958997295, "learning_rate": 8.68532613611938e-06, "loss": 2.018, "step": 202 }, { "epoch": 0.24195470798569726, "grad_norm": 0.43540172054622217, "learning_rate": 8.672571265492944e-06, "loss": 1.9989, "step": 203 }, { "epoch": 0.2431466030989273, "grad_norm": 0.5216426023045612, "learning_rate": 8.659764282911948e-06, "loss": 1.9866, "step": 204 }, { "epoch": 0.24433849821215733, "grad_norm": 0.6613860116484914, "learning_rate": 8.64690537010109e-06, "loss": 2.0061, "step": 205 }, { "epoch": 0.24553039332538737, "grad_norm": 0.7138301888755583, "learning_rate": 8.63399470952193e-06, "loss": 2.0107, "step": 206 }, { "epoch": 0.2467222884386174, "grad_norm": 0.7998521068632918, "learning_rate": 8.621032484370299e-06, "loss": 1.9856, "step": 207 }, { "epoch": 0.24791418355184744, "grad_norm": 0.6733799007638906, "learning_rate": 8.60801887857371e-06, "loss": 1.9789, "step": 208 }, { "epoch": 0.24910607866507747, "grad_norm": 0.4890141413650463, "learning_rate": 8.594954076788736e-06, "loss": 1.9966, "step": 209 }, { "epoch": 0.25029797377830754, "grad_norm": 0.510254285654425, "learning_rate": 8.5818382643984e-06, "loss": 2.0033, "step": 210 }, { "epoch": 0.25148986889153757, "grad_norm": 0.6736096737562903, "learning_rate": 8.56867162750954e-06, "loss": 1.9882, "step": 211 }, { "epoch": 0.2526817640047676, "grad_norm": 0.688224238343655, "learning_rate": 8.555454352950161e-06, "loss": 1.9826, "step": 212 }, { "epoch": 0.25387365911799764, "grad_norm": 0.5310568361772406, "learning_rate": 8.542186628266801e-06, "loss": 2.018, "step": 213 }, { "epoch": 0.2550655542312277, "grad_norm": 0.4622700149348845, "learning_rate": 8.528868641721857e-06, "loss": 1.9873, "step": 214 }, { "epoch": 0.2562574493444577, "grad_norm": 0.44850296625902714, "learning_rate": 8.515500582290914e-06, "loss": 1.9738, "step": 215 }, { "epoch": 0.25744934445768775, "grad_norm": 0.5800104445256365, "learning_rate": 8.502082639660068e-06, "loss": 2.0033, "step": 216 }, { "epoch": 0.2586412395709178, "grad_norm": 0.5571007121924001, "learning_rate": 8.488615004223233e-06, "loss": 2.0097, "step": 217 }, { "epoch": 0.2598331346841478, "grad_norm": 0.5363110521997889, "learning_rate": 8.475097867079437e-06, "loss": 1.9826, "step": 218 }, { "epoch": 0.26102502979737785, "grad_norm": 0.46575794642736956, "learning_rate": 8.461531420030117e-06, "loss": 2.0129, "step": 219 }, { "epoch": 0.2622169249106079, "grad_norm": 0.40917886114681945, "learning_rate": 8.44791585557639e-06, "loss": 2.0047, "step": 220 }, { "epoch": 0.2634088200238379, "grad_norm": 0.428624008942813, "learning_rate": 8.434251366916323e-06, "loss": 1.9781, "step": 221 }, { "epoch": 0.26460071513706795, "grad_norm": 0.4571746297128128, "learning_rate": 8.420538147942196e-06, "loss": 1.9844, "step": 222 }, { "epoch": 0.265792610250298, "grad_norm": 0.47157884654181986, "learning_rate": 8.406776393237748e-06, "loss": 1.9985, "step": 223 }, { "epoch": 0.266984505363528, "grad_norm": 0.46012310079193414, "learning_rate": 8.392966298075413e-06, "loss": 1.9945, "step": 224 }, { "epoch": 0.26817640047675806, "grad_norm": 0.4551526365374971, "learning_rate": 8.379108058413553e-06, "loss": 1.9778, "step": 225 }, { "epoch": 0.2693682955899881, "grad_norm": 0.4810916725254239, "learning_rate": 8.36520187089368e-06, "loss": 1.9814, "step": 226 }, { "epoch": 0.27056019070321813, "grad_norm": 0.46258784460873204, "learning_rate": 8.351247932837655e-06, "loss": 1.9719, "step": 227 }, { "epoch": 0.27175208581644816, "grad_norm": 0.45411997594863557, "learning_rate": 8.337246442244902e-06, "loss": 1.9753, "step": 228 }, { "epoch": 0.2729439809296782, "grad_norm": 0.43996967181045016, "learning_rate": 8.32319759778959e-06, "loss": 2.0033, "step": 229 }, { "epoch": 0.27413587604290823, "grad_norm": 0.507769478588206, "learning_rate": 8.309101598817812e-06, "loss": 2.0024, "step": 230 }, { "epoch": 0.27532777115613827, "grad_norm": 0.48069601950891877, "learning_rate": 8.294958645344766e-06, "loss": 1.9824, "step": 231 }, { "epoch": 0.2765196662693683, "grad_norm": 0.5157028595077698, "learning_rate": 8.280768938051909e-06, "loss": 1.9699, "step": 232 }, { "epoch": 0.27771156138259834, "grad_norm": 0.579814229455722, "learning_rate": 8.266532678284103e-06, "loss": 1.984, "step": 233 }, { "epoch": 0.2789034564958284, "grad_norm": 0.627324817155187, "learning_rate": 8.252250068046784e-06, "loss": 1.9861, "step": 234 }, { "epoch": 0.2800953516090584, "grad_norm": 0.593805814527224, "learning_rate": 8.23792131000306e-06, "loss": 1.9693, "step": 235 }, { "epoch": 0.28128724672228844, "grad_norm": 0.6552471095231857, "learning_rate": 8.223546607470863e-06, "loss": 1.9862, "step": 236 }, { "epoch": 0.2824791418355185, "grad_norm": 0.6028562723069028, "learning_rate": 8.209126164420056e-06, "loss": 1.981, "step": 237 }, { "epoch": 0.2836710369487485, "grad_norm": 0.5873677146224183, "learning_rate": 8.19466018546953e-06, "loss": 1.9967, "step": 238 }, { "epoch": 0.28486293206197855, "grad_norm": 0.5279550914843492, "learning_rate": 8.18014887588431e-06, "loss": 1.9836, "step": 239 }, { "epoch": 0.2860548271752086, "grad_norm": 0.5159083129491098, "learning_rate": 8.165592441572648e-06, "loss": 1.9906, "step": 240 }, { "epoch": 0.2872467222884386, "grad_norm": 0.5540993574066266, "learning_rate": 8.150991089083081e-06, "loss": 1.9953, "step": 241 }, { "epoch": 0.28843861740166865, "grad_norm": 0.6125101838648868, "learning_rate": 8.13634502560152e-06, "loss": 2.0038, "step": 242 }, { "epoch": 0.2896305125148987, "grad_norm": 0.5519571584252633, "learning_rate": 8.1216544589483e-06, "loss": 1.9983, "step": 243 }, { "epoch": 0.2908224076281287, "grad_norm": 0.544350413761365, "learning_rate": 8.106919597575238e-06, "loss": 1.9718, "step": 244 }, { "epoch": 0.29201430274135876, "grad_norm": 0.5664660915352969, "learning_rate": 8.092140650562665e-06, "loss": 1.9671, "step": 245 }, { "epoch": 0.2932061978545888, "grad_norm": 0.6296577119121265, "learning_rate": 8.07731782761647e-06, "loss": 1.9881, "step": 246 }, { "epoch": 0.2943980929678188, "grad_norm": 0.4926647346394942, "learning_rate": 8.062451339065116e-06, "loss": 1.9609, "step": 247 }, { "epoch": 0.29558998808104886, "grad_norm": 0.4624410592429987, "learning_rate": 8.047541395856661e-06, "loss": 1.9974, "step": 248 }, { "epoch": 0.2967818831942789, "grad_norm": 0.559079602861405, "learning_rate": 8.032588209555765e-06, "loss": 1.999, "step": 249 }, { "epoch": 0.29797377830750893, "grad_norm": 0.5257803282078808, "learning_rate": 8.017591992340682e-06, "loss": 1.99, "step": 250 }, { "epoch": 0.29916567342073896, "grad_norm": 0.4532797658436555, "learning_rate": 8.002552957000254e-06, "loss": 1.9961, "step": 251 }, { "epoch": 0.300357568533969, "grad_norm": 0.4967793482713224, "learning_rate": 7.987471316930892e-06, "loss": 1.9859, "step": 252 }, { "epoch": 0.30154946364719903, "grad_norm": 0.5216037784287865, "learning_rate": 7.972347286133549e-06, "loss": 1.9775, "step": 253 }, { "epoch": 0.30274135876042907, "grad_norm": 0.44165364383086597, "learning_rate": 7.957181079210676e-06, "loss": 1.9834, "step": 254 }, { "epoch": 0.3039332538736591, "grad_norm": 0.4525734716636921, "learning_rate": 7.941972911363187e-06, "loss": 1.9834, "step": 255 }, { "epoch": 0.30512514898688914, "grad_norm": 0.4399784793186879, "learning_rate": 7.926722998387398e-06, "loss": 1.9883, "step": 256 }, { "epoch": 0.3063170441001192, "grad_norm": 0.4302293917353196, "learning_rate": 7.911431556671967e-06, "loss": 1.9888, "step": 257 }, { "epoch": 0.3075089392133492, "grad_norm": 0.5077527400267277, "learning_rate": 7.896098803194828e-06, "loss": 1.9814, "step": 258 }, { "epoch": 0.30870083432657924, "grad_norm": 0.5455522386411445, "learning_rate": 7.880724955520105e-06, "loss": 2.0022, "step": 259 }, { "epoch": 0.3098927294398093, "grad_norm": 0.4734204507402147, "learning_rate": 7.865310231795026e-06, "loss": 1.9883, "step": 260 }, { "epoch": 0.3110846245530393, "grad_norm": 0.46463402034819734, "learning_rate": 7.849854850746834e-06, "loss": 1.9871, "step": 261 }, { "epoch": 0.31227651966626935, "grad_norm": 0.48102107314994796, "learning_rate": 7.83435903167968e-06, "loss": 1.9817, "step": 262 }, { "epoch": 0.3134684147794994, "grad_norm": 0.49443270213282037, "learning_rate": 7.818822994471504e-06, "loss": 1.9726, "step": 263 }, { "epoch": 0.3146603098927294, "grad_norm": 0.5141146391688594, "learning_rate": 7.80324695957093e-06, "loss": 1.9843, "step": 264 }, { "epoch": 0.31585220500595945, "grad_norm": 0.48124603321709436, "learning_rate": 7.78763114799412e-06, "loss": 1.9713, "step": 265 }, { "epoch": 0.3170441001191895, "grad_norm": 0.4573264323307654, "learning_rate": 7.771975781321655e-06, "loss": 1.9855, "step": 266 }, { "epoch": 0.3182359952324195, "grad_norm": 0.497648183015366, "learning_rate": 7.75628108169538e-06, "loss": 1.9857, "step": 267 }, { "epoch": 0.31942789034564956, "grad_norm": 0.5260277669621191, "learning_rate": 7.740547271815253e-06, "loss": 1.9867, "step": 268 }, { "epoch": 0.3206197854588796, "grad_norm": 0.5443051292540823, "learning_rate": 7.72477457493619e-06, "loss": 1.9742, "step": 269 }, { "epoch": 0.3218116805721097, "grad_norm": 0.4269306335257882, "learning_rate": 7.70896321486489e-06, "loss": 1.9768, "step": 270 }, { "epoch": 0.3230035756853397, "grad_norm": 0.42010336549578936, "learning_rate": 7.693113415956674e-06, "loss": 1.9799, "step": 271 }, { "epoch": 0.32419547079856975, "grad_norm": 0.46762767407360706, "learning_rate": 7.677225403112277e-06, "loss": 1.9843, "step": 272 }, { "epoch": 0.3253873659117998, "grad_norm": 0.5057942342132519, "learning_rate": 7.661299401774677e-06, "loss": 1.9828, "step": 273 }, { "epoch": 0.3265792610250298, "grad_norm": 0.5952967303729245, "learning_rate": 7.645335637925897e-06, "loss": 1.9796, "step": 274 }, { "epoch": 0.32777115613825986, "grad_norm": 0.5273900870276448, "learning_rate": 7.629334338083774e-06, "loss": 1.9766, "step": 275 }, { "epoch": 0.3289630512514899, "grad_norm": 0.45567977499071444, "learning_rate": 7.6132957292987795e-06, "loss": 1.9617, "step": 276 }, { "epoch": 0.3301549463647199, "grad_norm": 0.5179821998771547, "learning_rate": 7.597220039150768e-06, "loss": 1.9863, "step": 277 }, { "epoch": 0.33134684147794996, "grad_norm": 0.5651280024042905, "learning_rate": 7.58110749574577e-06, "loss": 1.9821, "step": 278 }, { "epoch": 0.33253873659118, "grad_norm": 0.46901304611627237, "learning_rate": 7.564958327712735e-06, "loss": 1.9798, "step": 279 }, { "epoch": 0.33373063170441003, "grad_norm": 0.4359604972801817, "learning_rate": 7.5487727642003075e-06, "loss": 1.9789, "step": 280 }, { "epoch": 0.33492252681764006, "grad_norm": 0.4691874050085417, "learning_rate": 7.532551034873558e-06, "loss": 1.9858, "step": 281 }, { "epoch": 0.3361144219308701, "grad_norm": 0.49078578351565005, "learning_rate": 7.516293369910737e-06, "loss": 1.9905, "step": 282 }, { "epoch": 0.33730631704410013, "grad_norm": 0.4810414634759214, "learning_rate": 7.500000000000001e-06, "loss": 1.9757, "step": 283 }, { "epoch": 0.33849821215733017, "grad_norm": 0.4004089110467056, "learning_rate": 7.483671156336142e-06, "loss": 1.9743, "step": 284 }, { "epoch": 0.3396901072705602, "grad_norm": 0.48370804553795343, "learning_rate": 7.467307070617309e-06, "loss": 1.9882, "step": 285 }, { "epoch": 0.34088200238379024, "grad_norm": 0.3916208994505171, "learning_rate": 7.4509079750417154e-06, "loss": 1.9906, "step": 286 }, { "epoch": 0.3420738974970203, "grad_norm": 0.4440622088562717, "learning_rate": 7.43447410230435e-06, "loss": 1.9756, "step": 287 }, { "epoch": 0.3432657926102503, "grad_norm": 0.4151369125535769, "learning_rate": 7.418005685593669e-06, "loss": 1.98, "step": 288 }, { "epoch": 0.34445768772348034, "grad_norm": 0.42888099521221656, "learning_rate": 7.4015029585882925e-06, "loss": 1.9597, "step": 289 }, { "epoch": 0.3456495828367104, "grad_norm": 0.4031068379998817, "learning_rate": 7.384966155453686e-06, "loss": 1.9909, "step": 290 }, { "epoch": 0.3468414779499404, "grad_norm": 0.4288403976952624, "learning_rate": 7.368395510838838e-06, "loss": 1.9715, "step": 291 }, { "epoch": 0.34803337306317045, "grad_norm": 0.4047372419449946, "learning_rate": 7.351791259872929e-06, "loss": 1.9933, "step": 292 }, { "epoch": 0.3492252681764005, "grad_norm": 0.42040782221308876, "learning_rate": 7.335153638162005e-06, "loss": 1.9875, "step": 293 }, { "epoch": 0.3504171632896305, "grad_norm": 0.40151800416240474, "learning_rate": 7.318482881785612e-06, "loss": 1.9827, "step": 294 }, { "epoch": 0.35160905840286055, "grad_norm": 0.40534989415691614, "learning_rate": 7.301779227293475e-06, "loss": 1.9899, "step": 295 }, { "epoch": 0.3528009535160906, "grad_norm": 0.41437334261849135, "learning_rate": 7.285042911702116e-06, "loss": 1.9761, "step": 296 }, { "epoch": 0.3539928486293206, "grad_norm": 0.43461149682609845, "learning_rate": 7.268274172491508e-06, "loss": 2.0009, "step": 297 }, { "epoch": 0.35518474374255066, "grad_norm": 0.42255392024397564, "learning_rate": 7.251473247601698e-06, "loss": 1.9805, "step": 298 }, { "epoch": 0.3563766388557807, "grad_norm": 0.44303489088588954, "learning_rate": 7.234640375429427e-06, "loss": 1.9824, "step": 299 }, { "epoch": 0.3575685339690107, "grad_norm": 0.43499397642762283, "learning_rate": 7.217775794824759e-06, "loss": 1.9785, "step": 300 }, { "epoch": 0.35876042908224076, "grad_norm": 0.4208326930599362, "learning_rate": 7.200879745087681e-06, "loss": 1.994, "step": 301 }, { "epoch": 0.3599523241954708, "grad_norm": 0.4452902733869807, "learning_rate": 7.183952465964711e-06, "loss": 1.9741, "step": 302 }, { "epoch": 0.36114421930870083, "grad_norm": 0.4764827599963297, "learning_rate": 7.166994197645497e-06, "loss": 1.9826, "step": 303 }, { "epoch": 0.36233611442193087, "grad_norm": 0.4460964876445021, "learning_rate": 7.150005180759411e-06, "loss": 1.9808, "step": 304 }, { "epoch": 0.3635280095351609, "grad_norm": 0.42052492138452646, "learning_rate": 7.132985656372126e-06, "loss": 1.9652, "step": 305 }, { "epoch": 0.36471990464839094, "grad_norm": 0.3578650107792017, "learning_rate": 7.115935865982205e-06, "loss": 2.0037, "step": 306 }, { "epoch": 0.36591179976162097, "grad_norm": 0.4213839735073625, "learning_rate": 7.098856051517673e-06, "loss": 1.9983, "step": 307 }, { "epoch": 0.367103694874851, "grad_norm": 0.41798689890135715, "learning_rate": 7.0817464553325764e-06, "loss": 1.9833, "step": 308 }, { "epoch": 0.36829558998808104, "grad_norm": 0.46301273631831313, "learning_rate": 7.064607320203552e-06, "loss": 1.9785, "step": 309 }, { "epoch": 0.3694874851013111, "grad_norm": 0.3853768039281196, "learning_rate": 7.047438889326377e-06, "loss": 1.9953, "step": 310 }, { "epoch": 0.3706793802145411, "grad_norm": 0.39106836774943315, "learning_rate": 7.030241406312528e-06, "loss": 1.9908, "step": 311 }, { "epoch": 0.37187127532777114, "grad_norm": 0.3557595574168793, "learning_rate": 7.013015115185706e-06, "loss": 1.9711, "step": 312 }, { "epoch": 0.3730631704410012, "grad_norm": 0.462884994313804, "learning_rate": 6.9957602603783944e-06, "loss": 2.0036, "step": 313 }, { "epoch": 0.3742550655542312, "grad_norm": 0.42933967393666006, "learning_rate": 6.978477086728375e-06, "loss": 1.9843, "step": 314 }, { "epoch": 0.37544696066746125, "grad_norm": 0.43775594546905017, "learning_rate": 6.961165839475262e-06, "loss": 1.9799, "step": 315 }, { "epoch": 0.3766388557806913, "grad_norm": 0.40786517623408314, "learning_rate": 6.9438267642570216e-06, "loss": 1.9674, "step": 316 }, { "epoch": 0.3778307508939213, "grad_norm": 0.3812009351969576, "learning_rate": 6.926460107106483e-06, "loss": 1.9835, "step": 317 }, { "epoch": 0.37902264600715135, "grad_norm": 0.43023083569572035, "learning_rate": 6.909066114447847e-06, "loss": 1.9843, "step": 318 }, { "epoch": 0.3802145411203814, "grad_norm": 0.4055444095073271, "learning_rate": 6.891645033093196e-06, "loss": 1.9802, "step": 319 }, { "epoch": 0.3814064362336114, "grad_norm": 0.43023837992568775, "learning_rate": 6.874197110238986e-06, "loss": 1.9756, "step": 320 }, { "epoch": 0.38259833134684146, "grad_norm": 0.4061991284550457, "learning_rate": 6.8567225934625385e-06, "loss": 1.9793, "step": 321 }, { "epoch": 0.3837902264600715, "grad_norm": 0.46263343121001, "learning_rate": 6.8392217307185325e-06, "loss": 1.9888, "step": 322 }, { "epoch": 0.38498212157330153, "grad_norm": 0.5183393565092786, "learning_rate": 6.8216947703354815e-06, "loss": 1.9678, "step": 323 }, { "epoch": 0.38617401668653156, "grad_norm": 0.4914054711777072, "learning_rate": 6.804141961012213e-06, "loss": 1.9774, "step": 324 }, { "epoch": 0.3873659117997616, "grad_norm": 0.38775497500354755, "learning_rate": 6.786563551814333e-06, "loss": 1.9843, "step": 325 }, { "epoch": 0.38855780691299163, "grad_norm": 0.4175239392741797, "learning_rate": 6.7689597921707065e-06, "loss": 1.9812, "step": 326 }, { "epoch": 0.38974970202622167, "grad_norm": 0.5074081729621598, "learning_rate": 6.7513309318698975e-06, "loss": 1.9673, "step": 327 }, { "epoch": 0.3909415971394517, "grad_norm": 0.5759724338089542, "learning_rate": 6.733677221056645e-06, "loss": 1.9595, "step": 328 }, { "epoch": 0.39213349225268174, "grad_norm": 0.45858283981603526, "learning_rate": 6.715998910228296e-06, "loss": 1.979, "step": 329 }, { "epoch": 0.3933253873659118, "grad_norm": 0.39590782238976335, "learning_rate": 6.698296250231271e-06, "loss": 1.981, "step": 330 }, { "epoch": 0.39451728247914186, "grad_norm": 0.5514883543457016, "learning_rate": 6.68056949225748e-06, "loss": 1.9754, "step": 331 }, { "epoch": 0.3957091775923719, "grad_norm": 0.5367006385906758, "learning_rate": 6.6628188878407806e-06, "loss": 1.9688, "step": 332 }, { "epoch": 0.39690107270560193, "grad_norm": 0.4563028045170266, "learning_rate": 6.645044688853396e-06, "loss": 1.9792, "step": 333 }, { "epoch": 0.39809296781883197, "grad_norm": 0.4705275885547744, "learning_rate": 6.627247147502343e-06, "loss": 1.9751, "step": 334 }, { "epoch": 0.399284862932062, "grad_norm": 0.39053085326929393, "learning_rate": 6.609426516325859e-06, "loss": 1.9809, "step": 335 }, { "epoch": 0.40047675804529204, "grad_norm": 0.46336889396641767, "learning_rate": 6.591583048189812e-06, "loss": 1.9819, "step": 336 }, { "epoch": 0.40166865315852207, "grad_norm": 0.41312116285494427, "learning_rate": 6.573716996284114e-06, "loss": 1.9956, "step": 337 }, { "epoch": 0.4028605482717521, "grad_norm": 0.4261033537644772, "learning_rate": 6.555828614119132e-06, "loss": 1.9864, "step": 338 }, { "epoch": 0.40405244338498214, "grad_norm": 0.5571802621996744, "learning_rate": 6.537918155522089e-06, "loss": 1.9881, "step": 339 }, { "epoch": 0.4052443384982122, "grad_norm": 0.42763390364122206, "learning_rate": 6.519985874633454e-06, "loss": 1.981, "step": 340 }, { "epoch": 0.4064362336114422, "grad_norm": 0.41484190699219026, "learning_rate": 6.502032025903356e-06, "loss": 1.9641, "step": 341 }, { "epoch": 0.40762812872467225, "grad_norm": 0.3838791164718351, "learning_rate": 6.484056864087948e-06, "loss": 1.9709, "step": 342 }, { "epoch": 0.4088200238379023, "grad_norm": 0.4023689175266171, "learning_rate": 6.4660606442458155e-06, "loss": 1.9713, "step": 343 }, { "epoch": 0.4100119189511323, "grad_norm": 0.4336427044211903, "learning_rate": 6.4480436217343366e-06, "loss": 1.9534, "step": 344 }, { "epoch": 0.41120381406436235, "grad_norm": 0.37598773624858467, "learning_rate": 6.430006052206083e-06, "loss": 1.9603, "step": 345 }, { "epoch": 0.4123957091775924, "grad_norm": 0.43416807891817494, "learning_rate": 6.411948191605164e-06, "loss": 1.9787, "step": 346 }, { "epoch": 0.4135876042908224, "grad_norm": 0.3977800151758, "learning_rate": 6.393870296163616e-06, "loss": 1.9916, "step": 347 }, { "epoch": 0.41477949940405245, "grad_norm": 0.4008696555982334, "learning_rate": 6.375772622397762e-06, "loss": 1.9804, "step": 348 }, { "epoch": 0.4159713945172825, "grad_norm": 0.3451532285909086, "learning_rate": 6.357655427104562e-06, "loss": 1.9663, "step": 349 }, { "epoch": 0.4171632896305125, "grad_norm": 0.4341428658767691, "learning_rate": 6.339518967357985e-06, "loss": 1.9744, "step": 350 }, { "epoch": 0.41835518474374256, "grad_norm": 0.37680689737786904, "learning_rate": 6.321363500505348e-06, "loss": 1.994, "step": 351 }, { "epoch": 0.4195470798569726, "grad_norm": 0.36788506489233713, "learning_rate": 6.3031892841636685e-06, "loss": 1.9847, "step": 352 }, { "epoch": 0.42073897497020263, "grad_norm": 0.38396929856917666, "learning_rate": 6.284996576216014e-06, "loss": 1.9722, "step": 353 }, { "epoch": 0.42193087008343266, "grad_norm": 0.3512841210948969, "learning_rate": 6.266785634807838e-06, "loss": 1.9504, "step": 354 }, { "epoch": 0.4231227651966627, "grad_norm": 0.3841371233710849, "learning_rate": 6.248556718343314e-06, "loss": 1.9997, "step": 355 }, { "epoch": 0.42431466030989273, "grad_norm": 0.41345223603319187, "learning_rate": 6.230310085481677e-06, "loss": 1.9754, "step": 356 }, { "epoch": 0.42550655542312277, "grad_norm": 0.36115831056461284, "learning_rate": 6.212045995133543e-06, "loss": 1.9735, "step": 357 }, { "epoch": 0.4266984505363528, "grad_norm": 0.37667258015583416, "learning_rate": 6.193764706457249e-06, "loss": 1.9669, "step": 358 }, { "epoch": 0.42789034564958284, "grad_norm": 0.34439222602136627, "learning_rate": 6.175466478855161e-06, "loss": 1.9788, "step": 359 }, { "epoch": 0.42908224076281287, "grad_norm": 0.406649190145765, "learning_rate": 6.157151571970005e-06, "loss": 1.9868, "step": 360 }, { "epoch": 0.4302741358760429, "grad_norm": 0.37410746997126837, "learning_rate": 6.13882024568117e-06, "loss": 1.9588, "step": 361 }, { "epoch": 0.43146603098927294, "grad_norm": 0.38935925565712926, "learning_rate": 6.1204727601010396e-06, "loss": 1.978, "step": 362 }, { "epoch": 0.432657926102503, "grad_norm": 0.3934047570972324, "learning_rate": 6.10210937557128e-06, "loss": 1.9728, "step": 363 }, { "epoch": 0.433849821215733, "grad_norm": 0.3740037082900391, "learning_rate": 6.083730352659158e-06, "loss": 1.9777, "step": 364 }, { "epoch": 0.43504171632896305, "grad_norm": 0.3962866525803316, "learning_rate": 6.065335952153846e-06, "loss": 1.9753, "step": 365 }, { "epoch": 0.4362336114421931, "grad_norm": 0.3703123980920405, "learning_rate": 6.0469264350627075e-06, "loss": 1.9685, "step": 366 }, { "epoch": 0.4374255065554231, "grad_norm": 0.3772080775482272, "learning_rate": 6.0285020626076115e-06, "loss": 1.9918, "step": 367 }, { "epoch": 0.43861740166865315, "grad_norm": 0.36096021522163296, "learning_rate": 6.010063096221215e-06, "loss": 1.9857, "step": 368 }, { "epoch": 0.4398092967818832, "grad_norm": 0.4027363280332516, "learning_rate": 5.991609797543253e-06, "loss": 1.9772, "step": 369 }, { "epoch": 0.4410011918951132, "grad_norm": 0.36449407433194586, "learning_rate": 5.973142428416829e-06, "loss": 1.9926, "step": 370 }, { "epoch": 0.44219308700834326, "grad_norm": 0.41922022657177943, "learning_rate": 5.954661250884704e-06, "loss": 1.9851, "step": 371 }, { "epoch": 0.4433849821215733, "grad_norm": 0.3957989777206615, "learning_rate": 5.936166527185565e-06, "loss": 1.9627, "step": 372 }, { "epoch": 0.4445768772348033, "grad_norm": 0.39452398707557135, "learning_rate": 5.91765851975032e-06, "loss": 1.9876, "step": 373 }, { "epoch": 0.44576877234803336, "grad_norm": 0.39493419711592515, "learning_rate": 5.899137491198364e-06, "loss": 1.9686, "step": 374 }, { "epoch": 0.4469606674612634, "grad_norm": 0.4099934527801523, "learning_rate": 5.880603704333851e-06, "loss": 1.9534, "step": 375 }, { "epoch": 0.44815256257449343, "grad_norm": 0.36964455654061956, "learning_rate": 5.862057422141979e-06, "loss": 1.9523, "step": 376 }, { "epoch": 0.44934445768772346, "grad_norm": 0.3902869598970143, "learning_rate": 5.843498907785236e-06, "loss": 1.9554, "step": 377 }, { "epoch": 0.4505363528009535, "grad_norm": 0.3969483119716555, "learning_rate": 5.8249284245996905e-06, "loss": 1.9907, "step": 378 }, { "epoch": 0.45172824791418353, "grad_norm": 0.3960234150743317, "learning_rate": 5.806346236091232e-06, "loss": 1.9906, "step": 379 }, { "epoch": 0.45292014302741357, "grad_norm": 0.3810498242078963, "learning_rate": 5.78775260593185e-06, "loss": 1.9612, "step": 380 }, { "epoch": 0.4541120381406436, "grad_norm": 0.385855393557767, "learning_rate": 5.769147797955882e-06, "loss": 1.9736, "step": 381 }, { "epoch": 0.45530393325387364, "grad_norm": 0.34406815893035153, "learning_rate": 5.7505320761562735e-06, "loss": 1.9864, "step": 382 }, { "epoch": 0.4564958283671037, "grad_norm": 0.37884788683749326, "learning_rate": 5.731905704680834e-06, "loss": 1.9878, "step": 383 }, { "epoch": 0.4576877234803337, "grad_norm": 0.38229911057814764, "learning_rate": 5.713268947828484e-06, "loss": 1.9677, "step": 384 }, { "epoch": 0.45887961859356374, "grad_norm": 0.3930195565597414, "learning_rate": 5.694622070045507e-06, "loss": 1.9831, "step": 385 }, { "epoch": 0.4600715137067938, "grad_norm": 0.35771899505040233, "learning_rate": 5.6759653359218e-06, "loss": 1.938, "step": 386 }, { "epoch": 0.4612634088200238, "grad_norm": 0.3844248408562967, "learning_rate": 5.657299010187116e-06, "loss": 1.983, "step": 387 }, { "epoch": 0.46245530393325385, "grad_norm": 0.374339760496431, "learning_rate": 5.638623357707304e-06, "loss": 1.9696, "step": 388 }, { "epoch": 0.4636471990464839, "grad_norm": 0.4187861158867821, "learning_rate": 5.6199386434805615e-06, "loss": 1.9678, "step": 389 }, { "epoch": 0.464839094159714, "grad_norm": 0.37470925657624427, "learning_rate": 5.601245132633662e-06, "loss": 1.9708, "step": 390 }, { "epoch": 0.466030989272944, "grad_norm": 0.43682382668647773, "learning_rate": 5.582543090418203e-06, "loss": 1.9742, "step": 391 }, { "epoch": 0.46722288438617404, "grad_norm": 0.38062716223853055, "learning_rate": 5.563832782206835e-06, "loss": 1.956, "step": 392 }, { "epoch": 0.4684147794994041, "grad_norm": 0.39166492023793653, "learning_rate": 5.5451144734895e-06, "loss": 1.9479, "step": 393 }, { "epoch": 0.4696066746126341, "grad_norm": 0.45740493772589974, "learning_rate": 5.526388429869663e-06, "loss": 1.9757, "step": 394 }, { "epoch": 0.47079856972586415, "grad_norm": 0.3532441760302746, "learning_rate": 5.507654917060541e-06, "loss": 1.9774, "step": 395 }, { "epoch": 0.4719904648390942, "grad_norm": 0.4162677343329253, "learning_rate": 5.48891420088134e-06, "loss": 1.9837, "step": 396 }, { "epoch": 0.4731823599523242, "grad_norm": 0.4634604848492295, "learning_rate": 5.470166547253476e-06, "loss": 1.9923, "step": 397 }, { "epoch": 0.47437425506555425, "grad_norm": 0.4001952153469404, "learning_rate": 5.451412222196801e-06, "loss": 1.969, "step": 398 }, { "epoch": 0.4755661501787843, "grad_norm": 0.4117431494583168, "learning_rate": 5.432651491825837e-06, "loss": 1.9609, "step": 399 }, { "epoch": 0.4767580452920143, "grad_norm": 0.4367947660920832, "learning_rate": 5.4138846223459895e-06, "loss": 1.9621, "step": 400 }, { "epoch": 0.47794994040524436, "grad_norm": 0.34963770890851276, "learning_rate": 5.395111880049775e-06, "loss": 1.9564, "step": 401 }, { "epoch": 0.4791418355184744, "grad_norm": 0.4080401962751008, "learning_rate": 5.376333531313046e-06, "loss": 1.9689, "step": 402 }, { "epoch": 0.4803337306317044, "grad_norm": 0.39779512665663647, "learning_rate": 5.3575498425912046e-06, "loss": 1.9752, "step": 403 }, { "epoch": 0.48152562574493446, "grad_norm": 0.3494078316294088, "learning_rate": 5.338761080415425e-06, "loss": 1.988, "step": 404 }, { "epoch": 0.4827175208581645, "grad_norm": 0.38403810675465305, "learning_rate": 5.319967511388871e-06, "loss": 1.9849, "step": 405 }, { "epoch": 0.48390941597139453, "grad_norm": 0.41925050485912146, "learning_rate": 5.301169402182915e-06, "loss": 1.9744, "step": 406 }, { "epoch": 0.48510131108462456, "grad_norm": 0.3659050285550682, "learning_rate": 5.28236701953335e-06, "loss": 1.9594, "step": 407 }, { "epoch": 0.4862932061978546, "grad_norm": 0.3779979519911562, "learning_rate": 5.263560630236611e-06, "loss": 1.969, "step": 408 }, { "epoch": 0.48748510131108463, "grad_norm": 0.4051001024185403, "learning_rate": 5.244750501145977e-06, "loss": 1.9758, "step": 409 }, { "epoch": 0.48867699642431467, "grad_norm": 0.3580954206397942, "learning_rate": 5.225936899167803e-06, "loss": 1.9712, "step": 410 }, { "epoch": 0.4898688915375447, "grad_norm": 0.37492205319973293, "learning_rate": 5.207120091257715e-06, "loss": 1.9924, "step": 411 }, { "epoch": 0.49106078665077474, "grad_norm": 0.3787755420296742, "learning_rate": 5.188300344416834e-06, "loss": 1.9607, "step": 412 }, { "epoch": 0.4922526817640048, "grad_norm": 0.3594245434434773, "learning_rate": 5.169477925687981e-06, "loss": 1.9596, "step": 413 }, { "epoch": 0.4934445768772348, "grad_norm": 0.4048509843155868, "learning_rate": 5.15065310215189e-06, "loss": 1.9811, "step": 414 }, { "epoch": 0.49463647199046484, "grad_norm": 0.33930841548544644, "learning_rate": 5.1318261409234185e-06, "loss": 1.9785, "step": 415 }, { "epoch": 0.4958283671036949, "grad_norm": 0.3971904008450457, "learning_rate": 5.112997309147753e-06, "loss": 1.9538, "step": 416 }, { "epoch": 0.4970202622169249, "grad_norm": 0.4109703239083303, "learning_rate": 5.094166873996632e-06, "loss": 1.9442, "step": 417 }, { "epoch": 0.49821215733015495, "grad_norm": 0.35849090963357355, "learning_rate": 5.075335102664533e-06, "loss": 1.9611, "step": 418 }, { "epoch": 0.499404052443385, "grad_norm": 0.3315925723712266, "learning_rate": 5.0565022623649e-06, "loss": 1.9507, "step": 419 }, { "epoch": 0.5005959475566151, "grad_norm": 0.40128345634186274, "learning_rate": 5.037668620326343e-06, "loss": 1.9965, "step": 420 }, { "epoch": 0.5017878426698451, "grad_norm": 0.34631267401835186, "learning_rate": 5.018834443788855e-06, "loss": 1.9739, "step": 421 }, { "epoch": 0.5029797377830751, "grad_norm": 0.37750605356600553, "learning_rate": 5e-06, "loss": 1.9577, "step": 422 }, { "epoch": 0.5041716328963052, "grad_norm": 0.325413886379343, "learning_rate": 4.9811655562111465e-06, "loss": 1.964, "step": 423 }, { "epoch": 0.5053635280095352, "grad_norm": 0.37792660484449137, "learning_rate": 4.9623313796736575e-06, "loss": 1.9834, "step": 424 }, { "epoch": 0.5065554231227652, "grad_norm": 0.3212926587032829, "learning_rate": 4.943497737635103e-06, "loss": 1.9652, "step": 425 }, { "epoch": 0.5077473182359953, "grad_norm": 0.3666539973322088, "learning_rate": 4.9246648973354704e-06, "loss": 1.9898, "step": 426 }, { "epoch": 0.5089392133492253, "grad_norm": 0.3470498382172804, "learning_rate": 4.905833126003371e-06, "loss": 1.986, "step": 427 }, { "epoch": 0.5101311084624554, "grad_norm": 0.3509551861996659, "learning_rate": 4.887002690852249e-06, "loss": 1.9765, "step": 428 }, { "epoch": 0.5113230035756854, "grad_norm": 0.33773403719361406, "learning_rate": 4.868173859076585e-06, "loss": 1.9514, "step": 429 }, { "epoch": 0.5125148986889154, "grad_norm": 0.33839162767720193, "learning_rate": 4.849346897848111e-06, "loss": 1.9671, "step": 430 }, { "epoch": 0.5137067938021455, "grad_norm": 0.34429335199030947, "learning_rate": 4.830522074312019e-06, "loss": 1.9739, "step": 431 }, { "epoch": 0.5148986889153755, "grad_norm": 0.35288845889112397, "learning_rate": 4.811699655583167e-06, "loss": 1.9912, "step": 432 }, { "epoch": 0.5160905840286055, "grad_norm": 0.3461629113067177, "learning_rate": 4.792879908742285e-06, "loss": 1.9484, "step": 433 }, { "epoch": 0.5172824791418356, "grad_norm": 0.3196675261690019, "learning_rate": 4.774063100832199e-06, "loss": 1.9688, "step": 434 }, { "epoch": 0.5184743742550656, "grad_norm": 0.3392521279527564, "learning_rate": 4.755249498854024e-06, "loss": 1.9506, "step": 435 }, { "epoch": 0.5196662693682956, "grad_norm": 0.3457448380814436, "learning_rate": 4.736439369763391e-06, "loss": 1.9743, "step": 436 }, { "epoch": 0.5208581644815257, "grad_norm": 0.33206346746993015, "learning_rate": 4.717632980466652e-06, "loss": 1.9593, "step": 437 }, { "epoch": 0.5220500595947557, "grad_norm": 0.3528235654912419, "learning_rate": 4.698830597817087e-06, "loss": 1.9665, "step": 438 }, { "epoch": 0.5232419547079857, "grad_norm": 0.3556856155018991, "learning_rate": 4.680032488611131e-06, "loss": 1.9799, "step": 439 }, { "epoch": 0.5244338498212158, "grad_norm": 0.32848860913310046, "learning_rate": 4.661238919584578e-06, "loss": 1.9803, "step": 440 }, { "epoch": 0.5256257449344458, "grad_norm": 0.32687019162828723, "learning_rate": 4.642450157408798e-06, "loss": 1.9428, "step": 441 }, { "epoch": 0.5268176400476758, "grad_norm": 0.3453824430384208, "learning_rate": 4.623666468686956e-06, "loss": 1.9822, "step": 442 }, { "epoch": 0.5280095351609059, "grad_norm": 0.37003751019358744, "learning_rate": 4.6048881199502265e-06, "loss": 1.9483, "step": 443 }, { "epoch": 0.5292014302741359, "grad_norm": 0.4147946881041239, "learning_rate": 4.586115377654014e-06, "loss": 1.9617, "step": 444 }, { "epoch": 0.5303933253873659, "grad_norm": 0.3574077732974426, "learning_rate": 4.567348508174164e-06, "loss": 1.9583, "step": 445 }, { "epoch": 0.531585220500596, "grad_norm": 0.40825647248866936, "learning_rate": 4.548587777803198e-06, "loss": 1.9804, "step": 446 }, { "epoch": 0.532777115613826, "grad_norm": 0.37613255907704796, "learning_rate": 4.529833452746526e-06, "loss": 1.9927, "step": 447 }, { "epoch": 0.533969010727056, "grad_norm": 0.594095776694764, "learning_rate": 4.5110857991186606e-06, "loss": 1.9719, "step": 448 }, { "epoch": 0.5351609058402861, "grad_norm": 0.3717370719647907, "learning_rate": 4.49234508293946e-06, "loss": 1.9593, "step": 449 }, { "epoch": 0.5363528009535161, "grad_norm": 0.3648799549586229, "learning_rate": 4.47361157013034e-06, "loss": 1.967, "step": 450 }, { "epoch": 0.5375446960667462, "grad_norm": 0.33995726389121855, "learning_rate": 4.454885526510501e-06, "loss": 1.9753, "step": 451 }, { "epoch": 0.5387365911799762, "grad_norm": 0.39251871849389397, "learning_rate": 4.436167217793167e-06, "loss": 1.9818, "step": 452 }, { "epoch": 0.5399284862932062, "grad_norm": 0.3199928030279707, "learning_rate": 4.417456909581798e-06, "loss": 1.9552, "step": 453 }, { "epoch": 0.5411203814064363, "grad_norm": 0.3669027651321596, "learning_rate": 4.398754867366339e-06, "loss": 1.9775, "step": 454 }, { "epoch": 0.5423122765196663, "grad_norm": 0.3436432164393003, "learning_rate": 4.38006135651944e-06, "loss": 1.9772, "step": 455 }, { "epoch": 0.5435041716328963, "grad_norm": 0.361749523201955, "learning_rate": 4.361376642292698e-06, "loss": 1.9683, "step": 456 }, { "epoch": 0.5446960667461264, "grad_norm": 0.39560797233498957, "learning_rate": 4.3427009898128865e-06, "loss": 1.9671, "step": 457 }, { "epoch": 0.5458879618593564, "grad_norm": 0.3602620583029035, "learning_rate": 4.3240346640782014e-06, "loss": 1.9944, "step": 458 }, { "epoch": 0.5470798569725864, "grad_norm": 0.36119141344790967, "learning_rate": 4.305377929954495e-06, "loss": 1.9761, "step": 459 }, { "epoch": 0.5482717520858165, "grad_norm": 0.3678120966781157, "learning_rate": 4.286731052171518e-06, "loss": 1.958, "step": 460 }, { "epoch": 0.5494636471990465, "grad_norm": 0.3617639507211402, "learning_rate": 4.268094295319167e-06, "loss": 1.9813, "step": 461 }, { "epoch": 0.5506555423122765, "grad_norm": 0.3608243381659533, "learning_rate": 4.249467923843728e-06, "loss": 1.9641, "step": 462 }, { "epoch": 0.5518474374255066, "grad_norm": 0.36278415417064125, "learning_rate": 4.23085220204412e-06, "loss": 1.9709, "step": 463 }, { "epoch": 0.5530393325387366, "grad_norm": 0.3588218797888413, "learning_rate": 4.212247394068151e-06, "loss": 1.9626, "step": 464 }, { "epoch": 0.5542312276519666, "grad_norm": 0.33036367069937955, "learning_rate": 4.19365376390877e-06, "loss": 1.9832, "step": 465 }, { "epoch": 0.5554231227651967, "grad_norm": 0.4074815848531431, "learning_rate": 4.175071575400311e-06, "loss": 1.9776, "step": 466 }, { "epoch": 0.5566150178784267, "grad_norm": 0.34881797295660344, "learning_rate": 4.1565010922147644e-06, "loss": 1.957, "step": 467 }, { "epoch": 0.5578069129916567, "grad_norm": 0.37520529930498075, "learning_rate": 4.137942577858023e-06, "loss": 1.975, "step": 468 }, { "epoch": 0.5589988081048868, "grad_norm": 0.3630248536367911, "learning_rate": 4.11939629566615e-06, "loss": 1.9608, "step": 469 }, { "epoch": 0.5601907032181168, "grad_norm": 0.33352062979381114, "learning_rate": 4.100862508801639e-06, "loss": 1.968, "step": 470 }, { "epoch": 0.5613825983313468, "grad_norm": 0.3586501055080032, "learning_rate": 4.082341480249681e-06, "loss": 1.951, "step": 471 }, { "epoch": 0.5625744934445769, "grad_norm": 0.32010968502296533, "learning_rate": 4.063833472814437e-06, "loss": 1.9712, "step": 472 }, { "epoch": 0.5637663885578069, "grad_norm": 0.35914935623294864, "learning_rate": 4.045338749115299e-06, "loss": 1.9451, "step": 473 }, { "epoch": 0.564958283671037, "grad_norm": 0.3215562828423304, "learning_rate": 4.026857571583173e-06, "loss": 1.9914, "step": 474 }, { "epoch": 0.566150178784267, "grad_norm": 0.34368077323068136, "learning_rate": 4.008390202456748e-06, "loss": 1.9602, "step": 475 }, { "epoch": 0.567342073897497, "grad_norm": 0.33832066930772653, "learning_rate": 3.989936903778785e-06, "loss": 1.9604, "step": 476 }, { "epoch": 0.5685339690107271, "grad_norm": 0.3169050597259224, "learning_rate": 3.971497937392388e-06, "loss": 2.0011, "step": 477 }, { "epoch": 0.5697258641239571, "grad_norm": 0.33977687443341886, "learning_rate": 3.953073564937293e-06, "loss": 1.9483, "step": 478 }, { "epoch": 0.5709177592371871, "grad_norm": 0.31238016120669476, "learning_rate": 3.934664047846157e-06, "loss": 1.967, "step": 479 }, { "epoch": 0.5721096543504172, "grad_norm": 0.3251899985092587, "learning_rate": 3.916269647340843e-06, "loss": 1.958, "step": 480 }, { "epoch": 0.5733015494636472, "grad_norm": 0.34188581574139687, "learning_rate": 3.897890624428721e-06, "loss": 1.97, "step": 481 }, { "epoch": 0.5744934445768772, "grad_norm": 0.3179020258722567, "learning_rate": 3.879527239898962e-06, "loss": 1.9713, "step": 482 }, { "epoch": 0.5756853396901073, "grad_norm": 0.35414300160209977, "learning_rate": 3.86117975431883e-06, "loss": 1.9387, "step": 483 }, { "epoch": 0.5768772348033373, "grad_norm": 0.296876137502102, "learning_rate": 3.8428484280299975e-06, "loss": 1.9918, "step": 484 }, { "epoch": 0.5780691299165673, "grad_norm": 0.35059849213295274, "learning_rate": 3.8245335211448404e-06, "loss": 1.9622, "step": 485 }, { "epoch": 0.5792610250297974, "grad_norm": 0.2899536086006706, "learning_rate": 3.8062352935427526e-06, "loss": 1.9727, "step": 486 }, { "epoch": 0.5804529201430274, "grad_norm": 0.3244862339368592, "learning_rate": 3.787954004866459e-06, "loss": 1.9829, "step": 487 }, { "epoch": 0.5816448152562574, "grad_norm": 0.31060144753736796, "learning_rate": 3.769689914518326e-06, "loss": 1.9743, "step": 488 }, { "epoch": 0.5828367103694875, "grad_norm": 0.3081671121318371, "learning_rate": 3.751443281656688e-06, "loss": 1.9716, "step": 489 }, { "epoch": 0.5840286054827175, "grad_norm": 0.28679657845355666, "learning_rate": 3.733214365192162e-06, "loss": 1.9836, "step": 490 }, { "epoch": 0.5852205005959475, "grad_norm": 0.31077612486695794, "learning_rate": 3.715003423783986e-06, "loss": 1.9894, "step": 491 }, { "epoch": 0.5864123957091776, "grad_norm": 0.2980657403471547, "learning_rate": 3.696810715836332e-06, "loss": 1.9712, "step": 492 }, { "epoch": 0.5876042908224076, "grad_norm": 0.28507782391437864, "learning_rate": 3.6786364994946543e-06, "loss": 1.9652, "step": 493 }, { "epoch": 0.5887961859356377, "grad_norm": 0.3076841882401857, "learning_rate": 3.660481032642016e-06, "loss": 1.9756, "step": 494 }, { "epoch": 0.5899880810488677, "grad_norm": 0.2873213364073368, "learning_rate": 3.6423445728954393e-06, "loss": 1.9702, "step": 495 }, { "epoch": 0.5911799761620977, "grad_norm": 0.30064962474416257, "learning_rate": 3.6242273776022396e-06, "loss": 1.9798, "step": 496 }, { "epoch": 0.5923718712753278, "grad_norm": 0.30016520129470653, "learning_rate": 3.6061297038363853e-06, "loss": 1.9708, "step": 497 }, { "epoch": 0.5935637663885578, "grad_norm": 0.3186216715211957, "learning_rate": 3.5880518083948377e-06, "loss": 1.9786, "step": 498 }, { "epoch": 0.5947556615017878, "grad_norm": 0.3093775837624005, "learning_rate": 3.5699939477939183e-06, "loss": 1.9585, "step": 499 }, { "epoch": 0.5959475566150179, "grad_norm": 0.28193348662211454, "learning_rate": 3.5519563782656642e-06, "loss": 1.9738, "step": 500 }, { "epoch": 0.5971394517282479, "grad_norm": 0.32328773490671, "learning_rate": 3.533939355754188e-06, "loss": 1.9619, "step": 501 }, { "epoch": 0.5983313468414779, "grad_norm": 0.30291671495352485, "learning_rate": 3.5159431359120545e-06, "loss": 1.9651, "step": 502 }, { "epoch": 0.599523241954708, "grad_norm": 0.3080909269221942, "learning_rate": 3.497967974096647e-06, "loss": 1.9783, "step": 503 }, { "epoch": 0.600715137067938, "grad_norm": 0.32314557640507674, "learning_rate": 3.4800141253665463e-06, "loss": 1.9657, "step": 504 }, { "epoch": 0.601907032181168, "grad_norm": 0.29346056048517033, "learning_rate": 3.4620818444779126e-06, "loss": 1.9787, "step": 505 }, { "epoch": 0.6030989272943981, "grad_norm": 0.3110390571856809, "learning_rate": 3.4441713858808684e-06, "loss": 1.9414, "step": 506 }, { "epoch": 0.6042908224076281, "grad_norm": 0.31467381689979457, "learning_rate": 3.426283003715886e-06, "loss": 1.9619, "step": 507 }, { "epoch": 0.6054827175208581, "grad_norm": 0.2969133354888754, "learning_rate": 3.4084169518101896e-06, "loss": 1.9604, "step": 508 }, { "epoch": 0.6066746126340882, "grad_norm": 0.3184238842438653, "learning_rate": 3.3905734836741415e-06, "loss": 1.953, "step": 509 }, { "epoch": 0.6078665077473182, "grad_norm": 0.2969150683168432, "learning_rate": 3.3727528524976583e-06, "loss": 1.9664, "step": 510 }, { "epoch": 0.6090584028605482, "grad_norm": 0.33154057267330567, "learning_rate": 3.354955311146606e-06, "loss": 1.9776, "step": 511 }, { "epoch": 0.6102502979737783, "grad_norm": 0.30901718720421373, "learning_rate": 3.3371811121592203e-06, "loss": 1.9917, "step": 512 }, { "epoch": 0.6114421930870083, "grad_norm": 0.3212832298222802, "learning_rate": 3.3194305077425215e-06, "loss": 1.9928, "step": 513 }, { "epoch": 0.6126340882002383, "grad_norm": 0.34130767861666084, "learning_rate": 3.3017037497687303e-06, "loss": 1.9501, "step": 514 }, { "epoch": 0.6138259833134684, "grad_norm": 0.2919077388333617, "learning_rate": 3.2840010897717045e-06, "loss": 1.9657, "step": 515 }, { "epoch": 0.6150178784266984, "grad_norm": 0.3277066008449366, "learning_rate": 3.2663227789433573e-06, "loss": 1.9602, "step": 516 }, { "epoch": 0.6162097735399285, "grad_norm": 0.2903404769911658, "learning_rate": 3.2486690681301046e-06, "loss": 1.959, "step": 517 }, { "epoch": 0.6174016686531585, "grad_norm": 0.284277433828357, "learning_rate": 3.2310402078292956e-06, "loss": 1.9718, "step": 518 }, { "epoch": 0.6185935637663885, "grad_norm": 0.3258141085919218, "learning_rate": 3.2134364481856663e-06, "loss": 1.9612, "step": 519 }, { "epoch": 0.6197854588796186, "grad_norm": 0.285408156114209, "learning_rate": 3.1958580389877876e-06, "loss": 1.9747, "step": 520 }, { "epoch": 0.6209773539928486, "grad_norm": 0.3071499624906975, "learning_rate": 3.178305229664519e-06, "loss": 1.9781, "step": 521 }, { "epoch": 0.6221692491060786, "grad_norm": 0.29430716274498264, "learning_rate": 3.1607782692814683e-06, "loss": 1.9785, "step": 522 }, { "epoch": 0.6233611442193087, "grad_norm": 0.29446694445491767, "learning_rate": 3.1432774065374628e-06, "loss": 1.9651, "step": 523 }, { "epoch": 0.6245530393325387, "grad_norm": 0.2868927792141283, "learning_rate": 3.125802889761016e-06, "loss": 1.9604, "step": 524 }, { "epoch": 0.6257449344457687, "grad_norm": 0.3075894856023552, "learning_rate": 3.1083549669068048e-06, "loss": 1.981, "step": 525 }, { "epoch": 0.6269368295589988, "grad_norm": 0.30553317063832414, "learning_rate": 3.090933885552155e-06, "loss": 1.968, "step": 526 }, { "epoch": 0.6281287246722288, "grad_norm": 0.2883247866247332, "learning_rate": 3.073539892893519e-06, "loss": 1.9647, "step": 527 }, { "epoch": 0.6293206197854588, "grad_norm": 0.3093327452992941, "learning_rate": 3.0561732357429797e-06, "loss": 1.9691, "step": 528 }, { "epoch": 0.6305125148986889, "grad_norm": 0.2944434342418357, "learning_rate": 3.0388341605247385e-06, "loss": 1.9756, "step": 529 }, { "epoch": 0.6317044100119189, "grad_norm": 0.3231077122645434, "learning_rate": 3.021522913271627e-06, "loss": 1.9774, "step": 530 }, { "epoch": 0.6328963051251489, "grad_norm": 0.2937937539093132, "learning_rate": 3.0042397396216076e-06, "loss": 1.9813, "step": 531 }, { "epoch": 0.634088200238379, "grad_norm": 0.33747028062165074, "learning_rate": 2.9869848848142957e-06, "loss": 1.9817, "step": 532 }, { "epoch": 0.635280095351609, "grad_norm": 0.27860436170886715, "learning_rate": 2.969758593687475e-06, "loss": 1.995, "step": 533 }, { "epoch": 0.636471990464839, "grad_norm": 0.2686660592261799, "learning_rate": 2.952561110673623e-06, "loss": 2.004, "step": 534 }, { "epoch": 0.6376638855780691, "grad_norm": 0.3171126513844146, "learning_rate": 2.9353926797964495e-06, "loss": 1.9675, "step": 535 }, { "epoch": 0.6388557806912991, "grad_norm": 0.26076405849359174, "learning_rate": 2.9182535446674244e-06, "loss": 1.9606, "step": 536 }, { "epoch": 0.6400476758045291, "grad_norm": 0.311798441596794, "learning_rate": 2.9011439484823287e-06, "loss": 1.9566, "step": 537 }, { "epoch": 0.6412395709177592, "grad_norm": 0.2667721525695941, "learning_rate": 2.8840641340177955e-06, "loss": 1.9571, "step": 538 }, { "epoch": 0.6424314660309892, "grad_norm": 0.29165327528369395, "learning_rate": 2.8670143436278757e-06, "loss": 1.9648, "step": 539 }, { "epoch": 0.6436233611442194, "grad_norm": 0.29487930858334793, "learning_rate": 2.84999481924059e-06, "loss": 1.9499, "step": 540 }, { "epoch": 0.6448152562574494, "grad_norm": 0.31540084878211927, "learning_rate": 2.8330058023545027e-06, "loss": 1.9658, "step": 541 }, { "epoch": 0.6460071513706794, "grad_norm": 0.2789685559518471, "learning_rate": 2.8160475340352913e-06, "loss": 1.9638, "step": 542 }, { "epoch": 0.6471990464839095, "grad_norm": 0.28954283549505694, "learning_rate": 2.799120254912321e-06, "loss": 1.964, "step": 543 }, { "epoch": 0.6483909415971395, "grad_norm": 0.29043220060176517, "learning_rate": 2.7822242051752425e-06, "loss": 1.9457, "step": 544 }, { "epoch": 0.6495828367103695, "grad_norm": 0.268629176168656, "learning_rate": 2.765359624570574e-06, "loss": 1.9753, "step": 545 }, { "epoch": 0.6507747318235996, "grad_norm": 0.29396871373699995, "learning_rate": 2.7485267523983038e-06, "loss": 1.9803, "step": 546 }, { "epoch": 0.6519666269368296, "grad_norm": 0.2938578682137881, "learning_rate": 2.731725827508494e-06, "loss": 1.9559, "step": 547 }, { "epoch": 0.6531585220500596, "grad_norm": 0.26444066496746194, "learning_rate": 2.714957088297886e-06, "loss": 1.9621, "step": 548 }, { "epoch": 0.6543504171632897, "grad_norm": 0.2898176558803259, "learning_rate": 2.6982207727065252e-06, "loss": 1.9551, "step": 549 }, { "epoch": 0.6555423122765197, "grad_norm": 0.3003676611598843, "learning_rate": 2.681517118214389e-06, "loss": 1.9841, "step": 550 }, { "epoch": 0.6567342073897497, "grad_norm": 0.2592919375869367, "learning_rate": 2.664846361837997e-06, "loss": 1.976, "step": 551 }, { "epoch": 0.6579261025029798, "grad_norm": 0.3266565084733632, "learning_rate": 2.6482087401270705e-06, "loss": 1.9564, "step": 552 }, { "epoch": 0.6591179976162098, "grad_norm": 0.2995845038649281, "learning_rate": 2.6316044891611633e-06, "loss": 1.969, "step": 553 }, { "epoch": 0.6603098927294399, "grad_norm": 0.2804027081600714, "learning_rate": 2.6150338445463146e-06, "loss": 1.9693, "step": 554 }, { "epoch": 0.6615017878426699, "grad_norm": 0.27698419373196886, "learning_rate": 2.5984970414117096e-06, "loss": 1.9788, "step": 555 }, { "epoch": 0.6626936829558999, "grad_norm": 0.31032114395815213, "learning_rate": 2.5819943144063326e-06, "loss": 1.9741, "step": 556 }, { "epoch": 0.66388557806913, "grad_norm": 0.28800726045711933, "learning_rate": 2.565525897695651e-06, "loss": 1.9507, "step": 557 }, { "epoch": 0.66507747318236, "grad_norm": 0.29802393651993614, "learning_rate": 2.549092024958285e-06, "loss": 1.9664, "step": 558 }, { "epoch": 0.66626936829559, "grad_norm": 0.2982356345030979, "learning_rate": 2.532692929382692e-06, "loss": 1.9789, "step": 559 }, { "epoch": 0.6674612634088201, "grad_norm": 0.2803035272437382, "learning_rate": 2.51632884366386e-06, "loss": 1.9609, "step": 560 }, { "epoch": 0.6686531585220501, "grad_norm": 0.29369752020144174, "learning_rate": 2.5000000000000015e-06, "loss": 1.9665, "step": 561 }, { "epoch": 0.6698450536352801, "grad_norm": 0.2692763488935535, "learning_rate": 2.4837066300892647e-06, "loss": 1.9775, "step": 562 }, { "epoch": 0.6710369487485102, "grad_norm": 0.2640671578025783, "learning_rate": 2.4674489651264433e-06, "loss": 1.9621, "step": 563 }, { "epoch": 0.6722288438617402, "grad_norm": 0.2968222691817008, "learning_rate": 2.4512272357996937e-06, "loss": 1.956, "step": 564 }, { "epoch": 0.6734207389749702, "grad_norm": 0.3011250889616646, "learning_rate": 2.4350416722872657e-06, "loss": 1.9775, "step": 565 }, { "epoch": 0.6746126340882003, "grad_norm": 0.27706203721849776, "learning_rate": 2.418892504254231e-06, "loss": 1.9858, "step": 566 }, { "epoch": 0.6758045292014303, "grad_norm": 0.2886529947325675, "learning_rate": 2.402779960849232e-06, "loss": 1.9778, "step": 567 }, { "epoch": 0.6769964243146603, "grad_norm": 0.32555422289644703, "learning_rate": 2.3867042707012234e-06, "loss": 1.9652, "step": 568 }, { "epoch": 0.6781883194278904, "grad_norm": 0.2728774574387877, "learning_rate": 2.3706656619162278e-06, "loss": 1.9556, "step": 569 }, { "epoch": 0.6793802145411204, "grad_norm": 0.29791540079606743, "learning_rate": 2.3546643620741054e-06, "loss": 1.9665, "step": 570 }, { "epoch": 0.6805721096543504, "grad_norm": 0.28429090975445814, "learning_rate": 2.3387005982253218e-06, "loss": 1.9947, "step": 571 }, { "epoch": 0.6817640047675805, "grad_norm": 0.2933689275167632, "learning_rate": 2.322774596887726e-06, "loss": 1.9811, "step": 572 }, { "epoch": 0.6829558998808105, "grad_norm": 0.27022852014602733, "learning_rate": 2.3068865840433286e-06, "loss": 1.9643, "step": 573 }, { "epoch": 0.6841477949940405, "grad_norm": 0.27566403732559813, "learning_rate": 2.29103678513511e-06, "loss": 1.9494, "step": 574 }, { "epoch": 0.6853396901072706, "grad_norm": 0.2813649305654506, "learning_rate": 2.275225425063813e-06, "loss": 1.9596, "step": 575 }, { "epoch": 0.6865315852205006, "grad_norm": 0.28703149945139833, "learning_rate": 2.259452728184749e-06, "loss": 1.9674, "step": 576 }, { "epoch": 0.6877234803337307, "grad_norm": 0.278836772705952, "learning_rate": 2.2437189183046236e-06, "loss": 1.9683, "step": 577 }, { "epoch": 0.6889153754469607, "grad_norm": 0.3180141045052597, "learning_rate": 2.2280242186783473e-06, "loss": 1.9588, "step": 578 }, { "epoch": 0.6901072705601907, "grad_norm": 0.2622104807864232, "learning_rate": 2.21236885200588e-06, "loss": 1.9587, "step": 579 }, { "epoch": 0.6912991656734208, "grad_norm": 0.28789973870105057, "learning_rate": 2.1967530404290702e-06, "loss": 1.9827, "step": 580 }, { "epoch": 0.6924910607866508, "grad_norm": 0.2821097592933177, "learning_rate": 2.1811770055284968e-06, "loss": 2.0036, "step": 581 }, { "epoch": 0.6936829558998808, "grad_norm": 0.24946555260466954, "learning_rate": 2.1656409683203216e-06, "loss": 1.9897, "step": 582 }, { "epoch": 0.6948748510131109, "grad_norm": 0.29219706494149983, "learning_rate": 2.1501451492531664e-06, "loss": 1.9703, "step": 583 }, { "epoch": 0.6960667461263409, "grad_norm": 0.26827509295364377, "learning_rate": 2.134689768204975e-06, "loss": 1.9539, "step": 584 }, { "epoch": 0.6972586412395709, "grad_norm": 0.30266646603465935, "learning_rate": 2.1192750444798982e-06, "loss": 1.986, "step": 585 }, { "epoch": 0.698450536352801, "grad_norm": 0.2718431001798245, "learning_rate": 2.103901196805173e-06, "loss": 1.9738, "step": 586 }, { "epoch": 0.699642431466031, "grad_norm": 0.2687110838757682, "learning_rate": 2.0885684433280336e-06, "loss": 1.9494, "step": 587 }, { "epoch": 0.700834326579261, "grad_norm": 0.2776314528817648, "learning_rate": 2.073277001612603e-06, "loss": 1.9529, "step": 588 }, { "epoch": 0.7020262216924911, "grad_norm": 0.25980607862615657, "learning_rate": 2.058027088636814e-06, "loss": 1.9529, "step": 589 }, { "epoch": 0.7032181168057211, "grad_norm": 0.2801681412198667, "learning_rate": 2.042818920789326e-06, "loss": 1.9688, "step": 590 }, { "epoch": 0.7044100119189511, "grad_norm": 0.2631013529820137, "learning_rate": 2.0276527138664537e-06, "loss": 1.9363, "step": 591 }, { "epoch": 0.7056019070321812, "grad_norm": 0.2528230435660016, "learning_rate": 2.012528683069109e-06, "loss": 1.9542, "step": 592 }, { "epoch": 0.7067938021454112, "grad_norm": 0.2473972746312196, "learning_rate": 1.9974470429997482e-06, "loss": 1.9962, "step": 593 }, { "epoch": 0.7079856972586412, "grad_norm": 0.284941379850682, "learning_rate": 1.98240800765932e-06, "loss": 1.9447, "step": 594 }, { "epoch": 0.7091775923718713, "grad_norm": 0.2621960635197473, "learning_rate": 1.9674117904442364e-06, "loss": 1.9812, "step": 595 }, { "epoch": 0.7103694874851013, "grad_norm": 0.24858361697066161, "learning_rate": 1.9524586041433393e-06, "loss": 1.9562, "step": 596 }, { "epoch": 0.7115613825983313, "grad_norm": 0.2669834824927238, "learning_rate": 1.9375486609348842e-06, "loss": 1.987, "step": 597 }, { "epoch": 0.7127532777115614, "grad_norm": 0.26234172310570103, "learning_rate": 1.9226821723835322e-06, "loss": 1.9735, "step": 598 }, { "epoch": 0.7139451728247914, "grad_norm": 0.25384961760334385, "learning_rate": 1.907859349437336e-06, "loss": 1.9831, "step": 599 }, { "epoch": 0.7151370679380215, "grad_norm": 0.3104750369664491, "learning_rate": 1.8930804024247635e-06, "loss": 1.9714, "step": 600 }, { "epoch": 0.7163289630512515, "grad_norm": 0.2458078645357097, "learning_rate": 1.8783455410517004e-06, "loss": 1.9468, "step": 601 }, { "epoch": 0.7175208581644815, "grad_norm": 0.26529680805920836, "learning_rate": 1.8636549743984815e-06, "loss": 1.9593, "step": 602 }, { "epoch": 0.7187127532777116, "grad_norm": 0.25080419801242315, "learning_rate": 1.8490089109169218e-06, "loss": 1.9808, "step": 603 }, { "epoch": 0.7199046483909416, "grad_norm": 0.26413238202627376, "learning_rate": 1.8344075584273547e-06, "loss": 1.9487, "step": 604 }, { "epoch": 0.7210965435041716, "grad_norm": 0.2674448281901473, "learning_rate": 1.8198511241156902e-06, "loss": 1.9598, "step": 605 }, { "epoch": 0.7222884386174017, "grad_norm": 0.24083245686353985, "learning_rate": 1.8053398145304723e-06, "loss": 1.9662, "step": 606 }, { "epoch": 0.7234803337306317, "grad_norm": 0.25961756440068884, "learning_rate": 1.7908738355799454e-06, "loss": 1.9868, "step": 607 }, { "epoch": 0.7246722288438617, "grad_norm": 0.2784591415570306, "learning_rate": 1.776453392529139e-06, "loss": 1.9473, "step": 608 }, { "epoch": 0.7258641239570918, "grad_norm": 0.23968494857480035, "learning_rate": 1.7620786899969412e-06, "loss": 1.9716, "step": 609 }, { "epoch": 0.7270560190703218, "grad_norm": 0.23937998852690856, "learning_rate": 1.747749931953217e-06, "loss": 1.9635, "step": 610 }, { "epoch": 0.7282479141835518, "grad_norm": 0.259732006086446, "learning_rate": 1.7334673217158976e-06, "loss": 1.9616, "step": 611 }, { "epoch": 0.7294398092967819, "grad_norm": 0.25239102464142604, "learning_rate": 1.719231061948094e-06, "loss": 1.9656, "step": 612 }, { "epoch": 0.7306317044100119, "grad_norm": 0.2550463812437055, "learning_rate": 1.7050413546552347e-06, "loss": 1.9784, "step": 613 }, { "epoch": 0.7318235995232419, "grad_norm": 0.2535210200301375, "learning_rate": 1.6908984011821883e-06, "loss": 1.9847, "step": 614 }, { "epoch": 0.733015494636472, "grad_norm": 0.24932432687921058, "learning_rate": 1.6768024022104106e-06, "loss": 1.972, "step": 615 }, { "epoch": 0.734207389749702, "grad_norm": 0.2644613269238538, "learning_rate": 1.6627535577550996e-06, "loss": 1.9716, "step": 616 }, { "epoch": 0.735399284862932, "grad_norm": 0.3944302146845491, "learning_rate": 1.6487520671623469e-06, "loss": 1.9595, "step": 617 }, { "epoch": 0.7365911799761621, "grad_norm": 0.244722231687242, "learning_rate": 1.6347981291063224e-06, "loss": 1.9688, "step": 618 }, { "epoch": 0.7377830750893921, "grad_norm": 0.2504826371525299, "learning_rate": 1.6208919415864476e-06, "loss": 1.9721, "step": 619 }, { "epoch": 0.7389749702026222, "grad_norm": 0.2523790844757924, "learning_rate": 1.6070337019245896e-06, "loss": 1.9456, "step": 620 }, { "epoch": 0.7401668653158522, "grad_norm": 0.26338811471433093, "learning_rate": 1.5932236067622542e-06, "loss": 1.9613, "step": 621 }, { "epoch": 0.7413587604290822, "grad_norm": 0.25146034966929337, "learning_rate": 1.5794618520578053e-06, "loss": 1.981, "step": 622 }, { "epoch": 0.7425506555423123, "grad_norm": 0.2478403982473681, "learning_rate": 1.5657486330836786e-06, "loss": 1.9263, "step": 623 }, { "epoch": 0.7437425506555423, "grad_norm": 0.2536474779363047, "learning_rate": 1.5520841444236118e-06, "loss": 1.9789, "step": 624 }, { "epoch": 0.7449344457687723, "grad_norm": 0.2615274746690614, "learning_rate": 1.5384685799698839e-06, "loss": 1.9783, "step": 625 }, { "epoch": 0.7461263408820024, "grad_norm": 0.2679161856145564, "learning_rate": 1.5249021329205638e-06, "loss": 1.9513, "step": 626 }, { "epoch": 0.7473182359952324, "grad_norm": 0.24553342227151687, "learning_rate": 1.5113849957767685e-06, "loss": 1.9711, "step": 627 }, { "epoch": 0.7485101311084624, "grad_norm": 0.246019311870797, "learning_rate": 1.4979173603399323e-06, "loss": 1.9734, "step": 628 }, { "epoch": 0.7497020262216925, "grad_norm": 0.25764970394173725, "learning_rate": 1.4844994177090871e-06, "loss": 1.9575, "step": 629 }, { "epoch": 0.7508939213349225, "grad_norm": 0.2419520407437769, "learning_rate": 1.4711313582781434e-06, "loss": 1.9444, "step": 630 }, { "epoch": 0.7520858164481525, "grad_norm": 0.2386706941133275, "learning_rate": 1.4578133717331982e-06, "loss": 1.9675, "step": 631 }, { "epoch": 0.7532777115613826, "grad_norm": 0.251990632652635, "learning_rate": 1.4445456470498392e-06, "loss": 1.9571, "step": 632 }, { "epoch": 0.7544696066746126, "grad_norm": 0.24481833940935246, "learning_rate": 1.4313283724904632e-06, "loss": 1.9538, "step": 633 }, { "epoch": 0.7556615017878426, "grad_norm": 0.24576950539499237, "learning_rate": 1.418161735601601e-06, "loss": 1.9676, "step": 634 }, { "epoch": 0.7568533969010727, "grad_norm": 0.24675237000023065, "learning_rate": 1.4050459232112652e-06, "loss": 1.9672, "step": 635 }, { "epoch": 0.7580452920143027, "grad_norm": 0.2407161568341905, "learning_rate": 1.3919811214262913e-06, "loss": 1.9726, "step": 636 }, { "epoch": 0.7592371871275327, "grad_norm": 0.23031407014507166, "learning_rate": 1.378967515629701e-06, "loss": 1.9768, "step": 637 }, { "epoch": 0.7604290822407628, "grad_norm": 0.2345707206990765, "learning_rate": 1.3660052904780707e-06, "loss": 1.9517, "step": 638 }, { "epoch": 0.7616209773539928, "grad_norm": 0.23677366971206826, "learning_rate": 1.353094629898909e-06, "loss": 1.9654, "step": 639 }, { "epoch": 0.7628128724672228, "grad_norm": 0.24749335727794808, "learning_rate": 1.3402357170880514e-06, "loss": 1.9752, "step": 640 }, { "epoch": 0.7640047675804529, "grad_norm": 0.23017419897906063, "learning_rate": 1.3274287345070564e-06, "loss": 1.9538, "step": 641 }, { "epoch": 0.7651966626936829, "grad_norm": 0.24400711432750527, "learning_rate": 1.3146738638806217e-06, "loss": 1.9571, "step": 642 }, { "epoch": 0.766388557806913, "grad_norm": 0.2322768595933808, "learning_rate": 1.3019712861939964e-06, "loss": 1.967, "step": 643 }, { "epoch": 0.767580452920143, "grad_norm": 0.2448647193354467, "learning_rate": 1.2893211816904243e-06, "loss": 1.9702, "step": 644 }, { "epoch": 0.768772348033373, "grad_norm": 0.2264734125461794, "learning_rate": 1.2767237298685787e-06, "loss": 1.9708, "step": 645 }, { "epoch": 0.7699642431466031, "grad_norm": 0.24280998510060245, "learning_rate": 1.26417910948002e-06, "loss": 2.0062, "step": 646 }, { "epoch": 0.7711561382598331, "grad_norm": 0.2599649290379438, "learning_rate": 1.2516874985266508e-06, "loss": 1.9641, "step": 647 }, { "epoch": 0.7723480333730631, "grad_norm": 0.23209096205716762, "learning_rate": 1.239249074258203e-06, "loss": 1.9844, "step": 648 }, { "epoch": 0.7735399284862932, "grad_norm": 0.2366200983286952, "learning_rate": 1.2268640131697129e-06, "loss": 1.9591, "step": 649 }, { "epoch": 0.7747318235995232, "grad_norm": 0.22549692632142865, "learning_rate": 1.2145324909990202e-06, "loss": 1.9638, "step": 650 }, { "epoch": 0.7759237187127532, "grad_norm": 0.2201422471843865, "learning_rate": 1.202254682724276e-06, "loss": 1.96, "step": 651 }, { "epoch": 0.7771156138259833, "grad_norm": 0.23804071076564637, "learning_rate": 1.190030762561452e-06, "loss": 1.9429, "step": 652 }, { "epoch": 0.7783075089392133, "grad_norm": 0.23445786497651513, "learning_rate": 1.1778609039618804e-06, "loss": 1.9441, "step": 653 }, { "epoch": 0.7794994040524433, "grad_norm": 0.23319783177552136, "learning_rate": 1.1657452796097879e-06, "loss": 1.9561, "step": 654 }, { "epoch": 0.7806912991656734, "grad_norm": 0.21246102421189209, "learning_rate": 1.1536840614198376e-06, "loss": 1.9552, "step": 655 }, { "epoch": 0.7818831942789034, "grad_norm": 0.21558582464035986, "learning_rate": 1.1416774205347015e-06, "loss": 1.9535, "step": 656 }, { "epoch": 0.7830750893921334, "grad_norm": 0.2478855415089653, "learning_rate": 1.1297255273226254e-06, "loss": 1.9648, "step": 657 }, { "epoch": 0.7842669845053635, "grad_norm": 0.24079598014625692, "learning_rate": 1.117828551375013e-06, "loss": 1.9517, "step": 658 }, { "epoch": 0.7854588796185935, "grad_norm": 0.22483152992478453, "learning_rate": 1.1059866615040205e-06, "loss": 1.9615, "step": 659 }, { "epoch": 0.7866507747318237, "grad_norm": 0.21611761849037114, "learning_rate": 1.094200025740157e-06, "loss": 1.9544, "step": 660 }, { "epoch": 0.7878426698450537, "grad_norm": 0.22680299546251373, "learning_rate": 1.0824688113299054e-06, "loss": 1.9656, "step": 661 }, { "epoch": 0.7890345649582837, "grad_norm": 0.22651384710874864, "learning_rate": 1.0707931847333487e-06, "loss": 1.952, "step": 662 }, { "epoch": 0.7902264600715138, "grad_norm": 0.22804104499330677, "learning_rate": 1.0591733116218046e-06, "loss": 1.9469, "step": 663 }, { "epoch": 0.7914183551847438, "grad_norm": 0.23170987494579412, "learning_rate": 1.0476093568754776e-06, "loss": 1.9743, "step": 664 }, { "epoch": 0.7926102502979738, "grad_norm": 0.22978004850491673, "learning_rate": 1.036101484581117e-06, "loss": 1.9595, "step": 665 }, { "epoch": 0.7938021454112039, "grad_norm": 0.21260865957457795, "learning_rate": 1.0246498580296903e-06, "loss": 1.9656, "step": 666 }, { "epoch": 0.7949940405244339, "grad_norm": 0.22425557844267943, "learning_rate": 1.0132546397140687e-06, "loss": 1.9755, "step": 667 }, { "epoch": 0.7961859356376639, "grad_norm": 0.2266231438335908, "learning_rate": 1.0019159913267156e-06, "loss": 1.9871, "step": 668 }, { "epoch": 0.797377830750894, "grad_norm": 0.21739761610592676, "learning_rate": 9.90634073757397e-07, "loss": 1.9599, "step": 669 }, { "epoch": 0.798569725864124, "grad_norm": 0.22507089101888264, "learning_rate": 9.794090470908962e-07, "loss": 1.9703, "step": 670 }, { "epoch": 0.799761620977354, "grad_norm": 0.2076814121868233, "learning_rate": 9.68241070604743e-07, "loss": 1.964, "step": 671 }, { "epoch": 0.8009535160905841, "grad_norm": 0.23327916717788147, "learning_rate": 9.571303027669548e-07, "loss": 1.9825, "step": 672 }, { "epoch": 0.8021454112038141, "grad_norm": 0.21841469332058575, "learning_rate": 9.460769012337839e-07, "loss": 1.9897, "step": 673 }, { "epoch": 0.8033373063170441, "grad_norm": 0.22795437088618667, "learning_rate": 9.350810228474855e-07, "loss": 1.9548, "step": 674 }, { "epoch": 0.8045292014302742, "grad_norm": 0.24461982798572574, "learning_rate": 9.241428236340904e-07, "loss": 1.971, "step": 675 }, { "epoch": 0.8057210965435042, "grad_norm": 0.22693929127887172, "learning_rate": 9.132624588011896e-07, "loss": 1.9697, "step": 676 }, { "epoch": 0.8069129916567342, "grad_norm": 0.22481042822198152, "learning_rate": 9.024400827357344e-07, "loss": 1.9729, "step": 677 }, { "epoch": 0.8081048867699643, "grad_norm": 0.21859877558397856, "learning_rate": 8.916758490018418e-07, "loss": 1.9666, "step": 678 }, { "epoch": 0.8092967818831943, "grad_norm": 0.2260921434511296, "learning_rate": 8.809699103386204e-07, "loss": 1.964, "step": 679 }, { "epoch": 0.8104886769964244, "grad_norm": 0.20963128480459883, "learning_rate": 8.703224186580012e-07, "loss": 1.9969, "step": 680 }, { "epoch": 0.8116805721096544, "grad_norm": 0.2204158197482051, "learning_rate": 8.597335250425809e-07, "loss": 1.9494, "step": 681 }, { "epoch": 0.8128724672228844, "grad_norm": 0.22459531839550131, "learning_rate": 8.492033797434762e-07, "loss": 1.9473, "step": 682 }, { "epoch": 0.8140643623361145, "grad_norm": 0.22674947382748567, "learning_rate": 8.387321321781977e-07, "loss": 1.9591, "step": 683 }, { "epoch": 0.8152562574493445, "grad_norm": 0.23228573604473446, "learning_rate": 8.283199309285284e-07, "loss": 1.9622, "step": 684 }, { "epoch": 0.8164481525625745, "grad_norm": 0.2287116546758065, "learning_rate": 8.179669237384097e-07, "loss": 1.971, "step": 685 }, { "epoch": 0.8176400476758046, "grad_norm": 0.20888102799928682, "learning_rate": 8.07673257511849e-07, "loss": 1.9647, "step": 686 }, { "epoch": 0.8188319427890346, "grad_norm": 0.23285143313735843, "learning_rate": 7.97439078310836e-07, "loss": 1.9475, "step": 687 }, { "epoch": 0.8200238379022646, "grad_norm": 0.2306317998265742, "learning_rate": 7.872645313532701e-07, "loss": 1.9843, "step": 688 }, { "epoch": 0.8212157330154947, "grad_norm": 0.22262551838654496, "learning_rate": 7.771497610108981e-07, "loss": 1.9715, "step": 689 }, { "epoch": 0.8224076281287247, "grad_norm": 0.23849053028826073, "learning_rate": 7.670949108072673e-07, "loss": 1.944, "step": 690 }, { "epoch": 0.8235995232419547, "grad_norm": 0.21447305264782468, "learning_rate": 7.57100123415685e-07, "loss": 1.9642, "step": 691 }, { "epoch": 0.8247914183551848, "grad_norm": 0.2531000448062807, "learning_rate": 7.471655406572003e-07, "loss": 1.9447, "step": 692 }, { "epoch": 0.8259833134684148, "grad_norm": 0.23002093940013935, "learning_rate": 7.372913034985879e-07, "loss": 1.9441, "step": 693 }, { "epoch": 0.8271752085816448, "grad_norm": 0.21804664562427592, "learning_rate": 7.274775520503491e-07, "loss": 1.9494, "step": 694 }, { "epoch": 0.8283671036948749, "grad_norm": 0.2180719845670406, "learning_rate": 7.177244255647209e-07, "loss": 1.9612, "step": 695 }, { "epoch": 0.8295589988081049, "grad_norm": 0.2292893667744583, "learning_rate": 7.080320624337039e-07, "loss": 1.9631, "step": 696 }, { "epoch": 0.8307508939213349, "grad_norm": 0.22411915203502086, "learning_rate": 6.984006001870974e-07, "loss": 1.9558, "step": 697 }, { "epoch": 0.831942789034565, "grad_norm": 0.22749753478121626, "learning_rate": 6.888301754905469e-07, "loss": 1.9498, "step": 698 }, { "epoch": 0.833134684147795, "grad_norm": 0.22227813746152136, "learning_rate": 6.79320924143605e-07, "loss": 1.9746, "step": 699 }, { "epoch": 0.834326579261025, "grad_norm": 0.21559253597914696, "learning_rate": 6.698729810778065e-07, "loss": 1.9528, "step": 700 }, { "epoch": 0.8355184743742551, "grad_norm": 0.22780943536330842, "learning_rate": 6.604864803547511e-07, "loss": 1.9803, "step": 701 }, { "epoch": 0.8367103694874851, "grad_norm": 0.21085095301925635, "learning_rate": 6.51161555164203e-07, "loss": 1.973, "step": 702 }, { "epoch": 0.8379022646007152, "grad_norm": 0.212543861677965, "learning_rate": 6.418983378221988e-07, "loss": 1.9623, "step": 703 }, { "epoch": 0.8390941597139452, "grad_norm": 0.2115457183313653, "learning_rate": 6.326969597691724e-07, "loss": 1.9817, "step": 704 }, { "epoch": 0.8402860548271752, "grad_norm": 0.21617807812407117, "learning_rate": 6.235575515680898e-07, "loss": 1.968, "step": 705 }, { "epoch": 0.8414779499404053, "grad_norm": 0.21587790930172882, "learning_rate": 6.144802429025948e-07, "loss": 1.9549, "step": 706 }, { "epoch": 0.8426698450536353, "grad_norm": 0.21797830405681992, "learning_rate": 6.054651625751717e-07, "loss": 1.9833, "step": 707 }, { "epoch": 0.8438617401668653, "grad_norm": 0.22284253238842683, "learning_rate": 5.965124385053112e-07, "loss": 1.9498, "step": 708 }, { "epoch": 0.8450536352800954, "grad_norm": 0.20628741944346807, "learning_rate": 5.876221977277042e-07, "loss": 1.9382, "step": 709 }, { "epoch": 0.8462455303933254, "grad_norm": 0.22293588358500385, "learning_rate": 5.787945663904332e-07, "loss": 1.9773, "step": 710 }, { "epoch": 0.8474374255065554, "grad_norm": 0.22508597630366683, "learning_rate": 5.700296697531843e-07, "loss": 1.9659, "step": 711 }, { "epoch": 0.8486293206197855, "grad_norm": 0.22145576581778206, "learning_rate": 5.613276321854699e-07, "loss": 1.9536, "step": 712 }, { "epoch": 0.8498212157330155, "grad_norm": 0.21697947016074837, "learning_rate": 5.526885771648599e-07, "loss": 1.9686, "step": 713 }, { "epoch": 0.8510131108462455, "grad_norm": 0.2238437978274647, "learning_rate": 5.441126272752395e-07, "loss": 1.9654, "step": 714 }, { "epoch": 0.8522050059594756, "grad_norm": 0.20552250633575228, "learning_rate": 5.355999042050603e-07, "loss": 1.9679, "step": 715 }, { "epoch": 0.8533969010727056, "grad_norm": 0.20678965975151994, "learning_rate": 5.271505287456153e-07, "loss": 1.9695, "step": 716 }, { "epoch": 0.8545887961859356, "grad_norm": 0.22026378225643617, "learning_rate": 5.187646207893287e-07, "loss": 1.9459, "step": 717 }, { "epoch": 0.8557806912991657, "grad_norm": 0.21952615459392946, "learning_rate": 5.104422993280522e-07, "loss": 1.9583, "step": 718 }, { "epoch": 0.8569725864123957, "grad_norm": 0.2103248912718566, "learning_rate": 5.021836824513759e-07, "loss": 1.9653, "step": 719 }, { "epoch": 0.8581644815256257, "grad_norm": 0.21006195755848364, "learning_rate": 4.939888873449567e-07, "loss": 1.9688, "step": 720 }, { "epoch": 0.8593563766388558, "grad_norm": 0.20402501881530985, "learning_rate": 4.858580302888466e-07, "loss": 1.9765, "step": 721 }, { "epoch": 0.8605482717520858, "grad_norm": 0.20084862547322885, "learning_rate": 4.777912266558532e-07, "loss": 1.9761, "step": 722 }, { "epoch": 0.8617401668653158, "grad_norm": 0.1988469838008945, "learning_rate": 4.6978859090989703e-07, "loss": 1.9694, "step": 723 }, { "epoch": 0.8629320619785459, "grad_norm": 0.203864028540796, "learning_rate": 4.618502366043881e-07, "loss": 1.9775, "step": 724 }, { "epoch": 0.8641239570917759, "grad_norm": 0.2145910427428764, "learning_rate": 4.5397627638061604e-07, "loss": 1.96, "step": 725 }, { "epoch": 0.865315852205006, "grad_norm": 0.2050136901691872, "learning_rate": 4.4616682196614636e-07, "loss": 1.9623, "step": 726 }, { "epoch": 0.866507747318236, "grad_norm": 0.20872576917324093, "learning_rate": 4.3842198417324346e-07, "loss": 1.9554, "step": 727 }, { "epoch": 0.867699642431466, "grad_norm": 0.20606498295864895, "learning_rate": 4.307418728972934e-07, "loss": 1.9572, "step": 728 }, { "epoch": 0.8688915375446961, "grad_norm": 0.20387463599521696, "learning_rate": 4.2312659711524486e-07, "loss": 1.9873, "step": 729 }, { "epoch": 0.8700834326579261, "grad_norm": 0.20613519314048598, "learning_rate": 4.1557626488406223e-07, "loss": 1.9745, "step": 730 }, { "epoch": 0.8712753277711561, "grad_norm": 0.2119361724107287, "learning_rate": 4.080909833391944e-07, "loss": 1.956, "step": 731 }, { "epoch": 0.8724672228843862, "grad_norm": 0.21723399868985763, "learning_rate": 4.0067085869305357e-07, "loss": 1.9787, "step": 732 }, { "epoch": 0.8736591179976162, "grad_norm": 0.206994355395634, "learning_rate": 3.9331599623350815e-07, "loss": 1.9593, "step": 733 }, { "epoch": 0.8748510131108462, "grad_norm": 0.20592726537984876, "learning_rate": 3.8602650032238675e-07, "loss": 1.9687, "step": 734 }, { "epoch": 0.8760429082240763, "grad_norm": 0.19758730236891384, "learning_rate": 3.788024743940016e-07, "loss": 1.9957, "step": 735 }, { "epoch": 0.8772348033373063, "grad_norm": 0.20119012937681818, "learning_rate": 3.71644020953677e-07, "loss": 1.9908, "step": 736 }, { "epoch": 0.8784266984505363, "grad_norm": 0.1987555097318407, "learning_rate": 3.6455124157629805e-07, "loss": 1.963, "step": 737 }, { "epoch": 0.8796185935637664, "grad_norm": 0.20693505027292836, "learning_rate": 3.575242369048665e-07, "loss": 1.956, "step": 738 }, { "epoch": 0.8808104886769964, "grad_norm": 0.20983712357706624, "learning_rate": 3.505631066490728e-07, "loss": 1.9719, "step": 739 }, { "epoch": 0.8820023837902264, "grad_norm": 0.20291800945532407, "learning_rate": 3.436679495838835e-07, "loss": 1.9658, "step": 740 }, { "epoch": 0.8831942789034565, "grad_norm": 0.20400470569172324, "learning_rate": 3.3683886354813953e-07, "loss": 1.9785, "step": 741 }, { "epoch": 0.8843861740166865, "grad_norm": 0.20085471728798332, "learning_rate": 3.300759454431657e-07, "loss": 1.9534, "step": 742 }, { "epoch": 0.8855780691299165, "grad_norm": 0.20101578952892968, "learning_rate": 3.233792912313943e-07, "loss": 1.9637, "step": 743 }, { "epoch": 0.8867699642431466, "grad_norm": 0.194386663867366, "learning_rate": 3.1674899593501175e-07, "loss": 1.9718, "step": 744 }, { "epoch": 0.8879618593563766, "grad_norm": 0.2022754658332612, "learning_rate": 3.101851536346007e-07, "loss": 1.9493, "step": 745 }, { "epoch": 0.8891537544696066, "grad_norm": 0.19855072055520911, "learning_rate": 3.0368785746780925e-07, "loss": 1.9845, "step": 746 }, { "epoch": 0.8903456495828367, "grad_norm": 0.20555522738375503, "learning_rate": 2.9725719962802936e-07, "loss": 1.9562, "step": 747 }, { "epoch": 0.8915375446960667, "grad_norm": 0.20020380210201041, "learning_rate": 2.9089327136308855e-07, "loss": 1.9423, "step": 748 }, { "epoch": 0.8927294398092968, "grad_norm": 0.21841772751668617, "learning_rate": 2.8459616297395464e-07, "loss": 1.9513, "step": 749 }, { "epoch": 0.8939213349225268, "grad_norm": 0.20236116487467856, "learning_rate": 2.7836596381345613e-07, "loss": 1.9567, "step": 750 }, { "epoch": 0.8951132300357568, "grad_norm": 0.20135449349261023, "learning_rate": 2.722027622850104e-07, "loss": 1.9645, "step": 751 }, { "epoch": 0.8963051251489869, "grad_norm": 0.18378339190654983, "learning_rate": 2.6610664584137413e-07, "loss": 1.9556, "step": 752 }, { "epoch": 0.8974970202622169, "grad_norm": 0.19531080180392058, "learning_rate": 2.600777009833982e-07, "loss": 1.9651, "step": 753 }, { "epoch": 0.8986889153754469, "grad_norm": 0.19913934223741267, "learning_rate": 2.541160132588044e-07, "loss": 1.9903, "step": 754 }, { "epoch": 0.899880810488677, "grad_norm": 0.19578044282345453, "learning_rate": 2.482216672609677e-07, "loss": 1.9826, "step": 755 }, { "epoch": 0.901072705601907, "grad_norm": 0.19669019170528293, "learning_rate": 2.423947466277177e-07, "loss": 1.9608, "step": 756 }, { "epoch": 0.902264600715137, "grad_norm": 0.20106014848036682, "learning_rate": 2.3663533404015227e-07, "loss": 1.9479, "step": 757 }, { "epoch": 0.9034564958283671, "grad_norm": 0.19075058248139964, "learning_rate": 2.3094351122146307e-07, "loss": 1.9461, "step": 758 }, { "epoch": 0.9046483909415971, "grad_norm": 0.1936347340988058, "learning_rate": 2.2531935893577827e-07, "loss": 1.9786, "step": 759 }, { "epoch": 0.9058402860548271, "grad_norm": 0.1953475101067191, "learning_rate": 2.1976295698701245e-07, "loss": 1.9602, "step": 760 }, { "epoch": 0.9070321811680572, "grad_norm": 0.2018953126999259, "learning_rate": 2.142743842177386e-07, "loss": 1.9589, "step": 761 }, { "epoch": 0.9082240762812872, "grad_norm": 0.2413615126875786, "learning_rate": 2.0885371850806691e-07, "loss": 1.9761, "step": 762 }, { "epoch": 0.9094159713945172, "grad_norm": 0.20022969875884347, "learning_rate": 2.0350103677454047e-07, "loss": 1.9589, "step": 763 }, { "epoch": 0.9106078665077473, "grad_norm": 0.19437922719148687, "learning_rate": 1.98216414969043e-07, "loss": 1.9522, "step": 764 }, { "epoch": 0.9117997616209773, "grad_norm": 0.19907577309780014, "learning_rate": 1.9299992807772173e-07, "loss": 1.9416, "step": 765 }, { "epoch": 0.9129916567342073, "grad_norm": 0.20237562054567065, "learning_rate": 1.8785165011992513e-07, "loss": 1.9472, "step": 766 }, { "epoch": 0.9141835518474374, "grad_norm": 0.19811233452806024, "learning_rate": 1.8277165414714858e-07, "loss": 1.9539, "step": 767 }, { "epoch": 0.9153754469606674, "grad_norm": 0.19216150911802557, "learning_rate": 1.7776001224200257e-07, "loss": 1.9735, "step": 768 }, { "epoch": 0.9165673420738975, "grad_norm": 0.20840514563527793, "learning_rate": 1.7281679551718445e-07, "loss": 1.9809, "step": 769 }, { "epoch": 0.9177592371871275, "grad_norm": 0.20320307604353186, "learning_rate": 1.6794207411447548e-07, "loss": 1.9701, "step": 770 }, { "epoch": 0.9189511323003575, "grad_norm": 0.1931262061968698, "learning_rate": 1.6313591720374057e-07, "loss": 1.9379, "step": 771 }, { "epoch": 0.9201430274135876, "grad_norm": 0.1960925515484337, "learning_rate": 1.583983929819488e-07, "loss": 1.9537, "step": 772 }, { "epoch": 0.9213349225268176, "grad_norm": 0.20033984414838282, "learning_rate": 1.5372956867220678e-07, "loss": 1.9524, "step": 773 }, { "epoch": 0.9225268176400476, "grad_norm": 0.1969245380458675, "learning_rate": 1.49129510522803e-07, "loss": 1.9909, "step": 774 }, { "epoch": 0.9237187127532777, "grad_norm": 0.19242733300556847, "learning_rate": 1.445982838062676e-07, "loss": 1.9672, "step": 775 }, { "epoch": 0.9249106078665077, "grad_norm": 0.19349859868942168, "learning_rate": 1.4013595281844872e-07, "loss": 1.9694, "step": 776 }, { "epoch": 0.9261025029797377, "grad_norm": 0.19361152253228486, "learning_rate": 1.357425808775964e-07, "loss": 1.982, "step": 777 }, { "epoch": 0.9272943980929678, "grad_norm": 0.20619875373877572, "learning_rate": 1.3141823032346736e-07, "loss": 1.9625, "step": 778 }, { "epoch": 0.9284862932061978, "grad_norm": 0.19945778231248415, "learning_rate": 1.2716296251644e-07, "loss": 1.9819, "step": 779 }, { "epoch": 0.929678188319428, "grad_norm": 0.19580818822309443, "learning_rate": 1.2297683783664138e-07, "loss": 1.971, "step": 780 }, { "epoch": 0.930870083432658, "grad_norm": 0.1969258751508119, "learning_rate": 1.1885991568309385e-07, "loss": 1.9684, "step": 781 }, { "epoch": 0.932061978545888, "grad_norm": 0.19668726045406146, "learning_rate": 1.1481225447286803e-07, "loss": 1.9336, "step": 782 }, { "epoch": 0.933253873659118, "grad_norm": 0.19002790593711985, "learning_rate": 1.1083391164025903e-07, "loss": 1.9776, "step": 783 }, { "epoch": 0.9344457687723481, "grad_norm": 0.19965702113266218, "learning_rate": 1.069249436359665e-07, "loss": 1.982, "step": 784 }, { "epoch": 0.9356376638855781, "grad_norm": 0.1916717569664899, "learning_rate": 1.0308540592629756e-07, "loss": 1.9611, "step": 785 }, { "epoch": 0.9368295589988082, "grad_norm": 0.1958220628621678, "learning_rate": 9.931535299237737e-08, "loss": 1.9439, "step": 786 }, { "epoch": 0.9380214541120382, "grad_norm": 0.19546126458914154, "learning_rate": 9.561483832937535e-08, "loss": 1.9596, "step": 787 }, { "epoch": 0.9392133492252682, "grad_norm": 0.18627727476440192, "learning_rate": 9.198391444575072e-08, "loss": 1.977, "step": 788 }, { "epoch": 0.9404052443384983, "grad_norm": 0.19214479226727124, "learning_rate": 8.842263286250208e-08, "loss": 1.9714, "step": 789 }, { "epoch": 0.9415971394517283, "grad_norm": 0.19627451760011, "learning_rate": 8.493104411243791e-08, "loss": 1.9846, "step": 790 }, { "epoch": 0.9427890345649583, "grad_norm": 0.189201378107075, "learning_rate": 8.150919773946165e-08, "loss": 1.9438, "step": 791 }, { "epoch": 0.9439809296781884, "grad_norm": 0.1881449814121689, "learning_rate": 7.81571422978672e-08, "loss": 1.9758, "step": 792 }, { "epoch": 0.9451728247914184, "grad_norm": 0.19461953430827816, "learning_rate": 7.487492535164842e-08, "loss": 1.9538, "step": 793 }, { "epoch": 0.9463647199046484, "grad_norm": 0.19961269699233244, "learning_rate": 7.166259347382854e-08, "loss": 1.9861, "step": 794 }, { "epoch": 0.9475566150178785, "grad_norm": 0.19603572773830497, "learning_rate": 6.852019224579287e-08, "loss": 1.954, "step": 795 }, { "epoch": 0.9487485101311085, "grad_norm": 0.18552652214530319, "learning_rate": 6.544776625664829e-08, "loss": 1.9701, "step": 796 }, { "epoch": 0.9499404052443385, "grad_norm": 0.18737364550182184, "learning_rate": 6.244535910258697e-08, "loss": 1.9507, "step": 797 }, { "epoch": 0.9511323003575686, "grad_norm": 0.19128218729370722, "learning_rate": 5.95130133862698e-08, "loss": 1.963, "step": 798 }, { "epoch": 0.9523241954707986, "grad_norm": 0.1857403939801107, "learning_rate": 5.665077071621894e-08, "loss": 1.9782, "step": 799 }, { "epoch": 0.9535160905840286, "grad_norm": 0.18845875837578616, "learning_rate": 5.3858671706230605e-08, "loss": 1.9714, "step": 800 }, { "epoch": 0.9547079856972587, "grad_norm": 0.191894557195297, "learning_rate": 5.1136755974797724e-08, "loss": 1.9802, "step": 801 }, { "epoch": 0.9558998808104887, "grad_norm": 0.18884552490025192, "learning_rate": 4.848506214454651e-08, "loss": 1.9635, "step": 802 }, { "epoch": 0.9570917759237187, "grad_norm": 0.1921418572611478, "learning_rate": 4.590362784169022e-08, "loss": 1.9863, "step": 803 }, { "epoch": 0.9582836710369488, "grad_norm": 0.19545913000509813, "learning_rate": 4.3392489695493475e-08, "loss": 1.9582, "step": 804 }, { "epoch": 0.9594755661501788, "grad_norm": 0.18809267333949384, "learning_rate": 4.0951683337754345e-08, "loss": 1.9486, "step": 805 }, { "epoch": 0.9606674612634089, "grad_norm": 0.1993552148845141, "learning_rate": 3.858124340229863e-08, "loss": 1.9596, "step": 806 }, { "epoch": 0.9618593563766389, "grad_norm": 0.19546359586133671, "learning_rate": 3.628120352448583e-08, "loss": 1.9635, "step": 807 }, { "epoch": 0.9630512514898689, "grad_norm": 0.1965379164207019, "learning_rate": 3.405159634073452e-08, "loss": 1.9586, "step": 808 }, { "epoch": 0.964243146603099, "grad_norm": 0.18935857235084785, "learning_rate": 3.1892453488058803e-08, "loss": 1.9854, "step": 809 }, { "epoch": 0.965435041716329, "grad_norm": 0.19406372777092382, "learning_rate": 2.9803805603619283e-08, "loss": 1.9588, "step": 810 }, { "epoch": 0.966626936829559, "grad_norm": 0.19087575145791982, "learning_rate": 2.77856823242878e-08, "loss": 1.9681, "step": 811 }, { "epoch": 0.9678188319427891, "grad_norm": 0.20093522828177285, "learning_rate": 2.5838112286226123e-08, "loss": 1.9667, "step": 812 }, { "epoch": 0.9690107270560191, "grad_norm": 0.18798772341602374, "learning_rate": 2.39611231244824e-08, "loss": 1.9722, "step": 813 }, { "epoch": 0.9702026221692491, "grad_norm": 0.1848757446131922, "learning_rate": 2.2154741472596996e-08, "loss": 1.9578, "step": 814 }, { "epoch": 0.9713945172824792, "grad_norm": 0.18755577360898026, "learning_rate": 2.0418992962224495e-08, "loss": 1.963, "step": 815 }, { "epoch": 0.9725864123957092, "grad_norm": 0.18908507808905262, "learning_rate": 1.8753902222770627e-08, "loss": 1.9986, "step": 816 }, { "epoch": 0.9737783075089392, "grad_norm": 0.1919401118801061, "learning_rate": 1.7159492881041462e-08, "loss": 1.9351, "step": 817 }, { "epoch": 0.9749702026221693, "grad_norm": 0.1877371294012426, "learning_rate": 1.563578756091144e-08, "loss": 1.9486, "step": 818 }, { "epoch": 0.9761620977353993, "grad_norm": 0.1915342862821692, "learning_rate": 1.4182807882999194e-08, "loss": 1.9647, "step": 819 }, { "epoch": 0.9773539928486293, "grad_norm": 0.18698005671466014, "learning_rate": 1.2800574464361115e-08, "loss": 1.9578, "step": 820 }, { "epoch": 0.9785458879618594, "grad_norm": 0.1894762036895136, "learning_rate": 1.1489106918200487e-08, "loss": 1.9497, "step": 821 }, { "epoch": 0.9797377830750894, "grad_norm": 0.19184360419844976, "learning_rate": 1.0248423853587154e-08, "loss": 1.9767, "step": 822 }, { "epoch": 0.9809296781883194, "grad_norm": 0.19532092856528233, "learning_rate": 9.07854287519494e-09, "loss": 1.9623, "step": 823 }, { "epoch": 0.9821215733015495, "grad_norm": 0.19624970994803084, "learning_rate": 7.979480583052423e-09, "loss": 1.961, "step": 824 }, { "epoch": 0.9833134684147795, "grad_norm": 0.1845931499070557, "learning_rate": 6.951252572304224e-09, "loss": 1.983, "step": 825 }, { "epoch": 0.9845053635280095, "grad_norm": 0.18794485352405654, "learning_rate": 5.993873432993957e-09, "loss": 1.9616, "step": 826 }, { "epoch": 0.9856972586412396, "grad_norm": 0.19891751745555894, "learning_rate": 5.107356749853298e-09, "loss": 1.9535, "step": 827 }, { "epoch": 0.9868891537544696, "grad_norm": 0.19040177607162215, "learning_rate": 4.291715102112126e-09, "loss": 1.9726, "step": 828 }, { "epoch": 0.9880810488676997, "grad_norm": 0.22336004861620634, "learning_rate": 3.546960063319227e-09, "loss": 1.966, "step": 829 }, { "epoch": 0.9892729439809297, "grad_norm": 0.18822951819678269, "learning_rate": 2.8731022011757593e-09, "loss": 1.9966, "step": 830 }, { "epoch": 0.9904648390941597, "grad_norm": 0.18927421286397889, "learning_rate": 2.27015107739037e-09, "loss": 1.9726, "step": 831 }, { "epoch": 0.9916567342073898, "grad_norm": 0.19221873853139876, "learning_rate": 1.7381152475376416e-09, "loss": 1.9832, "step": 832 }, { "epoch": 0.9928486293206198, "grad_norm": 0.1904281270223511, "learning_rate": 1.2770022609409628e-09, "loss": 1.9563, "step": 833 }, { "epoch": 0.9940405244338498, "grad_norm": 0.18797717350418608, "learning_rate": 8.868186605631712e-10, "loss": 1.9507, "step": 834 }, { "epoch": 0.9952324195470799, "grad_norm": 0.18924896153536938, "learning_rate": 5.675699829160719e-10, "loss": 1.9705, "step": 835 }, { "epoch": 0.9964243146603099, "grad_norm": 0.19596674067827927, "learning_rate": 3.1926075797827914e-10, "loss": 1.9888, "step": 836 }, { "epoch": 0.9976162097735399, "grad_norm": 0.1862037805845138, "learning_rate": 1.4189450913415505e-10, "loss": 1.9437, "step": 837 }, { "epoch": 0.99880810488677, "grad_norm": 0.19347983202580893, "learning_rate": 3.547375312218382e-11, "loss": 1.9667, "step": 838 }, { "epoch": 1.0, "grad_norm": 0.20375226613820588, "learning_rate": 0.0, "loss": 1.9468, "step": 839 }, { "epoch": 1.0, "step": 839, "total_flos": 1802102510714880.0, "train_loss": 1.9892315510771414, "train_runtime": 26830.5336, "train_samples_per_second": 58.025, "train_steps_per_second": 0.031 } ], "logging_steps": 1, "max_steps": 839, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1802102510714880.0, "train_batch_size": 58, "trial_name": null, "trial_params": null }