{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996300406955235, "eval_steps": 100, "global_step": 1351, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003699593044765076, "grad_norm": 2.757993258887922, "learning_rate": 7.352941176470589e-07, "loss": 1.1067, "mean_token_accuracy": 0.708824381857028, "step": 5 }, { "epoch": 0.007399186089530152, "grad_norm": 2.5907330458452984, "learning_rate": 1.4705882352941177e-06, "loss": 1.0679, "mean_token_accuracy": 0.7169299986512495, "step": 10 }, { "epoch": 0.011098779134295227, "grad_norm": 2.241392532504099, "learning_rate": 2.2058823529411767e-06, "loss": 1.1238, "mean_token_accuracy": 0.6992935634786113, "step": 15 }, { "epoch": 0.014798372179060304, "grad_norm": 1.5956862014648674, "learning_rate": 2.9411764705882355e-06, "loss": 1.0062, "mean_token_accuracy": 0.7288372261973354, "step": 20 }, { "epoch": 0.01849796522382538, "grad_norm": 1.6119131199982493, "learning_rate": 3.6764705882352946e-06, "loss": 1.0517, "mean_token_accuracy": 0.7142909651691335, "step": 25 }, { "epoch": 0.022197558268590455, "grad_norm": 1.139657389593639, "learning_rate": 4.411764705882353e-06, "loss": 0.9593, "mean_token_accuracy": 0.7346081897149975, "step": 30 }, { "epoch": 0.02589715131335553, "grad_norm": 1.2790776332632598, "learning_rate": 5.147058823529411e-06, "loss": 0.9805, "mean_token_accuracy": 0.72720137806358, "step": 35 }, { "epoch": 0.029596744358120607, "grad_norm": 1.104655854611052, "learning_rate": 5.882352941176471e-06, "loss": 0.9114, "mean_token_accuracy": 0.7425328465386071, "step": 40 }, { "epoch": 0.033296337402885685, "grad_norm": 0.9408582472471563, "learning_rate": 6.61764705882353e-06, "loss": 0.8885, "mean_token_accuracy": 0.7472382565404383, "step": 45 }, { "epoch": 0.03699593044765076, "grad_norm": 1.0580888719684283, "learning_rate": 7.352941176470589e-06, "loss": 0.8658, "mean_token_accuracy": 0.7493579770562304, "step": 50 }, { "epoch": 0.040695523492415835, "grad_norm": 0.8832977290583615, "learning_rate": 8.088235294117648e-06, "loss": 0.841, "mean_token_accuracy": 0.7568992237585881, "step": 55 }, { "epoch": 0.04439511653718091, "grad_norm": 0.9301447232199826, "learning_rate": 8.823529411764707e-06, "loss": 0.8677, "mean_token_accuracy": 0.7502923381848673, "step": 60 }, { "epoch": 0.048094709581945984, "grad_norm": 0.856501238425618, "learning_rate": 9.558823529411766e-06, "loss": 0.878, "mean_token_accuracy": 0.7445654933582921, "step": 65 }, { "epoch": 0.05179430262671106, "grad_norm": 1.0586539300870963, "learning_rate": 1.0294117647058823e-05, "loss": 0.8825, "mean_token_accuracy": 0.7438754593389819, "step": 70 }, { "epoch": 0.05549389567147614, "grad_norm": 1.1122367826346538, "learning_rate": 1.1029411764705885e-05, "loss": 0.8775, "mean_token_accuracy": 0.7447408372141248, "step": 75 }, { "epoch": 0.059193488716241215, "grad_norm": 1.0632613858567348, "learning_rate": 1.1764705882352942e-05, "loss": 0.8347, "mean_token_accuracy": 0.7567175732049909, "step": 80 }, { "epoch": 0.06289308176100629, "grad_norm": 0.9476656757993303, "learning_rate": 1.25e-05, "loss": 0.8305, "mean_token_accuracy": 0.7566193958141736, "step": 85 }, { "epoch": 0.06659267480577137, "grad_norm": 1.1083079276477652, "learning_rate": 1.323529411764706e-05, "loss": 0.8466, "mean_token_accuracy": 0.7519149864344391, "step": 90 }, { "epoch": 0.07029226785053644, "grad_norm": 0.9768809343076362, "learning_rate": 1.3970588235294118e-05, "loss": 0.7894, "mean_token_accuracy": 0.768089470723049, "step": 95 }, { "epoch": 0.07399186089530152, "grad_norm": 1.0897420744707864, "learning_rate": 1.4705882352941179e-05, "loss": 0.842, "mean_token_accuracy": 0.7519371481531969, "step": 100 }, { "epoch": 0.07399186089530152, "eval_loss": 0.8507319688796997, "eval_mean_token_accuracy": 0.7484970684902305, "eval_runtime": 13.1139, "eval_samples_per_second": 9.837, "eval_steps_per_second": 2.516, "step": 100 }, { "epoch": 0.07769145394006659, "grad_norm": 1.0530600966285744, "learning_rate": 1.5441176470588237e-05, "loss": 0.8609, "mean_token_accuracy": 0.7470814372661423, "step": 105 }, { "epoch": 0.08139104698483167, "grad_norm": 0.9420676582335842, "learning_rate": 1.6176470588235296e-05, "loss": 0.8056, "mean_token_accuracy": 0.7604963244668818, "step": 110 }, { "epoch": 0.08509064002959675, "grad_norm": 0.868613480082479, "learning_rate": 1.6911764705882355e-05, "loss": 0.8432, "mean_token_accuracy": 0.7519378614781397, "step": 115 }, { "epoch": 0.08879023307436182, "grad_norm": 1.0518122686708429, "learning_rate": 1.7647058823529414e-05, "loss": 0.8237, "mean_token_accuracy": 0.7588873826848553, "step": 120 }, { "epoch": 0.0924898261191269, "grad_norm": 1.135357117838434, "learning_rate": 1.8382352941176472e-05, "loss": 0.8417, "mean_token_accuracy": 0.7510501482017762, "step": 125 }, { "epoch": 0.09618941916389197, "grad_norm": 1.0511646764690201, "learning_rate": 1.911764705882353e-05, "loss": 0.8024, "mean_token_accuracy": 0.7630132146126162, "step": 130 }, { "epoch": 0.09988901220865705, "grad_norm": 1.003659422839662, "learning_rate": 1.985294117647059e-05, "loss": 0.7846, "mean_token_accuracy": 0.7684428405908383, "step": 135 }, { "epoch": 0.10358860525342212, "grad_norm": 0.9783718356092802, "learning_rate": 1.9999465148392906e-05, "loss": 0.7978, "mean_token_accuracy": 0.7638017765978411, "step": 140 }, { "epoch": 0.1072881982981872, "grad_norm": 0.9325940574831129, "learning_rate": 1.999729241179462e-05, "loss": 0.7899, "mean_token_accuracy": 0.7657915220582932, "step": 145 }, { "epoch": 0.11098779134295228, "grad_norm": 0.9587245678809391, "learning_rate": 1.999344872485215e-05, "loss": 0.7935, "mean_token_accuracy": 0.7643045615369954, "step": 150 }, { "epoch": 0.11468738438771735, "grad_norm": 1.2026644648319857, "learning_rate": 1.9987934730000457e-05, "loss": 0.7922, "mean_token_accuracy": 0.7667913563744123, "step": 155 }, { "epoch": 0.11838697743248243, "grad_norm": 1.0527655093884445, "learning_rate": 1.998075134885022e-05, "loss": 0.8036, "mean_token_accuracy": 0.7613009885033224, "step": 160 }, { "epoch": 0.1220865704772475, "grad_norm": 1.2386325448592015, "learning_rate": 1.9971899782033853e-05, "loss": 0.8013, "mean_token_accuracy": 0.7620870662678945, "step": 165 }, { "epoch": 0.12578616352201258, "grad_norm": 1.127408462312219, "learning_rate": 1.9961381509004785e-05, "loss": 0.808, "mean_token_accuracy": 0.7593721598276708, "step": 170 }, { "epoch": 0.12948575656677766, "grad_norm": 0.9438483145827001, "learning_rate": 1.9949198287790215e-05, "loss": 0.7861, "mean_token_accuracy": 0.7660641312412623, "step": 175 }, { "epoch": 0.13318534961154274, "grad_norm": 0.9318614898436913, "learning_rate": 1.9935352154697257e-05, "loss": 0.8038, "mean_token_accuracy": 0.7602947954939976, "step": 180 }, { "epoch": 0.1368849426563078, "grad_norm": 0.8876177342352581, "learning_rate": 1.9919845423972603e-05, "loss": 0.757, "mean_token_accuracy": 0.7742074952233244, "step": 185 }, { "epoch": 0.14058453570107288, "grad_norm": 1.005618517605514, "learning_rate": 1.9902680687415704e-05, "loss": 0.7985, "mean_token_accuracy": 0.7622990063822126, "step": 190 }, { "epoch": 0.14428412874583796, "grad_norm": 1.0909612602188234, "learning_rate": 1.9883860813945596e-05, "loss": 0.7997, "mean_token_accuracy": 0.7611327796064032, "step": 195 }, { "epoch": 0.14798372179060304, "grad_norm": 0.9223546224550383, "learning_rate": 1.986338894912137e-05, "loss": 0.7888, "mean_token_accuracy": 0.7650975947505124, "step": 200 }, { "epoch": 0.14798372179060304, "eval_loss": 0.8212586641311646, "eval_mean_token_accuracy": 0.753984452665533, "eval_runtime": 12.914, "eval_samples_per_second": 9.989, "eval_steps_per_second": 2.555, "step": 200 }, { "epoch": 0.15168331483536812, "grad_norm": 0.9461631972455284, "learning_rate": 1.9841268514616434e-05, "loss": 0.8206, "mean_token_accuracy": 0.7560038239175771, "step": 205 }, { "epoch": 0.15538290788013318, "grad_norm": 1.0128847648211983, "learning_rate": 1.9817503207646606e-05, "loss": 0.7827, "mean_token_accuracy": 0.7663386260842018, "step": 210 }, { "epoch": 0.15908250092489826, "grad_norm": 1.0613216235343779, "learning_rate": 1.979209700035216e-05, "loss": 0.8097, "mean_token_accuracy": 0.7595550837031688, "step": 215 }, { "epoch": 0.16278209396966334, "grad_norm": 1.0811772085526155, "learning_rate": 1.976505413913393e-05, "loss": 0.7771, "mean_token_accuracy": 0.7685113508318921, "step": 220 }, { "epoch": 0.16648168701442842, "grad_norm": 1.0032217781891688, "learning_rate": 1.9736379143943565e-05, "loss": 0.7788, "mean_token_accuracy": 0.767374709683548, "step": 225 }, { "epoch": 0.1701812800591935, "grad_norm": 1.0033846900829124, "learning_rate": 1.9706076807528044e-05, "loss": 0.7792, "mean_token_accuracy": 0.7671742181015657, "step": 230 }, { "epoch": 0.17388087310395856, "grad_norm": 1.016761717770083, "learning_rate": 1.967415219462864e-05, "loss": 0.776, "mean_token_accuracy": 0.7692263473301846, "step": 235 }, { "epoch": 0.17758046614872364, "grad_norm": 1.0390090395504632, "learning_rate": 1.9640610641134383e-05, "loss": 0.8125, "mean_token_accuracy": 0.7578594416116643, "step": 240 }, { "epoch": 0.18128005919348872, "grad_norm": 0.9506480558904421, "learning_rate": 1.9605457753190224e-05, "loss": 0.774, "mean_token_accuracy": 0.7669808330485, "step": 245 }, { "epoch": 0.1849796522382538, "grad_norm": 0.9074565067919744, "learning_rate": 1.9568699406260016e-05, "loss": 0.7631, "mean_token_accuracy": 0.7712889443688947, "step": 250 }, { "epoch": 0.18867924528301888, "grad_norm": 0.870192572015487, "learning_rate": 1.953034174414449e-05, "loss": 0.8041, "mean_token_accuracy": 0.7584920492345467, "step": 255 }, { "epoch": 0.19237883832778394, "grad_norm": 0.9436163053750114, "learning_rate": 1.9490391177954383e-05, "loss": 0.7697, "mean_token_accuracy": 0.7687670857457188, "step": 260 }, { "epoch": 0.19607843137254902, "grad_norm": 0.8454337167883038, "learning_rate": 1.944885438503888e-05, "loss": 0.7854, "mean_token_accuracy": 0.762450394993266, "step": 265 }, { "epoch": 0.1997780244173141, "grad_norm": 0.8388911143997999, "learning_rate": 1.9405738307869565e-05, "loss": 0.7568, "mean_token_accuracy": 0.7739168937821217, "step": 270 }, { "epoch": 0.20347761746207918, "grad_norm": 0.8905388426225519, "learning_rate": 1.936105015288003e-05, "loss": 0.7874, "mean_token_accuracy": 0.7657060442327474, "step": 275 }, { "epoch": 0.20717721050684423, "grad_norm": 0.9449437240241089, "learning_rate": 1.9314797389261426e-05, "loss": 0.7964, "mean_token_accuracy": 0.7608504929094035, "step": 280 }, { "epoch": 0.21087680355160932, "grad_norm": 0.9586909770652294, "learning_rate": 1.9266987747714036e-05, "loss": 0.7539, "mean_token_accuracy": 0.7740995999688999, "step": 285 }, { "epoch": 0.2145763965963744, "grad_norm": 0.8457904946858513, "learning_rate": 1.9217629219155172e-05, "loss": 0.7596, "mean_token_accuracy": 0.7711086747415601, "step": 290 }, { "epoch": 0.21827598964113948, "grad_norm": 0.9562284538878311, "learning_rate": 1.916673005338357e-05, "loss": 0.7636, "mean_token_accuracy": 0.7705879155908494, "step": 295 }, { "epoch": 0.22197558268590456, "grad_norm": 0.9311027437584551, "learning_rate": 1.9114298757700508e-05, "loss": 0.7833, "mean_token_accuracy": 0.7640983829119388, "step": 300 }, { "epoch": 0.22197558268590456, "eval_loss": 0.8046144247055054, "eval_mean_token_accuracy": 0.7572653228255071, "eval_runtime": 12.9112, "eval_samples_per_second": 9.991, "eval_steps_per_second": 2.556, "step": 300 }, { "epoch": 0.2256751757306696, "grad_norm": 0.981751974465348, "learning_rate": 1.9060344095487916e-05, "loss": 0.7517, "mean_token_accuracy": 0.7743910330942259, "step": 305 }, { "epoch": 0.2293747687754347, "grad_norm": 0.9758899701140303, "learning_rate": 1.9004875084743624e-05, "loss": 0.7744, "mean_token_accuracy": 0.7681139265197442, "step": 310 }, { "epoch": 0.23307436182019978, "grad_norm": 1.0996867282039497, "learning_rate": 1.8947900996574133e-05, "loss": 0.784, "mean_token_accuracy": 0.7653104608265264, "step": 315 }, { "epoch": 0.23677395486496486, "grad_norm": 0.8876749775333296, "learning_rate": 1.8889431353645004e-05, "loss": 0.7856, "mean_token_accuracy": 0.7637695086387486, "step": 320 }, { "epoch": 0.24047354790972994, "grad_norm": 0.9254315942079439, "learning_rate": 1.8829475928589272e-05, "loss": 0.7478, "mean_token_accuracy": 0.7747918700572913, "step": 325 }, { "epoch": 0.244173140954495, "grad_norm": 0.8975251081081479, "learning_rate": 1.8768044742374008e-05, "loss": 0.7838, "mean_token_accuracy": 0.7662794783326515, "step": 330 }, { "epoch": 0.24787273399926008, "grad_norm": 0.9659229415705355, "learning_rate": 1.870514806262544e-05, "loss": 0.8064, "mean_token_accuracy": 0.7575206306307912, "step": 335 }, { "epoch": 0.25157232704402516, "grad_norm": 0.8730001205655563, "learning_rate": 1.8640796401912805e-05, "loss": 0.7786, "mean_token_accuracy": 0.7647639047537955, "step": 340 }, { "epoch": 0.25527192008879024, "grad_norm": 0.8503523554051183, "learning_rate": 1.8575000515991283e-05, "loss": 0.7303, "mean_token_accuracy": 0.7790401063745367, "step": 345 }, { "epoch": 0.2589715131335553, "grad_norm": 0.952888955967417, "learning_rate": 1.850777140200427e-05, "loss": 0.7481, "mean_token_accuracy": 0.7754539961681624, "step": 350 }, { "epoch": 0.2626711061783204, "grad_norm": 0.8607187749209038, "learning_rate": 1.843912029664531e-05, "loss": 0.7628, "mean_token_accuracy": 0.7705045859827028, "step": 355 }, { "epoch": 0.2663706992230855, "grad_norm": 0.9612681025779204, "learning_rate": 1.8369058674280004e-05, "loss": 0.7751, "mean_token_accuracy": 0.7662138710016833, "step": 360 }, { "epoch": 0.2700702922678505, "grad_norm": 0.9382382938099896, "learning_rate": 1.8297598245028173e-05, "loss": 0.7723, "mean_token_accuracy": 0.767681726807316, "step": 365 }, { "epoch": 0.2737698853126156, "grad_norm": 0.9997500486677628, "learning_rate": 1.8224750952806626e-05, "loss": 0.7611, "mean_token_accuracy": 0.7707074275165302, "step": 370 }, { "epoch": 0.27746947835738067, "grad_norm": 1.0014266441284037, "learning_rate": 1.815052897333284e-05, "loss": 0.7505, "mean_token_accuracy": 0.7740457102243052, "step": 375 }, { "epoch": 0.28116907140214575, "grad_norm": 1.1075366477297495, "learning_rate": 1.8074944712089925e-05, "loss": 0.7641, "mean_token_accuracy": 0.7693695886355318, "step": 380 }, { "epoch": 0.28486866444691084, "grad_norm": 0.880418326713614, "learning_rate": 1.799801080225316e-05, "loss": 0.7539, "mean_token_accuracy": 0.7710243423063697, "step": 385 }, { "epoch": 0.2885682574916759, "grad_norm": 0.9012750826548256, "learning_rate": 1.7919740102578482e-05, "loss": 0.7781, "mean_token_accuracy": 0.7661210900072783, "step": 390 }, { "epoch": 0.292267850536441, "grad_norm": 0.9230339938349867, "learning_rate": 1.7840145695253258e-05, "loss": 0.7708, "mean_token_accuracy": 0.7679753783644399, "step": 395 }, { "epoch": 0.2959674435812061, "grad_norm": 0.8616748960380592, "learning_rate": 1.7759240883709745e-05, "loss": 0.7643, "mean_token_accuracy": 0.7698643926023239, "step": 400 }, { "epoch": 0.2959674435812061, "eval_loss": 0.7939132452011108, "eval_mean_token_accuracy": 0.7597238566071922, "eval_runtime": 12.8935, "eval_samples_per_second": 10.005, "eval_steps_per_second": 2.559, "step": 400 }, { "epoch": 0.29966703662597116, "grad_norm": 0.8626025158912716, "learning_rate": 1.7677039190401538e-05, "loss": 0.7945, "mean_token_accuracy": 0.7610350956243586, "step": 405 }, { "epoch": 0.30336662967073624, "grad_norm": 0.8809551406016749, "learning_rate": 1.759355435454342e-05, "loss": 0.7554, "mean_token_accuracy": 0.7720821829110623, "step": 410 }, { "epoch": 0.30706622271550127, "grad_norm": 0.8398343511073675, "learning_rate": 1.7508800329814993e-05, "loss": 0.758, "mean_token_accuracy": 0.7709962708725867, "step": 415 }, { "epoch": 0.31076581576026635, "grad_norm": 0.8717214162340853, "learning_rate": 1.7422791282028457e-05, "loss": 0.7456, "mean_token_accuracy": 0.7747603119006603, "step": 420 }, { "epoch": 0.31446540880503143, "grad_norm": 0.9182650879024069, "learning_rate": 1.7335541586760928e-05, "loss": 0.7678, "mean_token_accuracy": 0.768033024075789, "step": 425 }, { "epoch": 0.3181650018497965, "grad_norm": 0.9270374928264972, "learning_rate": 1.7247065826951694e-05, "loss": 0.7687, "mean_token_accuracy": 0.7670009758238231, "step": 430 }, { "epoch": 0.3218645948945616, "grad_norm": 0.9239947000531302, "learning_rate": 1.715737879046483e-05, "loss": 0.7807, "mean_token_accuracy": 0.7646944309939067, "step": 435 }, { "epoch": 0.3255641879393267, "grad_norm": 0.9234310473796695, "learning_rate": 1.7066495467617552e-05, "loss": 0.722, "mean_token_accuracy": 0.7810521648421277, "step": 440 }, { "epoch": 0.32926378098409176, "grad_norm": 0.9873873391290533, "learning_rate": 1.6974431048674714e-05, "loss": 0.7773, "mean_token_accuracy": 0.7676746345471892, "step": 445 }, { "epoch": 0.33296337402885684, "grad_norm": 0.8848520725498183, "learning_rate": 1.6881200921309914e-05, "loss": 0.7799, "mean_token_accuracy": 0.7653884936966252, "step": 450 }, { "epoch": 0.3366629670736219, "grad_norm": 0.9045509398039985, "learning_rate": 1.6786820668033596e-05, "loss": 0.7575, "mean_token_accuracy": 0.7713310238062097, "step": 455 }, { "epoch": 0.340362560118387, "grad_norm": 0.890193659220096, "learning_rate": 1.6691306063588583e-05, "loss": 0.7643, "mean_token_accuracy": 0.7680450171610729, "step": 460 }, { "epoch": 0.34406215316315203, "grad_norm": 0.9121601629876329, "learning_rate": 1.6594673072313478e-05, "loss": 0.7565, "mean_token_accuracy": 0.7709845040717809, "step": 465 }, { "epoch": 0.3477617462079171, "grad_norm": 0.8596218378746548, "learning_rate": 1.6496937845474375e-05, "loss": 0.7899, "mean_token_accuracy": 0.762253394394126, "step": 470 }, { "epoch": 0.3514613392526822, "grad_norm": 0.8881090780569305, "learning_rate": 1.639811671856535e-05, "loss": 0.7716, "mean_token_accuracy": 0.766323517253549, "step": 475 }, { "epoch": 0.3551609322974473, "grad_norm": 0.9129717947745375, "learning_rate": 1.6298226208578127e-05, "loss": 0.7773, "mean_token_accuracy": 0.7662860024728196, "step": 480 }, { "epoch": 0.35886052534221236, "grad_norm": 0.8773141616181648, "learning_rate": 1.6197283011241423e-05, "loss": 0.7823, "mean_token_accuracy": 0.7632768240832929, "step": 485 }, { "epoch": 0.36256011838697744, "grad_norm": 0.8872263111805669, "learning_rate": 1.6095303998230432e-05, "loss": 0.7683, "mean_token_accuracy": 0.7674727430979753, "step": 490 }, { "epoch": 0.3662597114317425, "grad_norm": 0.98466468959405, "learning_rate": 1.599230621434687e-05, "loss": 0.7591, "mean_token_accuracy": 0.7693043297460778, "step": 495 }, { "epoch": 0.3699593044765076, "grad_norm": 0.8908360555035981, "learning_rate": 1.5888306874670112e-05, "loss": 0.7362, "mean_token_accuracy": 0.7770877269382522, "step": 500 }, { "epoch": 0.3699593044765076, "eval_loss": 0.784197211265564, "eval_mean_token_accuracy": 0.7620830191395729, "eval_runtime": 12.9033, "eval_samples_per_second": 9.997, "eval_steps_per_second": 2.557, "step": 500 }, { "epoch": 0.3736588975212727, "grad_norm": 0.9048447583159639, "learning_rate": 1.5783323361679865e-05, "loss": 0.7787, "mean_token_accuracy": 0.7648656868298831, "step": 505 }, { "epoch": 0.37735849056603776, "grad_norm": 0.9120327051425232, "learning_rate": 1.567737322235084e-05, "loss": 0.7593, "mean_token_accuracy": 0.7714510339240377, "step": 510 }, { "epoch": 0.3810580836108028, "grad_norm": 0.8195601280151146, "learning_rate": 1.557047416521996e-05, "loss": 0.7215, "mean_token_accuracy": 0.7789905700934947, "step": 515 }, { "epoch": 0.38475767665556787, "grad_norm": 0.9650761679887094, "learning_rate": 1.546264405742654e-05, "loss": 0.7782, "mean_token_accuracy": 0.7636861487365787, "step": 520 }, { "epoch": 0.38845726970033295, "grad_norm": 0.8237906709737827, "learning_rate": 1.535390092172597e-05, "loss": 0.7699, "mean_token_accuracy": 0.7671755622376449, "step": 525 }, { "epoch": 0.39215686274509803, "grad_norm": 0.8299457105893924, "learning_rate": 1.5244262933477401e-05, "loss": 0.7804, "mean_token_accuracy": 0.7637937394911022, "step": 530 }, { "epoch": 0.3958564557898631, "grad_norm": 1.0313966357425606, "learning_rate": 1.5133748417605878e-05, "loss": 0.7752, "mean_token_accuracy": 0.7650544976847276, "step": 535 }, { "epoch": 0.3995560488346282, "grad_norm": 0.8476613666163046, "learning_rate": 1.5022375845539537e-05, "loss": 0.7459, "mean_token_accuracy": 0.7739755173371613, "step": 540 }, { "epoch": 0.4032556418793933, "grad_norm": 0.8488122984336212, "learning_rate": 1.4910163832122278e-05, "loss": 0.7282, "mean_token_accuracy": 0.7793907307750887, "step": 545 }, { "epoch": 0.40695523492415836, "grad_norm": 0.9246732865437337, "learning_rate": 1.4797131132502464e-05, "loss": 0.7387, "mean_token_accuracy": 0.7771462284436527, "step": 550 }, { "epoch": 0.41065482796892344, "grad_norm": 0.8693436586761074, "learning_rate": 1.4683296638998192e-05, "loss": 0.7449, "mean_token_accuracy": 0.7733528571354785, "step": 555 }, { "epoch": 0.41435442101368847, "grad_norm": 1.0132116370018853, "learning_rate": 1.4568679377939619e-05, "loss": 0.7338, "mean_token_accuracy": 0.7772718301836594, "step": 560 }, { "epoch": 0.41805401405845355, "grad_norm": 0.8274852436061462, "learning_rate": 1.4453298506488896e-05, "loss": 0.7468, "mean_token_accuracy": 0.7725176751538921, "step": 565 }, { "epoch": 0.42175360710321863, "grad_norm": 0.8143005464066339, "learning_rate": 1.4337173309438236e-05, "loss": 0.7423, "mean_token_accuracy": 0.7735196406247817, "step": 570 }, { "epoch": 0.4254532001479837, "grad_norm": 0.8624822737135253, "learning_rate": 1.4220323195986649e-05, "loss": 0.7448, "mean_token_accuracy": 0.7735122705759122, "step": 575 }, { "epoch": 0.4291527931927488, "grad_norm": 0.8889965581940739, "learning_rate": 1.4102767696495885e-05, "loss": 0.7674, "mean_token_accuracy": 0.7686958478722382, "step": 580 }, { "epoch": 0.4328523862375139, "grad_norm": 0.7782401935752693, "learning_rate": 1.398452645922611e-05, "loss": 0.765, "mean_token_accuracy": 0.7691531672999341, "step": 585 }, { "epoch": 0.43655197928227896, "grad_norm": 0.8590974972802418, "learning_rate": 1.3865619247051916e-05, "loss": 0.7507, "mean_token_accuracy": 0.7729800254264771, "step": 590 }, { "epoch": 0.44025157232704404, "grad_norm": 0.7988473909688566, "learning_rate": 1.3746065934159123e-05, "loss": 0.7242, "mean_token_accuracy": 0.780331890399065, "step": 595 }, { "epoch": 0.4439511653718091, "grad_norm": 0.9282202265668916, "learning_rate": 1.3625886502723008e-05, "loss": 0.7587, "mean_token_accuracy": 0.7696747774936822, "step": 600 }, { "epoch": 0.4439511653718091, "eval_loss": 0.7762653827667236, "eval_mean_token_accuracy": 0.7637766286530999, "eval_runtime": 12.9012, "eval_samples_per_second": 9.999, "eval_steps_per_second": 2.558, "step": 600 }, { "epoch": 0.4476507584165742, "grad_norm": 0.8375068814255916, "learning_rate": 1.3505101039568494e-05, "loss": 0.7114, "mean_token_accuracy": 0.78174785615947, "step": 605 }, { "epoch": 0.4513503514613392, "grad_norm": 0.8406018488041918, "learning_rate": 1.3383729732812814e-05, "loss": 0.7309, "mean_token_accuracy": 0.7761469410168554, "step": 610 }, { "epoch": 0.4550499445061043, "grad_norm": 0.8214051664479577, "learning_rate": 1.3261792868491267e-05, "loss": 0.754, "mean_token_accuracy": 0.7719396256522534, "step": 615 }, { "epoch": 0.4587495375508694, "grad_norm": 0.8854429907993514, "learning_rate": 1.3139310827166613e-05, "loss": 0.7529, "mean_token_accuracy": 0.7723088441600371, "step": 620 }, { "epoch": 0.46244913059563447, "grad_norm": 0.8135757240558104, "learning_rate": 1.3016304080522657e-05, "loss": 0.7756, "mean_token_accuracy": 0.7642865483170395, "step": 625 }, { "epoch": 0.46614872364039955, "grad_norm": 0.8597128387115325, "learning_rate": 1.2892793187942588e-05, "loss": 0.7856, "mean_token_accuracy": 0.7615404441461953, "step": 630 }, { "epoch": 0.46984831668516464, "grad_norm": 0.8740917081939998, "learning_rate": 1.2768798793072708e-05, "loss": 0.7599, "mean_token_accuracy": 0.7694673674218602, "step": 635 }, { "epoch": 0.4735479097299297, "grad_norm": 0.8455035652800976, "learning_rate": 1.2644341620372025e-05, "loss": 0.7374, "mean_token_accuracy": 0.7753507196148239, "step": 640 }, { "epoch": 0.4772475027746948, "grad_norm": 0.8453775134661862, "learning_rate": 1.2519442471648364e-05, "loss": 0.7539, "mean_token_accuracy": 0.7711800403046617, "step": 645 }, { "epoch": 0.4809470958194599, "grad_norm": 0.8131753165282993, "learning_rate": 1.2394122222581557e-05, "loss": 0.7476, "mean_token_accuracy": 0.7740160766147529, "step": 650 }, { "epoch": 0.48464668886422496, "grad_norm": 0.8752309330635153, "learning_rate": 1.226840181923427e-05, "loss": 0.7234, "mean_token_accuracy": 0.7799259045764162, "step": 655 }, { "epoch": 0.48834628190899, "grad_norm": 0.9394868684222661, "learning_rate": 1.214230227455106e-05, "loss": 0.761, "mean_token_accuracy": 0.76914448788722, "step": 660 }, { "epoch": 0.49204587495375507, "grad_norm": 0.9190997939430626, "learning_rate": 1.201584466484629e-05, "loss": 0.7429, "mean_token_accuracy": 0.7749135561574454, "step": 665 }, { "epoch": 0.49574546799852015, "grad_norm": 0.991650882333127, "learning_rate": 1.1889050126281405e-05, "loss": 0.7624, "mean_token_accuracy": 0.7674541114303453, "step": 670 }, { "epoch": 0.49944506104328523, "grad_norm": 0.8727170236064898, "learning_rate": 1.1761939851332241e-05, "loss": 0.7686, "mean_token_accuracy": 0.7668983696182747, "step": 675 }, { "epoch": 0.5031446540880503, "grad_norm": 0.8423456719916217, "learning_rate": 1.1634535085246903e-05, "loss": 0.7502, "mean_token_accuracy": 0.7719282566060406, "step": 680 }, { "epoch": 0.5068442471328154, "grad_norm": 0.8041730146404278, "learning_rate": 1.1506857122494832e-05, "loss": 0.7557, "mean_token_accuracy": 0.7698715177090566, "step": 685 }, { "epoch": 0.5105438401775805, "grad_norm": 0.8008949159036595, "learning_rate": 1.1378927303207637e-05, "loss": 0.7555, "mean_token_accuracy": 0.7708746251231458, "step": 690 }, { "epoch": 0.5142434332223456, "grad_norm": 0.8535115805575514, "learning_rate": 1.12507670096123e-05, "loss": 0.7427, "mean_token_accuracy": 0.7726444687618006, "step": 695 }, { "epoch": 0.5179430262671106, "grad_norm": 0.9312344890724055, "learning_rate": 1.1122397662457352e-05, "loss": 0.7502, "mean_token_accuracy": 0.7726102122304674, "step": 700 }, { "epoch": 0.5179430262671106, "eval_loss": 0.7686853408813477, "eval_mean_token_accuracy": 0.7649829158123188, "eval_runtime": 12.9077, "eval_samples_per_second": 9.994, "eval_steps_per_second": 2.557, "step": 700 }, { "epoch": 0.5216426193118757, "grad_norm": 0.868812002235657, "learning_rate": 1.0993840717432582e-05, "loss": 0.739, "mean_token_accuracy": 0.7753342555426331, "step": 705 }, { "epoch": 0.5253422123566408, "grad_norm": 0.873440662985193, "learning_rate": 1.0865117661582958e-05, "loss": 0.7439, "mean_token_accuracy": 0.7739453320579491, "step": 710 }, { "epoch": 0.5290418054014059, "grad_norm": 0.862857533425593, "learning_rate": 1.0736250009717249e-05, "loss": 0.7569, "mean_token_accuracy": 0.770897334069314, "step": 715 }, { "epoch": 0.532741398446171, "grad_norm": 0.8639562204638955, "learning_rate": 1.0607259300812047e-05, "loss": 0.7074, "mean_token_accuracy": 0.7842015210192824, "step": 720 }, { "epoch": 0.536440991490936, "grad_norm": 0.899875919647952, "learning_rate": 1.0478167094411733e-05, "loss": 0.7363, "mean_token_accuracy": 0.7761369932840716, "step": 725 }, { "epoch": 0.540140584535701, "grad_norm": 0.8342218548841652, "learning_rate": 1.0348994967025012e-05, "loss": 0.7417, "mean_token_accuracy": 0.7746170812078568, "step": 730 }, { "epoch": 0.5438401775804661, "grad_norm": 0.8013624939670422, "learning_rate": 1.0219764508518595e-05, "loss": 0.7384, "mean_token_accuracy": 0.7767422075955539, "step": 735 }, { "epoch": 0.5475397706252312, "grad_norm": 0.8120468940513571, "learning_rate": 1.0090497318508687e-05, "loss": 0.7566, "mean_token_accuracy": 0.7691283622927295, "step": 740 }, { "epoch": 0.5512393636699963, "grad_norm": 0.8150917404467021, "learning_rate": 9.961215002750799e-06, "loss": 0.7624, "mean_token_accuracy": 0.7672601324412168, "step": 745 }, { "epoch": 0.5549389567147613, "grad_norm": 0.8358908477578574, "learning_rate": 9.831939169528565e-06, "loss": 0.7198, "mean_token_accuracy": 0.7803871901829526, "step": 750 }, { "epoch": 0.5586385497595264, "grad_norm": 0.8360497025012443, "learning_rate": 9.702691426042124e-06, "loss": 0.704, "mean_token_accuracy": 0.784279945314917, "step": 755 }, { "epoch": 0.5623381428042915, "grad_norm": 0.8484683109694328, "learning_rate": 9.573493374796694e-06, "loss": 0.742, "mean_token_accuracy": 0.773911456375648, "step": 760 }, { "epoch": 0.5660377358490566, "grad_norm": 0.801814751253701, "learning_rate": 9.444366609991916e-06, "loss": 0.7475, "mean_token_accuracy": 0.7704960987519774, "step": 765 }, { "epoch": 0.5697373288938217, "grad_norm": 0.7903451176947093, "learning_rate": 9.315332713912593e-06, "loss": 0.6802, "mean_token_accuracy": 0.7910297225776487, "step": 770 }, { "epoch": 0.5734369219385868, "grad_norm": 0.7640452473763807, "learning_rate": 9.18641325332142e-06, "loss": 0.7254, "mean_token_accuracy": 0.7779958468370307, "step": 775 }, { "epoch": 0.5771365149833518, "grad_norm": 0.7764311897247947, "learning_rate": 9.057629775854314e-06, "loss": 0.7317, "mean_token_accuracy": 0.7763394937851484, "step": 780 }, { "epoch": 0.5808361080281169, "grad_norm": 0.8245708624875354, "learning_rate": 8.929003806418934e-06, "loss": 0.7376, "mean_token_accuracy": 0.7750152545715012, "step": 785 }, { "epoch": 0.584535701072882, "grad_norm": 0.8103882551202022, "learning_rate": 8.800556843597002e-06, "loss": 0.7259, "mean_token_accuracy": 0.7782374392513656, "step": 790 }, { "epoch": 0.5882352941176471, "grad_norm": 0.7928991222424273, "learning_rate": 8.672310356051023e-06, "loss": 0.7431, "mean_token_accuracy": 0.7736308218387102, "step": 795 }, { "epoch": 0.5919348871624122, "grad_norm": 0.8289989830065249, "learning_rate": 8.544285778936004e-06, "loss": 0.7352, "mean_token_accuracy": 0.7764898230184529, "step": 800 }, { "epoch": 0.5919348871624122, "eval_loss": 0.7624932527542114, "eval_mean_token_accuracy": 0.7669661829206922, "eval_runtime": 12.9119, "eval_samples_per_second": 9.991, "eval_steps_per_second": 2.556, "step": 800 }, { "epoch": 0.5956344802071772, "grad_norm": 0.8482327309493489, "learning_rate": 8.416504510316774e-06, "loss": 0.7194, "mean_token_accuracy": 0.7805804982214577, "step": 805 }, { "epoch": 0.5993340732519423, "grad_norm": 0.8542094892921783, "learning_rate": 8.28898790759152e-06, "loss": 0.7332, "mean_token_accuracy": 0.7760156726467715, "step": 810 }, { "epoch": 0.6030336662967074, "grad_norm": 0.8315942810202602, "learning_rate": 8.161757283922084e-06, "loss": 0.7186, "mean_token_accuracy": 0.7800488930789287, "step": 815 }, { "epoch": 0.6067332593414725, "grad_norm": 0.8561047694162326, "learning_rate": 8.034833904671698e-06, "loss": 0.7286, "mean_token_accuracy": 0.7782032355305302, "step": 820 }, { "epoch": 0.6104328523862376, "grad_norm": 0.8322803740232512, "learning_rate": 7.908238983850666e-06, "loss": 0.7475, "mean_token_accuracy": 0.7716142551709843, "step": 825 }, { "epoch": 0.6141324454310025, "grad_norm": 0.8093783949232168, "learning_rate": 7.781993680570656e-06, "loss": 0.7419, "mean_token_accuracy": 0.7753669766613711, "step": 830 }, { "epoch": 0.6178320384757676, "grad_norm": 0.8251647916013486, "learning_rate": 7.656119095508155e-06, "loss": 0.7163, "mean_token_accuracy": 0.7810057582729616, "step": 835 }, { "epoch": 0.6215316315205327, "grad_norm": 0.8936545458969576, "learning_rate": 7.530636267377706e-06, "loss": 0.7212, "mean_token_accuracy": 0.7782000869575987, "step": 840 }, { "epoch": 0.6252312245652978, "grad_norm": 0.8327340922107708, "learning_rate": 7.405566169415481e-06, "loss": 0.7417, "mean_token_accuracy": 0.7713689918387485, "step": 845 }, { "epoch": 0.6289308176100629, "grad_norm": 0.9240689034622575, "learning_rate": 7.280929705873818e-06, "loss": 0.7671, "mean_token_accuracy": 0.7674969695134786, "step": 850 }, { "epoch": 0.632630410654828, "grad_norm": 0.8604187816835673, "learning_rate": 7.15674770852727e-06, "loss": 0.7714, "mean_token_accuracy": 0.765074378427229, "step": 855 }, { "epoch": 0.636330003699593, "grad_norm": 0.8575462475833365, "learning_rate": 7.033040933190776e-06, "loss": 0.7485, "mean_token_accuracy": 0.7725655493857382, "step": 860 }, { "epoch": 0.6400295967443581, "grad_norm": 0.7868065852245846, "learning_rate": 6.909830056250527e-06, "loss": 0.703, "mean_token_accuracy": 0.7851972669426284, "step": 865 }, { "epoch": 0.6437291897891232, "grad_norm": 0.817598798759793, "learning_rate": 6.787135671208126e-06, "loss": 0.7576, "mean_token_accuracy": 0.7701196194143121, "step": 870 }, { "epoch": 0.6474287828338883, "grad_norm": 0.7951083662310198, "learning_rate": 6.6649782852385554e-06, "loss": 0.7459, "mean_token_accuracy": 0.772123421166796, "step": 875 }, { "epoch": 0.6511283758786534, "grad_norm": 0.7271974698252879, "learning_rate": 6.543378315762634e-06, "loss": 0.7247, "mean_token_accuracy": 0.7791351612791593, "step": 880 }, { "epoch": 0.6548279689234184, "grad_norm": 0.8824398500626421, "learning_rate": 6.42235608703441e-06, "loss": 0.7606, "mean_token_accuracy": 0.7677733208877957, "step": 885 }, { "epoch": 0.6585275619681835, "grad_norm": 0.8552464856465032, "learning_rate": 6.301931826744189e-06, "loss": 0.7419, "mean_token_accuracy": 0.7743753446438143, "step": 890 }, { "epoch": 0.6622271550129486, "grad_norm": 0.8306173604508144, "learning_rate": 6.18212566263765e-06, "loss": 0.7378, "mean_token_accuracy": 0.7754989605528264, "step": 895 }, { "epoch": 0.6659267480577137, "grad_norm": 0.8046739019546305, "learning_rate": 6.0629576191517035e-06, "loss": 0.7254, "mean_token_accuracy": 0.7763354265372273, "step": 900 }, { "epoch": 0.6659267480577137, "eval_loss": 0.7582561373710632, "eval_mean_token_accuracy": 0.7673597290333886, "eval_runtime": 12.9256, "eval_samples_per_second": 9.98, "eval_steps_per_second": 2.553, "step": 900 }, { "epoch": 0.6696263411024788, "grad_norm": 0.7871630029994926, "learning_rate": 5.944447614067588e-06, "loss": 0.7365, "mean_token_accuracy": 0.7753696170648604, "step": 905 }, { "epoch": 0.6733259341472438, "grad_norm": 0.8274033791731158, "learning_rate": 5.8266154551818225e-06, "loss": 0.7148, "mean_token_accuracy": 0.7819856387093902, "step": 910 }, { "epoch": 0.6770255271920089, "grad_norm": 0.8585256390323572, "learning_rate": 5.709480836995509e-06, "loss": 0.7099, "mean_token_accuracy": 0.7823146568890846, "step": 915 }, { "epoch": 0.680725120236774, "grad_norm": 0.8245071508434019, "learning_rate": 5.593063337422595e-06, "loss": 0.7205, "mean_token_accuracy": 0.7803457311979414, "step": 920 }, { "epoch": 0.684424713281539, "grad_norm": 0.8137995619905571, "learning_rate": 5.477382414517625e-06, "loss": 0.7443, "mean_token_accuracy": 0.7742054658480322, "step": 925 }, { "epoch": 0.6881243063263041, "grad_norm": 0.8435356879873597, "learning_rate": 5.362457403223495e-06, "loss": 0.7321, "mean_token_accuracy": 0.7777223904380888, "step": 930 }, { "epoch": 0.6918238993710691, "grad_norm": 0.8275951168813332, "learning_rate": 5.248307512139818e-06, "loss": 0.7327, "mean_token_accuracy": 0.7760629303675801, "step": 935 }, { "epoch": 0.6955234924158342, "grad_norm": 0.7986090408289105, "learning_rate": 5.134951820312402e-06, "loss": 0.6797, "mean_token_accuracy": 0.792751892181402, "step": 940 }, { "epoch": 0.6992230854605993, "grad_norm": 0.8344358039184492, "learning_rate": 5.022409274044346e-06, "loss": 0.7484, "mean_token_accuracy": 0.7706997735792959, "step": 945 }, { "epoch": 0.7029226785053644, "grad_norm": 0.7590018472193605, "learning_rate": 4.910698683729371e-06, "loss": 0.7246, "mean_token_accuracy": 0.7794280766358616, "step": 950 }, { "epoch": 0.7066222715501295, "grad_norm": 0.8418460117578578, "learning_rate": 4.799838720707847e-06, "loss": 0.6786, "mean_token_accuracy": 0.7908993228344331, "step": 955 }, { "epoch": 0.7103218645948945, "grad_norm": 0.8709719228420416, "learning_rate": 4.6898479141460415e-06, "loss": 0.7067, "mean_token_accuracy": 0.7843742462359516, "step": 960 }, { "epoch": 0.7140214576396596, "grad_norm": 0.8535611787813129, "learning_rate": 4.580744647939163e-06, "loss": 0.7435, "mean_token_accuracy": 0.7730417779644787, "step": 965 }, { "epoch": 0.7177210506844247, "grad_norm": 0.8885996504303695, "learning_rate": 4.472547157638674e-06, "loss": 0.694, "mean_token_accuracy": 0.7880904268803789, "step": 970 }, { "epoch": 0.7214206437291898, "grad_norm": 0.9587623922743913, "learning_rate": 4.365273527404384e-06, "loss": 0.7401, "mean_token_accuracy": 0.7739410892854838, "step": 975 }, { "epoch": 0.7251202367739549, "grad_norm": 0.8107566238251871, "learning_rate": 4.258941686981864e-06, "loss": 0.7242, "mean_token_accuracy": 0.779501991954344, "step": 980 }, { "epoch": 0.72881982981872, "grad_norm": 0.8788408769548844, "learning_rate": 4.15356940870567e-06, "loss": 0.7382, "mean_token_accuracy": 0.7752977269534177, "step": 985 }, { "epoch": 0.732519422863485, "grad_norm": 0.8161841348783122, "learning_rate": 4.049174304528857e-06, "loss": 0.7122, "mean_token_accuracy": 0.7834159122342004, "step": 990 }, { "epoch": 0.7362190159082501, "grad_norm": 0.8045647610652492, "learning_rate": 3.945773823079315e-06, "loss": 0.7085, "mean_token_accuracy": 0.7838271814299385, "step": 995 }, { "epoch": 0.7399186089530152, "grad_norm": 0.7888951101393026, "learning_rate": 3.8433852467434175e-06, "loss": 0.7436, "mean_token_accuracy": 0.7715282648856208, "step": 1000 }, { "epoch": 0.7399186089530152, "eval_loss": 0.7540702819824219, "eval_mean_token_accuracy": 0.7684317578583987, "eval_runtime": 12.9145, "eval_samples_per_second": 9.989, "eval_steps_per_second": 2.555, "step": 1000 }, { "epoch": 0.7436182019977803, "grad_norm": 0.8476150102495247, "learning_rate": 3.742025688777413e-06, "loss": 0.755, "mean_token_accuracy": 0.769747059368865, "step": 1005 }, { "epoch": 0.7473177950425454, "grad_norm": 0.8447327457715914, "learning_rate": 3.641712090447125e-06, "loss": 0.7084, "mean_token_accuracy": 0.7835149860526573, "step": 1010 }, { "epoch": 0.7510173880873104, "grad_norm": 0.7966093391523419, "learning_rate": 3.542461218196379e-06, "loss": 0.7215, "mean_token_accuracy": 0.7790490334715525, "step": 1015 }, { "epoch": 0.7547169811320755, "grad_norm": 0.8909915448158038, "learning_rate": 3.444289660844665e-06, "loss": 0.7361, "mean_token_accuracy": 0.7758344142261316, "step": 1020 }, { "epoch": 0.7584165741768405, "grad_norm": 0.828582805131554, "learning_rate": 3.347213826814456e-06, "loss": 0.7317, "mean_token_accuracy": 0.7749168212587448, "step": 1025 }, { "epoch": 0.7621161672216056, "grad_norm": 0.8172943841676832, "learning_rate": 3.2512499413887255e-06, "loss": 0.7206, "mean_token_accuracy": 0.7795061516510636, "step": 1030 }, { "epoch": 0.7658157602663707, "grad_norm": 0.7948969347509628, "learning_rate": 3.1564140439990256e-06, "loss": 0.7534, "mean_token_accuracy": 0.7695401774020444, "step": 1035 }, { "epoch": 0.7695153533111357, "grad_norm": 0.7639527363991839, "learning_rate": 3.0627219855446667e-06, "loss": 0.6907, "mean_token_accuracy": 0.7891346245991027, "step": 1040 }, { "epoch": 0.7732149463559008, "grad_norm": 0.796687026157432, "learning_rate": 2.970189425743383e-06, "loss": 0.7365, "mean_token_accuracy": 0.7739980038896167, "step": 1045 }, { "epoch": 0.7769145394006659, "grad_norm": 0.7960037813992309, "learning_rate": 2.8788318305139808e-06, "loss": 0.7513, "mean_token_accuracy": 0.7689661221516821, "step": 1050 }, { "epoch": 0.780614132445431, "grad_norm": 0.7733589592006201, "learning_rate": 2.7886644693913333e-06, "loss": 0.7443, "mean_token_accuracy": 0.7728263504337974, "step": 1055 }, { "epoch": 0.7843137254901961, "grad_norm": 0.8565688482800564, "learning_rate": 2.6997024129742544e-06, "loss": 0.7328, "mean_token_accuracy": 0.7762133218500764, "step": 1060 }, { "epoch": 0.7880133185349611, "grad_norm": 0.8923008390952016, "learning_rate": 2.611960530406572e-06, "loss": 0.728, "mean_token_accuracy": 0.7796289219427248, "step": 1065 }, { "epoch": 0.7917129115797262, "grad_norm": 0.7873211808203069, "learning_rate": 2.5254534868919077e-06, "loss": 0.726, "mean_token_accuracy": 0.7775058698315347, "step": 1070 }, { "epoch": 0.7954125046244913, "grad_norm": 0.7719831428886607, "learning_rate": 2.4401957412425213e-06, "loss": 0.7309, "mean_token_accuracy": 0.7749563569631559, "step": 1075 }, { "epoch": 0.7991120976692564, "grad_norm": 0.835137021064406, "learning_rate": 2.3562015434626784e-06, "loss": 0.7145, "mean_token_accuracy": 0.7820543807444283, "step": 1080 }, { "epoch": 0.8028116907140215, "grad_norm": 0.7671221716084607, "learning_rate": 2.273484932366874e-06, "loss": 0.7021, "mean_token_accuracy": 0.7854601544650027, "step": 1085 }, { "epoch": 0.8065112837587866, "grad_norm": 0.7788468903413968, "learning_rate": 2.192059733233408e-06, "loss": 0.7244, "mean_token_accuracy": 0.7782192310784752, "step": 1090 }, { "epoch": 0.8102108768035516, "grad_norm": 0.8041148765506401, "learning_rate": 2.111939555493603e-06, "loss": 0.7225, "mean_token_accuracy": 0.7786757617228395, "step": 1095 }, { "epoch": 0.8139104698483167, "grad_norm": 0.9148318869053712, "learning_rate": 2.0331377904571303e-06, "loss": 0.745, "mean_token_accuracy": 0.7731225924993291, "step": 1100 }, { "epoch": 0.8139104698483167, "eval_loss": 0.7510696649551392, "eval_mean_token_accuracy": 0.7692631147293293, "eval_runtime": 12.9268, "eval_samples_per_second": 9.979, "eval_steps_per_second": 2.553, "step": 1100 }, { "epoch": 0.8176100628930818, "grad_norm": 0.8176644589533959, "learning_rate": 1.9556676090737803e-06, "loss": 0.7544, "mean_token_accuracy": 0.7705814013113146, "step": 1105 }, { "epoch": 0.8213096559378469, "grad_norm": 0.751484746704062, "learning_rate": 1.879541959732072e-06, "loss": 0.7133, "mean_token_accuracy": 0.7810593845260246, "step": 1110 }, { "epoch": 0.825009248982612, "grad_norm": 0.8448196083310406, "learning_rate": 1.8047735660950427e-06, "loss": 0.7088, "mean_token_accuracy": 0.7839263073608779, "step": 1115 }, { "epoch": 0.8287088420273769, "grad_norm": 0.7666551359898016, "learning_rate": 1.7313749249736266e-06, "loss": 0.7225, "mean_token_accuracy": 0.7800409694989299, "step": 1120 }, { "epoch": 0.832408435072142, "grad_norm": 0.7750630686515836, "learning_rate": 1.6593583042379192e-06, "loss": 0.7302, "mean_token_accuracy": 0.7764794004807236, "step": 1125 }, { "epoch": 0.8361080281169071, "grad_norm": 0.7448783923471559, "learning_rate": 1.5887357407667314e-06, "loss": 0.7303, "mean_token_accuracy": 0.7770985998568949, "step": 1130 }, { "epoch": 0.8398076211616722, "grad_norm": 0.8617250383965689, "learning_rate": 1.5195190384357405e-06, "loss": 0.7261, "mean_token_accuracy": 0.7790343243952108, "step": 1135 }, { "epoch": 0.8435072142064373, "grad_norm": 0.7315929117396289, "learning_rate": 1.4517197661445893e-06, "loss": 0.7103, "mean_token_accuracy": 0.7820174506567448, "step": 1140 }, { "epoch": 0.8472068072512023, "grad_norm": 0.7464816300853588, "learning_rate": 1.3853492558832472e-06, "loss": 0.7248, "mean_token_accuracy": 0.7778668411101556, "step": 1145 }, { "epoch": 0.8509064002959674, "grad_norm": 0.8027284787026119, "learning_rate": 1.3204186008379926e-06, "loss": 0.7142, "mean_token_accuracy": 0.7828477066310621, "step": 1150 }, { "epoch": 0.8546059933407325, "grad_norm": 0.7867579697995757, "learning_rate": 1.2569386535372807e-06, "loss": 0.7411, "mean_token_accuracy": 0.7738658084540277, "step": 1155 }, { "epoch": 0.8583055863854976, "grad_norm": 0.8069270526220779, "learning_rate": 1.1949200240378577e-06, "loss": 0.7066, "mean_token_accuracy": 0.7833467087206057, "step": 1160 }, { "epoch": 0.8620051794302627, "grad_norm": 0.8115083552759281, "learning_rate": 1.1343730781513896e-06, "loss": 0.7117, "mean_token_accuracy": 0.782034573182628, "step": 1165 }, { "epoch": 0.8657047724750278, "grad_norm": 0.7379062498914578, "learning_rate": 1.0753079357119134e-06, "loss": 0.7334, "mean_token_accuracy": 0.7753454314583148, "step": 1170 }, { "epoch": 0.8694043655197928, "grad_norm": 0.7894649463121637, "learning_rate": 1.017734468884417e-06, "loss": 0.6873, "mean_token_accuracy": 0.7897845425207853, "step": 1175 }, { "epoch": 0.8731039585645579, "grad_norm": 0.8023692788963809, "learning_rate": 9.616623005147952e-07, "loss": 0.7416, "mean_token_accuracy": 0.7721576809793975, "step": 1180 }, { "epoch": 0.876803551609323, "grad_norm": 0.7552635215864901, "learning_rate": 9.071008025214767e-07, "loss": 0.6686, "mean_token_accuracy": 0.7955690867836535, "step": 1185 }, { "epoch": 0.8805031446540881, "grad_norm": 0.7427939874056083, "learning_rate": 8.540590943290128e-07, "loss": 0.7366, "mean_token_accuracy": 0.7745106495210125, "step": 1190 }, { "epoch": 0.8842027376988532, "grad_norm": 0.7579131248407016, "learning_rate": 8.025460413438457e-07, "loss": 0.7146, "mean_token_accuracy": 0.7814567025093309, "step": 1195 }, { "epoch": 0.8879023307436182, "grad_norm": 0.8173415852977716, "learning_rate": 7.525702534725443e-07, "loss": 0.7231, "mean_token_accuracy": 0.7785201805258471, "step": 1200 }, { "epoch": 0.8879023307436182, "eval_loss": 0.749515950679779, "eval_mean_token_accuracy": 0.7696483845818836, "eval_runtime": 12.9328, "eval_samples_per_second": 9.975, "eval_steps_per_second": 2.552, "step": 1200 }, { "epoch": 0.8916019237883833, "grad_norm": 0.7711527562330054, "learning_rate": 7.041400836827439e-07, "loss": 0.7143, "mean_token_accuracy": 0.7821882194464815, "step": 1205 }, { "epoch": 0.8953015168331484, "grad_norm": 0.7623772941452016, "learning_rate": 6.572636266070265e-07, "loss": 0.7387, "mean_token_accuracy": 0.7745962709444631, "step": 1210 }, { "epoch": 0.8990011098779135, "grad_norm": 0.7652994503797836, "learning_rate": 6.119487171899807e-07, "loss": 0.6961, "mean_token_accuracy": 0.7874255931848155, "step": 1215 }, { "epoch": 0.9027007029226785, "grad_norm": 0.8269678511841839, "learning_rate": 5.682029293786673e-07, "loss": 0.7255, "mean_token_accuracy": 0.7770277914333097, "step": 1220 }, { "epoch": 0.9064002959674435, "grad_norm": 0.7302387175249154, "learning_rate": 5.26033574856708e-07, "loss": 0.7036, "mean_token_accuracy": 0.7842749808544124, "step": 1225 }, { "epoch": 0.9100998890122086, "grad_norm": 0.7426310574145085, "learning_rate": 4.854477018222103e-07, "loss": 0.7085, "mean_token_accuracy": 0.7827587579577575, "step": 1230 }, { "epoch": 0.9137994820569737, "grad_norm": 0.7805083509005499, "learning_rate": 4.464520938097294e-07, "loss": 0.6552, "mean_token_accuracy": 0.7984315941552589, "step": 1235 }, { "epoch": 0.9174990751017388, "grad_norm": 0.7838715084886818, "learning_rate": 4.0905326855646186e-07, "loss": 0.6978, "mean_token_accuracy": 0.7866137937789542, "step": 1240 }, { "epoch": 0.9211986681465039, "grad_norm": 0.764142848066667, "learning_rate": 3.732574769128738e-07, "loss": 0.7425, "mean_token_accuracy": 0.7735163959357182, "step": 1245 }, { "epoch": 0.9248982611912689, "grad_norm": 0.8510657590112316, "learning_rate": 3.390707017979311e-07, "loss": 0.714, "mean_token_accuracy": 0.7817031945118419, "step": 1250 }, { "epoch": 0.928597854236034, "grad_norm": 0.8104897922544609, "learning_rate": 3.06498657199108e-07, "loss": 0.6972, "mean_token_accuracy": 0.7874145607415896, "step": 1255 }, { "epoch": 0.9322974472807991, "grad_norm": 0.8439497662820569, "learning_rate": 2.7554678721735675e-07, "loss": 0.7267, "mean_token_accuracy": 0.7782783773976328, "step": 1260 }, { "epoch": 0.9359970403255642, "grad_norm": 0.7506475456675276, "learning_rate": 2.4622026515717654e-07, "loss": 0.7116, "mean_token_accuracy": 0.7825011111093741, "step": 1265 }, { "epoch": 0.9396966333703293, "grad_norm": 0.796131840794368, "learning_rate": 2.1852399266194312e-07, "loss": 0.7043, "mean_token_accuracy": 0.7831285774300467, "step": 1270 }, { "epoch": 0.9433962264150944, "grad_norm": 0.8874806771851743, "learning_rate": 1.9246259889464935e-07, "loss": 0.7372, "mean_token_accuracy": 0.7754287878050505, "step": 1275 }, { "epoch": 0.9470958194598594, "grad_norm": 0.7616413651005322, "learning_rate": 1.6804043976418438e-07, "loss": 0.7115, "mean_token_accuracy": 0.782469409255144, "step": 1280 }, { "epoch": 0.9507954125046245, "grad_norm": 0.7354908780262112, "learning_rate": 1.4526159719728595e-07, "loss": 0.6923, "mean_token_accuracy": 0.7877402800804598, "step": 1285 }, { "epoch": 0.9544950055493896, "grad_norm": 0.8147962775571531, "learning_rate": 1.24129878456285e-07, "loss": 0.7266, "mean_token_accuracy": 0.7786802824447104, "step": 1290 }, { "epoch": 0.9581945985941547, "grad_norm": 0.7892855143587276, "learning_rate": 1.0464881550276362e-07, "loss": 0.7106, "mean_token_accuracy": 0.7812839457332936, "step": 1295 }, { "epoch": 0.9618941916389198, "grad_norm": 0.7807996872371684, "learning_rate": 8.682166440721729e-08, "loss": 0.6851, "mean_token_accuracy": 0.7892695119285057, "step": 1300 }, { "epoch": 0.9618941916389198, "eval_loss": 0.7488865852355957, "eval_mean_token_accuracy": 0.7697332311412013, "eval_runtime": 12.9253, "eval_samples_per_second": 9.98, "eval_steps_per_second": 2.553, "step": 1300 }, { "epoch": 0.9655937846836848, "grad_norm": 0.7500272096924462, "learning_rate": 7.065140480483235e-08, "loss": 0.722, "mean_token_accuracy": 0.7809713685380442, "step": 1305 }, { "epoch": 0.9692933777284499, "grad_norm": 0.7740539516194233, "learning_rate": 5.6140739397474445e-08, "loss": 0.741, "mean_token_accuracy": 0.7735731347336795, "step": 1310 }, { "epoch": 0.9729929707732149, "grad_norm": 0.7621995001113339, "learning_rate": 4.329209350195651e-08, "loss": 0.7277, "mean_token_accuracy": 0.7768363708086847, "step": 1315 }, { "epoch": 0.97669256381798, "grad_norm": 0.7190084013715407, "learning_rate": 3.210761464466639e-08, "loss": 0.6751, "mean_token_accuracy": 0.7917913948419976, "step": 1320 }, { "epoch": 0.9803921568627451, "grad_norm": 0.832622944037137, "learning_rate": 2.2589172202635014e-08, "loss": 0.7183, "mean_token_accuracy": 0.7814518443094862, "step": 1325 }, { "epoch": 0.9840917499075101, "grad_norm": 0.8517126035217566, "learning_rate": 1.4738357091084177e-08, "loss": 0.7223, "mean_token_accuracy": 0.7779738218649835, "step": 1330 }, { "epoch": 0.9877913429522752, "grad_norm": 0.7735765973323167, "learning_rate": 8.556481497521418e-09, "loss": 0.7064, "mean_token_accuracy": 0.7825683170955192, "step": 1335 }, { "epoch": 0.9914909359970403, "grad_norm": 0.7713072693524988, "learning_rate": 4.044578662419918e-09, "loss": 0.7179, "mean_token_accuracy": 0.7801742579673625, "step": 1340 }, { "epoch": 0.9951905290418054, "grad_norm": 0.7685841540071269, "learning_rate": 1.203402706525525e-09, "loss": 0.7493, "mean_token_accuracy": 0.7706208056883479, "step": 1345 }, { "epoch": 0.9988901220865705, "grad_norm": 0.8159997368959132, "learning_rate": 3.342850480869686e-11, "loss": 0.7328, "mean_token_accuracy": 0.7759899768837795, "step": 1350 }, { "epoch": 0.9996300406955235, "mean_token_accuracy": 0.7755799181439557, "step": 1351, "total_flos": 76966677970944.0, "train_loss": 0.759784622734163, "train_runtime": 8483.4218, "train_samples_per_second": 2.549, "train_steps_per_second": 0.159 } ], "logging_steps": 5, "max_steps": 1351, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 76966677970944.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }