|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9996300406955235, |
|
"eval_steps": 100, |
|
"global_step": 1351, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003699593044765076, |
|
"grad_norm": 2.757993258887922, |
|
"learning_rate": 7.352941176470589e-07, |
|
"loss": 1.1067, |
|
"mean_token_accuracy": 0.708824381857028, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.007399186089530152, |
|
"grad_norm": 2.5907330458452984, |
|
"learning_rate": 1.4705882352941177e-06, |
|
"loss": 1.0679, |
|
"mean_token_accuracy": 0.7169299986512495, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.011098779134295227, |
|
"grad_norm": 2.241392532504099, |
|
"learning_rate": 2.2058823529411767e-06, |
|
"loss": 1.1238, |
|
"mean_token_accuracy": 0.6992935634786113, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.014798372179060304, |
|
"grad_norm": 1.5956862014648674, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 1.0062, |
|
"mean_token_accuracy": 0.7288372261973354, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01849796522382538, |
|
"grad_norm": 1.6119131199982493, |
|
"learning_rate": 3.6764705882352946e-06, |
|
"loss": 1.0517, |
|
"mean_token_accuracy": 0.7142909651691335, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.022197558268590455, |
|
"grad_norm": 1.139657389593639, |
|
"learning_rate": 4.411764705882353e-06, |
|
"loss": 0.9593, |
|
"mean_token_accuracy": 0.7346081897149975, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02589715131335553, |
|
"grad_norm": 1.2790776332632598, |
|
"learning_rate": 5.147058823529411e-06, |
|
"loss": 0.9805, |
|
"mean_token_accuracy": 0.72720137806358, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.029596744358120607, |
|
"grad_norm": 1.104655854611052, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 0.9114, |
|
"mean_token_accuracy": 0.7425328465386071, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.033296337402885685, |
|
"grad_norm": 0.9408582472471563, |
|
"learning_rate": 6.61764705882353e-06, |
|
"loss": 0.8885, |
|
"mean_token_accuracy": 0.7472382565404383, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03699593044765076, |
|
"grad_norm": 1.0580888719684283, |
|
"learning_rate": 7.352941176470589e-06, |
|
"loss": 0.8658, |
|
"mean_token_accuracy": 0.7493579770562304, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.040695523492415835, |
|
"grad_norm": 0.8832977290583615, |
|
"learning_rate": 8.088235294117648e-06, |
|
"loss": 0.841, |
|
"mean_token_accuracy": 0.7568992237585881, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 0.9301447232199826, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 0.8677, |
|
"mean_token_accuracy": 0.7502923381848673, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.048094709581945984, |
|
"grad_norm": 0.856501238425618, |
|
"learning_rate": 9.558823529411766e-06, |
|
"loss": 0.878, |
|
"mean_token_accuracy": 0.7445654933582921, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05179430262671106, |
|
"grad_norm": 1.0586539300870963, |
|
"learning_rate": 1.0294117647058823e-05, |
|
"loss": 0.8825, |
|
"mean_token_accuracy": 0.7438754593389819, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05549389567147614, |
|
"grad_norm": 1.1122367826346538, |
|
"learning_rate": 1.1029411764705885e-05, |
|
"loss": 0.8775, |
|
"mean_token_accuracy": 0.7447408372141248, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.059193488716241215, |
|
"grad_norm": 1.0632613858567348, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.8347, |
|
"mean_token_accuracy": 0.7567175732049909, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06289308176100629, |
|
"grad_norm": 0.9476656757993303, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.8305, |
|
"mean_token_accuracy": 0.7566193958141736, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.06659267480577137, |
|
"grad_norm": 1.1083079276477652, |
|
"learning_rate": 1.323529411764706e-05, |
|
"loss": 0.8466, |
|
"mean_token_accuracy": 0.7519149864344391, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07029226785053644, |
|
"grad_norm": 0.9768809343076362, |
|
"learning_rate": 1.3970588235294118e-05, |
|
"loss": 0.7894, |
|
"mean_token_accuracy": 0.768089470723049, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.07399186089530152, |
|
"grad_norm": 1.0897420744707864, |
|
"learning_rate": 1.4705882352941179e-05, |
|
"loss": 0.842, |
|
"mean_token_accuracy": 0.7519371481531969, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07399186089530152, |
|
"eval_loss": 0.8507319688796997, |
|
"eval_mean_token_accuracy": 0.7484970684902305, |
|
"eval_runtime": 13.1139, |
|
"eval_samples_per_second": 9.837, |
|
"eval_steps_per_second": 2.516, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07769145394006659, |
|
"grad_norm": 1.0530600966285744, |
|
"learning_rate": 1.5441176470588237e-05, |
|
"loss": 0.8609, |
|
"mean_token_accuracy": 0.7470814372661423, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08139104698483167, |
|
"grad_norm": 0.9420676582335842, |
|
"learning_rate": 1.6176470588235296e-05, |
|
"loss": 0.8056, |
|
"mean_token_accuracy": 0.7604963244668818, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08509064002959675, |
|
"grad_norm": 0.868613480082479, |
|
"learning_rate": 1.6911764705882355e-05, |
|
"loss": 0.8432, |
|
"mean_token_accuracy": 0.7519378614781397, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 1.0518122686708429, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.8237, |
|
"mean_token_accuracy": 0.7588873826848553, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0924898261191269, |
|
"grad_norm": 1.135357117838434, |
|
"learning_rate": 1.8382352941176472e-05, |
|
"loss": 0.8417, |
|
"mean_token_accuracy": 0.7510501482017762, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.09618941916389197, |
|
"grad_norm": 1.0511646764690201, |
|
"learning_rate": 1.911764705882353e-05, |
|
"loss": 0.8024, |
|
"mean_token_accuracy": 0.7630132146126162, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09988901220865705, |
|
"grad_norm": 1.003659422839662, |
|
"learning_rate": 1.985294117647059e-05, |
|
"loss": 0.7846, |
|
"mean_token_accuracy": 0.7684428405908383, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.10358860525342212, |
|
"grad_norm": 0.9783718356092802, |
|
"learning_rate": 1.9999465148392906e-05, |
|
"loss": 0.7978, |
|
"mean_token_accuracy": 0.7638017765978411, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1072881982981872, |
|
"grad_norm": 0.9325940574831129, |
|
"learning_rate": 1.999729241179462e-05, |
|
"loss": 0.7899, |
|
"mean_token_accuracy": 0.7657915220582932, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.11098779134295228, |
|
"grad_norm": 0.9587245678809391, |
|
"learning_rate": 1.999344872485215e-05, |
|
"loss": 0.7935, |
|
"mean_token_accuracy": 0.7643045615369954, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.11468738438771735, |
|
"grad_norm": 1.2026644648319857, |
|
"learning_rate": 1.9987934730000457e-05, |
|
"loss": 0.7922, |
|
"mean_token_accuracy": 0.7667913563744123, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.11838697743248243, |
|
"grad_norm": 1.0527655093884445, |
|
"learning_rate": 1.998075134885022e-05, |
|
"loss": 0.8036, |
|
"mean_token_accuracy": 0.7613009885033224, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1220865704772475, |
|
"grad_norm": 1.2386325448592015, |
|
"learning_rate": 1.9971899782033853e-05, |
|
"loss": 0.8013, |
|
"mean_token_accuracy": 0.7620870662678945, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.12578616352201258, |
|
"grad_norm": 1.127408462312219, |
|
"learning_rate": 1.9961381509004785e-05, |
|
"loss": 0.808, |
|
"mean_token_accuracy": 0.7593721598276708, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.12948575656677766, |
|
"grad_norm": 0.9438483145827001, |
|
"learning_rate": 1.9949198287790215e-05, |
|
"loss": 0.7861, |
|
"mean_token_accuracy": 0.7660641312412623, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.9318614898436913, |
|
"learning_rate": 1.9935352154697257e-05, |
|
"loss": 0.8038, |
|
"mean_token_accuracy": 0.7602947954939976, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1368849426563078, |
|
"grad_norm": 0.8876177342352581, |
|
"learning_rate": 1.9919845423972603e-05, |
|
"loss": 0.757, |
|
"mean_token_accuracy": 0.7742074952233244, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14058453570107288, |
|
"grad_norm": 1.005618517605514, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 0.7985, |
|
"mean_token_accuracy": 0.7622990063822126, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.14428412874583796, |
|
"grad_norm": 1.0909612602188234, |
|
"learning_rate": 1.9883860813945596e-05, |
|
"loss": 0.7997, |
|
"mean_token_accuracy": 0.7611327796064032, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.14798372179060304, |
|
"grad_norm": 0.9223546224550383, |
|
"learning_rate": 1.986338894912137e-05, |
|
"loss": 0.7888, |
|
"mean_token_accuracy": 0.7650975947505124, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14798372179060304, |
|
"eval_loss": 0.8212586641311646, |
|
"eval_mean_token_accuracy": 0.753984452665533, |
|
"eval_runtime": 12.914, |
|
"eval_samples_per_second": 9.989, |
|
"eval_steps_per_second": 2.555, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.15168331483536812, |
|
"grad_norm": 0.9461631972455284, |
|
"learning_rate": 1.9841268514616434e-05, |
|
"loss": 0.8206, |
|
"mean_token_accuracy": 0.7560038239175771, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.15538290788013318, |
|
"grad_norm": 1.0128847648211983, |
|
"learning_rate": 1.9817503207646606e-05, |
|
"loss": 0.7827, |
|
"mean_token_accuracy": 0.7663386260842018, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.15908250092489826, |
|
"grad_norm": 1.0613216235343779, |
|
"learning_rate": 1.979209700035216e-05, |
|
"loss": 0.8097, |
|
"mean_token_accuracy": 0.7595550837031688, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.16278209396966334, |
|
"grad_norm": 1.0811772085526155, |
|
"learning_rate": 1.976505413913393e-05, |
|
"loss": 0.7771, |
|
"mean_token_accuracy": 0.7685113508318921, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.16648168701442842, |
|
"grad_norm": 1.0032217781891688, |
|
"learning_rate": 1.9736379143943565e-05, |
|
"loss": 0.7788, |
|
"mean_token_accuracy": 0.767374709683548, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1701812800591935, |
|
"grad_norm": 1.0033846900829124, |
|
"learning_rate": 1.9706076807528044e-05, |
|
"loss": 0.7792, |
|
"mean_token_accuracy": 0.7671742181015657, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.17388087310395856, |
|
"grad_norm": 1.016761717770083, |
|
"learning_rate": 1.967415219462864e-05, |
|
"loss": 0.776, |
|
"mean_token_accuracy": 0.7692263473301846, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 1.0390090395504632, |
|
"learning_rate": 1.9640610641134383e-05, |
|
"loss": 0.8125, |
|
"mean_token_accuracy": 0.7578594416116643, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.18128005919348872, |
|
"grad_norm": 0.9506480558904421, |
|
"learning_rate": 1.9605457753190224e-05, |
|
"loss": 0.774, |
|
"mean_token_accuracy": 0.7669808330485, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1849796522382538, |
|
"grad_norm": 0.9074565067919744, |
|
"learning_rate": 1.9568699406260016e-05, |
|
"loss": 0.7631, |
|
"mean_token_accuracy": 0.7712889443688947, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 0.870192572015487, |
|
"learning_rate": 1.953034174414449e-05, |
|
"loss": 0.8041, |
|
"mean_token_accuracy": 0.7584920492345467, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.19237883832778394, |
|
"grad_norm": 0.9436163053750114, |
|
"learning_rate": 1.9490391177954383e-05, |
|
"loss": 0.7697, |
|
"mean_token_accuracy": 0.7687670857457188, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 0.8454337167883038, |
|
"learning_rate": 1.944885438503888e-05, |
|
"loss": 0.7854, |
|
"mean_token_accuracy": 0.762450394993266, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.1997780244173141, |
|
"grad_norm": 0.8388911143997999, |
|
"learning_rate": 1.9405738307869565e-05, |
|
"loss": 0.7568, |
|
"mean_token_accuracy": 0.7739168937821217, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.20347761746207918, |
|
"grad_norm": 0.8905388426225519, |
|
"learning_rate": 1.936105015288003e-05, |
|
"loss": 0.7874, |
|
"mean_token_accuracy": 0.7657060442327474, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.20717721050684423, |
|
"grad_norm": 0.9449437240241089, |
|
"learning_rate": 1.9314797389261426e-05, |
|
"loss": 0.7964, |
|
"mean_token_accuracy": 0.7608504929094035, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.21087680355160932, |
|
"grad_norm": 0.9586909770652294, |
|
"learning_rate": 1.9266987747714036e-05, |
|
"loss": 0.7539, |
|
"mean_token_accuracy": 0.7740995999688999, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.2145763965963744, |
|
"grad_norm": 0.8457904946858513, |
|
"learning_rate": 1.9217629219155172e-05, |
|
"loss": 0.7596, |
|
"mean_token_accuracy": 0.7711086747415601, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.21827598964113948, |
|
"grad_norm": 0.9562284538878311, |
|
"learning_rate": 1.916673005338357e-05, |
|
"loss": 0.7636, |
|
"mean_token_accuracy": 0.7705879155908494, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.9311027437584551, |
|
"learning_rate": 1.9114298757700508e-05, |
|
"loss": 0.7833, |
|
"mean_token_accuracy": 0.7640983829119388, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"eval_loss": 0.8046144247055054, |
|
"eval_mean_token_accuracy": 0.7572653228255071, |
|
"eval_runtime": 12.9112, |
|
"eval_samples_per_second": 9.991, |
|
"eval_steps_per_second": 2.556, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2256751757306696, |
|
"grad_norm": 0.981751974465348, |
|
"learning_rate": 1.9060344095487916e-05, |
|
"loss": 0.7517, |
|
"mean_token_accuracy": 0.7743910330942259, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.2293747687754347, |
|
"grad_norm": 0.9758899701140303, |
|
"learning_rate": 1.9004875084743624e-05, |
|
"loss": 0.7744, |
|
"mean_token_accuracy": 0.7681139265197442, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.23307436182019978, |
|
"grad_norm": 1.0996867282039497, |
|
"learning_rate": 1.8947900996574133e-05, |
|
"loss": 0.784, |
|
"mean_token_accuracy": 0.7653104608265264, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.23677395486496486, |
|
"grad_norm": 0.8876749775333296, |
|
"learning_rate": 1.8889431353645004e-05, |
|
"loss": 0.7856, |
|
"mean_token_accuracy": 0.7637695086387486, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.24047354790972994, |
|
"grad_norm": 0.9254315942079439, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 0.7478, |
|
"mean_token_accuracy": 0.7747918700572913, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.244173140954495, |
|
"grad_norm": 0.8975251081081479, |
|
"learning_rate": 1.8768044742374008e-05, |
|
"loss": 0.7838, |
|
"mean_token_accuracy": 0.7662794783326515, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.24787273399926008, |
|
"grad_norm": 0.9659229415705355, |
|
"learning_rate": 1.870514806262544e-05, |
|
"loss": 0.8064, |
|
"mean_token_accuracy": 0.7575206306307912, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.25157232704402516, |
|
"grad_norm": 0.8730001205655563, |
|
"learning_rate": 1.8640796401912805e-05, |
|
"loss": 0.7786, |
|
"mean_token_accuracy": 0.7647639047537955, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.25527192008879024, |
|
"grad_norm": 0.8503523554051183, |
|
"learning_rate": 1.8575000515991283e-05, |
|
"loss": 0.7303, |
|
"mean_token_accuracy": 0.7790401063745367, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2589715131335553, |
|
"grad_norm": 0.952888955967417, |
|
"learning_rate": 1.850777140200427e-05, |
|
"loss": 0.7481, |
|
"mean_token_accuracy": 0.7754539961681624, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2626711061783204, |
|
"grad_norm": 0.8607187749209038, |
|
"learning_rate": 1.843912029664531e-05, |
|
"loss": 0.7628, |
|
"mean_token_accuracy": 0.7705045859827028, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.2663706992230855, |
|
"grad_norm": 0.9612681025779204, |
|
"learning_rate": 1.8369058674280004e-05, |
|
"loss": 0.7751, |
|
"mean_token_accuracy": 0.7662138710016833, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2700702922678505, |
|
"grad_norm": 0.9382382938099896, |
|
"learning_rate": 1.8297598245028173e-05, |
|
"loss": 0.7723, |
|
"mean_token_accuracy": 0.767681726807316, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.2737698853126156, |
|
"grad_norm": 0.9997500486677628, |
|
"learning_rate": 1.8224750952806626e-05, |
|
"loss": 0.7611, |
|
"mean_token_accuracy": 0.7707074275165302, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.27746947835738067, |
|
"grad_norm": 1.0014266441284037, |
|
"learning_rate": 1.815052897333284e-05, |
|
"loss": 0.7505, |
|
"mean_token_accuracy": 0.7740457102243052, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.28116907140214575, |
|
"grad_norm": 1.1075366477297495, |
|
"learning_rate": 1.8074944712089925e-05, |
|
"loss": 0.7641, |
|
"mean_token_accuracy": 0.7693695886355318, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.28486866444691084, |
|
"grad_norm": 0.880418326713614, |
|
"learning_rate": 1.799801080225316e-05, |
|
"loss": 0.7539, |
|
"mean_token_accuracy": 0.7710243423063697, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2885682574916759, |
|
"grad_norm": 0.9012750826548256, |
|
"learning_rate": 1.7919740102578482e-05, |
|
"loss": 0.7781, |
|
"mean_token_accuracy": 0.7661210900072783, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.292267850536441, |
|
"grad_norm": 0.9230339938349867, |
|
"learning_rate": 1.7840145695253258e-05, |
|
"loss": 0.7708, |
|
"mean_token_accuracy": 0.7679753783644399, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2959674435812061, |
|
"grad_norm": 0.8616748960380592, |
|
"learning_rate": 1.7759240883709745e-05, |
|
"loss": 0.7643, |
|
"mean_token_accuracy": 0.7698643926023239, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2959674435812061, |
|
"eval_loss": 0.7939132452011108, |
|
"eval_mean_token_accuracy": 0.7597238566071922, |
|
"eval_runtime": 12.8935, |
|
"eval_samples_per_second": 10.005, |
|
"eval_steps_per_second": 2.559, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.29966703662597116, |
|
"grad_norm": 0.8626025158912716, |
|
"learning_rate": 1.7677039190401538e-05, |
|
"loss": 0.7945, |
|
"mean_token_accuracy": 0.7610350956243586, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.30336662967073624, |
|
"grad_norm": 0.8809551406016749, |
|
"learning_rate": 1.759355435454342e-05, |
|
"loss": 0.7554, |
|
"mean_token_accuracy": 0.7720821829110623, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.30706622271550127, |
|
"grad_norm": 0.8398343511073675, |
|
"learning_rate": 1.7508800329814993e-05, |
|
"loss": 0.758, |
|
"mean_token_accuracy": 0.7709962708725867, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.31076581576026635, |
|
"grad_norm": 0.8717214162340853, |
|
"learning_rate": 1.7422791282028457e-05, |
|
"loss": 0.7456, |
|
"mean_token_accuracy": 0.7747603119006603, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.31446540880503143, |
|
"grad_norm": 0.9182650879024069, |
|
"learning_rate": 1.7335541586760928e-05, |
|
"loss": 0.7678, |
|
"mean_token_accuracy": 0.768033024075789, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.3181650018497965, |
|
"grad_norm": 0.9270374928264972, |
|
"learning_rate": 1.7247065826951694e-05, |
|
"loss": 0.7687, |
|
"mean_token_accuracy": 0.7670009758238231, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3218645948945616, |
|
"grad_norm": 0.9239947000531302, |
|
"learning_rate": 1.715737879046483e-05, |
|
"loss": 0.7807, |
|
"mean_token_accuracy": 0.7646944309939067, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3255641879393267, |
|
"grad_norm": 0.9234310473796695, |
|
"learning_rate": 1.7066495467617552e-05, |
|
"loss": 0.722, |
|
"mean_token_accuracy": 0.7810521648421277, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.32926378098409176, |
|
"grad_norm": 0.9873873391290533, |
|
"learning_rate": 1.6974431048674714e-05, |
|
"loss": 0.7773, |
|
"mean_token_accuracy": 0.7676746345471892, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.33296337402885684, |
|
"grad_norm": 0.8848520725498183, |
|
"learning_rate": 1.6881200921309914e-05, |
|
"loss": 0.7799, |
|
"mean_token_accuracy": 0.7653884936966252, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3366629670736219, |
|
"grad_norm": 0.9045509398039985, |
|
"learning_rate": 1.6786820668033596e-05, |
|
"loss": 0.7575, |
|
"mean_token_accuracy": 0.7713310238062097, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.340362560118387, |
|
"grad_norm": 0.890193659220096, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 0.7643, |
|
"mean_token_accuracy": 0.7680450171610729, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.34406215316315203, |
|
"grad_norm": 0.9121601629876329, |
|
"learning_rate": 1.6594673072313478e-05, |
|
"loss": 0.7565, |
|
"mean_token_accuracy": 0.7709845040717809, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3477617462079171, |
|
"grad_norm": 0.8596218378746548, |
|
"learning_rate": 1.6496937845474375e-05, |
|
"loss": 0.7899, |
|
"mean_token_accuracy": 0.762253394394126, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3514613392526822, |
|
"grad_norm": 0.8881090780569305, |
|
"learning_rate": 1.639811671856535e-05, |
|
"loss": 0.7716, |
|
"mean_token_accuracy": 0.766323517253549, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3551609322974473, |
|
"grad_norm": 0.9129717947745375, |
|
"learning_rate": 1.6298226208578127e-05, |
|
"loss": 0.7773, |
|
"mean_token_accuracy": 0.7662860024728196, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.35886052534221236, |
|
"grad_norm": 0.8773141616181648, |
|
"learning_rate": 1.6197283011241423e-05, |
|
"loss": 0.7823, |
|
"mean_token_accuracy": 0.7632768240832929, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.36256011838697744, |
|
"grad_norm": 0.8872263111805669, |
|
"learning_rate": 1.6095303998230432e-05, |
|
"loss": 0.7683, |
|
"mean_token_accuracy": 0.7674727430979753, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3662597114317425, |
|
"grad_norm": 0.98466468959405, |
|
"learning_rate": 1.599230621434687e-05, |
|
"loss": 0.7591, |
|
"mean_token_accuracy": 0.7693043297460778, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3699593044765076, |
|
"grad_norm": 0.8908360555035981, |
|
"learning_rate": 1.5888306874670112e-05, |
|
"loss": 0.7362, |
|
"mean_token_accuracy": 0.7770877269382522, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3699593044765076, |
|
"eval_loss": 0.784197211265564, |
|
"eval_mean_token_accuracy": 0.7620830191395729, |
|
"eval_runtime": 12.9033, |
|
"eval_samples_per_second": 9.997, |
|
"eval_steps_per_second": 2.557, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3736588975212727, |
|
"grad_norm": 0.9048447583159639, |
|
"learning_rate": 1.5783323361679865e-05, |
|
"loss": 0.7787, |
|
"mean_token_accuracy": 0.7648656868298831, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.9120327051425232, |
|
"learning_rate": 1.567737322235084e-05, |
|
"loss": 0.7593, |
|
"mean_token_accuracy": 0.7714510339240377, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3810580836108028, |
|
"grad_norm": 0.8195601280151146, |
|
"learning_rate": 1.557047416521996e-05, |
|
"loss": 0.7215, |
|
"mean_token_accuracy": 0.7789905700934947, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.38475767665556787, |
|
"grad_norm": 0.9650761679887094, |
|
"learning_rate": 1.546264405742654e-05, |
|
"loss": 0.7782, |
|
"mean_token_accuracy": 0.7636861487365787, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.38845726970033295, |
|
"grad_norm": 0.8237906709737827, |
|
"learning_rate": 1.535390092172597e-05, |
|
"loss": 0.7699, |
|
"mean_token_accuracy": 0.7671755622376449, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 0.8299457105893924, |
|
"learning_rate": 1.5244262933477401e-05, |
|
"loss": 0.7804, |
|
"mean_token_accuracy": 0.7637937394911022, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3958564557898631, |
|
"grad_norm": 1.0313966357425606, |
|
"learning_rate": 1.5133748417605878e-05, |
|
"loss": 0.7752, |
|
"mean_token_accuracy": 0.7650544976847276, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3995560488346282, |
|
"grad_norm": 0.8476613666163046, |
|
"learning_rate": 1.5022375845539537e-05, |
|
"loss": 0.7459, |
|
"mean_token_accuracy": 0.7739755173371613, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.4032556418793933, |
|
"grad_norm": 0.8488122984336212, |
|
"learning_rate": 1.4910163832122278e-05, |
|
"loss": 0.7282, |
|
"mean_token_accuracy": 0.7793907307750887, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.40695523492415836, |
|
"grad_norm": 0.9246732865437337, |
|
"learning_rate": 1.4797131132502464e-05, |
|
"loss": 0.7387, |
|
"mean_token_accuracy": 0.7771462284436527, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.41065482796892344, |
|
"grad_norm": 0.8693436586761074, |
|
"learning_rate": 1.4683296638998192e-05, |
|
"loss": 0.7449, |
|
"mean_token_accuracy": 0.7733528571354785, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.41435442101368847, |
|
"grad_norm": 1.0132116370018853, |
|
"learning_rate": 1.4568679377939619e-05, |
|
"loss": 0.7338, |
|
"mean_token_accuracy": 0.7772718301836594, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.41805401405845355, |
|
"grad_norm": 0.8274852436061462, |
|
"learning_rate": 1.4453298506488896e-05, |
|
"loss": 0.7468, |
|
"mean_token_accuracy": 0.7725176751538921, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.42175360710321863, |
|
"grad_norm": 0.8143005464066339, |
|
"learning_rate": 1.4337173309438236e-05, |
|
"loss": 0.7423, |
|
"mean_token_accuracy": 0.7735196406247817, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4254532001479837, |
|
"grad_norm": 0.8624822737135253, |
|
"learning_rate": 1.4220323195986649e-05, |
|
"loss": 0.7448, |
|
"mean_token_accuracy": 0.7735122705759122, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4291527931927488, |
|
"grad_norm": 0.8889965581940739, |
|
"learning_rate": 1.4102767696495885e-05, |
|
"loss": 0.7674, |
|
"mean_token_accuracy": 0.7686958478722382, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4328523862375139, |
|
"grad_norm": 0.7782401935752693, |
|
"learning_rate": 1.398452645922611e-05, |
|
"loss": 0.765, |
|
"mean_token_accuracy": 0.7691531672999341, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.43655197928227896, |
|
"grad_norm": 0.8590974972802418, |
|
"learning_rate": 1.3865619247051916e-05, |
|
"loss": 0.7507, |
|
"mean_token_accuracy": 0.7729800254264771, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.44025157232704404, |
|
"grad_norm": 0.7988473909688566, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 0.7242, |
|
"mean_token_accuracy": 0.780331890399065, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"grad_norm": 0.9282202265668916, |
|
"learning_rate": 1.3625886502723008e-05, |
|
"loss": 0.7587, |
|
"mean_token_accuracy": 0.7696747774936822, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"eval_loss": 0.7762653827667236, |
|
"eval_mean_token_accuracy": 0.7637766286530999, |
|
"eval_runtime": 12.9012, |
|
"eval_samples_per_second": 9.999, |
|
"eval_steps_per_second": 2.558, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4476507584165742, |
|
"grad_norm": 0.8375068814255916, |
|
"learning_rate": 1.3505101039568494e-05, |
|
"loss": 0.7114, |
|
"mean_token_accuracy": 0.78174785615947, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.4513503514613392, |
|
"grad_norm": 0.8406018488041918, |
|
"learning_rate": 1.3383729732812814e-05, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.7761469410168554, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.4550499445061043, |
|
"grad_norm": 0.8214051664479577, |
|
"learning_rate": 1.3261792868491267e-05, |
|
"loss": 0.754, |
|
"mean_token_accuracy": 0.7719396256522534, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4587495375508694, |
|
"grad_norm": 0.8854429907993514, |
|
"learning_rate": 1.3139310827166613e-05, |
|
"loss": 0.7529, |
|
"mean_token_accuracy": 0.7723088441600371, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.46244913059563447, |
|
"grad_norm": 0.8135757240558104, |
|
"learning_rate": 1.3016304080522657e-05, |
|
"loss": 0.7756, |
|
"mean_token_accuracy": 0.7642865483170395, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.46614872364039955, |
|
"grad_norm": 0.8597128387115325, |
|
"learning_rate": 1.2892793187942588e-05, |
|
"loss": 0.7856, |
|
"mean_token_accuracy": 0.7615404441461953, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.46984831668516464, |
|
"grad_norm": 0.8740917081939998, |
|
"learning_rate": 1.2768798793072708e-05, |
|
"loss": 0.7599, |
|
"mean_token_accuracy": 0.7694673674218602, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4735479097299297, |
|
"grad_norm": 0.8455035652800976, |
|
"learning_rate": 1.2644341620372025e-05, |
|
"loss": 0.7374, |
|
"mean_token_accuracy": 0.7753507196148239, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.4772475027746948, |
|
"grad_norm": 0.8453775134661862, |
|
"learning_rate": 1.2519442471648364e-05, |
|
"loss": 0.7539, |
|
"mean_token_accuracy": 0.7711800403046617, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.4809470958194599, |
|
"grad_norm": 0.8131753165282993, |
|
"learning_rate": 1.2394122222581557e-05, |
|
"loss": 0.7476, |
|
"mean_token_accuracy": 0.7740160766147529, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.48464668886422496, |
|
"grad_norm": 0.8752309330635153, |
|
"learning_rate": 1.226840181923427e-05, |
|
"loss": 0.7234, |
|
"mean_token_accuracy": 0.7799259045764162, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.48834628190899, |
|
"grad_norm": 0.9394868684222661, |
|
"learning_rate": 1.214230227455106e-05, |
|
"loss": 0.761, |
|
"mean_token_accuracy": 0.76914448788722, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.49204587495375507, |
|
"grad_norm": 0.9190997939430626, |
|
"learning_rate": 1.201584466484629e-05, |
|
"loss": 0.7429, |
|
"mean_token_accuracy": 0.7749135561574454, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.49574546799852015, |
|
"grad_norm": 0.991650882333127, |
|
"learning_rate": 1.1889050126281405e-05, |
|
"loss": 0.7624, |
|
"mean_token_accuracy": 0.7674541114303453, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.49944506104328523, |
|
"grad_norm": 0.8727170236064898, |
|
"learning_rate": 1.1761939851332241e-05, |
|
"loss": 0.7686, |
|
"mean_token_accuracy": 0.7668983696182747, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"grad_norm": 0.8423456719916217, |
|
"learning_rate": 1.1634535085246903e-05, |
|
"loss": 0.7502, |
|
"mean_token_accuracy": 0.7719282566060406, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.5068442471328154, |
|
"grad_norm": 0.8041730146404278, |
|
"learning_rate": 1.1506857122494832e-05, |
|
"loss": 0.7557, |
|
"mean_token_accuracy": 0.7698715177090566, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.5105438401775805, |
|
"grad_norm": 0.8008949159036595, |
|
"learning_rate": 1.1378927303207637e-05, |
|
"loss": 0.7555, |
|
"mean_token_accuracy": 0.7708746251231458, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5142434332223456, |
|
"grad_norm": 0.8535115805575514, |
|
"learning_rate": 1.12507670096123e-05, |
|
"loss": 0.7427, |
|
"mean_token_accuracy": 0.7726444687618006, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.5179430262671106, |
|
"grad_norm": 0.9312344890724055, |
|
"learning_rate": 1.1122397662457352e-05, |
|
"loss": 0.7502, |
|
"mean_token_accuracy": 0.7726102122304674, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5179430262671106, |
|
"eval_loss": 0.7686853408813477, |
|
"eval_mean_token_accuracy": 0.7649829158123188, |
|
"eval_runtime": 12.9077, |
|
"eval_samples_per_second": 9.994, |
|
"eval_steps_per_second": 2.557, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5216426193118757, |
|
"grad_norm": 0.868812002235657, |
|
"learning_rate": 1.0993840717432582e-05, |
|
"loss": 0.739, |
|
"mean_token_accuracy": 0.7753342555426331, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5253422123566408, |
|
"grad_norm": 0.873440662985193, |
|
"learning_rate": 1.0865117661582958e-05, |
|
"loss": 0.7439, |
|
"mean_token_accuracy": 0.7739453320579491, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.5290418054014059, |
|
"grad_norm": 0.862857533425593, |
|
"learning_rate": 1.0736250009717249e-05, |
|
"loss": 0.7569, |
|
"mean_token_accuracy": 0.770897334069314, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.532741398446171, |
|
"grad_norm": 0.8639562204638955, |
|
"learning_rate": 1.0607259300812047e-05, |
|
"loss": 0.7074, |
|
"mean_token_accuracy": 0.7842015210192824, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.536440991490936, |
|
"grad_norm": 0.899875919647952, |
|
"learning_rate": 1.0478167094411733e-05, |
|
"loss": 0.7363, |
|
"mean_token_accuracy": 0.7761369932840716, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.540140584535701, |
|
"grad_norm": 0.8342218548841652, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 0.7417, |
|
"mean_token_accuracy": 0.7746170812078568, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5438401775804661, |
|
"grad_norm": 0.8013624939670422, |
|
"learning_rate": 1.0219764508518595e-05, |
|
"loss": 0.7384, |
|
"mean_token_accuracy": 0.7767422075955539, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5475397706252312, |
|
"grad_norm": 0.8120468940513571, |
|
"learning_rate": 1.0090497318508687e-05, |
|
"loss": 0.7566, |
|
"mean_token_accuracy": 0.7691283622927295, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5512393636699963, |
|
"grad_norm": 0.8150917404467021, |
|
"learning_rate": 9.961215002750799e-06, |
|
"loss": 0.7624, |
|
"mean_token_accuracy": 0.7672601324412168, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.5549389567147613, |
|
"grad_norm": 0.8358908477578574, |
|
"learning_rate": 9.831939169528565e-06, |
|
"loss": 0.7198, |
|
"mean_token_accuracy": 0.7803871901829526, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.5586385497595264, |
|
"grad_norm": 0.8360497025012443, |
|
"learning_rate": 9.702691426042124e-06, |
|
"loss": 0.704, |
|
"mean_token_accuracy": 0.784279945314917, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5623381428042915, |
|
"grad_norm": 0.8484683109694328, |
|
"learning_rate": 9.573493374796694e-06, |
|
"loss": 0.742, |
|
"mean_token_accuracy": 0.773911456375648, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 0.801814751253701, |
|
"learning_rate": 9.444366609991916e-06, |
|
"loss": 0.7475, |
|
"mean_token_accuracy": 0.7704960987519774, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5697373288938217, |
|
"grad_norm": 0.7903451176947093, |
|
"learning_rate": 9.315332713912593e-06, |
|
"loss": 0.6802, |
|
"mean_token_accuracy": 0.7910297225776487, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5734369219385868, |
|
"grad_norm": 0.7640452473763807, |
|
"learning_rate": 9.18641325332142e-06, |
|
"loss": 0.7254, |
|
"mean_token_accuracy": 0.7779958468370307, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5771365149833518, |
|
"grad_norm": 0.7764311897247947, |
|
"learning_rate": 9.057629775854314e-06, |
|
"loss": 0.7317, |
|
"mean_token_accuracy": 0.7763394937851484, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5808361080281169, |
|
"grad_norm": 0.8245708624875354, |
|
"learning_rate": 8.929003806418934e-06, |
|
"loss": 0.7376, |
|
"mean_token_accuracy": 0.7750152545715012, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.584535701072882, |
|
"grad_norm": 0.8103882551202022, |
|
"learning_rate": 8.800556843597002e-06, |
|
"loss": 0.7259, |
|
"mean_token_accuracy": 0.7782374392513656, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.7928991222424273, |
|
"learning_rate": 8.672310356051023e-06, |
|
"loss": 0.7431, |
|
"mean_token_accuracy": 0.7736308218387102, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5919348871624122, |
|
"grad_norm": 0.8289989830065249, |
|
"learning_rate": 8.544285778936004e-06, |
|
"loss": 0.7352, |
|
"mean_token_accuracy": 0.7764898230184529, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5919348871624122, |
|
"eval_loss": 0.7624932527542114, |
|
"eval_mean_token_accuracy": 0.7669661829206922, |
|
"eval_runtime": 12.9119, |
|
"eval_samples_per_second": 9.991, |
|
"eval_steps_per_second": 2.556, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5956344802071772, |
|
"grad_norm": 0.8482327309493489, |
|
"learning_rate": 8.416504510316774e-06, |
|
"loss": 0.7194, |
|
"mean_token_accuracy": 0.7805804982214577, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.5993340732519423, |
|
"grad_norm": 0.8542094892921783, |
|
"learning_rate": 8.28898790759152e-06, |
|
"loss": 0.7332, |
|
"mean_token_accuracy": 0.7760156726467715, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6030336662967074, |
|
"grad_norm": 0.8315942810202602, |
|
"learning_rate": 8.161757283922084e-06, |
|
"loss": 0.7186, |
|
"mean_token_accuracy": 0.7800488930789287, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.6067332593414725, |
|
"grad_norm": 0.8561047694162326, |
|
"learning_rate": 8.034833904671698e-06, |
|
"loss": 0.7286, |
|
"mean_token_accuracy": 0.7782032355305302, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6104328523862376, |
|
"grad_norm": 0.8322803740232512, |
|
"learning_rate": 7.908238983850666e-06, |
|
"loss": 0.7475, |
|
"mean_token_accuracy": 0.7716142551709843, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6141324454310025, |
|
"grad_norm": 0.8093783949232168, |
|
"learning_rate": 7.781993680570656e-06, |
|
"loss": 0.7419, |
|
"mean_token_accuracy": 0.7753669766613711, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.6178320384757676, |
|
"grad_norm": 0.8251647916013486, |
|
"learning_rate": 7.656119095508155e-06, |
|
"loss": 0.7163, |
|
"mean_token_accuracy": 0.7810057582729616, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.6215316315205327, |
|
"grad_norm": 0.8936545458969576, |
|
"learning_rate": 7.530636267377706e-06, |
|
"loss": 0.7212, |
|
"mean_token_accuracy": 0.7782000869575987, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6252312245652978, |
|
"grad_norm": 0.8327340922107708, |
|
"learning_rate": 7.405566169415481e-06, |
|
"loss": 0.7417, |
|
"mean_token_accuracy": 0.7713689918387485, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.6289308176100629, |
|
"grad_norm": 0.9240689034622575, |
|
"learning_rate": 7.280929705873818e-06, |
|
"loss": 0.7671, |
|
"mean_token_accuracy": 0.7674969695134786, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.632630410654828, |
|
"grad_norm": 0.8604187816835673, |
|
"learning_rate": 7.15674770852727e-06, |
|
"loss": 0.7714, |
|
"mean_token_accuracy": 0.765074378427229, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.636330003699593, |
|
"grad_norm": 0.8575462475833365, |
|
"learning_rate": 7.033040933190776e-06, |
|
"loss": 0.7485, |
|
"mean_token_accuracy": 0.7725655493857382, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.6400295967443581, |
|
"grad_norm": 0.7868065852245846, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.703, |
|
"mean_token_accuracy": 0.7851972669426284, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.6437291897891232, |
|
"grad_norm": 0.817598798759793, |
|
"learning_rate": 6.787135671208126e-06, |
|
"loss": 0.7576, |
|
"mean_token_accuracy": 0.7701196194143121, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6474287828338883, |
|
"grad_norm": 0.7951083662310198, |
|
"learning_rate": 6.6649782852385554e-06, |
|
"loss": 0.7459, |
|
"mean_token_accuracy": 0.772123421166796, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6511283758786534, |
|
"grad_norm": 0.7271974698252879, |
|
"learning_rate": 6.543378315762634e-06, |
|
"loss": 0.7247, |
|
"mean_token_accuracy": 0.7791351612791593, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6548279689234184, |
|
"grad_norm": 0.8824398500626421, |
|
"learning_rate": 6.42235608703441e-06, |
|
"loss": 0.7606, |
|
"mean_token_accuracy": 0.7677733208877957, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6585275619681835, |
|
"grad_norm": 0.8552464856465032, |
|
"learning_rate": 6.301931826744189e-06, |
|
"loss": 0.7419, |
|
"mean_token_accuracy": 0.7743753446438143, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6622271550129486, |
|
"grad_norm": 0.8306173604508144, |
|
"learning_rate": 6.18212566263765e-06, |
|
"loss": 0.7378, |
|
"mean_token_accuracy": 0.7754989605528264, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"grad_norm": 0.8046739019546305, |
|
"learning_rate": 6.0629576191517035e-06, |
|
"loss": 0.7254, |
|
"mean_token_accuracy": 0.7763354265372273, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"eval_loss": 0.7582561373710632, |
|
"eval_mean_token_accuracy": 0.7673597290333886, |
|
"eval_runtime": 12.9256, |
|
"eval_samples_per_second": 9.98, |
|
"eval_steps_per_second": 2.553, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6696263411024788, |
|
"grad_norm": 0.7871630029994926, |
|
"learning_rate": 5.944447614067588e-06, |
|
"loss": 0.7365, |
|
"mean_token_accuracy": 0.7753696170648604, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6733259341472438, |
|
"grad_norm": 0.8274033791731158, |
|
"learning_rate": 5.8266154551818225e-06, |
|
"loss": 0.7148, |
|
"mean_token_accuracy": 0.7819856387093902, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6770255271920089, |
|
"grad_norm": 0.8585256390323572, |
|
"learning_rate": 5.709480836995509e-06, |
|
"loss": 0.7099, |
|
"mean_token_accuracy": 0.7823146568890846, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.680725120236774, |
|
"grad_norm": 0.8245071508434019, |
|
"learning_rate": 5.593063337422595e-06, |
|
"loss": 0.7205, |
|
"mean_token_accuracy": 0.7803457311979414, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.684424713281539, |
|
"grad_norm": 0.8137995619905571, |
|
"learning_rate": 5.477382414517625e-06, |
|
"loss": 0.7443, |
|
"mean_token_accuracy": 0.7742054658480322, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.6881243063263041, |
|
"grad_norm": 0.8435356879873597, |
|
"learning_rate": 5.362457403223495e-06, |
|
"loss": 0.7321, |
|
"mean_token_accuracy": 0.7777223904380888, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6918238993710691, |
|
"grad_norm": 0.8275951168813332, |
|
"learning_rate": 5.248307512139818e-06, |
|
"loss": 0.7327, |
|
"mean_token_accuracy": 0.7760629303675801, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.6955234924158342, |
|
"grad_norm": 0.7986090408289105, |
|
"learning_rate": 5.134951820312402e-06, |
|
"loss": 0.6797, |
|
"mean_token_accuracy": 0.792751892181402, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6992230854605993, |
|
"grad_norm": 0.8344358039184492, |
|
"learning_rate": 5.022409274044346e-06, |
|
"loss": 0.7484, |
|
"mean_token_accuracy": 0.7706997735792959, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7029226785053644, |
|
"grad_norm": 0.7590018472193605, |
|
"learning_rate": 4.910698683729371e-06, |
|
"loss": 0.7246, |
|
"mean_token_accuracy": 0.7794280766358616, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.7066222715501295, |
|
"grad_norm": 0.8418460117578578, |
|
"learning_rate": 4.799838720707847e-06, |
|
"loss": 0.6786, |
|
"mean_token_accuracy": 0.7908993228344331, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.7103218645948945, |
|
"grad_norm": 0.8709719228420416, |
|
"learning_rate": 4.6898479141460415e-06, |
|
"loss": 0.7067, |
|
"mean_token_accuracy": 0.7843742462359516, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.7140214576396596, |
|
"grad_norm": 0.8535611787813129, |
|
"learning_rate": 4.580744647939163e-06, |
|
"loss": 0.7435, |
|
"mean_token_accuracy": 0.7730417779644787, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.7177210506844247, |
|
"grad_norm": 0.8885996504303695, |
|
"learning_rate": 4.472547157638674e-06, |
|
"loss": 0.694, |
|
"mean_token_accuracy": 0.7880904268803789, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.7214206437291898, |
|
"grad_norm": 0.9587623922743913, |
|
"learning_rate": 4.365273527404384e-06, |
|
"loss": 0.7401, |
|
"mean_token_accuracy": 0.7739410892854838, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7251202367739549, |
|
"grad_norm": 0.8107566238251871, |
|
"learning_rate": 4.258941686981864e-06, |
|
"loss": 0.7242, |
|
"mean_token_accuracy": 0.779501991954344, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.72881982981872, |
|
"grad_norm": 0.8788408769548844, |
|
"learning_rate": 4.15356940870567e-06, |
|
"loss": 0.7382, |
|
"mean_token_accuracy": 0.7752977269534177, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.732519422863485, |
|
"grad_norm": 0.8161841348783122, |
|
"learning_rate": 4.049174304528857e-06, |
|
"loss": 0.7122, |
|
"mean_token_accuracy": 0.7834159122342004, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.7362190159082501, |
|
"grad_norm": 0.8045647610652492, |
|
"learning_rate": 3.945773823079315e-06, |
|
"loss": 0.7085, |
|
"mean_token_accuracy": 0.7838271814299385, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.7399186089530152, |
|
"grad_norm": 0.7888951101393026, |
|
"learning_rate": 3.8433852467434175e-06, |
|
"loss": 0.7436, |
|
"mean_token_accuracy": 0.7715282648856208, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7399186089530152, |
|
"eval_loss": 0.7540702819824219, |
|
"eval_mean_token_accuracy": 0.7684317578583987, |
|
"eval_runtime": 12.9145, |
|
"eval_samples_per_second": 9.989, |
|
"eval_steps_per_second": 2.555, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.7436182019977803, |
|
"grad_norm": 0.8476150102495247, |
|
"learning_rate": 3.742025688777413e-06, |
|
"loss": 0.755, |
|
"mean_token_accuracy": 0.769747059368865, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.7473177950425454, |
|
"grad_norm": 0.8447327457715914, |
|
"learning_rate": 3.641712090447125e-06, |
|
"loss": 0.7084, |
|
"mean_token_accuracy": 0.7835149860526573, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7510173880873104, |
|
"grad_norm": 0.7966093391523419, |
|
"learning_rate": 3.542461218196379e-06, |
|
"loss": 0.7215, |
|
"mean_token_accuracy": 0.7790490334715525, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.8909915448158038, |
|
"learning_rate": 3.444289660844665e-06, |
|
"loss": 0.7361, |
|
"mean_token_accuracy": 0.7758344142261316, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7584165741768405, |
|
"grad_norm": 0.828582805131554, |
|
"learning_rate": 3.347213826814456e-06, |
|
"loss": 0.7317, |
|
"mean_token_accuracy": 0.7749168212587448, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7621161672216056, |
|
"grad_norm": 0.8172943841676832, |
|
"learning_rate": 3.2512499413887255e-06, |
|
"loss": 0.7206, |
|
"mean_token_accuracy": 0.7795061516510636, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7658157602663707, |
|
"grad_norm": 0.7948969347509628, |
|
"learning_rate": 3.1564140439990256e-06, |
|
"loss": 0.7534, |
|
"mean_token_accuracy": 0.7695401774020444, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7695153533111357, |
|
"grad_norm": 0.7639527363991839, |
|
"learning_rate": 3.0627219855446667e-06, |
|
"loss": 0.6907, |
|
"mean_token_accuracy": 0.7891346245991027, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.7732149463559008, |
|
"grad_norm": 0.796687026157432, |
|
"learning_rate": 2.970189425743383e-06, |
|
"loss": 0.7365, |
|
"mean_token_accuracy": 0.7739980038896167, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7769145394006659, |
|
"grad_norm": 0.7960037813992309, |
|
"learning_rate": 2.8788318305139808e-06, |
|
"loss": 0.7513, |
|
"mean_token_accuracy": 0.7689661221516821, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.780614132445431, |
|
"grad_norm": 0.7733589592006201, |
|
"learning_rate": 2.7886644693913333e-06, |
|
"loss": 0.7443, |
|
"mean_token_accuracy": 0.7728263504337974, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 0.8565688482800564, |
|
"learning_rate": 2.6997024129742544e-06, |
|
"loss": 0.7328, |
|
"mean_token_accuracy": 0.7762133218500764, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7880133185349611, |
|
"grad_norm": 0.8923008390952016, |
|
"learning_rate": 2.611960530406572e-06, |
|
"loss": 0.728, |
|
"mean_token_accuracy": 0.7796289219427248, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.7917129115797262, |
|
"grad_norm": 0.7873211808203069, |
|
"learning_rate": 2.5254534868919077e-06, |
|
"loss": 0.726, |
|
"mean_token_accuracy": 0.7775058698315347, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7954125046244913, |
|
"grad_norm": 0.7719831428886607, |
|
"learning_rate": 2.4401957412425213e-06, |
|
"loss": 0.7309, |
|
"mean_token_accuracy": 0.7749563569631559, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7991120976692564, |
|
"grad_norm": 0.835137021064406, |
|
"learning_rate": 2.3562015434626784e-06, |
|
"loss": 0.7145, |
|
"mean_token_accuracy": 0.7820543807444283, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8028116907140215, |
|
"grad_norm": 0.7671221716084607, |
|
"learning_rate": 2.273484932366874e-06, |
|
"loss": 0.7021, |
|
"mean_token_accuracy": 0.7854601544650027, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.8065112837587866, |
|
"grad_norm": 0.7788468903413968, |
|
"learning_rate": 2.192059733233408e-06, |
|
"loss": 0.7244, |
|
"mean_token_accuracy": 0.7782192310784752, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.8102108768035516, |
|
"grad_norm": 0.8041148765506401, |
|
"learning_rate": 2.111939555493603e-06, |
|
"loss": 0.7225, |
|
"mean_token_accuracy": 0.7786757617228395, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8139104698483167, |
|
"grad_norm": 0.9148318869053712, |
|
"learning_rate": 2.0331377904571303e-06, |
|
"loss": 0.745, |
|
"mean_token_accuracy": 0.7731225924993291, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8139104698483167, |
|
"eval_loss": 0.7510696649551392, |
|
"eval_mean_token_accuracy": 0.7692631147293293, |
|
"eval_runtime": 12.9268, |
|
"eval_samples_per_second": 9.979, |
|
"eval_steps_per_second": 2.553, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.8176100628930818, |
|
"grad_norm": 0.8176644589533959, |
|
"learning_rate": 1.9556676090737803e-06, |
|
"loss": 0.7544, |
|
"mean_token_accuracy": 0.7705814013113146, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.8213096559378469, |
|
"grad_norm": 0.751484746704062, |
|
"learning_rate": 1.879541959732072e-06, |
|
"loss": 0.7133, |
|
"mean_token_accuracy": 0.7810593845260246, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.825009248982612, |
|
"grad_norm": 0.8448196083310406, |
|
"learning_rate": 1.8047735660950427e-06, |
|
"loss": 0.7088, |
|
"mean_token_accuracy": 0.7839263073608779, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.8287088420273769, |
|
"grad_norm": 0.7666551359898016, |
|
"learning_rate": 1.7313749249736266e-06, |
|
"loss": 0.7225, |
|
"mean_token_accuracy": 0.7800409694989299, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.832408435072142, |
|
"grad_norm": 0.7750630686515836, |
|
"learning_rate": 1.6593583042379192e-06, |
|
"loss": 0.7302, |
|
"mean_token_accuracy": 0.7764794004807236, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.8361080281169071, |
|
"grad_norm": 0.7448783923471559, |
|
"learning_rate": 1.5887357407667314e-06, |
|
"loss": 0.7303, |
|
"mean_token_accuracy": 0.7770985998568949, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.8398076211616722, |
|
"grad_norm": 0.8617250383965689, |
|
"learning_rate": 1.5195190384357405e-06, |
|
"loss": 0.7261, |
|
"mean_token_accuracy": 0.7790343243952108, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.8435072142064373, |
|
"grad_norm": 0.7315929117396289, |
|
"learning_rate": 1.4517197661445893e-06, |
|
"loss": 0.7103, |
|
"mean_token_accuracy": 0.7820174506567448, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.8472068072512023, |
|
"grad_norm": 0.7464816300853588, |
|
"learning_rate": 1.3853492558832472e-06, |
|
"loss": 0.7248, |
|
"mean_token_accuracy": 0.7778668411101556, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.8509064002959674, |
|
"grad_norm": 0.8027284787026119, |
|
"learning_rate": 1.3204186008379926e-06, |
|
"loss": 0.7142, |
|
"mean_token_accuracy": 0.7828477066310621, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.8546059933407325, |
|
"grad_norm": 0.7867579697995757, |
|
"learning_rate": 1.2569386535372807e-06, |
|
"loss": 0.7411, |
|
"mean_token_accuracy": 0.7738658084540277, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.8583055863854976, |
|
"grad_norm": 0.8069270526220779, |
|
"learning_rate": 1.1949200240378577e-06, |
|
"loss": 0.7066, |
|
"mean_token_accuracy": 0.7833467087206057, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.8620051794302627, |
|
"grad_norm": 0.8115083552759281, |
|
"learning_rate": 1.1343730781513896e-06, |
|
"loss": 0.7117, |
|
"mean_token_accuracy": 0.782034573182628, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.8657047724750278, |
|
"grad_norm": 0.7379062498914578, |
|
"learning_rate": 1.0753079357119134e-06, |
|
"loss": 0.7334, |
|
"mean_token_accuracy": 0.7753454314583148, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.8694043655197928, |
|
"grad_norm": 0.7894649463121637, |
|
"learning_rate": 1.017734468884417e-06, |
|
"loss": 0.6873, |
|
"mean_token_accuracy": 0.7897845425207853, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.8731039585645579, |
|
"grad_norm": 0.8023692788963809, |
|
"learning_rate": 9.616623005147952e-07, |
|
"loss": 0.7416, |
|
"mean_token_accuracy": 0.7721576809793975, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.876803551609323, |
|
"grad_norm": 0.7552635215864901, |
|
"learning_rate": 9.071008025214767e-07, |
|
"loss": 0.6686, |
|
"mean_token_accuracy": 0.7955690867836535, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.8805031446540881, |
|
"grad_norm": 0.7427939874056083, |
|
"learning_rate": 8.540590943290128e-07, |
|
"loss": 0.7366, |
|
"mean_token_accuracy": 0.7745106495210125, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.8842027376988532, |
|
"grad_norm": 0.7579131248407016, |
|
"learning_rate": 8.025460413438457e-07, |
|
"loss": 0.7146, |
|
"mean_token_accuracy": 0.7814567025093309, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"grad_norm": 0.8173415852977716, |
|
"learning_rate": 7.525702534725443e-07, |
|
"loss": 0.7231, |
|
"mean_token_accuracy": 0.7785201805258471, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8879023307436182, |
|
"eval_loss": 0.749515950679779, |
|
"eval_mean_token_accuracy": 0.7696483845818836, |
|
"eval_runtime": 12.9328, |
|
"eval_samples_per_second": 9.975, |
|
"eval_steps_per_second": 2.552, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8916019237883833, |
|
"grad_norm": 0.7711527562330054, |
|
"learning_rate": 7.041400836827439e-07, |
|
"loss": 0.7143, |
|
"mean_token_accuracy": 0.7821882194464815, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.8953015168331484, |
|
"grad_norm": 0.7623772941452016, |
|
"learning_rate": 6.572636266070265e-07, |
|
"loss": 0.7387, |
|
"mean_token_accuracy": 0.7745962709444631, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.8990011098779135, |
|
"grad_norm": 0.7652994503797836, |
|
"learning_rate": 6.119487171899807e-07, |
|
"loss": 0.6961, |
|
"mean_token_accuracy": 0.7874255931848155, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9027007029226785, |
|
"grad_norm": 0.8269678511841839, |
|
"learning_rate": 5.682029293786673e-07, |
|
"loss": 0.7255, |
|
"mean_token_accuracy": 0.7770277914333097, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.9064002959674435, |
|
"grad_norm": 0.7302387175249154, |
|
"learning_rate": 5.26033574856708e-07, |
|
"loss": 0.7036, |
|
"mean_token_accuracy": 0.7842749808544124, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.9100998890122086, |
|
"grad_norm": 0.7426310574145085, |
|
"learning_rate": 4.854477018222103e-07, |
|
"loss": 0.7085, |
|
"mean_token_accuracy": 0.7827587579577575, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.9137994820569737, |
|
"grad_norm": 0.7805083509005499, |
|
"learning_rate": 4.464520938097294e-07, |
|
"loss": 0.6552, |
|
"mean_token_accuracy": 0.7984315941552589, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.9174990751017388, |
|
"grad_norm": 0.7838715084886818, |
|
"learning_rate": 4.0905326855646186e-07, |
|
"loss": 0.6978, |
|
"mean_token_accuracy": 0.7866137937789542, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.9211986681465039, |
|
"grad_norm": 0.764142848066667, |
|
"learning_rate": 3.732574769128738e-07, |
|
"loss": 0.7425, |
|
"mean_token_accuracy": 0.7735163959357182, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.9248982611912689, |
|
"grad_norm": 0.8510657590112316, |
|
"learning_rate": 3.390707017979311e-07, |
|
"loss": 0.714, |
|
"mean_token_accuracy": 0.7817031945118419, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.928597854236034, |
|
"grad_norm": 0.8104897922544609, |
|
"learning_rate": 3.06498657199108e-07, |
|
"loss": 0.6972, |
|
"mean_token_accuracy": 0.7874145607415896, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.9322974472807991, |
|
"grad_norm": 0.8439497662820569, |
|
"learning_rate": 2.7554678721735675e-07, |
|
"loss": 0.7267, |
|
"mean_token_accuracy": 0.7782783773976328, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.9359970403255642, |
|
"grad_norm": 0.7506475456675276, |
|
"learning_rate": 2.4622026515717654e-07, |
|
"loss": 0.7116, |
|
"mean_token_accuracy": 0.7825011111093741, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.9396966333703293, |
|
"grad_norm": 0.796131840794368, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": 0.7043, |
|
"mean_token_accuracy": 0.7831285774300467, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 0.8874806771851743, |
|
"learning_rate": 1.9246259889464935e-07, |
|
"loss": 0.7372, |
|
"mean_token_accuracy": 0.7754287878050505, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.9470958194598594, |
|
"grad_norm": 0.7616413651005322, |
|
"learning_rate": 1.6804043976418438e-07, |
|
"loss": 0.7115, |
|
"mean_token_accuracy": 0.782469409255144, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.9507954125046245, |
|
"grad_norm": 0.7354908780262112, |
|
"learning_rate": 1.4526159719728595e-07, |
|
"loss": 0.6923, |
|
"mean_token_accuracy": 0.7877402800804598, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.9544950055493896, |
|
"grad_norm": 0.8147962775571531, |
|
"learning_rate": 1.24129878456285e-07, |
|
"loss": 0.7266, |
|
"mean_token_accuracy": 0.7786802824447104, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.9581945985941547, |
|
"grad_norm": 0.7892855143587276, |
|
"learning_rate": 1.0464881550276362e-07, |
|
"loss": 0.7106, |
|
"mean_token_accuracy": 0.7812839457332936, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.9618941916389198, |
|
"grad_norm": 0.7807996872371684, |
|
"learning_rate": 8.682166440721729e-08, |
|
"loss": 0.6851, |
|
"mean_token_accuracy": 0.7892695119285057, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9618941916389198, |
|
"eval_loss": 0.7488865852355957, |
|
"eval_mean_token_accuracy": 0.7697332311412013, |
|
"eval_runtime": 12.9253, |
|
"eval_samples_per_second": 9.98, |
|
"eval_steps_per_second": 2.553, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.9655937846836848, |
|
"grad_norm": 0.7500272096924462, |
|
"learning_rate": 7.065140480483235e-08, |
|
"loss": 0.722, |
|
"mean_token_accuracy": 0.7809713685380442, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.9692933777284499, |
|
"grad_norm": 0.7740539516194233, |
|
"learning_rate": 5.6140739397474445e-08, |
|
"loss": 0.741, |
|
"mean_token_accuracy": 0.7735731347336795, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.9729929707732149, |
|
"grad_norm": 0.7621995001113339, |
|
"learning_rate": 4.329209350195651e-08, |
|
"loss": 0.7277, |
|
"mean_token_accuracy": 0.7768363708086847, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.97669256381798, |
|
"grad_norm": 0.7190084013715407, |
|
"learning_rate": 3.210761464466639e-08, |
|
"loss": 0.6751, |
|
"mean_token_accuracy": 0.7917913948419976, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.9803921568627451, |
|
"grad_norm": 0.832622944037137, |
|
"learning_rate": 2.2589172202635014e-08, |
|
"loss": 0.7183, |
|
"mean_token_accuracy": 0.7814518443094862, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.9840917499075101, |
|
"grad_norm": 0.8517126035217566, |
|
"learning_rate": 1.4738357091084177e-08, |
|
"loss": 0.7223, |
|
"mean_token_accuracy": 0.7779738218649835, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.9877913429522752, |
|
"grad_norm": 0.7735765973323167, |
|
"learning_rate": 8.556481497521418e-09, |
|
"loss": 0.7064, |
|
"mean_token_accuracy": 0.7825683170955192, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.9914909359970403, |
|
"grad_norm": 0.7713072693524988, |
|
"learning_rate": 4.044578662419918e-09, |
|
"loss": 0.7179, |
|
"mean_token_accuracy": 0.7801742579673625, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.9951905290418054, |
|
"grad_norm": 0.7685841540071269, |
|
"learning_rate": 1.203402706525525e-09, |
|
"loss": 0.7493, |
|
"mean_token_accuracy": 0.7706208056883479, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.9988901220865705, |
|
"grad_norm": 0.8159997368959132, |
|
"learning_rate": 3.342850480869686e-11, |
|
"loss": 0.7328, |
|
"mean_token_accuracy": 0.7759899768837795, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9996300406955235, |
|
"mean_token_accuracy": 0.7755799181439557, |
|
"step": 1351, |
|
"total_flos": 76966677970944.0, |
|
"train_loss": 0.759784622734163, |
|
"train_runtime": 8483.4218, |
|
"train_samples_per_second": 2.549, |
|
"train_steps_per_second": 0.159 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1351, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 76966677970944.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|