Qwen2.5-1.5B-Open-R1-Distill / trainer_state.json
changjiakawhi's picture
Model save
1261afc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996300406955235,
"eval_steps": 100,
"global_step": 1351,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003699593044765076,
"grad_norm": 2.757993258887922,
"learning_rate": 7.352941176470589e-07,
"loss": 1.1067,
"mean_token_accuracy": 0.708824381857028,
"step": 5
},
{
"epoch": 0.007399186089530152,
"grad_norm": 2.5907330458452984,
"learning_rate": 1.4705882352941177e-06,
"loss": 1.0679,
"mean_token_accuracy": 0.7169299986512495,
"step": 10
},
{
"epoch": 0.011098779134295227,
"grad_norm": 2.241392532504099,
"learning_rate": 2.2058823529411767e-06,
"loss": 1.1238,
"mean_token_accuracy": 0.6992935634786113,
"step": 15
},
{
"epoch": 0.014798372179060304,
"grad_norm": 1.5956862014648674,
"learning_rate": 2.9411764705882355e-06,
"loss": 1.0062,
"mean_token_accuracy": 0.7288372261973354,
"step": 20
},
{
"epoch": 0.01849796522382538,
"grad_norm": 1.6119131199982493,
"learning_rate": 3.6764705882352946e-06,
"loss": 1.0517,
"mean_token_accuracy": 0.7142909651691335,
"step": 25
},
{
"epoch": 0.022197558268590455,
"grad_norm": 1.139657389593639,
"learning_rate": 4.411764705882353e-06,
"loss": 0.9593,
"mean_token_accuracy": 0.7346081897149975,
"step": 30
},
{
"epoch": 0.02589715131335553,
"grad_norm": 1.2790776332632598,
"learning_rate": 5.147058823529411e-06,
"loss": 0.9805,
"mean_token_accuracy": 0.72720137806358,
"step": 35
},
{
"epoch": 0.029596744358120607,
"grad_norm": 1.104655854611052,
"learning_rate": 5.882352941176471e-06,
"loss": 0.9114,
"mean_token_accuracy": 0.7425328465386071,
"step": 40
},
{
"epoch": 0.033296337402885685,
"grad_norm": 0.9408582472471563,
"learning_rate": 6.61764705882353e-06,
"loss": 0.8885,
"mean_token_accuracy": 0.7472382565404383,
"step": 45
},
{
"epoch": 0.03699593044765076,
"grad_norm": 1.0580888719684283,
"learning_rate": 7.352941176470589e-06,
"loss": 0.8658,
"mean_token_accuracy": 0.7493579770562304,
"step": 50
},
{
"epoch": 0.040695523492415835,
"grad_norm": 0.8832977290583615,
"learning_rate": 8.088235294117648e-06,
"loss": 0.841,
"mean_token_accuracy": 0.7568992237585881,
"step": 55
},
{
"epoch": 0.04439511653718091,
"grad_norm": 0.9301447232199826,
"learning_rate": 8.823529411764707e-06,
"loss": 0.8677,
"mean_token_accuracy": 0.7502923381848673,
"step": 60
},
{
"epoch": 0.048094709581945984,
"grad_norm": 0.856501238425618,
"learning_rate": 9.558823529411766e-06,
"loss": 0.878,
"mean_token_accuracy": 0.7445654933582921,
"step": 65
},
{
"epoch": 0.05179430262671106,
"grad_norm": 1.0586539300870963,
"learning_rate": 1.0294117647058823e-05,
"loss": 0.8825,
"mean_token_accuracy": 0.7438754593389819,
"step": 70
},
{
"epoch": 0.05549389567147614,
"grad_norm": 1.1122367826346538,
"learning_rate": 1.1029411764705885e-05,
"loss": 0.8775,
"mean_token_accuracy": 0.7447408372141248,
"step": 75
},
{
"epoch": 0.059193488716241215,
"grad_norm": 1.0632613858567348,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.8347,
"mean_token_accuracy": 0.7567175732049909,
"step": 80
},
{
"epoch": 0.06289308176100629,
"grad_norm": 0.9476656757993303,
"learning_rate": 1.25e-05,
"loss": 0.8305,
"mean_token_accuracy": 0.7566193958141736,
"step": 85
},
{
"epoch": 0.06659267480577137,
"grad_norm": 1.1083079276477652,
"learning_rate": 1.323529411764706e-05,
"loss": 0.8466,
"mean_token_accuracy": 0.7519149864344391,
"step": 90
},
{
"epoch": 0.07029226785053644,
"grad_norm": 0.9768809343076362,
"learning_rate": 1.3970588235294118e-05,
"loss": 0.7894,
"mean_token_accuracy": 0.768089470723049,
"step": 95
},
{
"epoch": 0.07399186089530152,
"grad_norm": 1.0897420744707864,
"learning_rate": 1.4705882352941179e-05,
"loss": 0.842,
"mean_token_accuracy": 0.7519371481531969,
"step": 100
},
{
"epoch": 0.07399186089530152,
"eval_loss": 0.8507319688796997,
"eval_mean_token_accuracy": 0.7484970684902305,
"eval_runtime": 13.1139,
"eval_samples_per_second": 9.837,
"eval_steps_per_second": 2.516,
"step": 100
},
{
"epoch": 0.07769145394006659,
"grad_norm": 1.0530600966285744,
"learning_rate": 1.5441176470588237e-05,
"loss": 0.8609,
"mean_token_accuracy": 0.7470814372661423,
"step": 105
},
{
"epoch": 0.08139104698483167,
"grad_norm": 0.9420676582335842,
"learning_rate": 1.6176470588235296e-05,
"loss": 0.8056,
"mean_token_accuracy": 0.7604963244668818,
"step": 110
},
{
"epoch": 0.08509064002959675,
"grad_norm": 0.868613480082479,
"learning_rate": 1.6911764705882355e-05,
"loss": 0.8432,
"mean_token_accuracy": 0.7519378614781397,
"step": 115
},
{
"epoch": 0.08879023307436182,
"grad_norm": 1.0518122686708429,
"learning_rate": 1.7647058823529414e-05,
"loss": 0.8237,
"mean_token_accuracy": 0.7588873826848553,
"step": 120
},
{
"epoch": 0.0924898261191269,
"grad_norm": 1.135357117838434,
"learning_rate": 1.8382352941176472e-05,
"loss": 0.8417,
"mean_token_accuracy": 0.7510501482017762,
"step": 125
},
{
"epoch": 0.09618941916389197,
"grad_norm": 1.0511646764690201,
"learning_rate": 1.911764705882353e-05,
"loss": 0.8024,
"mean_token_accuracy": 0.7630132146126162,
"step": 130
},
{
"epoch": 0.09988901220865705,
"grad_norm": 1.003659422839662,
"learning_rate": 1.985294117647059e-05,
"loss": 0.7846,
"mean_token_accuracy": 0.7684428405908383,
"step": 135
},
{
"epoch": 0.10358860525342212,
"grad_norm": 0.9783718356092802,
"learning_rate": 1.9999465148392906e-05,
"loss": 0.7978,
"mean_token_accuracy": 0.7638017765978411,
"step": 140
},
{
"epoch": 0.1072881982981872,
"grad_norm": 0.9325940574831129,
"learning_rate": 1.999729241179462e-05,
"loss": 0.7899,
"mean_token_accuracy": 0.7657915220582932,
"step": 145
},
{
"epoch": 0.11098779134295228,
"grad_norm": 0.9587245678809391,
"learning_rate": 1.999344872485215e-05,
"loss": 0.7935,
"mean_token_accuracy": 0.7643045615369954,
"step": 150
},
{
"epoch": 0.11468738438771735,
"grad_norm": 1.2026644648319857,
"learning_rate": 1.9987934730000457e-05,
"loss": 0.7922,
"mean_token_accuracy": 0.7667913563744123,
"step": 155
},
{
"epoch": 0.11838697743248243,
"grad_norm": 1.0527655093884445,
"learning_rate": 1.998075134885022e-05,
"loss": 0.8036,
"mean_token_accuracy": 0.7613009885033224,
"step": 160
},
{
"epoch": 0.1220865704772475,
"grad_norm": 1.2386325448592015,
"learning_rate": 1.9971899782033853e-05,
"loss": 0.8013,
"mean_token_accuracy": 0.7620870662678945,
"step": 165
},
{
"epoch": 0.12578616352201258,
"grad_norm": 1.127408462312219,
"learning_rate": 1.9961381509004785e-05,
"loss": 0.808,
"mean_token_accuracy": 0.7593721598276708,
"step": 170
},
{
"epoch": 0.12948575656677766,
"grad_norm": 0.9438483145827001,
"learning_rate": 1.9949198287790215e-05,
"loss": 0.7861,
"mean_token_accuracy": 0.7660641312412623,
"step": 175
},
{
"epoch": 0.13318534961154274,
"grad_norm": 0.9318614898436913,
"learning_rate": 1.9935352154697257e-05,
"loss": 0.8038,
"mean_token_accuracy": 0.7602947954939976,
"step": 180
},
{
"epoch": 0.1368849426563078,
"grad_norm": 0.8876177342352581,
"learning_rate": 1.9919845423972603e-05,
"loss": 0.757,
"mean_token_accuracy": 0.7742074952233244,
"step": 185
},
{
"epoch": 0.14058453570107288,
"grad_norm": 1.005618517605514,
"learning_rate": 1.9902680687415704e-05,
"loss": 0.7985,
"mean_token_accuracy": 0.7622990063822126,
"step": 190
},
{
"epoch": 0.14428412874583796,
"grad_norm": 1.0909612602188234,
"learning_rate": 1.9883860813945596e-05,
"loss": 0.7997,
"mean_token_accuracy": 0.7611327796064032,
"step": 195
},
{
"epoch": 0.14798372179060304,
"grad_norm": 0.9223546224550383,
"learning_rate": 1.986338894912137e-05,
"loss": 0.7888,
"mean_token_accuracy": 0.7650975947505124,
"step": 200
},
{
"epoch": 0.14798372179060304,
"eval_loss": 0.8212586641311646,
"eval_mean_token_accuracy": 0.753984452665533,
"eval_runtime": 12.914,
"eval_samples_per_second": 9.989,
"eval_steps_per_second": 2.555,
"step": 200
},
{
"epoch": 0.15168331483536812,
"grad_norm": 0.9461631972455284,
"learning_rate": 1.9841268514616434e-05,
"loss": 0.8206,
"mean_token_accuracy": 0.7560038239175771,
"step": 205
},
{
"epoch": 0.15538290788013318,
"grad_norm": 1.0128847648211983,
"learning_rate": 1.9817503207646606e-05,
"loss": 0.7827,
"mean_token_accuracy": 0.7663386260842018,
"step": 210
},
{
"epoch": 0.15908250092489826,
"grad_norm": 1.0613216235343779,
"learning_rate": 1.979209700035216e-05,
"loss": 0.8097,
"mean_token_accuracy": 0.7595550837031688,
"step": 215
},
{
"epoch": 0.16278209396966334,
"grad_norm": 1.0811772085526155,
"learning_rate": 1.976505413913393e-05,
"loss": 0.7771,
"mean_token_accuracy": 0.7685113508318921,
"step": 220
},
{
"epoch": 0.16648168701442842,
"grad_norm": 1.0032217781891688,
"learning_rate": 1.9736379143943565e-05,
"loss": 0.7788,
"mean_token_accuracy": 0.767374709683548,
"step": 225
},
{
"epoch": 0.1701812800591935,
"grad_norm": 1.0033846900829124,
"learning_rate": 1.9706076807528044e-05,
"loss": 0.7792,
"mean_token_accuracy": 0.7671742181015657,
"step": 230
},
{
"epoch": 0.17388087310395856,
"grad_norm": 1.016761717770083,
"learning_rate": 1.967415219462864e-05,
"loss": 0.776,
"mean_token_accuracy": 0.7692263473301846,
"step": 235
},
{
"epoch": 0.17758046614872364,
"grad_norm": 1.0390090395504632,
"learning_rate": 1.9640610641134383e-05,
"loss": 0.8125,
"mean_token_accuracy": 0.7578594416116643,
"step": 240
},
{
"epoch": 0.18128005919348872,
"grad_norm": 0.9506480558904421,
"learning_rate": 1.9605457753190224e-05,
"loss": 0.774,
"mean_token_accuracy": 0.7669808330485,
"step": 245
},
{
"epoch": 0.1849796522382538,
"grad_norm": 0.9074565067919744,
"learning_rate": 1.9568699406260016e-05,
"loss": 0.7631,
"mean_token_accuracy": 0.7712889443688947,
"step": 250
},
{
"epoch": 0.18867924528301888,
"grad_norm": 0.870192572015487,
"learning_rate": 1.953034174414449e-05,
"loss": 0.8041,
"mean_token_accuracy": 0.7584920492345467,
"step": 255
},
{
"epoch": 0.19237883832778394,
"grad_norm": 0.9436163053750114,
"learning_rate": 1.9490391177954383e-05,
"loss": 0.7697,
"mean_token_accuracy": 0.7687670857457188,
"step": 260
},
{
"epoch": 0.19607843137254902,
"grad_norm": 0.8454337167883038,
"learning_rate": 1.944885438503888e-05,
"loss": 0.7854,
"mean_token_accuracy": 0.762450394993266,
"step": 265
},
{
"epoch": 0.1997780244173141,
"grad_norm": 0.8388911143997999,
"learning_rate": 1.9405738307869565e-05,
"loss": 0.7568,
"mean_token_accuracy": 0.7739168937821217,
"step": 270
},
{
"epoch": 0.20347761746207918,
"grad_norm": 0.8905388426225519,
"learning_rate": 1.936105015288003e-05,
"loss": 0.7874,
"mean_token_accuracy": 0.7657060442327474,
"step": 275
},
{
"epoch": 0.20717721050684423,
"grad_norm": 0.9449437240241089,
"learning_rate": 1.9314797389261426e-05,
"loss": 0.7964,
"mean_token_accuracy": 0.7608504929094035,
"step": 280
},
{
"epoch": 0.21087680355160932,
"grad_norm": 0.9586909770652294,
"learning_rate": 1.9266987747714036e-05,
"loss": 0.7539,
"mean_token_accuracy": 0.7740995999688999,
"step": 285
},
{
"epoch": 0.2145763965963744,
"grad_norm": 0.8457904946858513,
"learning_rate": 1.9217629219155172e-05,
"loss": 0.7596,
"mean_token_accuracy": 0.7711086747415601,
"step": 290
},
{
"epoch": 0.21827598964113948,
"grad_norm": 0.9562284538878311,
"learning_rate": 1.916673005338357e-05,
"loss": 0.7636,
"mean_token_accuracy": 0.7705879155908494,
"step": 295
},
{
"epoch": 0.22197558268590456,
"grad_norm": 0.9311027437584551,
"learning_rate": 1.9114298757700508e-05,
"loss": 0.7833,
"mean_token_accuracy": 0.7640983829119388,
"step": 300
},
{
"epoch": 0.22197558268590456,
"eval_loss": 0.8046144247055054,
"eval_mean_token_accuracy": 0.7572653228255071,
"eval_runtime": 12.9112,
"eval_samples_per_second": 9.991,
"eval_steps_per_second": 2.556,
"step": 300
},
{
"epoch": 0.2256751757306696,
"grad_norm": 0.981751974465348,
"learning_rate": 1.9060344095487916e-05,
"loss": 0.7517,
"mean_token_accuracy": 0.7743910330942259,
"step": 305
},
{
"epoch": 0.2293747687754347,
"grad_norm": 0.9758899701140303,
"learning_rate": 1.9004875084743624e-05,
"loss": 0.7744,
"mean_token_accuracy": 0.7681139265197442,
"step": 310
},
{
"epoch": 0.23307436182019978,
"grad_norm": 1.0996867282039497,
"learning_rate": 1.8947900996574133e-05,
"loss": 0.784,
"mean_token_accuracy": 0.7653104608265264,
"step": 315
},
{
"epoch": 0.23677395486496486,
"grad_norm": 0.8876749775333296,
"learning_rate": 1.8889431353645004e-05,
"loss": 0.7856,
"mean_token_accuracy": 0.7637695086387486,
"step": 320
},
{
"epoch": 0.24047354790972994,
"grad_norm": 0.9254315942079439,
"learning_rate": 1.8829475928589272e-05,
"loss": 0.7478,
"mean_token_accuracy": 0.7747918700572913,
"step": 325
},
{
"epoch": 0.244173140954495,
"grad_norm": 0.8975251081081479,
"learning_rate": 1.8768044742374008e-05,
"loss": 0.7838,
"mean_token_accuracy": 0.7662794783326515,
"step": 330
},
{
"epoch": 0.24787273399926008,
"grad_norm": 0.9659229415705355,
"learning_rate": 1.870514806262544e-05,
"loss": 0.8064,
"mean_token_accuracy": 0.7575206306307912,
"step": 335
},
{
"epoch": 0.25157232704402516,
"grad_norm": 0.8730001205655563,
"learning_rate": 1.8640796401912805e-05,
"loss": 0.7786,
"mean_token_accuracy": 0.7647639047537955,
"step": 340
},
{
"epoch": 0.25527192008879024,
"grad_norm": 0.8503523554051183,
"learning_rate": 1.8575000515991283e-05,
"loss": 0.7303,
"mean_token_accuracy": 0.7790401063745367,
"step": 345
},
{
"epoch": 0.2589715131335553,
"grad_norm": 0.952888955967417,
"learning_rate": 1.850777140200427e-05,
"loss": 0.7481,
"mean_token_accuracy": 0.7754539961681624,
"step": 350
},
{
"epoch": 0.2626711061783204,
"grad_norm": 0.8607187749209038,
"learning_rate": 1.843912029664531e-05,
"loss": 0.7628,
"mean_token_accuracy": 0.7705045859827028,
"step": 355
},
{
"epoch": 0.2663706992230855,
"grad_norm": 0.9612681025779204,
"learning_rate": 1.8369058674280004e-05,
"loss": 0.7751,
"mean_token_accuracy": 0.7662138710016833,
"step": 360
},
{
"epoch": 0.2700702922678505,
"grad_norm": 0.9382382938099896,
"learning_rate": 1.8297598245028173e-05,
"loss": 0.7723,
"mean_token_accuracy": 0.767681726807316,
"step": 365
},
{
"epoch": 0.2737698853126156,
"grad_norm": 0.9997500486677628,
"learning_rate": 1.8224750952806626e-05,
"loss": 0.7611,
"mean_token_accuracy": 0.7707074275165302,
"step": 370
},
{
"epoch": 0.27746947835738067,
"grad_norm": 1.0014266441284037,
"learning_rate": 1.815052897333284e-05,
"loss": 0.7505,
"mean_token_accuracy": 0.7740457102243052,
"step": 375
},
{
"epoch": 0.28116907140214575,
"grad_norm": 1.1075366477297495,
"learning_rate": 1.8074944712089925e-05,
"loss": 0.7641,
"mean_token_accuracy": 0.7693695886355318,
"step": 380
},
{
"epoch": 0.28486866444691084,
"grad_norm": 0.880418326713614,
"learning_rate": 1.799801080225316e-05,
"loss": 0.7539,
"mean_token_accuracy": 0.7710243423063697,
"step": 385
},
{
"epoch": 0.2885682574916759,
"grad_norm": 0.9012750826548256,
"learning_rate": 1.7919740102578482e-05,
"loss": 0.7781,
"mean_token_accuracy": 0.7661210900072783,
"step": 390
},
{
"epoch": 0.292267850536441,
"grad_norm": 0.9230339938349867,
"learning_rate": 1.7840145695253258e-05,
"loss": 0.7708,
"mean_token_accuracy": 0.7679753783644399,
"step": 395
},
{
"epoch": 0.2959674435812061,
"grad_norm": 0.8616748960380592,
"learning_rate": 1.7759240883709745e-05,
"loss": 0.7643,
"mean_token_accuracy": 0.7698643926023239,
"step": 400
},
{
"epoch": 0.2959674435812061,
"eval_loss": 0.7939132452011108,
"eval_mean_token_accuracy": 0.7597238566071922,
"eval_runtime": 12.8935,
"eval_samples_per_second": 10.005,
"eval_steps_per_second": 2.559,
"step": 400
},
{
"epoch": 0.29966703662597116,
"grad_norm": 0.8626025158912716,
"learning_rate": 1.7677039190401538e-05,
"loss": 0.7945,
"mean_token_accuracy": 0.7610350956243586,
"step": 405
},
{
"epoch": 0.30336662967073624,
"grad_norm": 0.8809551406016749,
"learning_rate": 1.759355435454342e-05,
"loss": 0.7554,
"mean_token_accuracy": 0.7720821829110623,
"step": 410
},
{
"epoch": 0.30706622271550127,
"grad_norm": 0.8398343511073675,
"learning_rate": 1.7508800329814993e-05,
"loss": 0.758,
"mean_token_accuracy": 0.7709962708725867,
"step": 415
},
{
"epoch": 0.31076581576026635,
"grad_norm": 0.8717214162340853,
"learning_rate": 1.7422791282028457e-05,
"loss": 0.7456,
"mean_token_accuracy": 0.7747603119006603,
"step": 420
},
{
"epoch": 0.31446540880503143,
"grad_norm": 0.9182650879024069,
"learning_rate": 1.7335541586760928e-05,
"loss": 0.7678,
"mean_token_accuracy": 0.768033024075789,
"step": 425
},
{
"epoch": 0.3181650018497965,
"grad_norm": 0.9270374928264972,
"learning_rate": 1.7247065826951694e-05,
"loss": 0.7687,
"mean_token_accuracy": 0.7670009758238231,
"step": 430
},
{
"epoch": 0.3218645948945616,
"grad_norm": 0.9239947000531302,
"learning_rate": 1.715737879046483e-05,
"loss": 0.7807,
"mean_token_accuracy": 0.7646944309939067,
"step": 435
},
{
"epoch": 0.3255641879393267,
"grad_norm": 0.9234310473796695,
"learning_rate": 1.7066495467617552e-05,
"loss": 0.722,
"mean_token_accuracy": 0.7810521648421277,
"step": 440
},
{
"epoch": 0.32926378098409176,
"grad_norm": 0.9873873391290533,
"learning_rate": 1.6974431048674714e-05,
"loss": 0.7773,
"mean_token_accuracy": 0.7676746345471892,
"step": 445
},
{
"epoch": 0.33296337402885684,
"grad_norm": 0.8848520725498183,
"learning_rate": 1.6881200921309914e-05,
"loss": 0.7799,
"mean_token_accuracy": 0.7653884936966252,
"step": 450
},
{
"epoch": 0.3366629670736219,
"grad_norm": 0.9045509398039985,
"learning_rate": 1.6786820668033596e-05,
"loss": 0.7575,
"mean_token_accuracy": 0.7713310238062097,
"step": 455
},
{
"epoch": 0.340362560118387,
"grad_norm": 0.890193659220096,
"learning_rate": 1.6691306063588583e-05,
"loss": 0.7643,
"mean_token_accuracy": 0.7680450171610729,
"step": 460
},
{
"epoch": 0.34406215316315203,
"grad_norm": 0.9121601629876329,
"learning_rate": 1.6594673072313478e-05,
"loss": 0.7565,
"mean_token_accuracy": 0.7709845040717809,
"step": 465
},
{
"epoch": 0.3477617462079171,
"grad_norm": 0.8596218378746548,
"learning_rate": 1.6496937845474375e-05,
"loss": 0.7899,
"mean_token_accuracy": 0.762253394394126,
"step": 470
},
{
"epoch": 0.3514613392526822,
"grad_norm": 0.8881090780569305,
"learning_rate": 1.639811671856535e-05,
"loss": 0.7716,
"mean_token_accuracy": 0.766323517253549,
"step": 475
},
{
"epoch": 0.3551609322974473,
"grad_norm": 0.9129717947745375,
"learning_rate": 1.6298226208578127e-05,
"loss": 0.7773,
"mean_token_accuracy": 0.7662860024728196,
"step": 480
},
{
"epoch": 0.35886052534221236,
"grad_norm": 0.8773141616181648,
"learning_rate": 1.6197283011241423e-05,
"loss": 0.7823,
"mean_token_accuracy": 0.7632768240832929,
"step": 485
},
{
"epoch": 0.36256011838697744,
"grad_norm": 0.8872263111805669,
"learning_rate": 1.6095303998230432e-05,
"loss": 0.7683,
"mean_token_accuracy": 0.7674727430979753,
"step": 490
},
{
"epoch": 0.3662597114317425,
"grad_norm": 0.98466468959405,
"learning_rate": 1.599230621434687e-05,
"loss": 0.7591,
"mean_token_accuracy": 0.7693043297460778,
"step": 495
},
{
"epoch": 0.3699593044765076,
"grad_norm": 0.8908360555035981,
"learning_rate": 1.5888306874670112e-05,
"loss": 0.7362,
"mean_token_accuracy": 0.7770877269382522,
"step": 500
},
{
"epoch": 0.3699593044765076,
"eval_loss": 0.784197211265564,
"eval_mean_token_accuracy": 0.7620830191395729,
"eval_runtime": 12.9033,
"eval_samples_per_second": 9.997,
"eval_steps_per_second": 2.557,
"step": 500
},
{
"epoch": 0.3736588975212727,
"grad_norm": 0.9048447583159639,
"learning_rate": 1.5783323361679865e-05,
"loss": 0.7787,
"mean_token_accuracy": 0.7648656868298831,
"step": 505
},
{
"epoch": 0.37735849056603776,
"grad_norm": 0.9120327051425232,
"learning_rate": 1.567737322235084e-05,
"loss": 0.7593,
"mean_token_accuracy": 0.7714510339240377,
"step": 510
},
{
"epoch": 0.3810580836108028,
"grad_norm": 0.8195601280151146,
"learning_rate": 1.557047416521996e-05,
"loss": 0.7215,
"mean_token_accuracy": 0.7789905700934947,
"step": 515
},
{
"epoch": 0.38475767665556787,
"grad_norm": 0.9650761679887094,
"learning_rate": 1.546264405742654e-05,
"loss": 0.7782,
"mean_token_accuracy": 0.7636861487365787,
"step": 520
},
{
"epoch": 0.38845726970033295,
"grad_norm": 0.8237906709737827,
"learning_rate": 1.535390092172597e-05,
"loss": 0.7699,
"mean_token_accuracy": 0.7671755622376449,
"step": 525
},
{
"epoch": 0.39215686274509803,
"grad_norm": 0.8299457105893924,
"learning_rate": 1.5244262933477401e-05,
"loss": 0.7804,
"mean_token_accuracy": 0.7637937394911022,
"step": 530
},
{
"epoch": 0.3958564557898631,
"grad_norm": 1.0313966357425606,
"learning_rate": 1.5133748417605878e-05,
"loss": 0.7752,
"mean_token_accuracy": 0.7650544976847276,
"step": 535
},
{
"epoch": 0.3995560488346282,
"grad_norm": 0.8476613666163046,
"learning_rate": 1.5022375845539537e-05,
"loss": 0.7459,
"mean_token_accuracy": 0.7739755173371613,
"step": 540
},
{
"epoch": 0.4032556418793933,
"grad_norm": 0.8488122984336212,
"learning_rate": 1.4910163832122278e-05,
"loss": 0.7282,
"mean_token_accuracy": 0.7793907307750887,
"step": 545
},
{
"epoch": 0.40695523492415836,
"grad_norm": 0.9246732865437337,
"learning_rate": 1.4797131132502464e-05,
"loss": 0.7387,
"mean_token_accuracy": 0.7771462284436527,
"step": 550
},
{
"epoch": 0.41065482796892344,
"grad_norm": 0.8693436586761074,
"learning_rate": 1.4683296638998192e-05,
"loss": 0.7449,
"mean_token_accuracy": 0.7733528571354785,
"step": 555
},
{
"epoch": 0.41435442101368847,
"grad_norm": 1.0132116370018853,
"learning_rate": 1.4568679377939619e-05,
"loss": 0.7338,
"mean_token_accuracy": 0.7772718301836594,
"step": 560
},
{
"epoch": 0.41805401405845355,
"grad_norm": 0.8274852436061462,
"learning_rate": 1.4453298506488896e-05,
"loss": 0.7468,
"mean_token_accuracy": 0.7725176751538921,
"step": 565
},
{
"epoch": 0.42175360710321863,
"grad_norm": 0.8143005464066339,
"learning_rate": 1.4337173309438236e-05,
"loss": 0.7423,
"mean_token_accuracy": 0.7735196406247817,
"step": 570
},
{
"epoch": 0.4254532001479837,
"grad_norm": 0.8624822737135253,
"learning_rate": 1.4220323195986649e-05,
"loss": 0.7448,
"mean_token_accuracy": 0.7735122705759122,
"step": 575
},
{
"epoch": 0.4291527931927488,
"grad_norm": 0.8889965581940739,
"learning_rate": 1.4102767696495885e-05,
"loss": 0.7674,
"mean_token_accuracy": 0.7686958478722382,
"step": 580
},
{
"epoch": 0.4328523862375139,
"grad_norm": 0.7782401935752693,
"learning_rate": 1.398452645922611e-05,
"loss": 0.765,
"mean_token_accuracy": 0.7691531672999341,
"step": 585
},
{
"epoch": 0.43655197928227896,
"grad_norm": 0.8590974972802418,
"learning_rate": 1.3865619247051916e-05,
"loss": 0.7507,
"mean_token_accuracy": 0.7729800254264771,
"step": 590
},
{
"epoch": 0.44025157232704404,
"grad_norm": 0.7988473909688566,
"learning_rate": 1.3746065934159123e-05,
"loss": 0.7242,
"mean_token_accuracy": 0.780331890399065,
"step": 595
},
{
"epoch": 0.4439511653718091,
"grad_norm": 0.9282202265668916,
"learning_rate": 1.3625886502723008e-05,
"loss": 0.7587,
"mean_token_accuracy": 0.7696747774936822,
"step": 600
},
{
"epoch": 0.4439511653718091,
"eval_loss": 0.7762653827667236,
"eval_mean_token_accuracy": 0.7637766286530999,
"eval_runtime": 12.9012,
"eval_samples_per_second": 9.999,
"eval_steps_per_second": 2.558,
"step": 600
},
{
"epoch": 0.4476507584165742,
"grad_norm": 0.8375068814255916,
"learning_rate": 1.3505101039568494e-05,
"loss": 0.7114,
"mean_token_accuracy": 0.78174785615947,
"step": 605
},
{
"epoch": 0.4513503514613392,
"grad_norm": 0.8406018488041918,
"learning_rate": 1.3383729732812814e-05,
"loss": 0.7309,
"mean_token_accuracy": 0.7761469410168554,
"step": 610
},
{
"epoch": 0.4550499445061043,
"grad_norm": 0.8214051664479577,
"learning_rate": 1.3261792868491267e-05,
"loss": 0.754,
"mean_token_accuracy": 0.7719396256522534,
"step": 615
},
{
"epoch": 0.4587495375508694,
"grad_norm": 0.8854429907993514,
"learning_rate": 1.3139310827166613e-05,
"loss": 0.7529,
"mean_token_accuracy": 0.7723088441600371,
"step": 620
},
{
"epoch": 0.46244913059563447,
"grad_norm": 0.8135757240558104,
"learning_rate": 1.3016304080522657e-05,
"loss": 0.7756,
"mean_token_accuracy": 0.7642865483170395,
"step": 625
},
{
"epoch": 0.46614872364039955,
"grad_norm": 0.8597128387115325,
"learning_rate": 1.2892793187942588e-05,
"loss": 0.7856,
"mean_token_accuracy": 0.7615404441461953,
"step": 630
},
{
"epoch": 0.46984831668516464,
"grad_norm": 0.8740917081939998,
"learning_rate": 1.2768798793072708e-05,
"loss": 0.7599,
"mean_token_accuracy": 0.7694673674218602,
"step": 635
},
{
"epoch": 0.4735479097299297,
"grad_norm": 0.8455035652800976,
"learning_rate": 1.2644341620372025e-05,
"loss": 0.7374,
"mean_token_accuracy": 0.7753507196148239,
"step": 640
},
{
"epoch": 0.4772475027746948,
"grad_norm": 0.8453775134661862,
"learning_rate": 1.2519442471648364e-05,
"loss": 0.7539,
"mean_token_accuracy": 0.7711800403046617,
"step": 645
},
{
"epoch": 0.4809470958194599,
"grad_norm": 0.8131753165282993,
"learning_rate": 1.2394122222581557e-05,
"loss": 0.7476,
"mean_token_accuracy": 0.7740160766147529,
"step": 650
},
{
"epoch": 0.48464668886422496,
"grad_norm": 0.8752309330635153,
"learning_rate": 1.226840181923427e-05,
"loss": 0.7234,
"mean_token_accuracy": 0.7799259045764162,
"step": 655
},
{
"epoch": 0.48834628190899,
"grad_norm": 0.9394868684222661,
"learning_rate": 1.214230227455106e-05,
"loss": 0.761,
"mean_token_accuracy": 0.76914448788722,
"step": 660
},
{
"epoch": 0.49204587495375507,
"grad_norm": 0.9190997939430626,
"learning_rate": 1.201584466484629e-05,
"loss": 0.7429,
"mean_token_accuracy": 0.7749135561574454,
"step": 665
},
{
"epoch": 0.49574546799852015,
"grad_norm": 0.991650882333127,
"learning_rate": 1.1889050126281405e-05,
"loss": 0.7624,
"mean_token_accuracy": 0.7674541114303453,
"step": 670
},
{
"epoch": 0.49944506104328523,
"grad_norm": 0.8727170236064898,
"learning_rate": 1.1761939851332241e-05,
"loss": 0.7686,
"mean_token_accuracy": 0.7668983696182747,
"step": 675
},
{
"epoch": 0.5031446540880503,
"grad_norm": 0.8423456719916217,
"learning_rate": 1.1634535085246903e-05,
"loss": 0.7502,
"mean_token_accuracy": 0.7719282566060406,
"step": 680
},
{
"epoch": 0.5068442471328154,
"grad_norm": 0.8041730146404278,
"learning_rate": 1.1506857122494832e-05,
"loss": 0.7557,
"mean_token_accuracy": 0.7698715177090566,
"step": 685
},
{
"epoch": 0.5105438401775805,
"grad_norm": 0.8008949159036595,
"learning_rate": 1.1378927303207637e-05,
"loss": 0.7555,
"mean_token_accuracy": 0.7708746251231458,
"step": 690
},
{
"epoch": 0.5142434332223456,
"grad_norm": 0.8535115805575514,
"learning_rate": 1.12507670096123e-05,
"loss": 0.7427,
"mean_token_accuracy": 0.7726444687618006,
"step": 695
},
{
"epoch": 0.5179430262671106,
"grad_norm": 0.9312344890724055,
"learning_rate": 1.1122397662457352e-05,
"loss": 0.7502,
"mean_token_accuracy": 0.7726102122304674,
"step": 700
},
{
"epoch": 0.5179430262671106,
"eval_loss": 0.7686853408813477,
"eval_mean_token_accuracy": 0.7649829158123188,
"eval_runtime": 12.9077,
"eval_samples_per_second": 9.994,
"eval_steps_per_second": 2.557,
"step": 700
},
{
"epoch": 0.5216426193118757,
"grad_norm": 0.868812002235657,
"learning_rate": 1.0993840717432582e-05,
"loss": 0.739,
"mean_token_accuracy": 0.7753342555426331,
"step": 705
},
{
"epoch": 0.5253422123566408,
"grad_norm": 0.873440662985193,
"learning_rate": 1.0865117661582958e-05,
"loss": 0.7439,
"mean_token_accuracy": 0.7739453320579491,
"step": 710
},
{
"epoch": 0.5290418054014059,
"grad_norm": 0.862857533425593,
"learning_rate": 1.0736250009717249e-05,
"loss": 0.7569,
"mean_token_accuracy": 0.770897334069314,
"step": 715
},
{
"epoch": 0.532741398446171,
"grad_norm": 0.8639562204638955,
"learning_rate": 1.0607259300812047e-05,
"loss": 0.7074,
"mean_token_accuracy": 0.7842015210192824,
"step": 720
},
{
"epoch": 0.536440991490936,
"grad_norm": 0.899875919647952,
"learning_rate": 1.0478167094411733e-05,
"loss": 0.7363,
"mean_token_accuracy": 0.7761369932840716,
"step": 725
},
{
"epoch": 0.540140584535701,
"grad_norm": 0.8342218548841652,
"learning_rate": 1.0348994967025012e-05,
"loss": 0.7417,
"mean_token_accuracy": 0.7746170812078568,
"step": 730
},
{
"epoch": 0.5438401775804661,
"grad_norm": 0.8013624939670422,
"learning_rate": 1.0219764508518595e-05,
"loss": 0.7384,
"mean_token_accuracy": 0.7767422075955539,
"step": 735
},
{
"epoch": 0.5475397706252312,
"grad_norm": 0.8120468940513571,
"learning_rate": 1.0090497318508687e-05,
"loss": 0.7566,
"mean_token_accuracy": 0.7691283622927295,
"step": 740
},
{
"epoch": 0.5512393636699963,
"grad_norm": 0.8150917404467021,
"learning_rate": 9.961215002750799e-06,
"loss": 0.7624,
"mean_token_accuracy": 0.7672601324412168,
"step": 745
},
{
"epoch": 0.5549389567147613,
"grad_norm": 0.8358908477578574,
"learning_rate": 9.831939169528565e-06,
"loss": 0.7198,
"mean_token_accuracy": 0.7803871901829526,
"step": 750
},
{
"epoch": 0.5586385497595264,
"grad_norm": 0.8360497025012443,
"learning_rate": 9.702691426042124e-06,
"loss": 0.704,
"mean_token_accuracy": 0.784279945314917,
"step": 755
},
{
"epoch": 0.5623381428042915,
"grad_norm": 0.8484683109694328,
"learning_rate": 9.573493374796694e-06,
"loss": 0.742,
"mean_token_accuracy": 0.773911456375648,
"step": 760
},
{
"epoch": 0.5660377358490566,
"grad_norm": 0.801814751253701,
"learning_rate": 9.444366609991916e-06,
"loss": 0.7475,
"mean_token_accuracy": 0.7704960987519774,
"step": 765
},
{
"epoch": 0.5697373288938217,
"grad_norm": 0.7903451176947093,
"learning_rate": 9.315332713912593e-06,
"loss": 0.6802,
"mean_token_accuracy": 0.7910297225776487,
"step": 770
},
{
"epoch": 0.5734369219385868,
"grad_norm": 0.7640452473763807,
"learning_rate": 9.18641325332142e-06,
"loss": 0.7254,
"mean_token_accuracy": 0.7779958468370307,
"step": 775
},
{
"epoch": 0.5771365149833518,
"grad_norm": 0.7764311897247947,
"learning_rate": 9.057629775854314e-06,
"loss": 0.7317,
"mean_token_accuracy": 0.7763394937851484,
"step": 780
},
{
"epoch": 0.5808361080281169,
"grad_norm": 0.8245708624875354,
"learning_rate": 8.929003806418934e-06,
"loss": 0.7376,
"mean_token_accuracy": 0.7750152545715012,
"step": 785
},
{
"epoch": 0.584535701072882,
"grad_norm": 0.8103882551202022,
"learning_rate": 8.800556843597002e-06,
"loss": 0.7259,
"mean_token_accuracy": 0.7782374392513656,
"step": 790
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.7928991222424273,
"learning_rate": 8.672310356051023e-06,
"loss": 0.7431,
"mean_token_accuracy": 0.7736308218387102,
"step": 795
},
{
"epoch": 0.5919348871624122,
"grad_norm": 0.8289989830065249,
"learning_rate": 8.544285778936004e-06,
"loss": 0.7352,
"mean_token_accuracy": 0.7764898230184529,
"step": 800
},
{
"epoch": 0.5919348871624122,
"eval_loss": 0.7624932527542114,
"eval_mean_token_accuracy": 0.7669661829206922,
"eval_runtime": 12.9119,
"eval_samples_per_second": 9.991,
"eval_steps_per_second": 2.556,
"step": 800
},
{
"epoch": 0.5956344802071772,
"grad_norm": 0.8482327309493489,
"learning_rate": 8.416504510316774e-06,
"loss": 0.7194,
"mean_token_accuracy": 0.7805804982214577,
"step": 805
},
{
"epoch": 0.5993340732519423,
"grad_norm": 0.8542094892921783,
"learning_rate": 8.28898790759152e-06,
"loss": 0.7332,
"mean_token_accuracy": 0.7760156726467715,
"step": 810
},
{
"epoch": 0.6030336662967074,
"grad_norm": 0.8315942810202602,
"learning_rate": 8.161757283922084e-06,
"loss": 0.7186,
"mean_token_accuracy": 0.7800488930789287,
"step": 815
},
{
"epoch": 0.6067332593414725,
"grad_norm": 0.8561047694162326,
"learning_rate": 8.034833904671698e-06,
"loss": 0.7286,
"mean_token_accuracy": 0.7782032355305302,
"step": 820
},
{
"epoch": 0.6104328523862376,
"grad_norm": 0.8322803740232512,
"learning_rate": 7.908238983850666e-06,
"loss": 0.7475,
"mean_token_accuracy": 0.7716142551709843,
"step": 825
},
{
"epoch": 0.6141324454310025,
"grad_norm": 0.8093783949232168,
"learning_rate": 7.781993680570656e-06,
"loss": 0.7419,
"mean_token_accuracy": 0.7753669766613711,
"step": 830
},
{
"epoch": 0.6178320384757676,
"grad_norm": 0.8251647916013486,
"learning_rate": 7.656119095508155e-06,
"loss": 0.7163,
"mean_token_accuracy": 0.7810057582729616,
"step": 835
},
{
"epoch": 0.6215316315205327,
"grad_norm": 0.8936545458969576,
"learning_rate": 7.530636267377706e-06,
"loss": 0.7212,
"mean_token_accuracy": 0.7782000869575987,
"step": 840
},
{
"epoch": 0.6252312245652978,
"grad_norm": 0.8327340922107708,
"learning_rate": 7.405566169415481e-06,
"loss": 0.7417,
"mean_token_accuracy": 0.7713689918387485,
"step": 845
},
{
"epoch": 0.6289308176100629,
"grad_norm": 0.9240689034622575,
"learning_rate": 7.280929705873818e-06,
"loss": 0.7671,
"mean_token_accuracy": 0.7674969695134786,
"step": 850
},
{
"epoch": 0.632630410654828,
"grad_norm": 0.8604187816835673,
"learning_rate": 7.15674770852727e-06,
"loss": 0.7714,
"mean_token_accuracy": 0.765074378427229,
"step": 855
},
{
"epoch": 0.636330003699593,
"grad_norm": 0.8575462475833365,
"learning_rate": 7.033040933190776e-06,
"loss": 0.7485,
"mean_token_accuracy": 0.7725655493857382,
"step": 860
},
{
"epoch": 0.6400295967443581,
"grad_norm": 0.7868065852245846,
"learning_rate": 6.909830056250527e-06,
"loss": 0.703,
"mean_token_accuracy": 0.7851972669426284,
"step": 865
},
{
"epoch": 0.6437291897891232,
"grad_norm": 0.817598798759793,
"learning_rate": 6.787135671208126e-06,
"loss": 0.7576,
"mean_token_accuracy": 0.7701196194143121,
"step": 870
},
{
"epoch": 0.6474287828338883,
"grad_norm": 0.7951083662310198,
"learning_rate": 6.6649782852385554e-06,
"loss": 0.7459,
"mean_token_accuracy": 0.772123421166796,
"step": 875
},
{
"epoch": 0.6511283758786534,
"grad_norm": 0.7271974698252879,
"learning_rate": 6.543378315762634e-06,
"loss": 0.7247,
"mean_token_accuracy": 0.7791351612791593,
"step": 880
},
{
"epoch": 0.6548279689234184,
"grad_norm": 0.8824398500626421,
"learning_rate": 6.42235608703441e-06,
"loss": 0.7606,
"mean_token_accuracy": 0.7677733208877957,
"step": 885
},
{
"epoch": 0.6585275619681835,
"grad_norm": 0.8552464856465032,
"learning_rate": 6.301931826744189e-06,
"loss": 0.7419,
"mean_token_accuracy": 0.7743753446438143,
"step": 890
},
{
"epoch": 0.6622271550129486,
"grad_norm": 0.8306173604508144,
"learning_rate": 6.18212566263765e-06,
"loss": 0.7378,
"mean_token_accuracy": 0.7754989605528264,
"step": 895
},
{
"epoch": 0.6659267480577137,
"grad_norm": 0.8046739019546305,
"learning_rate": 6.0629576191517035e-06,
"loss": 0.7254,
"mean_token_accuracy": 0.7763354265372273,
"step": 900
},
{
"epoch": 0.6659267480577137,
"eval_loss": 0.7582561373710632,
"eval_mean_token_accuracy": 0.7673597290333886,
"eval_runtime": 12.9256,
"eval_samples_per_second": 9.98,
"eval_steps_per_second": 2.553,
"step": 900
},
{
"epoch": 0.6696263411024788,
"grad_norm": 0.7871630029994926,
"learning_rate": 5.944447614067588e-06,
"loss": 0.7365,
"mean_token_accuracy": 0.7753696170648604,
"step": 905
},
{
"epoch": 0.6733259341472438,
"grad_norm": 0.8274033791731158,
"learning_rate": 5.8266154551818225e-06,
"loss": 0.7148,
"mean_token_accuracy": 0.7819856387093902,
"step": 910
},
{
"epoch": 0.6770255271920089,
"grad_norm": 0.8585256390323572,
"learning_rate": 5.709480836995509e-06,
"loss": 0.7099,
"mean_token_accuracy": 0.7823146568890846,
"step": 915
},
{
"epoch": 0.680725120236774,
"grad_norm": 0.8245071508434019,
"learning_rate": 5.593063337422595e-06,
"loss": 0.7205,
"mean_token_accuracy": 0.7803457311979414,
"step": 920
},
{
"epoch": 0.684424713281539,
"grad_norm": 0.8137995619905571,
"learning_rate": 5.477382414517625e-06,
"loss": 0.7443,
"mean_token_accuracy": 0.7742054658480322,
"step": 925
},
{
"epoch": 0.6881243063263041,
"grad_norm": 0.8435356879873597,
"learning_rate": 5.362457403223495e-06,
"loss": 0.7321,
"mean_token_accuracy": 0.7777223904380888,
"step": 930
},
{
"epoch": 0.6918238993710691,
"grad_norm": 0.8275951168813332,
"learning_rate": 5.248307512139818e-06,
"loss": 0.7327,
"mean_token_accuracy": 0.7760629303675801,
"step": 935
},
{
"epoch": 0.6955234924158342,
"grad_norm": 0.7986090408289105,
"learning_rate": 5.134951820312402e-06,
"loss": 0.6797,
"mean_token_accuracy": 0.792751892181402,
"step": 940
},
{
"epoch": 0.6992230854605993,
"grad_norm": 0.8344358039184492,
"learning_rate": 5.022409274044346e-06,
"loss": 0.7484,
"mean_token_accuracy": 0.7706997735792959,
"step": 945
},
{
"epoch": 0.7029226785053644,
"grad_norm": 0.7590018472193605,
"learning_rate": 4.910698683729371e-06,
"loss": 0.7246,
"mean_token_accuracy": 0.7794280766358616,
"step": 950
},
{
"epoch": 0.7066222715501295,
"grad_norm": 0.8418460117578578,
"learning_rate": 4.799838720707847e-06,
"loss": 0.6786,
"mean_token_accuracy": 0.7908993228344331,
"step": 955
},
{
"epoch": 0.7103218645948945,
"grad_norm": 0.8709719228420416,
"learning_rate": 4.6898479141460415e-06,
"loss": 0.7067,
"mean_token_accuracy": 0.7843742462359516,
"step": 960
},
{
"epoch": 0.7140214576396596,
"grad_norm": 0.8535611787813129,
"learning_rate": 4.580744647939163e-06,
"loss": 0.7435,
"mean_token_accuracy": 0.7730417779644787,
"step": 965
},
{
"epoch": 0.7177210506844247,
"grad_norm": 0.8885996504303695,
"learning_rate": 4.472547157638674e-06,
"loss": 0.694,
"mean_token_accuracy": 0.7880904268803789,
"step": 970
},
{
"epoch": 0.7214206437291898,
"grad_norm": 0.9587623922743913,
"learning_rate": 4.365273527404384e-06,
"loss": 0.7401,
"mean_token_accuracy": 0.7739410892854838,
"step": 975
},
{
"epoch": 0.7251202367739549,
"grad_norm": 0.8107566238251871,
"learning_rate": 4.258941686981864e-06,
"loss": 0.7242,
"mean_token_accuracy": 0.779501991954344,
"step": 980
},
{
"epoch": 0.72881982981872,
"grad_norm": 0.8788408769548844,
"learning_rate": 4.15356940870567e-06,
"loss": 0.7382,
"mean_token_accuracy": 0.7752977269534177,
"step": 985
},
{
"epoch": 0.732519422863485,
"grad_norm": 0.8161841348783122,
"learning_rate": 4.049174304528857e-06,
"loss": 0.7122,
"mean_token_accuracy": 0.7834159122342004,
"step": 990
},
{
"epoch": 0.7362190159082501,
"grad_norm": 0.8045647610652492,
"learning_rate": 3.945773823079315e-06,
"loss": 0.7085,
"mean_token_accuracy": 0.7838271814299385,
"step": 995
},
{
"epoch": 0.7399186089530152,
"grad_norm": 0.7888951101393026,
"learning_rate": 3.8433852467434175e-06,
"loss": 0.7436,
"mean_token_accuracy": 0.7715282648856208,
"step": 1000
},
{
"epoch": 0.7399186089530152,
"eval_loss": 0.7540702819824219,
"eval_mean_token_accuracy": 0.7684317578583987,
"eval_runtime": 12.9145,
"eval_samples_per_second": 9.989,
"eval_steps_per_second": 2.555,
"step": 1000
},
{
"epoch": 0.7436182019977803,
"grad_norm": 0.8476150102495247,
"learning_rate": 3.742025688777413e-06,
"loss": 0.755,
"mean_token_accuracy": 0.769747059368865,
"step": 1005
},
{
"epoch": 0.7473177950425454,
"grad_norm": 0.8447327457715914,
"learning_rate": 3.641712090447125e-06,
"loss": 0.7084,
"mean_token_accuracy": 0.7835149860526573,
"step": 1010
},
{
"epoch": 0.7510173880873104,
"grad_norm": 0.7966093391523419,
"learning_rate": 3.542461218196379e-06,
"loss": 0.7215,
"mean_token_accuracy": 0.7790490334715525,
"step": 1015
},
{
"epoch": 0.7547169811320755,
"grad_norm": 0.8909915448158038,
"learning_rate": 3.444289660844665e-06,
"loss": 0.7361,
"mean_token_accuracy": 0.7758344142261316,
"step": 1020
},
{
"epoch": 0.7584165741768405,
"grad_norm": 0.828582805131554,
"learning_rate": 3.347213826814456e-06,
"loss": 0.7317,
"mean_token_accuracy": 0.7749168212587448,
"step": 1025
},
{
"epoch": 0.7621161672216056,
"grad_norm": 0.8172943841676832,
"learning_rate": 3.2512499413887255e-06,
"loss": 0.7206,
"mean_token_accuracy": 0.7795061516510636,
"step": 1030
},
{
"epoch": 0.7658157602663707,
"grad_norm": 0.7948969347509628,
"learning_rate": 3.1564140439990256e-06,
"loss": 0.7534,
"mean_token_accuracy": 0.7695401774020444,
"step": 1035
},
{
"epoch": 0.7695153533111357,
"grad_norm": 0.7639527363991839,
"learning_rate": 3.0627219855446667e-06,
"loss": 0.6907,
"mean_token_accuracy": 0.7891346245991027,
"step": 1040
},
{
"epoch": 0.7732149463559008,
"grad_norm": 0.796687026157432,
"learning_rate": 2.970189425743383e-06,
"loss": 0.7365,
"mean_token_accuracy": 0.7739980038896167,
"step": 1045
},
{
"epoch": 0.7769145394006659,
"grad_norm": 0.7960037813992309,
"learning_rate": 2.8788318305139808e-06,
"loss": 0.7513,
"mean_token_accuracy": 0.7689661221516821,
"step": 1050
},
{
"epoch": 0.780614132445431,
"grad_norm": 0.7733589592006201,
"learning_rate": 2.7886644693913333e-06,
"loss": 0.7443,
"mean_token_accuracy": 0.7728263504337974,
"step": 1055
},
{
"epoch": 0.7843137254901961,
"grad_norm": 0.8565688482800564,
"learning_rate": 2.6997024129742544e-06,
"loss": 0.7328,
"mean_token_accuracy": 0.7762133218500764,
"step": 1060
},
{
"epoch": 0.7880133185349611,
"grad_norm": 0.8923008390952016,
"learning_rate": 2.611960530406572e-06,
"loss": 0.728,
"mean_token_accuracy": 0.7796289219427248,
"step": 1065
},
{
"epoch": 0.7917129115797262,
"grad_norm": 0.7873211808203069,
"learning_rate": 2.5254534868919077e-06,
"loss": 0.726,
"mean_token_accuracy": 0.7775058698315347,
"step": 1070
},
{
"epoch": 0.7954125046244913,
"grad_norm": 0.7719831428886607,
"learning_rate": 2.4401957412425213e-06,
"loss": 0.7309,
"mean_token_accuracy": 0.7749563569631559,
"step": 1075
},
{
"epoch": 0.7991120976692564,
"grad_norm": 0.835137021064406,
"learning_rate": 2.3562015434626784e-06,
"loss": 0.7145,
"mean_token_accuracy": 0.7820543807444283,
"step": 1080
},
{
"epoch": 0.8028116907140215,
"grad_norm": 0.7671221716084607,
"learning_rate": 2.273484932366874e-06,
"loss": 0.7021,
"mean_token_accuracy": 0.7854601544650027,
"step": 1085
},
{
"epoch": 0.8065112837587866,
"grad_norm": 0.7788468903413968,
"learning_rate": 2.192059733233408e-06,
"loss": 0.7244,
"mean_token_accuracy": 0.7782192310784752,
"step": 1090
},
{
"epoch": 0.8102108768035516,
"grad_norm": 0.8041148765506401,
"learning_rate": 2.111939555493603e-06,
"loss": 0.7225,
"mean_token_accuracy": 0.7786757617228395,
"step": 1095
},
{
"epoch": 0.8139104698483167,
"grad_norm": 0.9148318869053712,
"learning_rate": 2.0331377904571303e-06,
"loss": 0.745,
"mean_token_accuracy": 0.7731225924993291,
"step": 1100
},
{
"epoch": 0.8139104698483167,
"eval_loss": 0.7510696649551392,
"eval_mean_token_accuracy": 0.7692631147293293,
"eval_runtime": 12.9268,
"eval_samples_per_second": 9.979,
"eval_steps_per_second": 2.553,
"step": 1100
},
{
"epoch": 0.8176100628930818,
"grad_norm": 0.8176644589533959,
"learning_rate": 1.9556676090737803e-06,
"loss": 0.7544,
"mean_token_accuracy": 0.7705814013113146,
"step": 1105
},
{
"epoch": 0.8213096559378469,
"grad_norm": 0.751484746704062,
"learning_rate": 1.879541959732072e-06,
"loss": 0.7133,
"mean_token_accuracy": 0.7810593845260246,
"step": 1110
},
{
"epoch": 0.825009248982612,
"grad_norm": 0.8448196083310406,
"learning_rate": 1.8047735660950427e-06,
"loss": 0.7088,
"mean_token_accuracy": 0.7839263073608779,
"step": 1115
},
{
"epoch": 0.8287088420273769,
"grad_norm": 0.7666551359898016,
"learning_rate": 1.7313749249736266e-06,
"loss": 0.7225,
"mean_token_accuracy": 0.7800409694989299,
"step": 1120
},
{
"epoch": 0.832408435072142,
"grad_norm": 0.7750630686515836,
"learning_rate": 1.6593583042379192e-06,
"loss": 0.7302,
"mean_token_accuracy": 0.7764794004807236,
"step": 1125
},
{
"epoch": 0.8361080281169071,
"grad_norm": 0.7448783923471559,
"learning_rate": 1.5887357407667314e-06,
"loss": 0.7303,
"mean_token_accuracy": 0.7770985998568949,
"step": 1130
},
{
"epoch": 0.8398076211616722,
"grad_norm": 0.8617250383965689,
"learning_rate": 1.5195190384357405e-06,
"loss": 0.7261,
"mean_token_accuracy": 0.7790343243952108,
"step": 1135
},
{
"epoch": 0.8435072142064373,
"grad_norm": 0.7315929117396289,
"learning_rate": 1.4517197661445893e-06,
"loss": 0.7103,
"mean_token_accuracy": 0.7820174506567448,
"step": 1140
},
{
"epoch": 0.8472068072512023,
"grad_norm": 0.7464816300853588,
"learning_rate": 1.3853492558832472e-06,
"loss": 0.7248,
"mean_token_accuracy": 0.7778668411101556,
"step": 1145
},
{
"epoch": 0.8509064002959674,
"grad_norm": 0.8027284787026119,
"learning_rate": 1.3204186008379926e-06,
"loss": 0.7142,
"mean_token_accuracy": 0.7828477066310621,
"step": 1150
},
{
"epoch": 0.8546059933407325,
"grad_norm": 0.7867579697995757,
"learning_rate": 1.2569386535372807e-06,
"loss": 0.7411,
"mean_token_accuracy": 0.7738658084540277,
"step": 1155
},
{
"epoch": 0.8583055863854976,
"grad_norm": 0.8069270526220779,
"learning_rate": 1.1949200240378577e-06,
"loss": 0.7066,
"mean_token_accuracy": 0.7833467087206057,
"step": 1160
},
{
"epoch": 0.8620051794302627,
"grad_norm": 0.8115083552759281,
"learning_rate": 1.1343730781513896e-06,
"loss": 0.7117,
"mean_token_accuracy": 0.782034573182628,
"step": 1165
},
{
"epoch": 0.8657047724750278,
"grad_norm": 0.7379062498914578,
"learning_rate": 1.0753079357119134e-06,
"loss": 0.7334,
"mean_token_accuracy": 0.7753454314583148,
"step": 1170
},
{
"epoch": 0.8694043655197928,
"grad_norm": 0.7894649463121637,
"learning_rate": 1.017734468884417e-06,
"loss": 0.6873,
"mean_token_accuracy": 0.7897845425207853,
"step": 1175
},
{
"epoch": 0.8731039585645579,
"grad_norm": 0.8023692788963809,
"learning_rate": 9.616623005147952e-07,
"loss": 0.7416,
"mean_token_accuracy": 0.7721576809793975,
"step": 1180
},
{
"epoch": 0.876803551609323,
"grad_norm": 0.7552635215864901,
"learning_rate": 9.071008025214767e-07,
"loss": 0.6686,
"mean_token_accuracy": 0.7955690867836535,
"step": 1185
},
{
"epoch": 0.8805031446540881,
"grad_norm": 0.7427939874056083,
"learning_rate": 8.540590943290128e-07,
"loss": 0.7366,
"mean_token_accuracy": 0.7745106495210125,
"step": 1190
},
{
"epoch": 0.8842027376988532,
"grad_norm": 0.7579131248407016,
"learning_rate": 8.025460413438457e-07,
"loss": 0.7146,
"mean_token_accuracy": 0.7814567025093309,
"step": 1195
},
{
"epoch": 0.8879023307436182,
"grad_norm": 0.8173415852977716,
"learning_rate": 7.525702534725443e-07,
"loss": 0.7231,
"mean_token_accuracy": 0.7785201805258471,
"step": 1200
},
{
"epoch": 0.8879023307436182,
"eval_loss": 0.749515950679779,
"eval_mean_token_accuracy": 0.7696483845818836,
"eval_runtime": 12.9328,
"eval_samples_per_second": 9.975,
"eval_steps_per_second": 2.552,
"step": 1200
},
{
"epoch": 0.8916019237883833,
"grad_norm": 0.7711527562330054,
"learning_rate": 7.041400836827439e-07,
"loss": 0.7143,
"mean_token_accuracy": 0.7821882194464815,
"step": 1205
},
{
"epoch": 0.8953015168331484,
"grad_norm": 0.7623772941452016,
"learning_rate": 6.572636266070265e-07,
"loss": 0.7387,
"mean_token_accuracy": 0.7745962709444631,
"step": 1210
},
{
"epoch": 0.8990011098779135,
"grad_norm": 0.7652994503797836,
"learning_rate": 6.119487171899807e-07,
"loss": 0.6961,
"mean_token_accuracy": 0.7874255931848155,
"step": 1215
},
{
"epoch": 0.9027007029226785,
"grad_norm": 0.8269678511841839,
"learning_rate": 5.682029293786673e-07,
"loss": 0.7255,
"mean_token_accuracy": 0.7770277914333097,
"step": 1220
},
{
"epoch": 0.9064002959674435,
"grad_norm": 0.7302387175249154,
"learning_rate": 5.26033574856708e-07,
"loss": 0.7036,
"mean_token_accuracy": 0.7842749808544124,
"step": 1225
},
{
"epoch": 0.9100998890122086,
"grad_norm": 0.7426310574145085,
"learning_rate": 4.854477018222103e-07,
"loss": 0.7085,
"mean_token_accuracy": 0.7827587579577575,
"step": 1230
},
{
"epoch": 0.9137994820569737,
"grad_norm": 0.7805083509005499,
"learning_rate": 4.464520938097294e-07,
"loss": 0.6552,
"mean_token_accuracy": 0.7984315941552589,
"step": 1235
},
{
"epoch": 0.9174990751017388,
"grad_norm": 0.7838715084886818,
"learning_rate": 4.0905326855646186e-07,
"loss": 0.6978,
"mean_token_accuracy": 0.7866137937789542,
"step": 1240
},
{
"epoch": 0.9211986681465039,
"grad_norm": 0.764142848066667,
"learning_rate": 3.732574769128738e-07,
"loss": 0.7425,
"mean_token_accuracy": 0.7735163959357182,
"step": 1245
},
{
"epoch": 0.9248982611912689,
"grad_norm": 0.8510657590112316,
"learning_rate": 3.390707017979311e-07,
"loss": 0.714,
"mean_token_accuracy": 0.7817031945118419,
"step": 1250
},
{
"epoch": 0.928597854236034,
"grad_norm": 0.8104897922544609,
"learning_rate": 3.06498657199108e-07,
"loss": 0.6972,
"mean_token_accuracy": 0.7874145607415896,
"step": 1255
},
{
"epoch": 0.9322974472807991,
"grad_norm": 0.8439497662820569,
"learning_rate": 2.7554678721735675e-07,
"loss": 0.7267,
"mean_token_accuracy": 0.7782783773976328,
"step": 1260
},
{
"epoch": 0.9359970403255642,
"grad_norm": 0.7506475456675276,
"learning_rate": 2.4622026515717654e-07,
"loss": 0.7116,
"mean_token_accuracy": 0.7825011111093741,
"step": 1265
},
{
"epoch": 0.9396966333703293,
"grad_norm": 0.796131840794368,
"learning_rate": 2.1852399266194312e-07,
"loss": 0.7043,
"mean_token_accuracy": 0.7831285774300467,
"step": 1270
},
{
"epoch": 0.9433962264150944,
"grad_norm": 0.8874806771851743,
"learning_rate": 1.9246259889464935e-07,
"loss": 0.7372,
"mean_token_accuracy": 0.7754287878050505,
"step": 1275
},
{
"epoch": 0.9470958194598594,
"grad_norm": 0.7616413651005322,
"learning_rate": 1.6804043976418438e-07,
"loss": 0.7115,
"mean_token_accuracy": 0.782469409255144,
"step": 1280
},
{
"epoch": 0.9507954125046245,
"grad_norm": 0.7354908780262112,
"learning_rate": 1.4526159719728595e-07,
"loss": 0.6923,
"mean_token_accuracy": 0.7877402800804598,
"step": 1285
},
{
"epoch": 0.9544950055493896,
"grad_norm": 0.8147962775571531,
"learning_rate": 1.24129878456285e-07,
"loss": 0.7266,
"mean_token_accuracy": 0.7786802824447104,
"step": 1290
},
{
"epoch": 0.9581945985941547,
"grad_norm": 0.7892855143587276,
"learning_rate": 1.0464881550276362e-07,
"loss": 0.7106,
"mean_token_accuracy": 0.7812839457332936,
"step": 1295
},
{
"epoch": 0.9618941916389198,
"grad_norm": 0.7807996872371684,
"learning_rate": 8.682166440721729e-08,
"loss": 0.6851,
"mean_token_accuracy": 0.7892695119285057,
"step": 1300
},
{
"epoch": 0.9618941916389198,
"eval_loss": 0.7488865852355957,
"eval_mean_token_accuracy": 0.7697332311412013,
"eval_runtime": 12.9253,
"eval_samples_per_second": 9.98,
"eval_steps_per_second": 2.553,
"step": 1300
},
{
"epoch": 0.9655937846836848,
"grad_norm": 0.7500272096924462,
"learning_rate": 7.065140480483235e-08,
"loss": 0.722,
"mean_token_accuracy": 0.7809713685380442,
"step": 1305
},
{
"epoch": 0.9692933777284499,
"grad_norm": 0.7740539516194233,
"learning_rate": 5.6140739397474445e-08,
"loss": 0.741,
"mean_token_accuracy": 0.7735731347336795,
"step": 1310
},
{
"epoch": 0.9729929707732149,
"grad_norm": 0.7621995001113339,
"learning_rate": 4.329209350195651e-08,
"loss": 0.7277,
"mean_token_accuracy": 0.7768363708086847,
"step": 1315
},
{
"epoch": 0.97669256381798,
"grad_norm": 0.7190084013715407,
"learning_rate": 3.210761464466639e-08,
"loss": 0.6751,
"mean_token_accuracy": 0.7917913948419976,
"step": 1320
},
{
"epoch": 0.9803921568627451,
"grad_norm": 0.832622944037137,
"learning_rate": 2.2589172202635014e-08,
"loss": 0.7183,
"mean_token_accuracy": 0.7814518443094862,
"step": 1325
},
{
"epoch": 0.9840917499075101,
"grad_norm": 0.8517126035217566,
"learning_rate": 1.4738357091084177e-08,
"loss": 0.7223,
"mean_token_accuracy": 0.7779738218649835,
"step": 1330
},
{
"epoch": 0.9877913429522752,
"grad_norm": 0.7735765973323167,
"learning_rate": 8.556481497521418e-09,
"loss": 0.7064,
"mean_token_accuracy": 0.7825683170955192,
"step": 1335
},
{
"epoch": 0.9914909359970403,
"grad_norm": 0.7713072693524988,
"learning_rate": 4.044578662419918e-09,
"loss": 0.7179,
"mean_token_accuracy": 0.7801742579673625,
"step": 1340
},
{
"epoch": 0.9951905290418054,
"grad_norm": 0.7685841540071269,
"learning_rate": 1.203402706525525e-09,
"loss": 0.7493,
"mean_token_accuracy": 0.7706208056883479,
"step": 1345
},
{
"epoch": 0.9988901220865705,
"grad_norm": 0.8159997368959132,
"learning_rate": 3.342850480869686e-11,
"loss": 0.7328,
"mean_token_accuracy": 0.7759899768837795,
"step": 1350
},
{
"epoch": 0.9996300406955235,
"mean_token_accuracy": 0.7755799181439557,
"step": 1351,
"total_flos": 76966677970944.0,
"train_loss": 0.759784622734163,
"train_runtime": 8483.4218,
"train_samples_per_second": 2.549,
"train_steps_per_second": 0.159
}
],
"logging_steps": 5,
"max_steps": 1351,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 76966677970944.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}