gpthinker / trainer_state.json
shmarymane's picture
Upload gpthinker
2d0e767 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 31635,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00948316737790422,
"grad_norm": 3.985076904296875,
"learning_rate": 9.7e-06,
"loss": 2.4371,
"step": 100
},
{
"epoch": 0.01896633475580844,
"grad_norm": 3.8551318645477295,
"learning_rate": 1.97e-05,
"loss": 2.1056,
"step": 200
},
{
"epoch": 0.02844950213371266,
"grad_norm": 4.302079200744629,
"learning_rate": 2.97e-05,
"loss": 1.9608,
"step": 300
},
{
"epoch": 0.03793266951161688,
"grad_norm": 3.31756329536438,
"learning_rate": 3.97e-05,
"loss": 1.8338,
"step": 400
},
{
"epoch": 0.0474158368895211,
"grad_norm": 2.4619405269622803,
"learning_rate": 4.97e-05,
"loss": 1.7855,
"step": 500
},
{
"epoch": 0.0474158368895211,
"eval_loss": 1.6501274108886719,
"eval_runtime": 72.2019,
"eval_samples_per_second": 129.83,
"eval_steps_per_second": 16.232,
"step": 500
},
{
"epoch": 0.05689900426742532,
"grad_norm": 2.553483724594116,
"learning_rate": 4.9844226754456404e-05,
"loss": 1.7277,
"step": 600
},
{
"epoch": 0.06638217164532954,
"grad_norm": 2.0428194999694824,
"learning_rate": 4.9683635779669185e-05,
"loss": 1.6971,
"step": 700
},
{
"epoch": 0.07586533902323377,
"grad_norm": 1.9449608325958252,
"learning_rate": 4.9523044804881966e-05,
"loss": 1.6537,
"step": 800
},
{
"epoch": 0.08534850640113797,
"grad_norm": 2.5439252853393555,
"learning_rate": 4.9362453830094753e-05,
"loss": 1.6464,
"step": 900
},
{
"epoch": 0.0948316737790422,
"grad_norm": 2.118544578552246,
"learning_rate": 4.9201862855307534e-05,
"loss": 1.5804,
"step": 1000
},
{
"epoch": 0.0948316737790422,
"eval_loss": 1.5088456869125366,
"eval_runtime": 72.0739,
"eval_samples_per_second": 130.061,
"eval_steps_per_second": 16.261,
"step": 1000
},
{
"epoch": 0.10431484115694642,
"grad_norm": 1.8551363945007324,
"learning_rate": 4.9041271880520315e-05,
"loss": 1.6341,
"step": 1100
},
{
"epoch": 0.11379800853485064,
"grad_norm": 1.9903297424316406,
"learning_rate": 4.88806809057331e-05,
"loss": 1.5718,
"step": 1200
},
{
"epoch": 0.12328117591275486,
"grad_norm": 2.2142210006713867,
"learning_rate": 4.8720089930945884e-05,
"loss": 1.5718,
"step": 1300
},
{
"epoch": 0.1327643432906591,
"grad_norm": 2.2737417221069336,
"learning_rate": 4.8559498956158664e-05,
"loss": 1.5137,
"step": 1400
},
{
"epoch": 0.1422475106685633,
"grad_norm": 2.3361587524414062,
"learning_rate": 4.839890798137145e-05,
"loss": 1.5332,
"step": 1500
},
{
"epoch": 0.1422475106685633,
"eval_loss": 1.4451285600662231,
"eval_runtime": 72.138,
"eval_samples_per_second": 129.945,
"eval_steps_per_second": 16.247,
"step": 1500
},
{
"epoch": 0.15173067804646753,
"grad_norm": 2.335610866546631,
"learning_rate": 4.823831700658423e-05,
"loss": 1.5669,
"step": 1600
},
{
"epoch": 0.16121384542437173,
"grad_norm": 1.811543583869934,
"learning_rate": 4.8077726031797014e-05,
"loss": 1.4985,
"step": 1700
},
{
"epoch": 0.17069701280227595,
"grad_norm": 2.1588528156280518,
"learning_rate": 4.79171350570098e-05,
"loss": 1.4979,
"step": 1800
},
{
"epoch": 0.18018018018018017,
"grad_norm": 1.7643985748291016,
"learning_rate": 4.775654408222258e-05,
"loss": 1.5246,
"step": 1900
},
{
"epoch": 0.1896633475580844,
"grad_norm": 1.9193495512008667,
"learning_rate": 4.759595310743536e-05,
"loss": 1.4915,
"step": 2000
},
{
"epoch": 0.1896633475580844,
"eval_loss": 1.403477430343628,
"eval_runtime": 71.9579,
"eval_samples_per_second": 130.271,
"eval_steps_per_second": 16.287,
"step": 2000
},
{
"epoch": 0.19914651493598862,
"grad_norm": 1.8307377099990845,
"learning_rate": 4.743536213264815e-05,
"loss": 1.5009,
"step": 2100
},
{
"epoch": 0.20862968231389284,
"grad_norm": 1.7923104763031006,
"learning_rate": 4.727477115786093e-05,
"loss": 1.4968,
"step": 2200
},
{
"epoch": 0.21811284969179706,
"grad_norm": 1.925938367843628,
"learning_rate": 4.711418018307371e-05,
"loss": 1.4696,
"step": 2300
},
{
"epoch": 0.22759601706970128,
"grad_norm": 2.106110095977783,
"learning_rate": 4.69535892082865e-05,
"loss": 1.4853,
"step": 2400
},
{
"epoch": 0.2370791844476055,
"grad_norm": 2.345017433166504,
"learning_rate": 4.679299823349928e-05,
"loss": 1.4868,
"step": 2500
},
{
"epoch": 0.2370791844476055,
"eval_loss": 1.3772392272949219,
"eval_runtime": 72.0321,
"eval_samples_per_second": 130.136,
"eval_steps_per_second": 16.271,
"step": 2500
},
{
"epoch": 0.24656235182550973,
"grad_norm": 1.5003846883773804,
"learning_rate": 4.663240725871206e-05,
"loss": 1.4641,
"step": 2600
},
{
"epoch": 0.25604551920341395,
"grad_norm": 1.8472124338150024,
"learning_rate": 4.647181628392485e-05,
"loss": 1.4594,
"step": 2700
},
{
"epoch": 0.2655286865813182,
"grad_norm": 1.8818256855010986,
"learning_rate": 4.631122530913763e-05,
"loss": 1.4547,
"step": 2800
},
{
"epoch": 0.2750118539592224,
"grad_norm": 1.5926233530044556,
"learning_rate": 4.615063433435041e-05,
"loss": 1.4414,
"step": 2900
},
{
"epoch": 0.2844950213371266,
"grad_norm": 1.505327820777893,
"learning_rate": 4.59900433595632e-05,
"loss": 1.4165,
"step": 3000
},
{
"epoch": 0.2844950213371266,
"eval_loss": 1.3518378734588623,
"eval_runtime": 71.9886,
"eval_samples_per_second": 130.215,
"eval_steps_per_second": 16.28,
"step": 3000
},
{
"epoch": 0.29397818871503084,
"grad_norm": 1.77092707157135,
"learning_rate": 4.582945238477598e-05,
"loss": 1.4222,
"step": 3100
},
{
"epoch": 0.30346135609293506,
"grad_norm": 2.265411376953125,
"learning_rate": 4.566886140998876e-05,
"loss": 1.3973,
"step": 3200
},
{
"epoch": 0.3129445234708393,
"grad_norm": 1.4207345247268677,
"learning_rate": 4.550827043520154e-05,
"loss": 1.4423,
"step": 3300
},
{
"epoch": 0.32242769084874345,
"grad_norm": 1.72047758102417,
"learning_rate": 4.534767946041433e-05,
"loss": 1.3939,
"step": 3400
},
{
"epoch": 0.3319108582266477,
"grad_norm": 1.7695670127868652,
"learning_rate": 4.518708848562711e-05,
"loss": 1.3911,
"step": 3500
},
{
"epoch": 0.3319108582266477,
"eval_loss": 1.3347505331039429,
"eval_runtime": 72.0526,
"eval_samples_per_second": 130.099,
"eval_steps_per_second": 16.266,
"step": 3500
},
{
"epoch": 0.3413940256045519,
"grad_norm": 1.93614661693573,
"learning_rate": 4.502649751083989e-05,
"loss": 1.405,
"step": 3600
},
{
"epoch": 0.3508771929824561,
"grad_norm": 1.4412301778793335,
"learning_rate": 4.486590653605268e-05,
"loss": 1.421,
"step": 3700
},
{
"epoch": 0.36036036036036034,
"grad_norm": 1.5761134624481201,
"learning_rate": 4.470531556126546e-05,
"loss": 1.3758,
"step": 3800
},
{
"epoch": 0.36984352773826457,
"grad_norm": 1.7923239469528198,
"learning_rate": 4.454472458647824e-05,
"loss": 1.4087,
"step": 3900
},
{
"epoch": 0.3793266951161688,
"grad_norm": 2.2492587566375732,
"learning_rate": 4.438413361169103e-05,
"loss": 1.3797,
"step": 4000
},
{
"epoch": 0.3793266951161688,
"eval_loss": 1.3214360475540161,
"eval_runtime": 72.0741,
"eval_samples_per_second": 130.061,
"eval_steps_per_second": 16.261,
"step": 4000
},
{
"epoch": 0.388809862494073,
"grad_norm": 1.978060245513916,
"learning_rate": 4.422354263690381e-05,
"loss": 1.4024,
"step": 4100
},
{
"epoch": 0.39829302987197723,
"grad_norm": 1.7838459014892578,
"learning_rate": 4.406295166211659e-05,
"loss": 1.4047,
"step": 4200
},
{
"epoch": 0.40777619724988146,
"grad_norm": 1.682637333869934,
"learning_rate": 4.3902360687329377e-05,
"loss": 1.3709,
"step": 4300
},
{
"epoch": 0.4172593646277857,
"grad_norm": 1.5510674715042114,
"learning_rate": 4.374176971254216e-05,
"loss": 1.4175,
"step": 4400
},
{
"epoch": 0.4267425320056899,
"grad_norm": 1.7401492595672607,
"learning_rate": 4.358117873775494e-05,
"loss": 1.3801,
"step": 4500
},
{
"epoch": 0.4267425320056899,
"eval_loss": 1.3049076795578003,
"eval_runtime": 72.1294,
"eval_samples_per_second": 129.961,
"eval_steps_per_second": 16.249,
"step": 4500
},
{
"epoch": 0.4362256993835941,
"grad_norm": 1.6590989828109741,
"learning_rate": 4.3420587762967726e-05,
"loss": 1.3827,
"step": 4600
},
{
"epoch": 0.44570886676149835,
"grad_norm": 1.5440171957015991,
"learning_rate": 4.325999678818051e-05,
"loss": 1.3617,
"step": 4700
},
{
"epoch": 0.45519203413940257,
"grad_norm": 1.716539978981018,
"learning_rate": 4.309940581339329e-05,
"loss": 1.3463,
"step": 4800
},
{
"epoch": 0.4646752015173068,
"grad_norm": 1.3042521476745605,
"learning_rate": 4.2938814838606075e-05,
"loss": 1.3456,
"step": 4900
},
{
"epoch": 0.474158368895211,
"grad_norm": 1.3467687368392944,
"learning_rate": 4.2778223863818856e-05,
"loss": 1.3559,
"step": 5000
},
{
"epoch": 0.474158368895211,
"eval_loss": 1.2918757200241089,
"eval_runtime": 72.0072,
"eval_samples_per_second": 130.181,
"eval_steps_per_second": 16.276,
"step": 5000
},
{
"epoch": 0.48364153627311524,
"grad_norm": 1.3807010650634766,
"learning_rate": 4.261763288903164e-05,
"loss": 1.3507,
"step": 5100
},
{
"epoch": 0.49312470365101946,
"grad_norm": 1.3885177373886108,
"learning_rate": 4.2457041914244425e-05,
"loss": 1.3552,
"step": 5200
},
{
"epoch": 0.5026078710289237,
"grad_norm": 1.2807698249816895,
"learning_rate": 4.2296450939457205e-05,
"loss": 1.3642,
"step": 5300
},
{
"epoch": 0.5120910384068279,
"grad_norm": 1.4009428024291992,
"learning_rate": 4.2135859964669986e-05,
"loss": 1.3781,
"step": 5400
},
{
"epoch": 0.5215742057847321,
"grad_norm": 1.3763035535812378,
"learning_rate": 4.1975268989882774e-05,
"loss": 1.3717,
"step": 5500
},
{
"epoch": 0.5215742057847321,
"eval_loss": 1.280537724494934,
"eval_runtime": 72.1115,
"eval_samples_per_second": 129.993,
"eval_steps_per_second": 16.253,
"step": 5500
},
{
"epoch": 0.5310573731626363,
"grad_norm": 1.5511786937713623,
"learning_rate": 4.1814678015095555e-05,
"loss": 1.3502,
"step": 5600
},
{
"epoch": 0.5405405405405406,
"grad_norm": 1.4995437860488892,
"learning_rate": 4.1654087040308336e-05,
"loss": 1.3599,
"step": 5700
},
{
"epoch": 0.5500237079184448,
"grad_norm": 1.3496274948120117,
"learning_rate": 4.149349606552112e-05,
"loss": 1.3421,
"step": 5800
},
{
"epoch": 0.559506875296349,
"grad_norm": 1.3634631633758545,
"learning_rate": 4.1332905090733904e-05,
"loss": 1.3617,
"step": 5900
},
{
"epoch": 0.5689900426742532,
"grad_norm": 1.5579423904418945,
"learning_rate": 4.1172314115946685e-05,
"loss": 1.3604,
"step": 6000
},
{
"epoch": 0.5689900426742532,
"eval_loss": 1.2698478698730469,
"eval_runtime": 72.1231,
"eval_samples_per_second": 129.972,
"eval_steps_per_second": 16.25,
"step": 6000
},
{
"epoch": 0.5784732100521575,
"grad_norm": 1.380241870880127,
"learning_rate": 4.101332905090734e-05,
"loss": 1.3379,
"step": 6100
},
{
"epoch": 0.5879563774300617,
"grad_norm": 1.764551043510437,
"learning_rate": 4.085273807612012e-05,
"loss": 1.3208,
"step": 6200
},
{
"epoch": 0.5974395448079659,
"grad_norm": 1.627012848854065,
"learning_rate": 4.069214710133291e-05,
"loss": 1.3448,
"step": 6300
},
{
"epoch": 0.6069227121858701,
"grad_norm": 1.539115071296692,
"learning_rate": 4.053155612654569e-05,
"loss": 1.3422,
"step": 6400
},
{
"epoch": 0.6164058795637744,
"grad_norm": 1.4698444604873657,
"learning_rate": 4.037257106150635e-05,
"loss": 1.3264,
"step": 6500
},
{
"epoch": 0.6164058795637744,
"eval_loss": 1.259299635887146,
"eval_runtime": 72.1176,
"eval_samples_per_second": 129.982,
"eval_steps_per_second": 16.251,
"step": 6500
},
{
"epoch": 0.6258890469416786,
"grad_norm": 1.8150815963745117,
"learning_rate": 4.021198008671913e-05,
"loss": 1.3262,
"step": 6600
},
{
"epoch": 0.6353722143195828,
"grad_norm": 1.4278889894485474,
"learning_rate": 4.005138911193191e-05,
"loss": 1.334,
"step": 6700
},
{
"epoch": 0.6448553816974869,
"grad_norm": 1.4713215827941895,
"learning_rate": 3.98907981371447e-05,
"loss": 1.2924,
"step": 6800
},
{
"epoch": 0.6543385490753911,
"grad_norm": 1.626541018486023,
"learning_rate": 3.9731813072105354e-05,
"loss": 1.3057,
"step": 6900
},
{
"epoch": 0.6638217164532954,
"grad_norm": 1.7835373878479004,
"learning_rate": 3.9571222097318134e-05,
"loss": 1.328,
"step": 7000
},
{
"epoch": 0.6638217164532954,
"eval_loss": 1.252388834953308,
"eval_runtime": 72.2427,
"eval_samples_per_second": 129.757,
"eval_steps_per_second": 16.223,
"step": 7000
},
{
"epoch": 0.6733048838311996,
"grad_norm": 1.8675563335418701,
"learning_rate": 3.9410631122530915e-05,
"loss": 1.322,
"step": 7100
},
{
"epoch": 0.6827880512091038,
"grad_norm": 1.5719430446624756,
"learning_rate": 3.92500401477437e-05,
"loss": 1.3464,
"step": 7200
},
{
"epoch": 0.692271218587008,
"grad_norm": 1.5038641691207886,
"learning_rate": 3.9089449172956484e-05,
"loss": 1.3315,
"step": 7300
},
{
"epoch": 0.7017543859649122,
"grad_norm": 1.777970314025879,
"learning_rate": 3.8928858198169265e-05,
"loss": 1.3549,
"step": 7400
},
{
"epoch": 0.7112375533428165,
"grad_norm": 1.8796472549438477,
"learning_rate": 3.8768267223382045e-05,
"loss": 1.2907,
"step": 7500
},
{
"epoch": 0.7112375533428165,
"eval_loss": 1.2450358867645264,
"eval_runtime": 72.1657,
"eval_samples_per_second": 129.895,
"eval_steps_per_second": 16.24,
"step": 7500
},
{
"epoch": 0.7207207207207207,
"grad_norm": 1.7477796077728271,
"learning_rate": 3.860767624859483e-05,
"loss": 1.3196,
"step": 7600
},
{
"epoch": 0.7302038880986249,
"grad_norm": 1.6598505973815918,
"learning_rate": 3.8447085273807614e-05,
"loss": 1.2799,
"step": 7700
},
{
"epoch": 0.7396870554765291,
"grad_norm": 1.7319283485412598,
"learning_rate": 3.8286494299020395e-05,
"loss": 1.3354,
"step": 7800
},
{
"epoch": 0.7491702228544334,
"grad_norm": 1.847347617149353,
"learning_rate": 3.812590332423318e-05,
"loss": 1.3034,
"step": 7900
},
{
"epoch": 0.7586533902323376,
"grad_norm": 1.6584995985031128,
"learning_rate": 3.796531234944596e-05,
"loss": 1.3092,
"step": 8000
},
{
"epoch": 0.7586533902323376,
"eval_loss": 1.2385543584823608,
"eval_runtime": 72.1594,
"eval_samples_per_second": 129.907,
"eval_steps_per_second": 16.242,
"step": 8000
},
{
"epoch": 0.7681365576102418,
"grad_norm": 1.581036925315857,
"learning_rate": 3.7804721374658744e-05,
"loss": 1.3064,
"step": 8100
},
{
"epoch": 0.777619724988146,
"grad_norm": 1.6824501752853394,
"learning_rate": 3.764413039987153e-05,
"loss": 1.3039,
"step": 8200
},
{
"epoch": 0.7871028923660502,
"grad_norm": 1.4804019927978516,
"learning_rate": 3.748353942508431e-05,
"loss": 1.2774,
"step": 8300
},
{
"epoch": 0.7965860597439545,
"grad_norm": 1.5401322841644287,
"learning_rate": 3.732294845029709e-05,
"loss": 1.3042,
"step": 8400
},
{
"epoch": 0.8060692271218587,
"grad_norm": 1.9226937294006348,
"learning_rate": 3.716235747550988e-05,
"loss": 1.3186,
"step": 8500
},
{
"epoch": 0.8060692271218587,
"eval_loss": 1.2315117120742798,
"eval_runtime": 72.0639,
"eval_samples_per_second": 130.079,
"eval_steps_per_second": 16.263,
"step": 8500
},
{
"epoch": 0.8155523944997629,
"grad_norm": 1.3993178606033325,
"learning_rate": 3.700176650072266e-05,
"loss": 1.3074,
"step": 8600
},
{
"epoch": 0.8250355618776671,
"grad_norm": 1.6044120788574219,
"learning_rate": 3.684117552593544e-05,
"loss": 1.2681,
"step": 8700
},
{
"epoch": 0.8345187292555714,
"grad_norm": 1.6285070180892944,
"learning_rate": 3.668058455114823e-05,
"loss": 1.3198,
"step": 8800
},
{
"epoch": 0.8440018966334756,
"grad_norm": 2.002086639404297,
"learning_rate": 3.651999357636101e-05,
"loss": 1.3227,
"step": 8900
},
{
"epoch": 0.8534850640113798,
"grad_norm": 1.5941271781921387,
"learning_rate": 3.635940260157379e-05,
"loss": 1.2914,
"step": 9000
},
{
"epoch": 0.8534850640113798,
"eval_loss": 1.2264697551727295,
"eval_runtime": 72.0482,
"eval_samples_per_second": 130.107,
"eval_steps_per_second": 16.267,
"step": 9000
},
{
"epoch": 0.862968231389284,
"grad_norm": 1.5721193552017212,
"learning_rate": 3.619881162678658e-05,
"loss": 1.3268,
"step": 9100
},
{
"epoch": 0.8724513987671882,
"grad_norm": 1.7066916227340698,
"learning_rate": 3.603822065199936e-05,
"loss": 1.2845,
"step": 9200
},
{
"epoch": 0.8819345661450925,
"grad_norm": 1.5683172941207886,
"learning_rate": 3.587762967721214e-05,
"loss": 1.2779,
"step": 9300
},
{
"epoch": 0.8914177335229967,
"grad_norm": 1.7200586795806885,
"learning_rate": 3.571703870242493e-05,
"loss": 1.3161,
"step": 9400
},
{
"epoch": 0.9009009009009009,
"grad_norm": 1.4963386058807373,
"learning_rate": 3.555644772763771e-05,
"loss": 1.2668,
"step": 9500
},
{
"epoch": 0.9009009009009009,
"eval_loss": 1.2190866470336914,
"eval_runtime": 72.0991,
"eval_samples_per_second": 130.015,
"eval_steps_per_second": 16.255,
"step": 9500
},
{
"epoch": 0.9103840682788051,
"grad_norm": 1.5414083003997803,
"learning_rate": 3.539585675285049e-05,
"loss": 1.3185,
"step": 9600
},
{
"epoch": 0.9198672356567094,
"grad_norm": 1.46302330493927,
"learning_rate": 3.523526577806328e-05,
"loss": 1.2485,
"step": 9700
},
{
"epoch": 0.9293504030346136,
"grad_norm": 1.4815856218338013,
"learning_rate": 3.507467480327606e-05,
"loss": 1.2912,
"step": 9800
},
{
"epoch": 0.9388335704125178,
"grad_norm": 1.5166754722595215,
"learning_rate": 3.491408382848884e-05,
"loss": 1.2722,
"step": 9900
},
{
"epoch": 0.948316737790422,
"grad_norm": 1.9628846645355225,
"learning_rate": 3.475349285370163e-05,
"loss": 1.2538,
"step": 10000
},
{
"epoch": 0.948316737790422,
"eval_loss": 1.2150416374206543,
"eval_runtime": 72.1513,
"eval_samples_per_second": 129.921,
"eval_steps_per_second": 16.244,
"step": 10000
},
{
"epoch": 0.9577999051683262,
"grad_norm": 1.6791901588439941,
"learning_rate": 3.459290187891441e-05,
"loss": 1.2624,
"step": 10100
},
{
"epoch": 0.9672830725462305,
"grad_norm": 1.5026668310165405,
"learning_rate": 3.443231090412719e-05,
"loss": 1.2696,
"step": 10200
},
{
"epoch": 0.9767662399241347,
"grad_norm": 1.176558017730713,
"learning_rate": 3.427171992933998e-05,
"loss": 1.29,
"step": 10300
},
{
"epoch": 0.9862494073020389,
"grad_norm": 1.5698468685150146,
"learning_rate": 3.411112895455276e-05,
"loss": 1.2874,
"step": 10400
},
{
"epoch": 0.9957325746799431,
"grad_norm": 1.4970085620880127,
"learning_rate": 3.395053797976554e-05,
"loss": 1.2874,
"step": 10500
},
{
"epoch": 0.9957325746799431,
"eval_loss": 1.2110899686813354,
"eval_runtime": 72.0475,
"eval_samples_per_second": 130.109,
"eval_steps_per_second": 16.267,
"step": 10500
},
{
"epoch": 1.0052157420578474,
"grad_norm": 1.284839391708374,
"learning_rate": 3.3789947004978326e-05,
"loss": 1.2793,
"step": 10600
},
{
"epoch": 1.0146989094357515,
"grad_norm": 1.680851697921753,
"learning_rate": 3.362935603019111e-05,
"loss": 1.2487,
"step": 10700
},
{
"epoch": 1.0241820768136558,
"grad_norm": 1.659610629081726,
"learning_rate": 3.346876505540389e-05,
"loss": 1.2454,
"step": 10800
},
{
"epoch": 1.03366524419156,
"grad_norm": 1.6641312837600708,
"learning_rate": 3.330817408061667e-05,
"loss": 1.2323,
"step": 10900
},
{
"epoch": 1.0431484115694643,
"grad_norm": 1.481063723564148,
"learning_rate": 3.3147583105829456e-05,
"loss": 1.2646,
"step": 11000
},
{
"epoch": 1.0431484115694643,
"eval_loss": 1.2060637474060059,
"eval_runtime": 71.9819,
"eval_samples_per_second": 130.227,
"eval_steps_per_second": 16.282,
"step": 11000
},
{
"epoch": 1.0526315789473684,
"grad_norm": 1.699491024017334,
"learning_rate": 3.298699213104224e-05,
"loss": 1.2828,
"step": 11100
},
{
"epoch": 1.0621147463252727,
"grad_norm": 2.0708415508270264,
"learning_rate": 3.282640115625502e-05,
"loss": 1.2648,
"step": 11200
},
{
"epoch": 1.0715979137031768,
"grad_norm": 1.4921772480010986,
"learning_rate": 3.266741609121567e-05,
"loss": 1.2611,
"step": 11300
},
{
"epoch": 1.0810810810810811,
"grad_norm": 1.744384765625,
"learning_rate": 3.250682511642846e-05,
"loss": 1.2435,
"step": 11400
},
{
"epoch": 1.0905642484589853,
"grad_norm": 1.1988921165466309,
"learning_rate": 3.234623414164124e-05,
"loss": 1.2525,
"step": 11500
},
{
"epoch": 1.0905642484589853,
"eval_loss": 1.2018728256225586,
"eval_runtime": 71.9385,
"eval_samples_per_second": 130.306,
"eval_steps_per_second": 16.292,
"step": 11500
},
{
"epoch": 1.1000474158368896,
"grad_norm": 1.5618336200714111,
"learning_rate": 3.218564316685402e-05,
"loss": 1.2387,
"step": 11600
},
{
"epoch": 1.1095305832147937,
"grad_norm": 1.512651801109314,
"learning_rate": 3.202505219206681e-05,
"loss": 1.2507,
"step": 11700
},
{
"epoch": 1.119013750592698,
"grad_norm": 2.1945042610168457,
"learning_rate": 3.186446121727959e-05,
"loss": 1.2316,
"step": 11800
},
{
"epoch": 1.1284969179706021,
"grad_norm": 1.3046265840530396,
"learning_rate": 3.170387024249237e-05,
"loss": 1.2352,
"step": 11900
},
{
"epoch": 1.1379800853485065,
"grad_norm": 1.5922869443893433,
"learning_rate": 3.154327926770516e-05,
"loss": 1.2361,
"step": 12000
},
{
"epoch": 1.1379800853485065,
"eval_loss": 1.1982355117797852,
"eval_runtime": 72.0166,
"eval_samples_per_second": 130.164,
"eval_steps_per_second": 16.274,
"step": 12000
},
{
"epoch": 1.1474632527264106,
"grad_norm": 1.2342475652694702,
"learning_rate": 3.138268829291794e-05,
"loss": 1.2318,
"step": 12100
},
{
"epoch": 1.156946420104315,
"grad_norm": 1.630129337310791,
"learning_rate": 3.122209731813072e-05,
"loss": 1.2185,
"step": 12200
},
{
"epoch": 1.166429587482219,
"grad_norm": 1.4030356407165527,
"learning_rate": 3.106150634334351e-05,
"loss": 1.2635,
"step": 12300
},
{
"epoch": 1.1759127548601234,
"grad_norm": 1.372003436088562,
"learning_rate": 3.090091536855629e-05,
"loss": 1.2131,
"step": 12400
},
{
"epoch": 1.1853959222380275,
"grad_norm": 1.1380951404571533,
"learning_rate": 3.074032439376907e-05,
"loss": 1.2553,
"step": 12500
},
{
"epoch": 1.1853959222380275,
"eval_loss": 1.1942973136901855,
"eval_runtime": 71.9892,
"eval_samples_per_second": 130.214,
"eval_steps_per_second": 16.28,
"step": 12500
},
{
"epoch": 1.1948790896159318,
"grad_norm": 1.8760716915130615,
"learning_rate": 3.057973341898186e-05,
"loss": 1.2479,
"step": 12600
},
{
"epoch": 1.204362256993836,
"grad_norm": 1.7070045471191406,
"learning_rate": 3.0419142444194638e-05,
"loss": 1.2283,
"step": 12700
},
{
"epoch": 1.2138454243717403,
"grad_norm": 1.6677838563919067,
"learning_rate": 3.025855146940742e-05,
"loss": 1.2527,
"step": 12800
},
{
"epoch": 1.2233285917496444,
"grad_norm": 1.5015747547149658,
"learning_rate": 3.0097960494620203e-05,
"loss": 1.2402,
"step": 12900
},
{
"epoch": 1.2328117591275487,
"grad_norm": 1.613587737083435,
"learning_rate": 2.9937369519832987e-05,
"loss": 1.2288,
"step": 13000
},
{
"epoch": 1.2328117591275487,
"eval_loss": 1.1904593706130981,
"eval_runtime": 72.0827,
"eval_samples_per_second": 130.045,
"eval_steps_per_second": 16.259,
"step": 13000
},
{
"epoch": 1.2422949265054528,
"grad_norm": 1.7170720100402832,
"learning_rate": 2.9776778545045768e-05,
"loss": 1.2199,
"step": 13100
},
{
"epoch": 1.251778093883357,
"grad_norm": 1.3260998725891113,
"learning_rate": 2.9616187570258552e-05,
"loss": 1.2575,
"step": 13200
},
{
"epoch": 1.2612612612612613,
"grad_norm": 1.450626254081726,
"learning_rate": 2.9455596595471337e-05,
"loss": 1.2267,
"step": 13300
},
{
"epoch": 1.2707444286391656,
"grad_norm": 1.51180899143219,
"learning_rate": 2.9295005620684118e-05,
"loss": 1.2546,
"step": 13400
},
{
"epoch": 1.2802275960170697,
"grad_norm": 1.846704125404358,
"learning_rate": 2.9134414645896902e-05,
"loss": 1.2216,
"step": 13500
},
{
"epoch": 1.2802275960170697,
"eval_loss": 1.1853208541870117,
"eval_runtime": 72.0024,
"eval_samples_per_second": 130.19,
"eval_steps_per_second": 16.277,
"step": 13500
},
{
"epoch": 1.2897107633949738,
"grad_norm": 1.5088779926300049,
"learning_rate": 2.8973823671109686e-05,
"loss": 1.2028,
"step": 13600
},
{
"epoch": 1.2991939307728781,
"grad_norm": 1.2047330141067505,
"learning_rate": 2.8813232696322467e-05,
"loss": 1.2326,
"step": 13700
},
{
"epoch": 1.3086770981507825,
"grad_norm": 1.6895666122436523,
"learning_rate": 2.865264172153525e-05,
"loss": 1.2032,
"step": 13800
},
{
"epoch": 1.3181602655286866,
"grad_norm": 1.3885574340820312,
"learning_rate": 2.8492050746748032e-05,
"loss": 1.2438,
"step": 13900
},
{
"epoch": 1.3276434329065907,
"grad_norm": 1.5129587650299072,
"learning_rate": 2.8331459771960816e-05,
"loss": 1.2099,
"step": 14000
},
{
"epoch": 1.3276434329065907,
"eval_loss": 1.1841365098953247,
"eval_runtime": 72.0289,
"eval_samples_per_second": 130.142,
"eval_steps_per_second": 16.271,
"step": 14000
},
{
"epoch": 1.337126600284495,
"grad_norm": 1.5244189500808716,
"learning_rate": 2.81708687971736e-05,
"loss": 1.2528,
"step": 14100
},
{
"epoch": 1.3466097676623994,
"grad_norm": 1.6656090021133423,
"learning_rate": 2.801027782238638e-05,
"loss": 1.2437,
"step": 14200
},
{
"epoch": 1.3560929350403035,
"grad_norm": 1.6365015506744385,
"learning_rate": 2.7849686847599165e-05,
"loss": 1.2481,
"step": 14300
},
{
"epoch": 1.3655761024182076,
"grad_norm": 1.729038953781128,
"learning_rate": 2.768909587281195e-05,
"loss": 1.2363,
"step": 14400
},
{
"epoch": 1.375059269796112,
"grad_norm": 1.663041114807129,
"learning_rate": 2.752850489802473e-05,
"loss": 1.2371,
"step": 14500
},
{
"epoch": 1.375059269796112,
"eval_loss": 1.1793495416641235,
"eval_runtime": 72.0339,
"eval_samples_per_second": 130.133,
"eval_steps_per_second": 16.27,
"step": 14500
},
{
"epoch": 1.384542437174016,
"grad_norm": 1.5626816749572754,
"learning_rate": 2.7367913923237515e-05,
"loss": 1.2287,
"step": 14600
},
{
"epoch": 1.3940256045519204,
"grad_norm": 1.2476764917373657,
"learning_rate": 2.72073229484503e-05,
"loss": 1.2129,
"step": 14700
},
{
"epoch": 1.4035087719298245,
"grad_norm": 1.4796671867370605,
"learning_rate": 2.704673197366308e-05,
"loss": 1.2143,
"step": 14800
},
{
"epoch": 1.4129919393077288,
"grad_norm": 1.8260607719421387,
"learning_rate": 2.6886140998875864e-05,
"loss": 1.2411,
"step": 14900
},
{
"epoch": 1.422475106685633,
"grad_norm": 1.6393589973449707,
"learning_rate": 2.6725550024088648e-05,
"loss": 1.2128,
"step": 15000
},
{
"epoch": 1.422475106685633,
"eval_loss": 1.1766639947891235,
"eval_runtime": 72.0436,
"eval_samples_per_second": 130.116,
"eval_steps_per_second": 16.268,
"step": 15000
},
{
"epoch": 1.4319582740635373,
"grad_norm": 1.2327754497528076,
"learning_rate": 2.656495904930143e-05,
"loss": 1.2218,
"step": 15100
},
{
"epoch": 1.4414414414414414,
"grad_norm": 1.4845291376113892,
"learning_rate": 2.6405973984262084e-05,
"loss": 1.2158,
"step": 15200
},
{
"epoch": 1.4509246088193457,
"grad_norm": 1.5115349292755127,
"learning_rate": 2.6245383009474868e-05,
"loss": 1.2597,
"step": 15300
},
{
"epoch": 1.4604077761972498,
"grad_norm": 1.2558484077453613,
"learning_rate": 2.608479203468765e-05,
"loss": 1.2293,
"step": 15400
},
{
"epoch": 1.4698909435751542,
"grad_norm": 1.412372350692749,
"learning_rate": 2.5924201059900433e-05,
"loss": 1.2078,
"step": 15500
},
{
"epoch": 1.4698909435751542,
"eval_loss": 1.175757646560669,
"eval_runtime": 72.1719,
"eval_samples_per_second": 129.884,
"eval_steps_per_second": 16.239,
"step": 15500
},
{
"epoch": 1.4793741109530583,
"grad_norm": 1.1586443185806274,
"learning_rate": 2.5763610085113217e-05,
"loss": 1.2167,
"step": 15600
},
{
"epoch": 1.4888572783309626,
"grad_norm": 1.535499095916748,
"learning_rate": 2.5603019110325998e-05,
"loss": 1.2177,
"step": 15700
},
{
"epoch": 1.4983404457088667,
"grad_norm": 1.3925201892852783,
"learning_rate": 2.5442428135538782e-05,
"loss": 1.2089,
"step": 15800
},
{
"epoch": 1.5078236130867708,
"grad_norm": 1.239797592163086,
"learning_rate": 2.5281837160751563e-05,
"loss": 1.2183,
"step": 15900
},
{
"epoch": 1.5173067804646752,
"grad_norm": 1.4727925062179565,
"learning_rate": 2.5121246185964347e-05,
"loss": 1.2382,
"step": 16000
},
{
"epoch": 1.5173067804646752,
"eval_loss": 1.1705734729766846,
"eval_runtime": 72.2315,
"eval_samples_per_second": 129.777,
"eval_steps_per_second": 16.226,
"step": 16000
},
{
"epoch": 1.5267899478425795,
"grad_norm": 1.9122114181518555,
"learning_rate": 2.4960655211177135e-05,
"loss": 1.2062,
"step": 16100
},
{
"epoch": 1.5362731152204836,
"grad_norm": 1.705417275428772,
"learning_rate": 2.4800064236389916e-05,
"loss": 1.2002,
"step": 16200
},
{
"epoch": 1.5457562825983877,
"grad_norm": 1.4141908884048462,
"learning_rate": 2.46394732616027e-05,
"loss": 1.2323,
"step": 16300
},
{
"epoch": 1.555239449976292,
"grad_norm": 2.050583839416504,
"learning_rate": 2.4478882286815484e-05,
"loss": 1.2145,
"step": 16400
},
{
"epoch": 1.5647226173541964,
"grad_norm": 1.495006799697876,
"learning_rate": 2.4318291312028265e-05,
"loss": 1.2041,
"step": 16500
},
{
"epoch": 1.5647226173541964,
"eval_loss": 1.1694616079330444,
"eval_runtime": 71.9712,
"eval_samples_per_second": 130.247,
"eval_steps_per_second": 16.284,
"step": 16500
},
{
"epoch": 1.5742057847321005,
"grad_norm": 1.4379011392593384,
"learning_rate": 2.415770033724105e-05,
"loss": 1.2045,
"step": 16600
},
{
"epoch": 1.5836889521100046,
"grad_norm": 1.6558938026428223,
"learning_rate": 2.399710936245383e-05,
"loss": 1.2234,
"step": 16700
},
{
"epoch": 1.593172119487909,
"grad_norm": 1.6931570768356323,
"learning_rate": 2.3836518387666614e-05,
"loss": 1.2061,
"step": 16800
},
{
"epoch": 1.6026552868658133,
"grad_norm": 1.445521593093872,
"learning_rate": 2.36759274128794e-05,
"loss": 1.2243,
"step": 16900
},
{
"epoch": 1.6121384542437174,
"grad_norm": 1.4067689180374146,
"learning_rate": 2.351533643809218e-05,
"loss": 1.2154,
"step": 17000
},
{
"epoch": 1.6121384542437174,
"eval_loss": 1.1659753322601318,
"eval_runtime": 72.1888,
"eval_samples_per_second": 129.854,
"eval_steps_per_second": 16.235,
"step": 17000
},
{
"epoch": 1.6216216216216215,
"grad_norm": 1.0550585985183716,
"learning_rate": 2.3354745463304964e-05,
"loss": 1.2333,
"step": 17100
},
{
"epoch": 1.6311047889995258,
"grad_norm": 1.5547784566879272,
"learning_rate": 2.3194154488517748e-05,
"loss": 1.2088,
"step": 17200
},
{
"epoch": 1.6405879563774302,
"grad_norm": 2.006110191345215,
"learning_rate": 2.303356351373053e-05,
"loss": 1.1881,
"step": 17300
},
{
"epoch": 1.6500711237553343,
"grad_norm": 1.6522830724716187,
"learning_rate": 2.2872972538943313e-05,
"loss": 1.2158,
"step": 17400
},
{
"epoch": 1.6595542911332384,
"grad_norm": 1.2928231954574585,
"learning_rate": 2.2712381564156097e-05,
"loss": 1.2303,
"step": 17500
},
{
"epoch": 1.6595542911332384,
"eval_loss": 1.1643718481063843,
"eval_runtime": 72.2381,
"eval_samples_per_second": 129.765,
"eval_steps_per_second": 16.224,
"step": 17500
},
{
"epoch": 1.6690374585111427,
"grad_norm": 1.38106107711792,
"learning_rate": 2.2551790589368878e-05,
"loss": 1.1969,
"step": 17600
},
{
"epoch": 1.678520625889047,
"grad_norm": 1.3726710081100464,
"learning_rate": 2.2391199614581662e-05,
"loss": 1.2122,
"step": 17700
},
{
"epoch": 1.6880037932669512,
"grad_norm": 1.2017816305160522,
"learning_rate": 2.2230608639794447e-05,
"loss": 1.2331,
"step": 17800
},
{
"epoch": 1.6974869606448553,
"grad_norm": 1.329315423965454,
"learning_rate": 2.2070017665007227e-05,
"loss": 1.2339,
"step": 17900
},
{
"epoch": 1.7069701280227596,
"grad_norm": 1.5352445840835571,
"learning_rate": 2.190942669022001e-05,
"loss": 1.2429,
"step": 18000
},
{
"epoch": 1.7069701280227596,
"eval_loss": 1.1619985103607178,
"eval_runtime": 72.1286,
"eval_samples_per_second": 129.962,
"eval_steps_per_second": 16.249,
"step": 18000
},
{
"epoch": 1.716453295400664,
"grad_norm": 1.5836015939712524,
"learning_rate": 2.1748835715432796e-05,
"loss": 1.1925,
"step": 18100
},
{
"epoch": 1.725936462778568,
"grad_norm": 1.7755178213119507,
"learning_rate": 2.1588244740645577e-05,
"loss": 1.2146,
"step": 18200
},
{
"epoch": 1.7354196301564722,
"grad_norm": 1.3868217468261719,
"learning_rate": 2.142765376585836e-05,
"loss": 1.2082,
"step": 18300
},
{
"epoch": 1.7449027975343765,
"grad_norm": 1.320333480834961,
"learning_rate": 2.1267062791071142e-05,
"loss": 1.213,
"step": 18400
},
{
"epoch": 1.7543859649122808,
"grad_norm": 1.5032850503921509,
"learning_rate": 2.1106471816283926e-05,
"loss": 1.2048,
"step": 18500
},
{
"epoch": 1.7543859649122808,
"eval_loss": 1.1578137874603271,
"eval_runtime": 72.0841,
"eval_samples_per_second": 130.043,
"eval_steps_per_second": 16.259,
"step": 18500
},
{
"epoch": 1.763869132290185,
"grad_norm": 1.5423904657363892,
"learning_rate": 2.094588084149671e-05,
"loss": 1.2282,
"step": 18600
},
{
"epoch": 1.773352299668089,
"grad_norm": 1.439765453338623,
"learning_rate": 2.078528986670949e-05,
"loss": 1.2171,
"step": 18700
},
{
"epoch": 1.7828354670459934,
"grad_norm": 1.573088526725769,
"learning_rate": 2.0624698891922275e-05,
"loss": 1.2149,
"step": 18800
},
{
"epoch": 1.7923186344238977,
"grad_norm": 1.4882514476776123,
"learning_rate": 2.046410791713506e-05,
"loss": 1.2278,
"step": 18900
},
{
"epoch": 1.8018018018018018,
"grad_norm": 1.9028195142745972,
"learning_rate": 2.030351694234784e-05,
"loss": 1.2247,
"step": 19000
},
{
"epoch": 1.8018018018018018,
"eval_loss": 1.157362937927246,
"eval_runtime": 72.1036,
"eval_samples_per_second": 130.007,
"eval_steps_per_second": 16.254,
"step": 19000
},
{
"epoch": 1.811284969179706,
"grad_norm": 1.289600133895874,
"learning_rate": 2.0142925967560625e-05,
"loss": 1.215,
"step": 19100
},
{
"epoch": 1.8207681365576103,
"grad_norm": 1.4183131456375122,
"learning_rate": 1.998233499277341e-05,
"loss": 1.2284,
"step": 19200
},
{
"epoch": 1.8302513039355146,
"grad_norm": 1.235146403312683,
"learning_rate": 1.982174401798619e-05,
"loss": 1.2067,
"step": 19300
},
{
"epoch": 1.8397344713134187,
"grad_norm": 1.486122488975525,
"learning_rate": 1.9661153043198974e-05,
"loss": 1.183,
"step": 19400
},
{
"epoch": 1.8492176386913228,
"grad_norm": 1.4615782499313354,
"learning_rate": 1.9500562068411758e-05,
"loss": 1.1847,
"step": 19500
},
{
"epoch": 1.8492176386913228,
"eval_loss": 1.1544617414474487,
"eval_runtime": 72.1411,
"eval_samples_per_second": 129.94,
"eval_steps_per_second": 16.246,
"step": 19500
},
{
"epoch": 1.8587008060692272,
"grad_norm": 1.3062597513198853,
"learning_rate": 1.933997109362454e-05,
"loss": 1.1998,
"step": 19600
},
{
"epoch": 1.8681839734471315,
"grad_norm": 1.7676483392715454,
"learning_rate": 1.9180986028585193e-05,
"loss": 1.1985,
"step": 19700
},
{
"epoch": 1.8776671408250356,
"grad_norm": 1.55678129196167,
"learning_rate": 1.9020395053797978e-05,
"loss": 1.2155,
"step": 19800
},
{
"epoch": 1.8871503082029397,
"grad_norm": 1.2260453701019287,
"learning_rate": 1.885980407901076e-05,
"loss": 1.2282,
"step": 19900
},
{
"epoch": 1.896633475580844,
"grad_norm": 1.6828114986419678,
"learning_rate": 1.8699213104223543e-05,
"loss": 1.2183,
"step": 20000
},
{
"epoch": 1.896633475580844,
"eval_loss": 1.1521168947219849,
"eval_runtime": 72.1018,
"eval_samples_per_second": 130.011,
"eval_steps_per_second": 16.255,
"step": 20000
},
{
"epoch": 1.9061166429587484,
"grad_norm": 1.6691786050796509,
"learning_rate": 1.8538622129436327e-05,
"loss": 1.1651,
"step": 20100
},
{
"epoch": 1.9155998103366523,
"grad_norm": 1.4728951454162598,
"learning_rate": 1.8378031154649108e-05,
"loss": 1.2022,
"step": 20200
},
{
"epoch": 1.9250829777145566,
"grad_norm": 1.6341995000839233,
"learning_rate": 1.8217440179861892e-05,
"loss": 1.1777,
"step": 20300
},
{
"epoch": 1.934566145092461,
"grad_norm": 1.4492669105529785,
"learning_rate": 1.8056849205074676e-05,
"loss": 1.2081,
"step": 20400
},
{
"epoch": 1.944049312470365,
"grad_norm": 1.6642097234725952,
"learning_rate": 1.7896258230287457e-05,
"loss": 1.1848,
"step": 20500
},
{
"epoch": 1.944049312470365,
"eval_loss": 1.150140404701233,
"eval_runtime": 72.0779,
"eval_samples_per_second": 130.054,
"eval_steps_per_second": 16.26,
"step": 20500
},
{
"epoch": 1.9535324798482692,
"grad_norm": 1.8986822366714478,
"learning_rate": 1.773566725550024e-05,
"loss": 1.2223,
"step": 20600
},
{
"epoch": 1.9630156472261735,
"grad_norm": 1.390931248664856,
"learning_rate": 1.7575076280713022e-05,
"loss": 1.2068,
"step": 20700
},
{
"epoch": 1.9724988146040778,
"grad_norm": 1.3856289386749268,
"learning_rate": 1.7414485305925806e-05,
"loss": 1.1828,
"step": 20800
},
{
"epoch": 1.981981981981982,
"grad_norm": 1.2241305112838745,
"learning_rate": 1.725389433113859e-05,
"loss": 1.1938,
"step": 20900
},
{
"epoch": 1.991465149359886,
"grad_norm": 1.5855077505111694,
"learning_rate": 1.709330335635137e-05,
"loss": 1.206,
"step": 21000
},
{
"epoch": 1.991465149359886,
"eval_loss": 1.1497843265533447,
"eval_runtime": 72.1674,
"eval_samples_per_second": 129.893,
"eval_steps_per_second": 16.24,
"step": 21000
},
{
"epoch": 2.0009483167377904,
"grad_norm": 2.0832741260528564,
"learning_rate": 1.6932712381564156e-05,
"loss": 1.1805,
"step": 21100
},
{
"epoch": 2.0104314841156947,
"grad_norm": 1.893350601196289,
"learning_rate": 1.677212140677694e-05,
"loss": 1.1757,
"step": 21200
},
{
"epoch": 2.019914651493599,
"grad_norm": 1.346118688583374,
"learning_rate": 1.661153043198972e-05,
"loss": 1.1938,
"step": 21300
},
{
"epoch": 2.029397818871503,
"grad_norm": 1.658034086227417,
"learning_rate": 1.6450939457202505e-05,
"loss": 1.1773,
"step": 21400
},
{
"epoch": 2.0388809862494073,
"grad_norm": 1.4759783744812012,
"learning_rate": 1.629034848241529e-05,
"loss": 1.1735,
"step": 21500
},
{
"epoch": 2.0388809862494073,
"eval_loss": 1.1474945545196533,
"eval_runtime": 71.9179,
"eval_samples_per_second": 130.343,
"eval_steps_per_second": 16.296,
"step": 21500
},
{
"epoch": 2.0483641536273116,
"grad_norm": 1.2887206077575684,
"learning_rate": 1.612975750762807e-05,
"loss": 1.1701,
"step": 21600
},
{
"epoch": 2.057847321005216,
"grad_norm": 1.552646279335022,
"learning_rate": 1.5969166532840854e-05,
"loss": 1.1734,
"step": 21700
},
{
"epoch": 2.06733048838312,
"grad_norm": 1.6683566570281982,
"learning_rate": 1.581018146780151e-05,
"loss": 1.1883,
"step": 21800
},
{
"epoch": 2.076813655761024,
"grad_norm": 1.4613324403762817,
"learning_rate": 1.5649590493014293e-05,
"loss": 1.1845,
"step": 21900
},
{
"epoch": 2.0862968231389285,
"grad_norm": 1.5622040033340454,
"learning_rate": 1.5488999518227077e-05,
"loss": 1.1584,
"step": 22000
},
{
"epoch": 2.0862968231389285,
"eval_loss": 1.1467849016189575,
"eval_runtime": 72.0497,
"eval_samples_per_second": 130.105,
"eval_steps_per_second": 16.267,
"step": 22000
},
{
"epoch": 2.095779990516833,
"grad_norm": 1.721030831336975,
"learning_rate": 1.5328408543439858e-05,
"loss": 1.2018,
"step": 22100
},
{
"epoch": 2.1052631578947367,
"grad_norm": 1.3872593641281128,
"learning_rate": 1.5167817568652642e-05,
"loss": 1.1659,
"step": 22200
},
{
"epoch": 2.114746325272641,
"grad_norm": 1.655704140663147,
"learning_rate": 1.5007226593865425e-05,
"loss": 1.1503,
"step": 22300
},
{
"epoch": 2.1242294926505454,
"grad_norm": 1.5672900676727295,
"learning_rate": 1.4848241528826081e-05,
"loss": 1.1879,
"step": 22400
},
{
"epoch": 2.1337126600284497,
"grad_norm": 1.6815894842147827,
"learning_rate": 1.4687650554038865e-05,
"loss": 1.1719,
"step": 22500
},
{
"epoch": 2.1337126600284497,
"eval_loss": 1.1450951099395752,
"eval_runtime": 72.1598,
"eval_samples_per_second": 129.906,
"eval_steps_per_second": 16.242,
"step": 22500
},
{
"epoch": 2.1431958274063536,
"grad_norm": 1.040648102760315,
"learning_rate": 1.4527059579251648e-05,
"loss": 1.1629,
"step": 22600
},
{
"epoch": 2.152678994784258,
"grad_norm": 1.5001453161239624,
"learning_rate": 1.436646860446443e-05,
"loss": 1.1796,
"step": 22700
},
{
"epoch": 2.1621621621621623,
"grad_norm": 1.7325968742370605,
"learning_rate": 1.4205877629677215e-05,
"loss": 1.1757,
"step": 22800
},
{
"epoch": 2.171645329540066,
"grad_norm": 1.7485188245773315,
"learning_rate": 1.4045286654889997e-05,
"loss": 1.1485,
"step": 22900
},
{
"epoch": 2.1811284969179705,
"grad_norm": 1.4972156286239624,
"learning_rate": 1.388469568010278e-05,
"loss": 1.1667,
"step": 23000
},
{
"epoch": 2.1811284969179705,
"eval_loss": 1.144049048423767,
"eval_runtime": 72.1218,
"eval_samples_per_second": 129.975,
"eval_steps_per_second": 16.25,
"step": 23000
},
{
"epoch": 2.190611664295875,
"grad_norm": 1.2919082641601562,
"learning_rate": 1.3724104705315564e-05,
"loss": 1.1764,
"step": 23100
},
{
"epoch": 2.200094831673779,
"grad_norm": 1.6442806720733643,
"learning_rate": 1.3563513730528346e-05,
"loss": 1.174,
"step": 23200
},
{
"epoch": 2.209577999051683,
"grad_norm": 1.480901837348938,
"learning_rate": 1.3402922755741129e-05,
"loss": 1.1666,
"step": 23300
},
{
"epoch": 2.2190611664295874,
"grad_norm": 1.6193006038665771,
"learning_rate": 1.3242331780953911e-05,
"loss": 1.1975,
"step": 23400
},
{
"epoch": 2.2285443338074917,
"grad_norm": 1.2970917224884033,
"learning_rate": 1.3081740806166696e-05,
"loss": 1.1579,
"step": 23500
},
{
"epoch": 2.2285443338074917,
"eval_loss": 1.1433159112930298,
"eval_runtime": 72.0832,
"eval_samples_per_second": 130.044,
"eval_steps_per_second": 16.259,
"step": 23500
},
{
"epoch": 2.238027501185396,
"grad_norm": 1.4054538011550903,
"learning_rate": 1.2921149831379478e-05,
"loss": 1.1779,
"step": 23600
},
{
"epoch": 2.2475106685633,
"grad_norm": 1.5161010026931763,
"learning_rate": 1.276055885659226e-05,
"loss": 1.1709,
"step": 23700
},
{
"epoch": 2.2569938359412043,
"grad_norm": 2.040818929672241,
"learning_rate": 1.2599967881805045e-05,
"loss": 1.1692,
"step": 23800
},
{
"epoch": 2.2664770033191086,
"grad_norm": 1.3812401294708252,
"learning_rate": 1.2439376907017826e-05,
"loss": 1.1733,
"step": 23900
},
{
"epoch": 2.275960170697013,
"grad_norm": 2.113886833190918,
"learning_rate": 1.2278785932230608e-05,
"loss": 1.1682,
"step": 24000
},
{
"epoch": 2.275960170697013,
"eval_loss": 1.1404303312301636,
"eval_runtime": 72.1649,
"eval_samples_per_second": 129.897,
"eval_steps_per_second": 16.241,
"step": 24000
},
{
"epoch": 2.285443338074917,
"grad_norm": 1.3256770372390747,
"learning_rate": 1.2118194957443393e-05,
"loss": 1.1847,
"step": 24100
},
{
"epoch": 2.294926505452821,
"grad_norm": 1.4699623584747314,
"learning_rate": 1.1957603982656175e-05,
"loss": 1.1576,
"step": 24200
},
{
"epoch": 2.3044096728307255,
"grad_norm": 1.5492583513259888,
"learning_rate": 1.1797013007868958e-05,
"loss": 1.1532,
"step": 24300
},
{
"epoch": 2.31389284020863,
"grad_norm": 1.409488558769226,
"learning_rate": 1.1636422033081742e-05,
"loss": 1.1626,
"step": 24400
},
{
"epoch": 2.3233760075865337,
"grad_norm": 1.642247200012207,
"learning_rate": 1.1475831058294524e-05,
"loss": 1.1943,
"step": 24500
},
{
"epoch": 2.3233760075865337,
"eval_loss": 1.139186978340149,
"eval_runtime": 72.1131,
"eval_samples_per_second": 129.99,
"eval_steps_per_second": 16.252,
"step": 24500
},
{
"epoch": 2.332859174964438,
"grad_norm": 1.4776501655578613,
"learning_rate": 1.1315240083507307e-05,
"loss": 1.1566,
"step": 24600
},
{
"epoch": 2.3423423423423424,
"grad_norm": 1.475188136100769,
"learning_rate": 1.115464910872009e-05,
"loss": 1.1743,
"step": 24700
},
{
"epoch": 2.3518255097202467,
"grad_norm": 1.48451828956604,
"learning_rate": 1.0994058133932874e-05,
"loss": 1.1539,
"step": 24800
},
{
"epoch": 2.3613086770981506,
"grad_norm": 1.4650864601135254,
"learning_rate": 1.0833467159145656e-05,
"loss": 1.2073,
"step": 24900
},
{
"epoch": 2.370791844476055,
"grad_norm": 1.71983003616333,
"learning_rate": 1.0672876184358439e-05,
"loss": 1.2021,
"step": 25000
},
{
"epoch": 2.370791844476055,
"eval_loss": 1.1377766132354736,
"eval_runtime": 71.9749,
"eval_samples_per_second": 130.24,
"eval_steps_per_second": 16.283,
"step": 25000
},
{
"epoch": 2.3802750118539593,
"grad_norm": 1.3838121891021729,
"learning_rate": 1.0512285209571223e-05,
"loss": 1.1791,
"step": 25100
},
{
"epoch": 2.3897581792318636,
"grad_norm": 1.8836325407028198,
"learning_rate": 1.0351694234784006e-05,
"loss": 1.1834,
"step": 25200
},
{
"epoch": 2.3992413466097675,
"grad_norm": 1.3679293394088745,
"learning_rate": 1.0191103259996788e-05,
"loss": 1.183,
"step": 25300
},
{
"epoch": 2.408724513987672,
"grad_norm": 1.5593743324279785,
"learning_rate": 1.003051228520957e-05,
"loss": 1.1703,
"step": 25400
},
{
"epoch": 2.418207681365576,
"grad_norm": 1.4257512092590332,
"learning_rate": 9.869921310422355e-06,
"loss": 1.172,
"step": 25500
},
{
"epoch": 2.418207681365576,
"eval_loss": 1.1378742456436157,
"eval_runtime": 72.0363,
"eval_samples_per_second": 130.129,
"eval_steps_per_second": 16.27,
"step": 25500
},
{
"epoch": 2.4276908487434805,
"grad_norm": 1.771941065788269,
"learning_rate": 9.709330335635137e-06,
"loss": 1.1676,
"step": 25600
},
{
"epoch": 2.4371740161213844,
"grad_norm": 1.7247157096862793,
"learning_rate": 9.54873936084792e-06,
"loss": 1.1753,
"step": 25700
},
{
"epoch": 2.4466571834992887,
"grad_norm": 1.5509614944458008,
"learning_rate": 9.388148386060704e-06,
"loss": 1.1705,
"step": 25800
},
{
"epoch": 2.456140350877193,
"grad_norm": 1.8205307722091675,
"learning_rate": 9.227557411273487e-06,
"loss": 1.1938,
"step": 25900
},
{
"epoch": 2.4656235182550974,
"grad_norm": 1.501631498336792,
"learning_rate": 9.06696643648627e-06,
"loss": 1.1737,
"step": 26000
},
{
"epoch": 2.4656235182550974,
"eval_loss": 1.1362242698669434,
"eval_runtime": 71.9608,
"eval_samples_per_second": 130.265,
"eval_steps_per_second": 16.287,
"step": 26000
},
{
"epoch": 2.4751066856330013,
"grad_norm": 1.4233213663101196,
"learning_rate": 8.906375461699054e-06,
"loss": 1.1728,
"step": 26100
},
{
"epoch": 2.4845898530109056,
"grad_norm": 1.597785472869873,
"learning_rate": 8.745784486911836e-06,
"loss": 1.1559,
"step": 26200
},
{
"epoch": 2.49407302038881,
"grad_norm": 1.2396786212921143,
"learning_rate": 8.585193512124619e-06,
"loss": 1.1645,
"step": 26300
},
{
"epoch": 2.503556187766714,
"grad_norm": 1.643211841583252,
"learning_rate": 8.424602537337401e-06,
"loss": 1.1948,
"step": 26400
},
{
"epoch": 2.513039355144618,
"grad_norm": 1.688436508178711,
"learning_rate": 8.264011562550185e-06,
"loss": 1.1875,
"step": 26500
},
{
"epoch": 2.513039355144618,
"eval_loss": 1.134669303894043,
"eval_runtime": 72.1082,
"eval_samples_per_second": 129.999,
"eval_steps_per_second": 16.253,
"step": 26500
},
{
"epoch": 2.5225225225225225,
"grad_norm": 1.6127384901046753,
"learning_rate": 8.103420587762968e-06,
"loss": 1.1657,
"step": 26600
},
{
"epoch": 2.532005689900427,
"grad_norm": 2.12892484664917,
"learning_rate": 7.944435522723622e-06,
"loss": 1.1636,
"step": 26700
},
{
"epoch": 2.541488857278331,
"grad_norm": 1.173686146736145,
"learning_rate": 7.783844547936407e-06,
"loss": 1.1866,
"step": 26800
},
{
"epoch": 2.550972024656235,
"grad_norm": 1.4527802467346191,
"learning_rate": 7.623253573149189e-06,
"loss": 1.1755,
"step": 26900
},
{
"epoch": 2.5604551920341394,
"grad_norm": 1.6228667497634888,
"learning_rate": 7.462662598361972e-06,
"loss": 1.1427,
"step": 27000
},
{
"epoch": 2.5604551920341394,
"eval_loss": 1.134996771812439,
"eval_runtime": 72.1853,
"eval_samples_per_second": 129.86,
"eval_steps_per_second": 16.236,
"step": 27000
},
{
"epoch": 2.5699383594120437,
"grad_norm": 1.5179518461227417,
"learning_rate": 7.302071623574755e-06,
"loss": 1.1496,
"step": 27100
},
{
"epoch": 2.5794215267899476,
"grad_norm": 1.2633978128433228,
"learning_rate": 7.141480648787538e-06,
"loss": 1.1633,
"step": 27200
},
{
"epoch": 2.588904694167852,
"grad_norm": 1.3050264120101929,
"learning_rate": 6.980889674000321e-06,
"loss": 1.1614,
"step": 27300
},
{
"epoch": 2.5983878615457563,
"grad_norm": 1.432268500328064,
"learning_rate": 6.820298699213104e-06,
"loss": 1.1684,
"step": 27400
},
{
"epoch": 2.6078710289236606,
"grad_norm": 1.6904171705245972,
"learning_rate": 6.659707724425887e-06,
"loss": 1.1673,
"step": 27500
},
{
"epoch": 2.6078710289236606,
"eval_loss": 1.1333271265029907,
"eval_runtime": 72.187,
"eval_samples_per_second": 129.857,
"eval_steps_per_second": 16.236,
"step": 27500
},
{
"epoch": 2.617354196301565,
"grad_norm": 1.2229042053222656,
"learning_rate": 6.49911674963867e-06,
"loss": 1.1793,
"step": 27600
},
{
"epoch": 2.626837363679469,
"grad_norm": 1.7409764528274536,
"learning_rate": 6.338525774851453e-06,
"loss": 1.1963,
"step": 27700
},
{
"epoch": 2.636320531057373,
"grad_norm": 1.4706058502197266,
"learning_rate": 6.177934800064237e-06,
"loss": 1.1836,
"step": 27800
},
{
"epoch": 2.6458036984352775,
"grad_norm": 1.3871138095855713,
"learning_rate": 6.01734382527702e-06,
"loss": 1.1669,
"step": 27900
},
{
"epoch": 2.6552868658131814,
"grad_norm": 1.5841022729873657,
"learning_rate": 5.856752850489803e-06,
"loss": 1.1765,
"step": 28000
},
{
"epoch": 2.6552868658131814,
"eval_loss": 1.1325418949127197,
"eval_runtime": 72.1699,
"eval_samples_per_second": 129.888,
"eval_steps_per_second": 16.239,
"step": 28000
},
{
"epoch": 2.6647700331910857,
"grad_norm": 1.2488940954208374,
"learning_rate": 5.696161875702586e-06,
"loss": 1.1581,
"step": 28100
},
{
"epoch": 2.67425320056899,
"grad_norm": 1.633123517036438,
"learning_rate": 5.535570900915369e-06,
"loss": 1.1829,
"step": 28200
},
{
"epoch": 2.6837363679468944,
"grad_norm": 1.558030366897583,
"learning_rate": 5.374979926128152e-06,
"loss": 1.1816,
"step": 28300
},
{
"epoch": 2.6932195353247987,
"grad_norm": 1.5178041458129883,
"learning_rate": 5.214388951340935e-06,
"loss": 1.1789,
"step": 28400
},
{
"epoch": 2.7027027027027026,
"grad_norm": 1.8317012786865234,
"learning_rate": 5.053797976553718e-06,
"loss": 1.1612,
"step": 28500
},
{
"epoch": 2.7027027027027026,
"eval_loss": 1.1320453882217407,
"eval_runtime": 72.3445,
"eval_samples_per_second": 129.575,
"eval_steps_per_second": 16.2,
"step": 28500
},
{
"epoch": 2.712185870080607,
"grad_norm": 1.4248275756835938,
"learning_rate": 4.893207001766502e-06,
"loss": 1.1583,
"step": 28600
},
{
"epoch": 2.7216690374585113,
"grad_norm": 1.3696835041046143,
"learning_rate": 4.732616026979284e-06,
"loss": 1.1302,
"step": 28700
},
{
"epoch": 2.731152204836415,
"grad_norm": 1.4212887287139893,
"learning_rate": 4.5720250521920675e-06,
"loss": 1.1396,
"step": 28800
},
{
"epoch": 2.7406353722143195,
"grad_norm": 1.6230417490005493,
"learning_rate": 4.41143407740485e-06,
"loss": 1.167,
"step": 28900
},
{
"epoch": 2.750118539592224,
"grad_norm": 1.4556254148483276,
"learning_rate": 4.252449012365505e-06,
"loss": 1.2229,
"step": 29000
},
{
"epoch": 2.750118539592224,
"eval_loss": 1.1307094097137451,
"eval_runtime": 72.2019,
"eval_samples_per_second": 129.83,
"eval_steps_per_second": 16.232,
"step": 29000
},
{
"epoch": 2.759601706970128,
"grad_norm": 1.399604082107544,
"learning_rate": 4.091858037578288e-06,
"loss": 1.183,
"step": 29100
},
{
"epoch": 2.769084874348032,
"grad_norm": 1.3562369346618652,
"learning_rate": 3.931267062791071e-06,
"loss": 1.1729,
"step": 29200
},
{
"epoch": 2.7785680417259364,
"grad_norm": 1.4427545070648193,
"learning_rate": 3.7706760880038542e-06,
"loss": 1.1636,
"step": 29300
},
{
"epoch": 2.7880512091038407,
"grad_norm": 1.6153539419174194,
"learning_rate": 3.610085113216637e-06,
"loss": 1.1608,
"step": 29400
},
{
"epoch": 2.797534376481745,
"grad_norm": 1.553841233253479,
"learning_rate": 3.44949413842942e-06,
"loss": 1.1727,
"step": 29500
},
{
"epoch": 2.797534376481745,
"eval_loss": 1.1305798292160034,
"eval_runtime": 72.4369,
"eval_samples_per_second": 129.409,
"eval_steps_per_second": 16.18,
"step": 29500
},
{
"epoch": 2.807017543859649,
"grad_norm": 1.4503796100616455,
"learning_rate": 3.2889031636422036e-06,
"loss": 1.1533,
"step": 29600
},
{
"epoch": 2.8165007112375533,
"grad_norm": 2.3234095573425293,
"learning_rate": 3.1283121888549865e-06,
"loss": 1.1849,
"step": 29700
},
{
"epoch": 2.8259838786154576,
"grad_norm": 1.6692347526550293,
"learning_rate": 2.9677212140677695e-06,
"loss": 1.1629,
"step": 29800
},
{
"epoch": 2.8354670459933615,
"grad_norm": 1.6683822870254517,
"learning_rate": 2.8071302392805524e-06,
"loss": 1.1584,
"step": 29900
},
{
"epoch": 2.844950213371266,
"grad_norm": 1.371102213859558,
"learning_rate": 2.6465392644933354e-06,
"loss": 1.1208,
"step": 30000
},
{
"epoch": 2.844950213371266,
"eval_loss": 1.1299171447753906,
"eval_runtime": 72.1885,
"eval_samples_per_second": 129.854,
"eval_steps_per_second": 16.235,
"step": 30000
},
{
"epoch": 2.85443338074917,
"grad_norm": 1.9285227060317993,
"learning_rate": 2.4859482897061184e-06,
"loss": 1.1871,
"step": 30100
},
{
"epoch": 2.8639165481270745,
"grad_norm": 1.5394768714904785,
"learning_rate": 2.3253573149189017e-06,
"loss": 1.1786,
"step": 30200
},
{
"epoch": 2.873399715504979,
"grad_norm": 1.606779932975769,
"learning_rate": 2.1647663401316847e-06,
"loss": 1.181,
"step": 30300
},
{
"epoch": 2.8828828828828827,
"grad_norm": 1.6637898683547974,
"learning_rate": 2.0041753653444677e-06,
"loss": 1.1435,
"step": 30400
},
{
"epoch": 2.892366050260787,
"grad_norm": 1.4190491437911987,
"learning_rate": 1.8435843905572506e-06,
"loss": 1.158,
"step": 30500
},
{
"epoch": 2.892366050260787,
"eval_loss": 1.129961371421814,
"eval_runtime": 72.2984,
"eval_samples_per_second": 129.657,
"eval_steps_per_second": 16.211,
"step": 30500
},
{
"epoch": 2.9018492176386914,
"grad_norm": 1.3839406967163086,
"learning_rate": 1.6829934157700338e-06,
"loss": 1.1716,
"step": 30600
},
{
"epoch": 2.9113323850165953,
"grad_norm": 1.2562811374664307,
"learning_rate": 1.5224024409828168e-06,
"loss": 1.1466,
"step": 30700
},
{
"epoch": 2.9208155523944996,
"grad_norm": 1.4180203676223755,
"learning_rate": 1.3618114661955997e-06,
"loss": 1.1405,
"step": 30800
},
{
"epoch": 2.930298719772404,
"grad_norm": 1.7891360521316528,
"learning_rate": 1.2012204914083829e-06,
"loss": 1.1591,
"step": 30900
},
{
"epoch": 2.9397818871503083,
"grad_norm": 1.7551426887512207,
"learning_rate": 1.0406295166211659e-06,
"loss": 1.1833,
"step": 31000
},
{
"epoch": 2.9397818871503083,
"eval_loss": 1.129394292831421,
"eval_runtime": 72.2972,
"eval_samples_per_second": 129.659,
"eval_steps_per_second": 16.211,
"step": 31000
},
{
"epoch": 2.9492650545282126,
"grad_norm": 1.4321238994598389,
"learning_rate": 8.800385418339489e-07,
"loss": 1.1879,
"step": 31100
},
{
"epoch": 2.9587482219061165,
"grad_norm": 1.732853651046753,
"learning_rate": 7.210534767946041e-07,
"loss": 1.1682,
"step": 31200
},
{
"epoch": 2.968231389284021,
"grad_norm": 1.473656415939331,
"learning_rate": 5.604625020073872e-07,
"loss": 1.1708,
"step": 31300
},
{
"epoch": 2.977714556661925,
"grad_norm": 1.2021667957305908,
"learning_rate": 3.998715272201702e-07,
"loss": 1.1679,
"step": 31400
},
{
"epoch": 2.987197724039829,
"grad_norm": 1.4972681999206543,
"learning_rate": 2.4088646218082545e-07,
"loss": 1.1678,
"step": 31500
},
{
"epoch": 2.987197724039829,
"eval_loss": 1.129324197769165,
"eval_runtime": 72.2219,
"eval_samples_per_second": 129.794,
"eval_steps_per_second": 16.228,
"step": 31500
},
{
"epoch": 2.9966808914177334,
"grad_norm": 1.7410774230957031,
"learning_rate": 8.029548739360848e-08,
"loss": 1.1645,
"step": 31600
}
],
"logging_steps": 100,
"max_steps": 31635,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.32254007164928e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}