auxyus's picture
Training in progress, step 1500, checkpoint
af4a919 verified
{
"best_metric": 2.743666648864746,
"best_model_checkpoint": "miner_id_24/checkpoint-1350",
"epoch": 0.028963959180126862,
"eval_steps": 150,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.9309306120084574e-05,
"eval_loss": 3.0466506481170654,
"eval_runtime": 330.8111,
"eval_samples_per_second": 131.834,
"eval_steps_per_second": 32.958,
"step": 1
},
{
"epoch": 0.00019309306120084574,
"grad_norm": 17.619115829467773,
"learning_rate": 0.0001,
"loss": 10.3433,
"step": 10
},
{
"epoch": 0.0003861861224016915,
"grad_norm": 18.392913818359375,
"learning_rate": 0.0001,
"loss": 10.498,
"step": 20
},
{
"epoch": 0.0005792791836025373,
"grad_norm": 18.08405113220215,
"learning_rate": 0.0001,
"loss": 11.2262,
"step": 30
},
{
"epoch": 0.000772372244803383,
"grad_norm": 17.59492301940918,
"learning_rate": 0.0001,
"loss": 12.7306,
"step": 40
},
{
"epoch": 0.0009654653060042288,
"grad_norm": 37.0333251953125,
"learning_rate": 0.0001,
"loss": 15.2312,
"step": 50
},
{
"epoch": 0.0011585583672050746,
"grad_norm": 13.556602478027344,
"learning_rate": 0.0001,
"loss": 10.201,
"step": 60
},
{
"epoch": 0.0013516514284059203,
"grad_norm": 16.682050704956055,
"learning_rate": 0.0001,
"loss": 9.9751,
"step": 70
},
{
"epoch": 0.001544744489606766,
"grad_norm": 20.121063232421875,
"learning_rate": 0.0001,
"loss": 10.9688,
"step": 80
},
{
"epoch": 0.0017378375508076116,
"grad_norm": 17.53766632080078,
"learning_rate": 0.0001,
"loss": 12.1271,
"step": 90
},
{
"epoch": 0.0019309306120084576,
"grad_norm": 33.57805252075195,
"learning_rate": 0.0001,
"loss": 14.9867,
"step": 100
},
{
"epoch": 0.002124023673209303,
"grad_norm": 19.4736385345459,
"learning_rate": 0.0001,
"loss": 9.7775,
"step": 110
},
{
"epoch": 0.002317116734410149,
"grad_norm": 21.09294319152832,
"learning_rate": 0.0001,
"loss": 10.0451,
"step": 120
},
{
"epoch": 0.002510209795610995,
"grad_norm": 16.475950241088867,
"learning_rate": 0.0001,
"loss": 10.4069,
"step": 130
},
{
"epoch": 0.0027033028568118405,
"grad_norm": 24.737951278686523,
"learning_rate": 0.0001,
"loss": 11.6022,
"step": 140
},
{
"epoch": 0.0028963959180126862,
"grad_norm": 38.39281463623047,
"learning_rate": 0.0001,
"loss": 14.6671,
"step": 150
},
{
"epoch": 0.0028963959180126862,
"eval_loss": 2.891888380050659,
"eval_runtime": 329.6932,
"eval_samples_per_second": 132.281,
"eval_steps_per_second": 33.07,
"step": 150
},
{
"epoch": 0.003089488979213532,
"grad_norm": 13.83809757232666,
"learning_rate": 0.0001,
"loss": 10.1709,
"step": 160
},
{
"epoch": 0.0032825820404143776,
"grad_norm": 17.797067642211914,
"learning_rate": 0.0001,
"loss": 10.6159,
"step": 170
},
{
"epoch": 0.0034756751016152233,
"grad_norm": 17.597434997558594,
"learning_rate": 0.0001,
"loss": 10.8431,
"step": 180
},
{
"epoch": 0.003668768162816069,
"grad_norm": 20.05368995666504,
"learning_rate": 0.0001,
"loss": 12.1154,
"step": 190
},
{
"epoch": 0.003861861224016915,
"grad_norm": 88.2765121459961,
"learning_rate": 0.0001,
"loss": 14.0078,
"step": 200
},
{
"epoch": 0.00405495428521776,
"grad_norm": 19.025007247924805,
"learning_rate": 0.0001,
"loss": 10.7787,
"step": 210
},
{
"epoch": 0.004248047346418606,
"grad_norm": 16.219017028808594,
"learning_rate": 0.0001,
"loss": 10.2578,
"step": 220
},
{
"epoch": 0.004441140407619453,
"grad_norm": 19.783754348754883,
"learning_rate": 0.0001,
"loss": 10.7105,
"step": 230
},
{
"epoch": 0.004634233468820298,
"grad_norm": 15.023333549499512,
"learning_rate": 0.0001,
"loss": 11.5988,
"step": 240
},
{
"epoch": 0.004827326530021144,
"grad_norm": 33.46087646484375,
"learning_rate": 0.0001,
"loss": 13.6889,
"step": 250
},
{
"epoch": 0.00502041959122199,
"grad_norm": 11.918452262878418,
"learning_rate": 0.0001,
"loss": 9.8579,
"step": 260
},
{
"epoch": 0.005213512652422835,
"grad_norm": 11.418573379516602,
"learning_rate": 0.0001,
"loss": 9.7857,
"step": 270
},
{
"epoch": 0.005406605713623681,
"grad_norm": 12.96380615234375,
"learning_rate": 0.0001,
"loss": 10.7496,
"step": 280
},
{
"epoch": 0.005599698774824527,
"grad_norm": 20.210433959960938,
"learning_rate": 0.0001,
"loss": 11.8999,
"step": 290
},
{
"epoch": 0.0057927918360253725,
"grad_norm": 39.24753952026367,
"learning_rate": 0.0001,
"loss": 13.6284,
"step": 300
},
{
"epoch": 0.0057927918360253725,
"eval_loss": 2.8597412109375,
"eval_runtime": 332.7905,
"eval_samples_per_second": 131.049,
"eval_steps_per_second": 32.762,
"step": 300
},
{
"epoch": 0.005985884897226218,
"grad_norm": 8.188671112060547,
"learning_rate": 0.0001,
"loss": 10.261,
"step": 310
},
{
"epoch": 0.006178977958427064,
"grad_norm": 13.034982681274414,
"learning_rate": 0.0001,
"loss": 10.359,
"step": 320
},
{
"epoch": 0.0063720710196279095,
"grad_norm": 10.07932186126709,
"learning_rate": 0.0001,
"loss": 10.7766,
"step": 330
},
{
"epoch": 0.006565164080828755,
"grad_norm": 14.22851848602295,
"learning_rate": 0.0001,
"loss": 11.6996,
"step": 340
},
{
"epoch": 0.006758257142029601,
"grad_norm": 29.241413116455078,
"learning_rate": 0.0001,
"loss": 14.314,
"step": 350
},
{
"epoch": 0.006951350203230447,
"grad_norm": 14.328963279724121,
"learning_rate": 0.0001,
"loss": 10.5319,
"step": 360
},
{
"epoch": 0.007144443264431292,
"grad_norm": 11.2478609085083,
"learning_rate": 0.0001,
"loss": 10.2886,
"step": 370
},
{
"epoch": 0.007337536325632138,
"grad_norm": 11.479792594909668,
"learning_rate": 0.0001,
"loss": 10.5397,
"step": 380
},
{
"epoch": 0.0075306293868329845,
"grad_norm": 16.216135025024414,
"learning_rate": 0.0001,
"loss": 11.4221,
"step": 390
},
{
"epoch": 0.00772372244803383,
"grad_norm": 40.46977996826172,
"learning_rate": 0.0001,
"loss": 13.6267,
"step": 400
},
{
"epoch": 0.007916815509234676,
"grad_norm": 7.014930725097656,
"learning_rate": 0.0001,
"loss": 9.8201,
"step": 410
},
{
"epoch": 0.00810990857043552,
"grad_norm": 9.932273864746094,
"learning_rate": 0.0001,
"loss": 10.2837,
"step": 420
},
{
"epoch": 0.008303001631636367,
"grad_norm": 10.231171607971191,
"learning_rate": 0.0001,
"loss": 10.6085,
"step": 430
},
{
"epoch": 0.008496094692837212,
"grad_norm": 14.54490852355957,
"learning_rate": 0.0001,
"loss": 11.7488,
"step": 440
},
{
"epoch": 0.008689187754038059,
"grad_norm": 48.76640701293945,
"learning_rate": 0.0001,
"loss": 14.019,
"step": 450
},
{
"epoch": 0.008689187754038059,
"eval_loss": 2.8116047382354736,
"eval_runtime": 332.18,
"eval_samples_per_second": 131.29,
"eval_steps_per_second": 32.823,
"step": 450
},
{
"epoch": 0.008882280815238905,
"grad_norm": 12.235373497009277,
"learning_rate": 0.0001,
"loss": 10.1831,
"step": 460
},
{
"epoch": 0.00907537387643975,
"grad_norm": 10.786988258361816,
"learning_rate": 0.0001,
"loss": 10.3065,
"step": 470
},
{
"epoch": 0.009268466937640597,
"grad_norm": 14.656991004943848,
"learning_rate": 0.0001,
"loss": 10.4513,
"step": 480
},
{
"epoch": 0.009461559998841441,
"grad_norm": 16.273334503173828,
"learning_rate": 0.0001,
"loss": 11.6948,
"step": 490
},
{
"epoch": 0.009654653060042288,
"grad_norm": 25.74225425720215,
"learning_rate": 0.0001,
"loss": 13.6766,
"step": 500
},
{
"epoch": 0.009847746121243133,
"grad_norm": 7.88370943069458,
"learning_rate": 0.0001,
"loss": 10.2527,
"step": 510
},
{
"epoch": 0.01004083918244398,
"grad_norm": 12.4476318359375,
"learning_rate": 0.0001,
"loss": 10.1829,
"step": 520
},
{
"epoch": 0.010233932243644824,
"grad_norm": 11.75400447845459,
"learning_rate": 0.0001,
"loss": 10.2371,
"step": 530
},
{
"epoch": 0.01042702530484567,
"grad_norm": 28.32915496826172,
"learning_rate": 0.0001,
"loss": 11.4662,
"step": 540
},
{
"epoch": 0.010620118366046516,
"grad_norm": 26.942928314208984,
"learning_rate": 0.0001,
"loss": 13.5862,
"step": 550
},
{
"epoch": 0.010813211427247362,
"grad_norm": 10.243542671203613,
"learning_rate": 0.0001,
"loss": 9.9785,
"step": 560
},
{
"epoch": 0.011006304488448207,
"grad_norm": 10.225203514099121,
"learning_rate": 0.0001,
"loss": 9.6087,
"step": 570
},
{
"epoch": 0.011199397549649054,
"grad_norm": 10.439690589904785,
"learning_rate": 0.0001,
"loss": 10.6501,
"step": 580
},
{
"epoch": 0.011392490610849898,
"grad_norm": 13.22080135345459,
"learning_rate": 0.0001,
"loss": 11.5253,
"step": 590
},
{
"epoch": 0.011585583672050745,
"grad_norm": 28.21418571472168,
"learning_rate": 0.0001,
"loss": 13.6289,
"step": 600
},
{
"epoch": 0.011585583672050745,
"eval_loss": 2.7912118434906006,
"eval_runtime": 333.6509,
"eval_samples_per_second": 130.711,
"eval_steps_per_second": 32.678,
"step": 600
},
{
"epoch": 0.011778676733251591,
"grad_norm": 10.044217109680176,
"learning_rate": 0.0001,
"loss": 10.1053,
"step": 610
},
{
"epoch": 0.011971769794452436,
"grad_norm": 12.914101600646973,
"learning_rate": 0.0001,
"loss": 10.0,
"step": 620
},
{
"epoch": 0.012164862855653283,
"grad_norm": 11.664057731628418,
"learning_rate": 0.0001,
"loss": 10.3879,
"step": 630
},
{
"epoch": 0.012357955916854128,
"grad_norm": 13.065714836120605,
"learning_rate": 0.0001,
"loss": 11.5201,
"step": 640
},
{
"epoch": 0.012551048978054974,
"grad_norm": 33.389984130859375,
"learning_rate": 0.0001,
"loss": 13.5861,
"step": 650
},
{
"epoch": 0.012744142039255819,
"grad_norm": 11.64077091217041,
"learning_rate": 0.0001,
"loss": 10.2282,
"step": 660
},
{
"epoch": 0.012937235100456666,
"grad_norm": 10.479111671447754,
"learning_rate": 0.0001,
"loss": 10.0161,
"step": 670
},
{
"epoch": 0.01313032816165751,
"grad_norm": 35.55380630493164,
"learning_rate": 0.0001,
"loss": 10.6205,
"step": 680
},
{
"epoch": 0.013323421222858357,
"grad_norm": 13.887811660766602,
"learning_rate": 0.0001,
"loss": 11.7771,
"step": 690
},
{
"epoch": 0.013516514284059202,
"grad_norm": 27.93064308166504,
"learning_rate": 0.0001,
"loss": 13.5786,
"step": 700
},
{
"epoch": 0.013709607345260048,
"grad_norm": 11.809444427490234,
"learning_rate": 0.0001,
"loss": 10.0317,
"step": 710
},
{
"epoch": 0.013902700406460893,
"grad_norm": 10.49244499206543,
"learning_rate": 0.0001,
"loss": 10.3579,
"step": 720
},
{
"epoch": 0.01409579346766174,
"grad_norm": 13.255899429321289,
"learning_rate": 0.0001,
"loss": 10.6709,
"step": 730
},
{
"epoch": 0.014288886528862585,
"grad_norm": 16.088109970092773,
"learning_rate": 0.0001,
"loss": 11.7847,
"step": 740
},
{
"epoch": 0.014481979590063431,
"grad_norm": 43.44777297973633,
"learning_rate": 0.0001,
"loss": 13.5673,
"step": 750
},
{
"epoch": 0.014481979590063431,
"eval_loss": 2.783721446990967,
"eval_runtime": 332.0115,
"eval_samples_per_second": 131.357,
"eval_steps_per_second": 32.839,
"step": 750
},
{
"epoch": 0.014675072651264276,
"grad_norm": 22.703245162963867,
"learning_rate": 0.0001,
"loss": 10.1566,
"step": 760
},
{
"epoch": 0.014868165712465123,
"grad_norm": 15.799572944641113,
"learning_rate": 0.0001,
"loss": 10.1408,
"step": 770
},
{
"epoch": 0.015061258773665969,
"grad_norm": 14.325783729553223,
"learning_rate": 0.0001,
"loss": 10.6599,
"step": 780
},
{
"epoch": 0.015254351834866814,
"grad_norm": 16.357566833496094,
"learning_rate": 0.0001,
"loss": 11.5767,
"step": 790
},
{
"epoch": 0.01544744489606766,
"grad_norm": 38.33959197998047,
"learning_rate": 0.0001,
"loss": 13.598,
"step": 800
},
{
"epoch": 0.015640537957268507,
"grad_norm": 14.860721588134766,
"learning_rate": 0.0001,
"loss": 10.2319,
"step": 810
},
{
"epoch": 0.015833631018469352,
"grad_norm": 9.913568496704102,
"learning_rate": 0.0001,
"loss": 10.0176,
"step": 820
},
{
"epoch": 0.016026724079670197,
"grad_norm": 8.6304931640625,
"learning_rate": 0.0001,
"loss": 10.7353,
"step": 830
},
{
"epoch": 0.01621981714087104,
"grad_norm": 12.518475532531738,
"learning_rate": 0.0001,
"loss": 11.1827,
"step": 840
},
{
"epoch": 0.01641291020207189,
"grad_norm": 26.06439781188965,
"learning_rate": 0.0001,
"loss": 13.8205,
"step": 850
},
{
"epoch": 0.016606003263272735,
"grad_norm": 7.715124130249023,
"learning_rate": 0.0001,
"loss": 10.1191,
"step": 860
},
{
"epoch": 0.01679909632447358,
"grad_norm": 8.914071083068848,
"learning_rate": 0.0001,
"loss": 10.2188,
"step": 870
},
{
"epoch": 0.016992189385674424,
"grad_norm": 14.51268196105957,
"learning_rate": 0.0001,
"loss": 10.603,
"step": 880
},
{
"epoch": 0.017185282446875273,
"grad_norm": 12.584301948547363,
"learning_rate": 0.0001,
"loss": 11.345,
"step": 890
},
{
"epoch": 0.017378375508076117,
"grad_norm": 21.21172523498535,
"learning_rate": 0.0001,
"loss": 13.4406,
"step": 900
},
{
"epoch": 0.017378375508076117,
"eval_loss": 2.7644200325012207,
"eval_runtime": 328.7945,
"eval_samples_per_second": 132.642,
"eval_steps_per_second": 33.161,
"step": 900
},
{
"epoch": 0.017571468569276962,
"grad_norm": 10.428869247436523,
"learning_rate": 0.0001,
"loss": 9.9264,
"step": 910
},
{
"epoch": 0.01776456163047781,
"grad_norm": 7.778339385986328,
"learning_rate": 0.0001,
"loss": 9.967,
"step": 920
},
{
"epoch": 0.017957654691678655,
"grad_norm": 8.562960624694824,
"learning_rate": 0.0001,
"loss": 10.4493,
"step": 930
},
{
"epoch": 0.0181507477528795,
"grad_norm": 10.700459480285645,
"learning_rate": 0.0001,
"loss": 11.288,
"step": 940
},
{
"epoch": 0.018343840814080345,
"grad_norm": 27.12032699584961,
"learning_rate": 0.0001,
"loss": 13.3088,
"step": 950
},
{
"epoch": 0.018536933875281193,
"grad_norm": 7.569306373596191,
"learning_rate": 0.0001,
"loss": 10.0856,
"step": 960
},
{
"epoch": 0.018730026936482038,
"grad_norm": 8.432563781738281,
"learning_rate": 0.0001,
"loss": 9.8671,
"step": 970
},
{
"epoch": 0.018923119997682883,
"grad_norm": 9.893686294555664,
"learning_rate": 0.0001,
"loss": 10.1824,
"step": 980
},
{
"epoch": 0.019116213058883728,
"grad_norm": 13.275106430053711,
"learning_rate": 0.0001,
"loss": 11.4841,
"step": 990
},
{
"epoch": 0.019309306120084576,
"grad_norm": 23.891876220703125,
"learning_rate": 0.0001,
"loss": 13.3897,
"step": 1000
},
{
"epoch": 0.01950239918128542,
"grad_norm": 8.6790771484375,
"learning_rate": 0.0001,
"loss": 9.6344,
"step": 1010
},
{
"epoch": 0.019695492242486266,
"grad_norm": 10.20168399810791,
"learning_rate": 0.0001,
"loss": 9.6956,
"step": 1020
},
{
"epoch": 0.01988858530368711,
"grad_norm": 9.794075012207031,
"learning_rate": 0.0001,
"loss": 10.6165,
"step": 1030
},
{
"epoch": 0.02008167836488796,
"grad_norm": 13.980074882507324,
"learning_rate": 0.0001,
"loss": 11.8835,
"step": 1040
},
{
"epoch": 0.020274771426088804,
"grad_norm": 31.785160064697266,
"learning_rate": 0.0001,
"loss": 13.106,
"step": 1050
},
{
"epoch": 0.020274771426088804,
"eval_loss": 2.7701072692871094,
"eval_runtime": 337.2345,
"eval_samples_per_second": 129.322,
"eval_steps_per_second": 32.331,
"step": 1050
},
{
"epoch": 0.02046786448728965,
"grad_norm": 7.864306926727295,
"learning_rate": 0.0001,
"loss": 10.2408,
"step": 1060
},
{
"epoch": 0.020660957548490497,
"grad_norm": 8.912210464477539,
"learning_rate": 0.0001,
"loss": 9.9443,
"step": 1070
},
{
"epoch": 0.02085405060969134,
"grad_norm": 9.936532974243164,
"learning_rate": 0.0001,
"loss": 10.8666,
"step": 1080
},
{
"epoch": 0.021047143670892186,
"grad_norm": 11.113265991210938,
"learning_rate": 0.0001,
"loss": 11.2062,
"step": 1090
},
{
"epoch": 0.02124023673209303,
"grad_norm": 30.588726043701172,
"learning_rate": 0.0001,
"loss": 13.0672,
"step": 1100
},
{
"epoch": 0.02143332979329388,
"grad_norm": 8.092857360839844,
"learning_rate": 0.0001,
"loss": 10.0935,
"step": 1110
},
{
"epoch": 0.021626422854494724,
"grad_norm": 11.335000991821289,
"learning_rate": 0.0001,
"loss": 9.9106,
"step": 1120
},
{
"epoch": 0.02181951591569557,
"grad_norm": 12.082404136657715,
"learning_rate": 0.0001,
"loss": 10.4083,
"step": 1130
},
{
"epoch": 0.022012608976896414,
"grad_norm": 14.026704788208008,
"learning_rate": 0.0001,
"loss": 11.4077,
"step": 1140
},
{
"epoch": 0.022205702038097262,
"grad_norm": 32.28516387939453,
"learning_rate": 0.0001,
"loss": 13.261,
"step": 1150
},
{
"epoch": 0.022398795099298107,
"grad_norm": 10.577113151550293,
"learning_rate": 0.0001,
"loss": 9.9731,
"step": 1160
},
{
"epoch": 0.022591888160498952,
"grad_norm": 9.278578758239746,
"learning_rate": 0.0001,
"loss": 9.6304,
"step": 1170
},
{
"epoch": 0.022784981221699797,
"grad_norm": 10.110872268676758,
"learning_rate": 0.0001,
"loss": 10.4382,
"step": 1180
},
{
"epoch": 0.022978074282900645,
"grad_norm": 12.344923973083496,
"learning_rate": 0.0001,
"loss": 11.3848,
"step": 1190
},
{
"epoch": 0.02317116734410149,
"grad_norm": 27.385581970214844,
"learning_rate": 0.0001,
"loss": 13.5253,
"step": 1200
},
{
"epoch": 0.02317116734410149,
"eval_loss": 2.7528042793273926,
"eval_runtime": 333.2124,
"eval_samples_per_second": 130.883,
"eval_steps_per_second": 32.721,
"step": 1200
},
{
"epoch": 0.023364260405302335,
"grad_norm": 11.155953407287598,
"learning_rate": 0.0001,
"loss": 9.898,
"step": 1210
},
{
"epoch": 0.023557353466503183,
"grad_norm": 10.302549362182617,
"learning_rate": 0.0001,
"loss": 10.058,
"step": 1220
},
{
"epoch": 0.023750446527704028,
"grad_norm": 11.402131080627441,
"learning_rate": 0.0001,
"loss": 10.707,
"step": 1230
},
{
"epoch": 0.023943539588904873,
"grad_norm": 13.274065971374512,
"learning_rate": 0.0001,
"loss": 11.5189,
"step": 1240
},
{
"epoch": 0.024136632650105717,
"grad_norm": 22.315773010253906,
"learning_rate": 0.0001,
"loss": 13.4052,
"step": 1250
},
{
"epoch": 0.024329725711306566,
"grad_norm": 8.078880310058594,
"learning_rate": 0.0001,
"loss": 9.8327,
"step": 1260
},
{
"epoch": 0.02452281877250741,
"grad_norm": 7.619024753570557,
"learning_rate": 0.0001,
"loss": 9.8486,
"step": 1270
},
{
"epoch": 0.024715911833708255,
"grad_norm": 10.089761734008789,
"learning_rate": 0.0001,
"loss": 10.4484,
"step": 1280
},
{
"epoch": 0.0249090048949091,
"grad_norm": 12.1000394821167,
"learning_rate": 0.0001,
"loss": 11.3575,
"step": 1290
},
{
"epoch": 0.02510209795610995,
"grad_norm": 23.944225311279297,
"learning_rate": 0.0001,
"loss": 13.2124,
"step": 1300
},
{
"epoch": 0.025295191017310793,
"grad_norm": 6.652422904968262,
"learning_rate": 0.0001,
"loss": 9.6957,
"step": 1310
},
{
"epoch": 0.025488284078511638,
"grad_norm": 10.389664649963379,
"learning_rate": 0.0001,
"loss": 9.8489,
"step": 1320
},
{
"epoch": 0.025681377139712483,
"grad_norm": 10.09528636932373,
"learning_rate": 0.0001,
"loss": 10.5627,
"step": 1330
},
{
"epoch": 0.02587447020091333,
"grad_norm": 13.287434577941895,
"learning_rate": 0.0001,
"loss": 11.8588,
"step": 1340
},
{
"epoch": 0.026067563262114176,
"grad_norm": 23.052188873291016,
"learning_rate": 0.0001,
"loss": 12.9806,
"step": 1350
},
{
"epoch": 0.026067563262114176,
"eval_loss": 2.743666648864746,
"eval_runtime": 333.1086,
"eval_samples_per_second": 130.924,
"eval_steps_per_second": 32.731,
"step": 1350
},
{
"epoch": 0.02626065632331502,
"grad_norm": 6.833707809448242,
"learning_rate": 0.0001,
"loss": 10.0071,
"step": 1360
},
{
"epoch": 0.026453749384515866,
"grad_norm": 8.814143180847168,
"learning_rate": 0.0001,
"loss": 9.8842,
"step": 1370
},
{
"epoch": 0.026646842445716714,
"grad_norm": 10.0423583984375,
"learning_rate": 0.0001,
"loss": 10.6022,
"step": 1380
},
{
"epoch": 0.02683993550691756,
"grad_norm": 11.296242713928223,
"learning_rate": 0.0001,
"loss": 11.2607,
"step": 1390
},
{
"epoch": 0.027033028568118404,
"grad_norm": 21.390296936035156,
"learning_rate": 0.0001,
"loss": 12.9834,
"step": 1400
},
{
"epoch": 0.027226121629319252,
"grad_norm": 8.968457221984863,
"learning_rate": 0.0001,
"loss": 10.0666,
"step": 1410
},
{
"epoch": 0.027419214690520097,
"grad_norm": 11.966334342956543,
"learning_rate": 0.0001,
"loss": 10.0537,
"step": 1420
},
{
"epoch": 0.02761230775172094,
"grad_norm": 13.229506492614746,
"learning_rate": 0.0001,
"loss": 10.3221,
"step": 1430
},
{
"epoch": 0.027805400812921786,
"grad_norm": 20.169370651245117,
"learning_rate": 0.0001,
"loss": 11.0951,
"step": 1440
},
{
"epoch": 0.027998493874122635,
"grad_norm": 43.70033264160156,
"learning_rate": 0.0001,
"loss": 13.0941,
"step": 1450
},
{
"epoch": 0.02819158693532348,
"grad_norm": 9.215312957763672,
"learning_rate": 0.0001,
"loss": 10.0646,
"step": 1460
},
{
"epoch": 0.028384679996524324,
"grad_norm": 20.890764236450195,
"learning_rate": 0.0001,
"loss": 9.7248,
"step": 1470
},
{
"epoch": 0.02857777305772517,
"grad_norm": 12.14108943939209,
"learning_rate": 0.0001,
"loss": 10.3096,
"step": 1480
},
{
"epoch": 0.028770866118926017,
"grad_norm": 15.33022689819336,
"learning_rate": 0.0001,
"loss": 11.5878,
"step": 1490
},
{
"epoch": 0.028963959180126862,
"grad_norm": 29.498929977416992,
"learning_rate": 0.0001,
"loss": 13.2048,
"step": 1500
},
{
"epoch": 0.028963959180126862,
"eval_loss": 2.7453110218048096,
"eval_runtime": 336.37,
"eval_samples_per_second": 129.655,
"eval_steps_per_second": 32.414,
"step": 1500
}
],
"logging_steps": 10,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.0944605381853184e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}