|
{ |
|
"best_metric": 2.743666648864746, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-1350", |
|
"epoch": 0.028963959180126862, |
|
"eval_steps": 150, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.9309306120084574e-05, |
|
"eval_loss": 3.0466506481170654, |
|
"eval_runtime": 330.8111, |
|
"eval_samples_per_second": 131.834, |
|
"eval_steps_per_second": 32.958, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00019309306120084574, |
|
"grad_norm": 17.619115829467773, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3433, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0003861861224016915, |
|
"grad_norm": 18.392913818359375, |
|
"learning_rate": 0.0001, |
|
"loss": 10.498, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0005792791836025373, |
|
"grad_norm": 18.08405113220215, |
|
"learning_rate": 0.0001, |
|
"loss": 11.2262, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.000772372244803383, |
|
"grad_norm": 17.59492301940918, |
|
"learning_rate": 0.0001, |
|
"loss": 12.7306, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0009654653060042288, |
|
"grad_norm": 37.0333251953125, |
|
"learning_rate": 0.0001, |
|
"loss": 15.2312, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0011585583672050746, |
|
"grad_norm": 13.556602478027344, |
|
"learning_rate": 0.0001, |
|
"loss": 10.201, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0013516514284059203, |
|
"grad_norm": 16.682050704956055, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9751, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.001544744489606766, |
|
"grad_norm": 20.121063232421875, |
|
"learning_rate": 0.0001, |
|
"loss": 10.9688, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0017378375508076116, |
|
"grad_norm": 17.53766632080078, |
|
"learning_rate": 0.0001, |
|
"loss": 12.1271, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0019309306120084576, |
|
"grad_norm": 33.57805252075195, |
|
"learning_rate": 0.0001, |
|
"loss": 14.9867, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002124023673209303, |
|
"grad_norm": 19.4736385345459, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7775, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.002317116734410149, |
|
"grad_norm": 21.09294319152832, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0451, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.002510209795610995, |
|
"grad_norm": 16.475950241088867, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4069, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0027033028568118405, |
|
"grad_norm": 24.737951278686523, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6022, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"grad_norm": 38.39281463623047, |
|
"learning_rate": 0.0001, |
|
"loss": 14.6671, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"eval_loss": 2.891888380050659, |
|
"eval_runtime": 329.6932, |
|
"eval_samples_per_second": 132.281, |
|
"eval_steps_per_second": 33.07, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.003089488979213532, |
|
"grad_norm": 13.83809757232666, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1709, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0032825820404143776, |
|
"grad_norm": 17.797067642211914, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6159, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0034756751016152233, |
|
"grad_norm": 17.597434997558594, |
|
"learning_rate": 0.0001, |
|
"loss": 10.8431, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.003668768162816069, |
|
"grad_norm": 20.05368995666504, |
|
"learning_rate": 0.0001, |
|
"loss": 12.1154, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.003861861224016915, |
|
"grad_norm": 88.2765121459961, |
|
"learning_rate": 0.0001, |
|
"loss": 14.0078, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00405495428521776, |
|
"grad_norm": 19.025007247924805, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7787, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.004248047346418606, |
|
"grad_norm": 16.219017028808594, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2578, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.004441140407619453, |
|
"grad_norm": 19.783754348754883, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7105, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.004634233468820298, |
|
"grad_norm": 15.023333549499512, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5988, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.004827326530021144, |
|
"grad_norm": 33.46087646484375, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6889, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00502041959122199, |
|
"grad_norm": 11.918452262878418, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8579, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.005213512652422835, |
|
"grad_norm": 11.418573379516602, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7857, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.005406605713623681, |
|
"grad_norm": 12.96380615234375, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7496, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.005599698774824527, |
|
"grad_norm": 20.210433959960938, |
|
"learning_rate": 0.0001, |
|
"loss": 11.8999, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0057927918360253725, |
|
"grad_norm": 39.24753952026367, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6284, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0057927918360253725, |
|
"eval_loss": 2.8597412109375, |
|
"eval_runtime": 332.7905, |
|
"eval_samples_per_second": 131.049, |
|
"eval_steps_per_second": 32.762, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.005985884897226218, |
|
"grad_norm": 8.188671112060547, |
|
"learning_rate": 0.0001, |
|
"loss": 10.261, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.006178977958427064, |
|
"grad_norm": 13.034982681274414, |
|
"learning_rate": 0.0001, |
|
"loss": 10.359, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0063720710196279095, |
|
"grad_norm": 10.07932186126709, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7766, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.006565164080828755, |
|
"grad_norm": 14.22851848602295, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6996, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.006758257142029601, |
|
"grad_norm": 29.241413116455078, |
|
"learning_rate": 0.0001, |
|
"loss": 14.314, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.006951350203230447, |
|
"grad_norm": 14.328963279724121, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5319, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.007144443264431292, |
|
"grad_norm": 11.2478609085083, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2886, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.007337536325632138, |
|
"grad_norm": 11.479792594909668, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5397, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0075306293868329845, |
|
"grad_norm": 16.216135025024414, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4221, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.00772372244803383, |
|
"grad_norm": 40.46977996826172, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6267, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.007916815509234676, |
|
"grad_norm": 7.014930725097656, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8201, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00810990857043552, |
|
"grad_norm": 9.932273864746094, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2837, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.008303001631636367, |
|
"grad_norm": 10.231171607971191, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6085, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.008496094692837212, |
|
"grad_norm": 14.54490852355957, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7488, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.008689187754038059, |
|
"grad_norm": 48.76640701293945, |
|
"learning_rate": 0.0001, |
|
"loss": 14.019, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.008689187754038059, |
|
"eval_loss": 2.8116047382354736, |
|
"eval_runtime": 332.18, |
|
"eval_samples_per_second": 131.29, |
|
"eval_steps_per_second": 32.823, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.008882280815238905, |
|
"grad_norm": 12.235373497009277, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1831, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.00907537387643975, |
|
"grad_norm": 10.786988258361816, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3065, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.009268466937640597, |
|
"grad_norm": 14.656991004943848, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4513, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.009461559998841441, |
|
"grad_norm": 16.273334503173828, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6948, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.009654653060042288, |
|
"grad_norm": 25.74225425720215, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6766, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.009847746121243133, |
|
"grad_norm": 7.88370943069458, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2527, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.01004083918244398, |
|
"grad_norm": 12.4476318359375, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1829, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.010233932243644824, |
|
"grad_norm": 11.75400447845459, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2371, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.01042702530484567, |
|
"grad_norm": 28.32915496826172, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4662, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.010620118366046516, |
|
"grad_norm": 26.942928314208984, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5862, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.010813211427247362, |
|
"grad_norm": 10.243542671203613, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9785, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.011006304488448207, |
|
"grad_norm": 10.225203514099121, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6087, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.011199397549649054, |
|
"grad_norm": 10.439690589904785, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6501, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.011392490610849898, |
|
"grad_norm": 13.22080135345459, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5253, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.011585583672050745, |
|
"grad_norm": 28.21418571472168, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6289, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.011585583672050745, |
|
"eval_loss": 2.7912118434906006, |
|
"eval_runtime": 333.6509, |
|
"eval_samples_per_second": 130.711, |
|
"eval_steps_per_second": 32.678, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.011778676733251591, |
|
"grad_norm": 10.044217109680176, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1053, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.011971769794452436, |
|
"grad_norm": 12.914101600646973, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.012164862855653283, |
|
"grad_norm": 11.664057731628418, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3879, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.012357955916854128, |
|
"grad_norm": 13.065714836120605, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5201, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.012551048978054974, |
|
"grad_norm": 33.389984130859375, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5861, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.012744142039255819, |
|
"grad_norm": 11.64077091217041, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2282, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.012937235100456666, |
|
"grad_norm": 10.479111671447754, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0161, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.01313032816165751, |
|
"grad_norm": 35.55380630493164, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6205, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.013323421222858357, |
|
"grad_norm": 13.887811660766602, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7771, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.013516514284059202, |
|
"grad_norm": 27.93064308166504, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5786, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.013709607345260048, |
|
"grad_norm": 11.809444427490234, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0317, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.013902700406460893, |
|
"grad_norm": 10.49244499206543, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3579, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01409579346766174, |
|
"grad_norm": 13.255899429321289, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6709, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.014288886528862585, |
|
"grad_norm": 16.088109970092773, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7847, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.014481979590063431, |
|
"grad_norm": 43.44777297973633, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5673, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.014481979590063431, |
|
"eval_loss": 2.783721446990967, |
|
"eval_runtime": 332.0115, |
|
"eval_samples_per_second": 131.357, |
|
"eval_steps_per_second": 32.839, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.014675072651264276, |
|
"grad_norm": 22.703245162963867, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1566, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.014868165712465123, |
|
"grad_norm": 15.799572944641113, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1408, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.015061258773665969, |
|
"grad_norm": 14.325783729553223, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6599, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.015254351834866814, |
|
"grad_norm": 16.357566833496094, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5767, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.01544744489606766, |
|
"grad_norm": 38.33959197998047, |
|
"learning_rate": 0.0001, |
|
"loss": 13.598, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.015640537957268507, |
|
"grad_norm": 14.860721588134766, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2319, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.015833631018469352, |
|
"grad_norm": 9.913568496704102, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0176, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.016026724079670197, |
|
"grad_norm": 8.6304931640625, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7353, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.01621981714087104, |
|
"grad_norm": 12.518475532531738, |
|
"learning_rate": 0.0001, |
|
"loss": 11.1827, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.01641291020207189, |
|
"grad_norm": 26.06439781188965, |
|
"learning_rate": 0.0001, |
|
"loss": 13.8205, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.016606003263272735, |
|
"grad_norm": 7.715124130249023, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1191, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.01679909632447358, |
|
"grad_norm": 8.914071083068848, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2188, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.016992189385674424, |
|
"grad_norm": 14.51268196105957, |
|
"learning_rate": 0.0001, |
|
"loss": 10.603, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.017185282446875273, |
|
"grad_norm": 12.584301948547363, |
|
"learning_rate": 0.0001, |
|
"loss": 11.345, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.017378375508076117, |
|
"grad_norm": 21.21172523498535, |
|
"learning_rate": 0.0001, |
|
"loss": 13.4406, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.017378375508076117, |
|
"eval_loss": 2.7644200325012207, |
|
"eval_runtime": 328.7945, |
|
"eval_samples_per_second": 132.642, |
|
"eval_steps_per_second": 33.161, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.017571468569276962, |
|
"grad_norm": 10.428869247436523, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9264, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.01776456163047781, |
|
"grad_norm": 7.778339385986328, |
|
"learning_rate": 0.0001, |
|
"loss": 9.967, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.017957654691678655, |
|
"grad_norm": 8.562960624694824, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4493, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0181507477528795, |
|
"grad_norm": 10.700459480285645, |
|
"learning_rate": 0.0001, |
|
"loss": 11.288, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.018343840814080345, |
|
"grad_norm": 27.12032699584961, |
|
"learning_rate": 0.0001, |
|
"loss": 13.3088, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.018536933875281193, |
|
"grad_norm": 7.569306373596191, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0856, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.018730026936482038, |
|
"grad_norm": 8.432563781738281, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8671, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.018923119997682883, |
|
"grad_norm": 9.893686294555664, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1824, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.019116213058883728, |
|
"grad_norm": 13.275106430053711, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4841, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.019309306120084576, |
|
"grad_norm": 23.891876220703125, |
|
"learning_rate": 0.0001, |
|
"loss": 13.3897, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01950239918128542, |
|
"grad_norm": 8.6790771484375, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6344, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.019695492242486266, |
|
"grad_norm": 10.20168399810791, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6956, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.01988858530368711, |
|
"grad_norm": 9.794075012207031, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6165, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.02008167836488796, |
|
"grad_norm": 13.980074882507324, |
|
"learning_rate": 0.0001, |
|
"loss": 11.8835, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.020274771426088804, |
|
"grad_norm": 31.785160064697266, |
|
"learning_rate": 0.0001, |
|
"loss": 13.106, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.020274771426088804, |
|
"eval_loss": 2.7701072692871094, |
|
"eval_runtime": 337.2345, |
|
"eval_samples_per_second": 129.322, |
|
"eval_steps_per_second": 32.331, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.02046786448728965, |
|
"grad_norm": 7.864306926727295, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2408, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.020660957548490497, |
|
"grad_norm": 8.912210464477539, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9443, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.02085405060969134, |
|
"grad_norm": 9.936532974243164, |
|
"learning_rate": 0.0001, |
|
"loss": 10.8666, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.021047143670892186, |
|
"grad_norm": 11.113265991210938, |
|
"learning_rate": 0.0001, |
|
"loss": 11.2062, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.02124023673209303, |
|
"grad_norm": 30.588726043701172, |
|
"learning_rate": 0.0001, |
|
"loss": 13.0672, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.02143332979329388, |
|
"grad_norm": 8.092857360839844, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0935, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.021626422854494724, |
|
"grad_norm": 11.335000991821289, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9106, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.02181951591569557, |
|
"grad_norm": 12.082404136657715, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4083, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.022012608976896414, |
|
"grad_norm": 14.026704788208008, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4077, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.022205702038097262, |
|
"grad_norm": 32.28516387939453, |
|
"learning_rate": 0.0001, |
|
"loss": 13.261, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.022398795099298107, |
|
"grad_norm": 10.577113151550293, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9731, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.022591888160498952, |
|
"grad_norm": 9.278578758239746, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6304, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.022784981221699797, |
|
"grad_norm": 10.110872268676758, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4382, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.022978074282900645, |
|
"grad_norm": 12.344923973083496, |
|
"learning_rate": 0.0001, |
|
"loss": 11.3848, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.02317116734410149, |
|
"grad_norm": 27.385581970214844, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5253, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.02317116734410149, |
|
"eval_loss": 2.7528042793273926, |
|
"eval_runtime": 333.2124, |
|
"eval_samples_per_second": 130.883, |
|
"eval_steps_per_second": 32.721, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.023364260405302335, |
|
"grad_norm": 11.155953407287598, |
|
"learning_rate": 0.0001, |
|
"loss": 9.898, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.023557353466503183, |
|
"grad_norm": 10.302549362182617, |
|
"learning_rate": 0.0001, |
|
"loss": 10.058, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.023750446527704028, |
|
"grad_norm": 11.402131080627441, |
|
"learning_rate": 0.0001, |
|
"loss": 10.707, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.023943539588904873, |
|
"grad_norm": 13.274065971374512, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5189, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.024136632650105717, |
|
"grad_norm": 22.315773010253906, |
|
"learning_rate": 0.0001, |
|
"loss": 13.4052, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.024329725711306566, |
|
"grad_norm": 8.078880310058594, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8327, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.02452281877250741, |
|
"grad_norm": 7.619024753570557, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8486, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.024715911833708255, |
|
"grad_norm": 10.089761734008789, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4484, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.0249090048949091, |
|
"grad_norm": 12.1000394821167, |
|
"learning_rate": 0.0001, |
|
"loss": 11.3575, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.02510209795610995, |
|
"grad_norm": 23.944225311279297, |
|
"learning_rate": 0.0001, |
|
"loss": 13.2124, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.025295191017310793, |
|
"grad_norm": 6.652422904968262, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6957, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.025488284078511638, |
|
"grad_norm": 10.389664649963379, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8489, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.025681377139712483, |
|
"grad_norm": 10.09528636932373, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5627, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.02587447020091333, |
|
"grad_norm": 13.287434577941895, |
|
"learning_rate": 0.0001, |
|
"loss": 11.8588, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.026067563262114176, |
|
"grad_norm": 23.052188873291016, |
|
"learning_rate": 0.0001, |
|
"loss": 12.9806, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.026067563262114176, |
|
"eval_loss": 2.743666648864746, |
|
"eval_runtime": 333.1086, |
|
"eval_samples_per_second": 130.924, |
|
"eval_steps_per_second": 32.731, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.02626065632331502, |
|
"grad_norm": 6.833707809448242, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0071, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.026453749384515866, |
|
"grad_norm": 8.814143180847168, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8842, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.026646842445716714, |
|
"grad_norm": 10.0423583984375, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6022, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.02683993550691756, |
|
"grad_norm": 11.296242713928223, |
|
"learning_rate": 0.0001, |
|
"loss": 11.2607, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.027033028568118404, |
|
"grad_norm": 21.390296936035156, |
|
"learning_rate": 0.0001, |
|
"loss": 12.9834, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.027226121629319252, |
|
"grad_norm": 8.968457221984863, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0666, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.027419214690520097, |
|
"grad_norm": 11.966334342956543, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0537, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.02761230775172094, |
|
"grad_norm": 13.229506492614746, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3221, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.027805400812921786, |
|
"grad_norm": 20.169370651245117, |
|
"learning_rate": 0.0001, |
|
"loss": 11.0951, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.027998493874122635, |
|
"grad_norm": 43.70033264160156, |
|
"learning_rate": 0.0001, |
|
"loss": 13.0941, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.02819158693532348, |
|
"grad_norm": 9.215312957763672, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0646, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.028384679996524324, |
|
"grad_norm": 20.890764236450195, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7248, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.02857777305772517, |
|
"grad_norm": 12.14108943939209, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3096, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.028770866118926017, |
|
"grad_norm": 15.33022689819336, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5878, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.028963959180126862, |
|
"grad_norm": 29.498929977416992, |
|
"learning_rate": 0.0001, |
|
"loss": 13.2048, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.028963959180126862, |
|
"eval_loss": 2.7453110218048096, |
|
"eval_runtime": 336.37, |
|
"eval_samples_per_second": 129.655, |
|
"eval_steps_per_second": 32.414, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0944605381853184e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|