|
{ |
|
"best_metric": 2.7592430114746094, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-1350", |
|
"epoch": 0.028963959180126862, |
|
"eval_steps": 150, |
|
"global_step": 1500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.9309306120084574e-05, |
|
"eval_loss": 3.049531936645508, |
|
"eval_runtime": 325.6693, |
|
"eval_samples_per_second": 133.915, |
|
"eval_steps_per_second": 33.479, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00019309306120084574, |
|
"grad_norm": 17.593538284301758, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3405, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0003861861224016915, |
|
"grad_norm": 25.346281051635742, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4921, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0005792791836025373, |
|
"grad_norm": 29.28500747680664, |
|
"learning_rate": 0.0001, |
|
"loss": 11.2535, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.000772372244803383, |
|
"grad_norm": 25.79044532775879, |
|
"learning_rate": 0.0001, |
|
"loss": 12.771, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0009654653060042288, |
|
"grad_norm": 29.85230827331543, |
|
"learning_rate": 0.0001, |
|
"loss": 15.3004, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0011585583672050746, |
|
"grad_norm": 18.505237579345703, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1685, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0013516514284059203, |
|
"grad_norm": 20.607799530029297, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0184, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.001544744489606766, |
|
"grad_norm": 28.849403381347656, |
|
"learning_rate": 0.0001, |
|
"loss": 10.9882, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0017378375508076116, |
|
"grad_norm": 22.832693099975586, |
|
"learning_rate": 0.0001, |
|
"loss": 12.2328, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0019309306120084576, |
|
"grad_norm": 36.24290084838867, |
|
"learning_rate": 0.0001, |
|
"loss": 15.052, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.002124023673209303, |
|
"grad_norm": 26.353515625, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8579, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.002317116734410149, |
|
"grad_norm": 36.15186309814453, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1559, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.002510209795610995, |
|
"grad_norm": 25.59376335144043, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5048, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0027033028568118405, |
|
"grad_norm": 45.6728630065918, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7695, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"grad_norm": 67.35182189941406, |
|
"learning_rate": 0.0001, |
|
"loss": 14.9163, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"eval_loss": 2.915919542312622, |
|
"eval_runtime": 325.4265, |
|
"eval_samples_per_second": 134.015, |
|
"eval_steps_per_second": 33.504, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.003089488979213532, |
|
"grad_norm": 23.811389923095703, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1832, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0032825820404143776, |
|
"grad_norm": 33.760498046875, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6903, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0034756751016152233, |
|
"grad_norm": 36.039459228515625, |
|
"learning_rate": 0.0001, |
|
"loss": 10.9393, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.003668768162816069, |
|
"grad_norm": 47.975990295410156, |
|
"learning_rate": 0.0001, |
|
"loss": 12.2215, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.003861861224016915, |
|
"grad_norm": 44.83605194091797, |
|
"learning_rate": 0.0001, |
|
"loss": 14.0036, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00405495428521776, |
|
"grad_norm": 27.559823989868164, |
|
"learning_rate": 0.0001, |
|
"loss": 10.8778, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.004248047346418606, |
|
"grad_norm": 29.685842514038086, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3398, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.004441140407619453, |
|
"grad_norm": 32.39301681518555, |
|
"learning_rate": 0.0001, |
|
"loss": 10.8218, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.004634233468820298, |
|
"grad_norm": 26.63850212097168, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7435, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.004827326530021144, |
|
"grad_norm": 42.46340560913086, |
|
"learning_rate": 0.0001, |
|
"loss": 13.7549, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.00502041959122199, |
|
"grad_norm": 41.16633605957031, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0357, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.005213512652422835, |
|
"grad_norm": 29.798715591430664, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9566, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.005406605713623681, |
|
"grad_norm": 26.712194442749023, |
|
"learning_rate": 0.0001, |
|
"loss": 10.8858, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.005599698774824527, |
|
"grad_norm": 24.423017501831055, |
|
"learning_rate": 0.0001, |
|
"loss": 12.0604, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0057927918360253725, |
|
"grad_norm": 32.55687713623047, |
|
"learning_rate": 0.0001, |
|
"loss": 13.7669, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0057927918360253725, |
|
"eval_loss": 2.8850886821746826, |
|
"eval_runtime": 326.0923, |
|
"eval_samples_per_second": 133.741, |
|
"eval_steps_per_second": 33.435, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.005985884897226218, |
|
"grad_norm": 17.198266983032227, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4193, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.006178977958427064, |
|
"grad_norm": 18.960155487060547, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4948, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0063720710196279095, |
|
"grad_norm": 15.217443466186523, |
|
"learning_rate": 0.0001, |
|
"loss": 10.8575, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.006565164080828755, |
|
"grad_norm": 17.301410675048828, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7949, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.006758257142029601, |
|
"grad_norm": 44.01465606689453, |
|
"learning_rate": 0.0001, |
|
"loss": 14.4235, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.006951350203230447, |
|
"grad_norm": 18.623449325561523, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6294, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.007144443264431292, |
|
"grad_norm": 15.859786987304688, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3981, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.007337536325632138, |
|
"grad_norm": 15.925846099853516, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6106, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0075306293868329845, |
|
"grad_norm": 23.462480545043945, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4959, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.00772372244803383, |
|
"grad_norm": 34.727569580078125, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6495, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.007916815509234676, |
|
"grad_norm": 13.482560157775879, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9286, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00810990857043552, |
|
"grad_norm": 19.61520004272461, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4189, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.008303001631636367, |
|
"grad_norm": 20.92690658569336, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7233, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.008496094692837212, |
|
"grad_norm": 40.37583541870117, |
|
"learning_rate": 0.0001, |
|
"loss": 11.906, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.008689187754038059, |
|
"grad_norm": 54.6865234375, |
|
"learning_rate": 0.0001, |
|
"loss": 14.2049, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.008689187754038059, |
|
"eval_loss": 2.856499433517456, |
|
"eval_runtime": 326.449, |
|
"eval_samples_per_second": 133.595, |
|
"eval_steps_per_second": 33.399, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.008882280815238905, |
|
"grad_norm": 31.493087768554688, |
|
"learning_rate": 0.0001, |
|
"loss": 10.422, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.00907537387643975, |
|
"grad_norm": 25.92237091064453, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5121, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.009268466937640597, |
|
"grad_norm": 23.767501831054688, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6667, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.009461559998841441, |
|
"grad_norm": 29.84052848815918, |
|
"learning_rate": 0.0001, |
|
"loss": 11.8893, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.009654653060042288, |
|
"grad_norm": 42.1278190612793, |
|
"learning_rate": 0.0001, |
|
"loss": 13.8487, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.009847746121243133, |
|
"grad_norm": 18.233877182006836, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4124, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.01004083918244398, |
|
"grad_norm": 24.33700180053711, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3656, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.010233932243644824, |
|
"grad_norm": 21.17763900756836, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4647, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.01042702530484567, |
|
"grad_norm": 47.72334289550781, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6587, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.010620118366046516, |
|
"grad_norm": 32.690364837646484, |
|
"learning_rate": 0.0001, |
|
"loss": 13.7886, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.010813211427247362, |
|
"grad_norm": 14.042930603027344, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1801, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.011006304488448207, |
|
"grad_norm": 13.466276168823242, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7874, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.011199397549649054, |
|
"grad_norm": 13.901522636413574, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7971, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.011392490610849898, |
|
"grad_norm": 15.882288932800293, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6839, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.011585583672050745, |
|
"grad_norm": 31.395444869995117, |
|
"learning_rate": 0.0001, |
|
"loss": 13.7779, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.011585583672050745, |
|
"eval_loss": 2.848609447479248, |
|
"eval_runtime": 328.4742, |
|
"eval_samples_per_second": 132.771, |
|
"eval_steps_per_second": 33.193, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.011778676733251591, |
|
"grad_norm": 16.243173599243164, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3243, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.011971769794452436, |
|
"grad_norm": 10.895001411437988, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1884, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.012164862855653283, |
|
"grad_norm": 12.364214897155762, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5667, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.012357955916854128, |
|
"grad_norm": 14.893481254577637, |
|
"learning_rate": 0.0001, |
|
"loss": 11.7055, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.012551048978054974, |
|
"grad_norm": 31.42432403564453, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6669, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.012744142039255819, |
|
"grad_norm": 11.641709327697754, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4052, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.012937235100456666, |
|
"grad_norm": 12.934191703796387, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1611, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.01313032816165751, |
|
"grad_norm": 28.218975067138672, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7762, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.013323421222858357, |
|
"grad_norm": 16.593141555786133, |
|
"learning_rate": 0.0001, |
|
"loss": 11.8962, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.013516514284059202, |
|
"grad_norm": 23.61168098449707, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6789, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.013709607345260048, |
|
"grad_norm": 11.673768043518066, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1343, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.013902700406460893, |
|
"grad_norm": 13.041866302490234, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4441, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01409579346766174, |
|
"grad_norm": 15.724223136901855, |
|
"learning_rate": 0.0001, |
|
"loss": 10.759, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.014288886528862585, |
|
"grad_norm": 15.264642715454102, |
|
"learning_rate": 0.0001, |
|
"loss": 11.8888, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.014481979590063431, |
|
"grad_norm": 37.832698822021484, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6694, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.014481979590063431, |
|
"eval_loss": 2.8133044242858887, |
|
"eval_runtime": 330.5652, |
|
"eval_samples_per_second": 131.932, |
|
"eval_steps_per_second": 32.983, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.014675072651264276, |
|
"grad_norm": 13.753046035766602, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2511, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.014868165712465123, |
|
"grad_norm": 12.730433464050293, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2344, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.015061258773665969, |
|
"grad_norm": 18.035993576049805, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7321, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.015254351834866814, |
|
"grad_norm": 20.784934997558594, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6791, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.01544744489606766, |
|
"grad_norm": 29.870254516601562, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6885, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.015640537957268507, |
|
"grad_norm": 16.11637306213379, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3119, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.015833631018469352, |
|
"grad_norm": 12.562838554382324, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0703, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.016026724079670197, |
|
"grad_norm": 11.796956062316895, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7967, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.01621981714087104, |
|
"grad_norm": 18.159549713134766, |
|
"learning_rate": 0.0001, |
|
"loss": 11.2847, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.01641291020207189, |
|
"grad_norm": 22.098344802856445, |
|
"learning_rate": 0.0001, |
|
"loss": 13.9348, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.016606003263272735, |
|
"grad_norm": 10.987359046936035, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1845, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.01679909632447358, |
|
"grad_norm": 22.796899795532227, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2937, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.016992189385674424, |
|
"grad_norm": 14.24060344696045, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6819, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.017185282446875273, |
|
"grad_norm": 14.327189445495605, |
|
"learning_rate": 0.0001, |
|
"loss": 11.411, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.017378375508076117, |
|
"grad_norm": 25.815399169921875, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5363, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.017378375508076117, |
|
"eval_loss": 2.7856554985046387, |
|
"eval_runtime": 328.0056, |
|
"eval_samples_per_second": 132.961, |
|
"eval_steps_per_second": 33.24, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.017571468569276962, |
|
"grad_norm": 14.01270580291748, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0269, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.01776456163047781, |
|
"grad_norm": 12.662365913391113, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0288, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.017957654691678655, |
|
"grad_norm": 11.21884822845459, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5588, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.0181507477528795, |
|
"grad_norm": 14.193058013916016, |
|
"learning_rate": 0.0001, |
|
"loss": 11.3954, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.018343840814080345, |
|
"grad_norm": 33.61137008666992, |
|
"learning_rate": 0.0001, |
|
"loss": 13.412, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.018536933875281193, |
|
"grad_norm": 8.29185962677002, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1405, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.018730026936482038, |
|
"grad_norm": 9.83952522277832, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9334, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.018923119997682883, |
|
"grad_norm": 13.321242332458496, |
|
"learning_rate": 0.0001, |
|
"loss": 10.2569, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.019116213058883728, |
|
"grad_norm": 11.480195045471191, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5664, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.019309306120084576, |
|
"grad_norm": 31.19932746887207, |
|
"learning_rate": 0.0001, |
|
"loss": 13.4732, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01950239918128542, |
|
"grad_norm": 8.18012809753418, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6682, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.019695492242486266, |
|
"grad_norm": 13.29211139678955, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7695, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.01988858530368711, |
|
"grad_norm": 10.953620910644531, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6678, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.02008167836488796, |
|
"grad_norm": 13.800012588500977, |
|
"learning_rate": 0.0001, |
|
"loss": 11.9463, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.020274771426088804, |
|
"grad_norm": 38.13134765625, |
|
"learning_rate": 0.0001, |
|
"loss": 13.1925, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.020274771426088804, |
|
"eval_loss": 2.779891014099121, |
|
"eval_runtime": 331.3018, |
|
"eval_samples_per_second": 131.638, |
|
"eval_steps_per_second": 32.91, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.02046786448728965, |
|
"grad_norm": 9.869668006896973, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3068, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.020660957548490497, |
|
"grad_norm": 16.215045928955078, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0212, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.02085405060969134, |
|
"grad_norm": 13.320286750793457, |
|
"learning_rate": 0.0001, |
|
"loss": 10.9438, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.021047143670892186, |
|
"grad_norm": 17.76883888244629, |
|
"learning_rate": 0.0001, |
|
"loss": 11.2812, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.02124023673209303, |
|
"grad_norm": 34.5752067565918, |
|
"learning_rate": 0.0001, |
|
"loss": 13.2006, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.02143332979329388, |
|
"grad_norm": 13.33773422241211, |
|
"learning_rate": 0.0001, |
|
"loss": 10.145, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.021626422854494724, |
|
"grad_norm": 13.584674835205078, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9768, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.02181951591569557, |
|
"grad_norm": 16.620107650756836, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4809, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.022012608976896414, |
|
"grad_norm": 20.761350631713867, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4802, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.022205702038097262, |
|
"grad_norm": 42.86295700073242, |
|
"learning_rate": 0.0001, |
|
"loss": 13.3743, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.022398795099298107, |
|
"grad_norm": 14.914039611816406, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9647, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.022591888160498952, |
|
"grad_norm": 18.681671142578125, |
|
"learning_rate": 0.0001, |
|
"loss": 9.6733, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.022784981221699797, |
|
"grad_norm": 16.26973533630371, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5368, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.022978074282900645, |
|
"grad_norm": 18.916370391845703, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4679, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.02317116734410149, |
|
"grad_norm": 43.991939544677734, |
|
"learning_rate": 0.0001, |
|
"loss": 13.6126, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.02317116734410149, |
|
"eval_loss": 2.76770281791687, |
|
"eval_runtime": 324.3376, |
|
"eval_samples_per_second": 134.465, |
|
"eval_steps_per_second": 33.616, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.023364260405302335, |
|
"grad_norm": 19.019067764282227, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9333, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.023557353466503183, |
|
"grad_norm": 14.600743293762207, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1374, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.023750446527704028, |
|
"grad_norm": 16.908267974853516, |
|
"learning_rate": 0.0001, |
|
"loss": 10.7748, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.023943539588904873, |
|
"grad_norm": 25.043933868408203, |
|
"learning_rate": 0.0001, |
|
"loss": 11.5941, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.024136632650105717, |
|
"grad_norm": 31.190759658813477, |
|
"learning_rate": 0.0001, |
|
"loss": 13.5434, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.024329725711306566, |
|
"grad_norm": 11.28165054321289, |
|
"learning_rate": 0.0001, |
|
"loss": 9.8626, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.02452281877250741, |
|
"grad_norm": 12.803401947021484, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9016, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.024715911833708255, |
|
"grad_norm": 10.985638618469238, |
|
"learning_rate": 0.0001, |
|
"loss": 10.5182, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.0249090048949091, |
|
"grad_norm": 14.558553695678711, |
|
"learning_rate": 0.0001, |
|
"loss": 11.4006, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.02510209795610995, |
|
"grad_norm": 28.10079002380371, |
|
"learning_rate": 0.0001, |
|
"loss": 13.2948, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.025295191017310793, |
|
"grad_norm": 10.139495849609375, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7414, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.025488284078511638, |
|
"grad_norm": 12.202920913696289, |
|
"learning_rate": 0.0001, |
|
"loss": 9.915, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.025681377139712483, |
|
"grad_norm": 14.947649002075195, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6419, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.02587447020091333, |
|
"grad_norm": 16.732439041137695, |
|
"learning_rate": 0.0001, |
|
"loss": 11.9169, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.026067563262114176, |
|
"grad_norm": 32.5897216796875, |
|
"learning_rate": 0.0001, |
|
"loss": 13.0356, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.026067563262114176, |
|
"eval_loss": 2.7592430114746094, |
|
"eval_runtime": 326.8015, |
|
"eval_samples_per_second": 133.451, |
|
"eval_steps_per_second": 33.363, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.02626065632331502, |
|
"grad_norm": 10.175602912902832, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0645, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.026453749384515866, |
|
"grad_norm": 13.354422569274902, |
|
"learning_rate": 0.0001, |
|
"loss": 9.9626, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.026646842445716714, |
|
"grad_norm": 14.017874717712402, |
|
"learning_rate": 0.0001, |
|
"loss": 10.6526, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.02683993550691756, |
|
"grad_norm": 15.533336639404297, |
|
"learning_rate": 0.0001, |
|
"loss": 11.3396, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.027033028568118404, |
|
"grad_norm": 21.672401428222656, |
|
"learning_rate": 0.0001, |
|
"loss": 13.0943, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.027226121629319252, |
|
"grad_norm": 11.563956260681152, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1492, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.027419214690520097, |
|
"grad_norm": 9.616212844848633, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1169, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.02761230775172094, |
|
"grad_norm": 14.188048362731934, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3995, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.027805400812921786, |
|
"grad_norm": 13.804783821105957, |
|
"learning_rate": 0.0001, |
|
"loss": 11.1481, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.027998493874122635, |
|
"grad_norm": 23.23021125793457, |
|
"learning_rate": 0.0001, |
|
"loss": 13.1522, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.02819158693532348, |
|
"grad_norm": 7.562139511108398, |
|
"learning_rate": 0.0001, |
|
"loss": 10.1138, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.028384679996524324, |
|
"grad_norm": 26.057680130004883, |
|
"learning_rate": 0.0001, |
|
"loss": 9.7856, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.02857777305772517, |
|
"grad_norm": 12.248312950134277, |
|
"learning_rate": 0.0001, |
|
"loss": 10.4052, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.028770866118926017, |
|
"grad_norm": 14.60325813293457, |
|
"learning_rate": 0.0001, |
|
"loss": 11.6516, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.028963959180126862, |
|
"grad_norm": 52.02046585083008, |
|
"learning_rate": 0.0001, |
|
"loss": 13.2193, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.028963959180126862, |
|
"eval_loss": 2.770695209503174, |
|
"eval_runtime": 330.0972, |
|
"eval_samples_per_second": 132.119, |
|
"eval_steps_per_second": 33.03, |
|
"step": 1500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0944605381853184e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|