|
{ |
|
"best_metric": 0.8746353387832642, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.23889154323936931, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00047778308647873863, |
|
"eval_loss": 4.255342483520508, |
|
"eval_runtime": 12.2428, |
|
"eval_samples_per_second": 72.043, |
|
"eval_steps_per_second": 18.051, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004777830864787387, |
|
"grad_norm": 93.89539337158203, |
|
"learning_rate": 4.2600000000000005e-05, |
|
"loss": 6.865, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009555661729574774, |
|
"grad_norm": 192.6894073486328, |
|
"learning_rate": 8.520000000000001e-05, |
|
"loss": 4.7086, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01433349259436216, |
|
"grad_norm": 185.80467224121094, |
|
"learning_rate": 0.0001278, |
|
"loss": 3.7399, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019111323459149548, |
|
"grad_norm": 120.97589874267578, |
|
"learning_rate": 0.00017040000000000002, |
|
"loss": 3.3705, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023889154323936932, |
|
"grad_norm": 87.03988647460938, |
|
"learning_rate": 0.000213, |
|
"loss": 3.1617, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.023889154323936932, |
|
"eval_loss": 1.551166296005249, |
|
"eval_runtime": 12.1727, |
|
"eval_samples_per_second": 72.457, |
|
"eval_steps_per_second": 18.155, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02866698518872432, |
|
"grad_norm": 99.68428039550781, |
|
"learning_rate": 0.00021274057135267128, |
|
"loss": 2.759, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.033444816053511704, |
|
"grad_norm": 31.421096801757812, |
|
"learning_rate": 0.00021196354932097723, |
|
"loss": 3.1152, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.038222646918299095, |
|
"grad_norm": 17.666555404663086, |
|
"learning_rate": 0.0002106727194781503, |
|
"loss": 2.3276, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04300047778308648, |
|
"grad_norm": 21.751888275146484, |
|
"learning_rate": 0.00020887437061743096, |
|
"loss": 2.3262, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.047778308647873864, |
|
"grad_norm": 14.296065330505371, |
|
"learning_rate": 0.00020657726411369925, |
|
"loss": 2.4236, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.047778308647873864, |
|
"eval_loss": 1.1035537719726562, |
|
"eval_runtime": 12.2578, |
|
"eval_samples_per_second": 71.954, |
|
"eval_steps_per_second": 18.029, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05255613951266125, |
|
"grad_norm": 31.222625732421875, |
|
"learning_rate": 0.000203792591238937, |
|
"loss": 1.611, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05733397037744864, |
|
"grad_norm": 108.65278625488281, |
|
"learning_rate": 0.0002005339186394757, |
|
"loss": 2.0965, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 33.067230224609375, |
|
"learning_rate": 0.00019681712224065936, |
|
"loss": 2.2948, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06688963210702341, |
|
"grad_norm": 33.02948760986328, |
|
"learning_rate": 0.0001926603099009319, |
|
"loss": 2.8421, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0716674629718108, |
|
"grad_norm": 15.799519538879395, |
|
"learning_rate": 0.00018808373319217114, |
|
"loss": 2.8939, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0716674629718108, |
|
"eval_loss": 1.215044379234314, |
|
"eval_runtime": 12.0789, |
|
"eval_samples_per_second": 73.02, |
|
"eval_steps_per_second": 18.296, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07644529383659819, |
|
"grad_norm": 27.139089584350586, |
|
"learning_rate": 0.00018310968873606635, |
|
"loss": 1.9604, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08122312470138557, |
|
"grad_norm": 37.57059097290039, |
|
"learning_rate": 0.0001777624095772184, |
|
"loss": 1.692, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08600095556617296, |
|
"grad_norm": 101.7945556640625, |
|
"learning_rate": 0.0001720679471221826, |
|
"loss": 3.0551, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09077878643096035, |
|
"grad_norm": 65.89530181884766, |
|
"learning_rate": 0.00016605404421963453, |
|
"loss": 2.2188, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09555661729574773, |
|
"grad_norm": 30.504783630371094, |
|
"learning_rate": 0.00015975, |
|
"loss": 2.4033, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09555661729574773, |
|
"eval_loss": 1.0835368633270264, |
|
"eval_runtime": 12.393, |
|
"eval_samples_per_second": 71.169, |
|
"eval_steps_per_second": 17.833, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10033444816053512, |
|
"grad_norm": 52.85816192626953, |
|
"learning_rate": 0.00015318652713303674, |
|
"loss": 1.599, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1051122790253225, |
|
"grad_norm": 19.591501235961914, |
|
"learning_rate": 0.00014639560219879464, |
|
"loss": 1.7043, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 81.96508026123047, |
|
"learning_rate": 0.0001394103099009319, |
|
"loss": 2.1468, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11466794075489728, |
|
"grad_norm": 30.87833595275879, |
|
"learning_rate": 0.0001322646818813646, |
|
"loss": 2.161, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11944577161968466, |
|
"grad_norm": 42.904701232910156, |
|
"learning_rate": 0.0001249935309215281, |
|
"loss": 2.5615, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11944577161968466, |
|
"eval_loss": 1.1374989748001099, |
|
"eval_runtime": 12.4118, |
|
"eval_samples_per_second": 71.061, |
|
"eval_steps_per_second": 17.806, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 26.0584716796875, |
|
"learning_rate": 0.0001176322813380051, |
|
"loss": 1.8027, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12900143334925943, |
|
"grad_norm": 19.178482055664062, |
|
"learning_rate": 0.00011021679639881638, |
|
"loss": 1.6998, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13377926421404682, |
|
"grad_norm": 24.075414657592773, |
|
"learning_rate": 0.00010278320360118368, |
|
"loss": 2.3066, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1385570950788342, |
|
"grad_norm": 19.714353561401367, |
|
"learning_rate": 9.536771866199493e-05, |
|
"loss": 2.4782, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1433349259436216, |
|
"grad_norm": 13.794386863708496, |
|
"learning_rate": 8.800646907847192e-05, |
|
"loss": 2.4282, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1433349259436216, |
|
"eval_loss": 1.0520588159561157, |
|
"eval_runtime": 12.5021, |
|
"eval_samples_per_second": 70.548, |
|
"eval_steps_per_second": 17.677, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.148112756808409, |
|
"grad_norm": 67.06897735595703, |
|
"learning_rate": 8.07353181186354e-05, |
|
"loss": 1.7239, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15289058767319638, |
|
"grad_norm": 51.10677719116211, |
|
"learning_rate": 7.35896900990681e-05, |
|
"loss": 1.6737, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15766841853798375, |
|
"grad_norm": 54.609580993652344, |
|
"learning_rate": 6.660439780120536e-05, |
|
"loss": 2.0017, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16244624940277114, |
|
"grad_norm": 39.43727111816406, |
|
"learning_rate": 5.981347286696324e-05, |
|
"loss": 2.1488, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"grad_norm": 30.269664764404297, |
|
"learning_rate": 5.325000000000002e-05, |
|
"loss": 2.3871, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"eval_loss": 0.9656849503517151, |
|
"eval_runtime": 12.1772, |
|
"eval_samples_per_second": 72.43, |
|
"eval_steps_per_second": 18.149, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17200191113234592, |
|
"grad_norm": 30.008037567138672, |
|
"learning_rate": 4.6945955780365475e-05, |
|
"loss": 1.4687, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1767797419971333, |
|
"grad_norm": 30.746850967407227, |
|
"learning_rate": 4.0932052877817393e-05, |
|
"loss": 1.5686, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1815575728619207, |
|
"grad_norm": 29.426549911499023, |
|
"learning_rate": 3.523759042278163e-05, |
|
"loss": 1.6911, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18633540372670807, |
|
"grad_norm": 33.3541145324707, |
|
"learning_rate": 2.989031126393367e-05, |
|
"loss": 2.0111, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19111323459149546, |
|
"grad_norm": 15.817224502563477, |
|
"learning_rate": 2.4916266807828855e-05, |
|
"loss": 2.2564, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19111323459149546, |
|
"eval_loss": 0.9130141735076904, |
|
"eval_runtime": 12.0386, |
|
"eval_samples_per_second": 73.264, |
|
"eval_steps_per_second": 18.358, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19589106545628285, |
|
"grad_norm": 40.40257263183594, |
|
"learning_rate": 2.033969009906811e-05, |
|
"loss": 1.4211, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20066889632107024, |
|
"grad_norm": 28.919811248779297, |
|
"learning_rate": 1.6182877759340637e-05, |
|
"loss": 1.5233, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.20544672718585763, |
|
"grad_norm": 18.431303024291992, |
|
"learning_rate": 1.2466081360524275e-05, |
|
"loss": 1.7242, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.210224558050645, |
|
"grad_norm": 15.968626022338867, |
|
"learning_rate": 9.207408761062996e-06, |
|
"loss": 1.9384, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.21500238891543239, |
|
"grad_norm": 35.02082061767578, |
|
"learning_rate": 6.422735886300764e-06, |
|
"loss": 2.1484, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21500238891543239, |
|
"eval_loss": 0.8792083859443665, |
|
"eval_runtime": 12.2699, |
|
"eval_samples_per_second": 71.883, |
|
"eval_steps_per_second": 18.011, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 28.86961555480957, |
|
"learning_rate": 4.125629382569038e-06, |
|
"loss": 1.3139, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22455805064500717, |
|
"grad_norm": 38.519046783447266, |
|
"learning_rate": 2.327280521849694e-06, |
|
"loss": 1.4227, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22933588150979456, |
|
"grad_norm": 20.339561462402344, |
|
"learning_rate": 1.0364506790227565e-06, |
|
"loss": 1.6595, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23411371237458195, |
|
"grad_norm": 55.318058013916016, |
|
"learning_rate": 2.5942864732872295e-07, |
|
"loss": 1.9445, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23889154323936931, |
|
"grad_norm": 15.225406646728516, |
|
"learning_rate": 0.0, |
|
"loss": 2.0612, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23889154323936931, |
|
"eval_loss": 0.8746353387832642, |
|
"eval_runtime": 12.2579, |
|
"eval_samples_per_second": 71.953, |
|
"eval_steps_per_second": 18.029, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4657011228672000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|