|
{ |
|
"best_metric": 0.832525908946991, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.23889154323936931, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00047778308647873863, |
|
"eval_loss": 4.254161834716797, |
|
"eval_runtime": 11.5108, |
|
"eval_samples_per_second": 76.624, |
|
"eval_steps_per_second": 19.199, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004777830864787387, |
|
"grad_norm": 86.97021484375, |
|
"learning_rate": 4.34e-05, |
|
"loss": 6.7643, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009555661729574774, |
|
"grad_norm": 114.20069885253906, |
|
"learning_rate": 8.68e-05, |
|
"loss": 4.2519, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01433349259436216, |
|
"grad_norm": 87.37146759033203, |
|
"learning_rate": 0.0001302, |
|
"loss": 3.5983, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019111323459149548, |
|
"grad_norm": 114.74700927734375, |
|
"learning_rate": 0.0001736, |
|
"loss": 3.2439, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023889154323936932, |
|
"grad_norm": 97.87152099609375, |
|
"learning_rate": 0.000217, |
|
"loss": 2.683, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.023889154323936932, |
|
"eval_loss": 1.3938980102539062, |
|
"eval_runtime": 11.5211, |
|
"eval_samples_per_second": 76.555, |
|
"eval_steps_per_second": 19.182, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02866698518872432, |
|
"grad_norm": 68.76221466064453, |
|
"learning_rate": 0.00021673569945319091, |
|
"loss": 2.163, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.033444816053511704, |
|
"grad_norm": 85.8083267211914, |
|
"learning_rate": 0.00021594408545846038, |
|
"loss": 1.941, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.038222646918299095, |
|
"grad_norm": 49.4107780456543, |
|
"learning_rate": 0.0002146290146796179, |
|
"loss": 2.4934, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04300047778308648, |
|
"grad_norm": 45.38887405395508, |
|
"learning_rate": 0.0002127968940093076, |
|
"loss": 2.7001, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.047778308647873864, |
|
"grad_norm": 114.47595977783203, |
|
"learning_rate": 0.00021045664935527106, |
|
"loss": 2.8237, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.047778308647873864, |
|
"eval_loss": 1.3519835472106934, |
|
"eval_runtime": 11.475, |
|
"eval_samples_per_second": 76.863, |
|
"eval_steps_per_second": 19.259, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05255613951266125, |
|
"grad_norm": 163.51150512695312, |
|
"learning_rate": 0.00020761968215422217, |
|
"loss": 2.9201, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05733397037744864, |
|
"grad_norm": 42.164276123046875, |
|
"learning_rate": 0.00020429981382519356, |
|
"loss": 2.8884, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 30.880720138549805, |
|
"learning_rate": 0.00020051321843297219, |
|
"loss": 2.5363, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06688963210702341, |
|
"grad_norm": 61.331912994384766, |
|
"learning_rate": 0.0001962783438896818, |
|
"loss": 2.4063, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0716674629718108, |
|
"grad_norm": 70.56568145751953, |
|
"learning_rate": 0.0001916158220784091, |
|
"loss": 2.5402, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0716674629718108, |
|
"eval_loss": 1.248333215713501, |
|
"eval_runtime": 11.4863, |
|
"eval_samples_per_second": 76.787, |
|
"eval_steps_per_second": 19.24, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07644529383659819, |
|
"grad_norm": 33.299007415771484, |
|
"learning_rate": 0.00018654836833674362, |
|
"loss": 1.8457, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08122312470138557, |
|
"grad_norm": 48.84244918823242, |
|
"learning_rate": 0.0001811006707899361, |
|
"loss": 2.259, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08600095556617296, |
|
"grad_norm": 16.119049072265625, |
|
"learning_rate": 0.0001752992700728339, |
|
"loss": 2.4819, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09077878643096035, |
|
"grad_norm": 32.0130500793457, |
|
"learning_rate": 0.00016917243002657602, |
|
"loss": 2.4841, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09555661729574773, |
|
"grad_norm": 70.85590362548828, |
|
"learning_rate": 0.00016275, |
|
"loss": 2.8376, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09555661729574773, |
|
"eval_loss": 1.1818195581436157, |
|
"eval_runtime": 11.5022, |
|
"eval_samples_per_second": 76.681, |
|
"eval_steps_per_second": 19.214, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10033444816053512, |
|
"grad_norm": 50.72034454345703, |
|
"learning_rate": 0.0001560632694266149, |
|
"loss": 1.869, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1051122790253225, |
|
"grad_norm": 42.09200668334961, |
|
"learning_rate": 0.00014914481538562646, |
|
"loss": 1.7881, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 118.14753723144531, |
|
"learning_rate": 0.0001420283438896818, |
|
"loss": 2.0519, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11466794075489728, |
|
"grad_norm": 24.315568923950195, |
|
"learning_rate": 0.00013474852567256393, |
|
"loss": 2.1679, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11944577161968466, |
|
"grad_norm": 70.42790985107422, |
|
"learning_rate": 0.00012734082727686196, |
|
"loss": 2.4192, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11944577161968466, |
|
"eval_loss": 1.1315124034881592, |
|
"eval_runtime": 11.4436, |
|
"eval_samples_per_second": 77.074, |
|
"eval_steps_per_second": 19.312, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 41.314945220947266, |
|
"learning_rate": 0.0001198413382645404, |
|
"loss": 1.7405, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12900143334925943, |
|
"grad_norm": 21.883790969848633, |
|
"learning_rate": 0.00011228659539222137, |
|
"loss": 1.6183, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13377926421404682, |
|
"grad_norm": 14.944008827209473, |
|
"learning_rate": 0.00010471340460777866, |
|
"loss": 1.7443, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1385570950788342, |
|
"grad_norm": 38.64680099487305, |
|
"learning_rate": 9.715866173545961e-05, |
|
"loss": 2.1661, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1433349259436216, |
|
"grad_norm": 34.455081939697266, |
|
"learning_rate": 8.965917272313806e-05, |
|
"loss": 2.2251, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1433349259436216, |
|
"eval_loss": 0.9748860597610474, |
|
"eval_runtime": 11.386, |
|
"eval_samples_per_second": 77.464, |
|
"eval_steps_per_second": 19.41, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.148112756808409, |
|
"grad_norm": 20.522775650024414, |
|
"learning_rate": 8.225147432743606e-05, |
|
"loss": 1.4992, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15289058767319638, |
|
"grad_norm": 75.11093139648438, |
|
"learning_rate": 7.497165611031821e-05, |
|
"loss": 1.4972, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15766841853798375, |
|
"grad_norm": 37.47844696044922, |
|
"learning_rate": 6.785518461437353e-05, |
|
"loss": 1.8537, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16244624940277114, |
|
"grad_norm": 23.639490127563477, |
|
"learning_rate": 6.093673057338509e-05, |
|
"loss": 2.1694, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"grad_norm": 54.70100021362305, |
|
"learning_rate": 5.4250000000000024e-05, |
|
"loss": 2.3268, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"eval_loss": 0.9435171484947205, |
|
"eval_runtime": 11.5124, |
|
"eval_samples_per_second": 76.613, |
|
"eval_steps_per_second": 19.197, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17200191113234592, |
|
"grad_norm": 34.21720886230469, |
|
"learning_rate": 4.782756997342398e-05, |
|
"loss": 1.3658, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1767797419971333, |
|
"grad_norm": 22.63213348388672, |
|
"learning_rate": 4.170072992716607e-05, |
|
"loss": 1.4596, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1815575728619207, |
|
"grad_norm": 13.130209922790527, |
|
"learning_rate": 3.5899329210063916e-05, |
|
"loss": 1.6314, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18633540372670807, |
|
"grad_norm": 13.130266189575195, |
|
"learning_rate": 3.045163166325637e-05, |
|
"loss": 1.9832, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19111323459149546, |
|
"grad_norm": 28.62923812866211, |
|
"learning_rate": 2.5384177921590895e-05, |
|
"loss": 2.1456, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19111323459149546, |
|
"eval_loss": 0.8851467370986938, |
|
"eval_runtime": 11.5188, |
|
"eval_samples_per_second": 76.57, |
|
"eval_steps_per_second": 19.186, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19589106545628285, |
|
"grad_norm": 12.487257957458496, |
|
"learning_rate": 2.0721656110318213e-05, |
|
"loss": 1.1234, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20066889632107024, |
|
"grad_norm": 12.503289222717285, |
|
"learning_rate": 1.6486781567027783e-05, |
|
"loss": 1.2566, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.20544672718585763, |
|
"grad_norm": 9.571052551269531, |
|
"learning_rate": 1.2700186174806422e-05, |
|
"loss": 1.6166, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.210224558050645, |
|
"grad_norm": 11.375801086425781, |
|
"learning_rate": 9.380317845777794e-06, |
|
"loss": 1.9494, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.21500238891543239, |
|
"grad_norm": 11.005843162536621, |
|
"learning_rate": 6.543350644728947e-06, |
|
"loss": 2.0351, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21500238891543239, |
|
"eval_loss": 0.8364555239677429, |
|
"eval_runtime": 11.4227, |
|
"eval_samples_per_second": 77.215, |
|
"eval_steps_per_second": 19.347, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 26.848785400390625, |
|
"learning_rate": 4.2031059906924e-06, |
|
"loss": 1.0386, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22455805064500717, |
|
"grad_norm": 13.840141296386719, |
|
"learning_rate": 2.3709853203820825e-06, |
|
"loss": 1.2618, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22933588150979456, |
|
"grad_norm": 37.794471740722656, |
|
"learning_rate": 1.0559145415396157e-06, |
|
"loss": 1.6539, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23411371237458195, |
|
"grad_norm": 38.33562088012695, |
|
"learning_rate": 2.643005468090745e-07, |
|
"loss": 1.7703, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23889154323936931, |
|
"grad_norm": 19.850322723388672, |
|
"learning_rate": 0.0, |
|
"loss": 2.0542, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23889154323936931, |
|
"eval_loss": 0.832525908946991, |
|
"eval_runtime": 11.6134, |
|
"eval_samples_per_second": 75.947, |
|
"eval_steps_per_second": 19.03, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4657011228672000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|