{ "best_metric": 2.7872583866119385, "best_model_checkpoint": "miner_id_24/checkpoint-450", "epoch": 0.4317789291882556, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008635578583765112, "eval_loss": 4.047359466552734, "eval_runtime": 6.7214, "eval_samples_per_second": 72.604, "eval_steps_per_second": 18.151, "step": 1 }, { "epoch": 0.008635578583765112, "grad_norm": 71.40580749511719, "learning_rate": 4.3e-05, "loss": 7.2404, "step": 10 }, { "epoch": 0.017271157167530225, "grad_norm": 61.08443832397461, "learning_rate": 8.6e-05, "loss": 7.6144, "step": 20 }, { "epoch": 0.025906735751295335, "grad_norm": 68.64509582519531, "learning_rate": 0.000129, "loss": 7.7104, "step": 30 }, { "epoch": 0.03454231433506045, "grad_norm": 51.64717102050781, "learning_rate": 0.000172, "loss": 7.3387, "step": 40 }, { "epoch": 0.04317789291882556, "grad_norm": 60.46714782714844, "learning_rate": 0.000215, "loss": 6.4608, "step": 50 }, { "epoch": 0.04317789291882556, "eval_loss": 3.1346349716186523, "eval_runtime": 6.6605, "eval_samples_per_second": 73.268, "eval_steps_per_second": 18.317, "step": 50 }, { "epoch": 0.05181347150259067, "grad_norm": 73.30271911621094, "learning_rate": 0.0002147381354029311, "loss": 6.3668, "step": 60 }, { "epoch": 0.06044905008635579, "grad_norm": 62.13903045654297, "learning_rate": 0.0002139538173897188, "loss": 6.2347, "step": 70 }, { "epoch": 0.0690846286701209, "grad_norm": 66.6341323852539, "learning_rate": 0.0002126508670788841, "loss": 5.7629, "step": 80 }, { "epoch": 0.07772020725388601, "grad_norm": 149.75143432617188, "learning_rate": 0.00021083563231336926, "loss": 6.0598, "step": 90 }, { "epoch": 0.08635578583765112, "grad_norm": 133.63043212890625, "learning_rate": 0.00020851695673448515, "loss": 6.6264, "step": 100 }, { "epoch": 0.08635578583765112, "eval_loss": 3.3024439811706543, "eval_runtime": 6.8401, "eval_samples_per_second": 71.344, "eval_steps_per_second": 17.836, "step": 100 }, { "epoch": 0.09499136442141623, "grad_norm": 265.6074523925781, "learning_rate": 0.00020570613669657956, "loss": 6.845, "step": 110 }, { "epoch": 0.10362694300518134, "grad_norm": 188.36181640625, "learning_rate": 0.00020241686623233464, "loss": 7.5856, "step": 120 }, { "epoch": 0.11226252158894647, "grad_norm": 72.51764678955078, "learning_rate": 0.00019866517033681577, "loss": 6.734, "step": 130 }, { "epoch": 0.12089810017271158, "grad_norm": 188.24923706054688, "learning_rate": 0.00019446932689530684, "loss": 6.4476, "step": 140 }, { "epoch": 0.12953367875647667, "grad_norm": 139.26271057128906, "learning_rate": 0.0001898497776352901, "loss": 6.255, "step": 150 }, { "epoch": 0.12953367875647667, "eval_loss": 3.182236909866333, "eval_runtime": 6.7075, "eval_samples_per_second": 72.754, "eval_steps_per_second": 18.189, "step": 150 }, { "epoch": 0.1381692573402418, "grad_norm": 203.13534545898438, "learning_rate": 0.000184829028536405, "loss": 6.0828, "step": 160 }, { "epoch": 0.14680483592400692, "grad_norm": 112.05760955810547, "learning_rate": 0.00017943154018357726, "loss": 6.1603, "step": 170 }, { "epoch": 0.15544041450777202, "grad_norm": 72.94593048095703, "learning_rate": 0.00017368360859750824, "loss": 6.1336, "step": 180 }, { "epoch": 0.16407599309153714, "grad_norm": 101.9844970703125, "learning_rate": 0.00016761323712310527, "loss": 6.1751, "step": 190 }, { "epoch": 0.17271157167530224, "grad_norm": 106.93225860595703, "learning_rate": 0.00016125, "loss": 6.8375, "step": 200 }, { "epoch": 0.17271157167530224, "eval_loss": 3.004676342010498, "eval_runtime": 6.7213, "eval_samples_per_second": 72.605, "eval_steps_per_second": 18.151, "step": 200 }, { "epoch": 0.18134715025906736, "grad_norm": 68.26008605957031, "learning_rate": 0.0001546248982798258, "loss": 5.7141, "step": 210 }, { "epoch": 0.18998272884283246, "grad_norm": 127.45634460449219, "learning_rate": 0.00014777020879221055, "loss": 5.7091, "step": 220 }, { "epoch": 0.19861830742659758, "grad_norm": 119.57172393798828, "learning_rate": 0.00014071932689530684, "loss": 6.3644, "step": 230 }, { "epoch": 0.20725388601036268, "grad_norm": 101.10028839111328, "learning_rate": 0.00013350660377696428, "loss": 6.5899, "step": 240 }, { "epoch": 0.2158894645941278, "grad_norm": 69.55671691894531, "learning_rate": 0.00012616717909919503, "loss": 6.1797, "step": 250 }, { "epoch": 0.2158894645941278, "eval_loss": 2.9973599910736084, "eval_runtime": 7.7072, "eval_samples_per_second": 63.318, "eval_steps_per_second": 15.829, "step": 250 }, { "epoch": 0.22452504317789293, "grad_norm": 81.67117309570312, "learning_rate": 0.00011873680980127275, "loss": 5.8035, "step": 260 }, { "epoch": 0.23316062176165803, "grad_norm": 60.442134857177734, "learning_rate": 0.00011125169589551887, "loss": 5.969, "step": 270 }, { "epoch": 0.24179620034542315, "grad_norm": 94.2229232788086, "learning_rate": 0.00010374830410448118, "loss": 5.6845, "step": 280 }, { "epoch": 0.2504317789291883, "grad_norm": 301.8155822753906, "learning_rate": 9.626319019872726e-05, "loss": 5.8336, "step": 290 }, { "epoch": 0.25906735751295334, "grad_norm": 53.06270217895508, "learning_rate": 8.883282090080499e-05, "loss": 6.1913, "step": 300 }, { "epoch": 0.25906735751295334, "eval_loss": 2.8754076957702637, "eval_runtime": 7.8782, "eval_samples_per_second": 61.943, "eval_steps_per_second": 15.486, "step": 300 }, { "epoch": 0.26770293609671847, "grad_norm": 85.92414093017578, "learning_rate": 8.149339622303573e-05, "loss": 5.7197, "step": 310 }, { "epoch": 0.2763385146804836, "grad_norm": 98.81600189208984, "learning_rate": 7.428067310469316e-05, "loss": 5.9757, "step": 320 }, { "epoch": 0.2849740932642487, "grad_norm": 83.21562194824219, "learning_rate": 6.722979120778945e-05, "loss": 5.7417, "step": 330 }, { "epoch": 0.29360967184801384, "grad_norm": 113.16553497314453, "learning_rate": 6.0375101720174165e-05, "loss": 5.7729, "step": 340 }, { "epoch": 0.3022452504317789, "grad_norm": 65.47755432128906, "learning_rate": 5.3750000000000026e-05, "loss": 5.8536, "step": 350 }, { "epoch": 0.3022452504317789, "eval_loss": 2.931809902191162, "eval_runtime": 6.7527, "eval_samples_per_second": 72.268, "eval_steps_per_second": 18.067, "step": 350 }, { "epoch": 0.31088082901554404, "grad_norm": 62.19725799560547, "learning_rate": 4.738676287689473e-05, "loss": 5.8092, "step": 360 }, { "epoch": 0.31951640759930916, "grad_norm": 71.66590881347656, "learning_rate": 4.131639140249173e-05, "loss": 5.8552, "step": 370 }, { "epoch": 0.3281519861830743, "grad_norm": 76.8243408203125, "learning_rate": 3.5568459816422774e-05, "loss": 5.9306, "step": 380 }, { "epoch": 0.33678756476683935, "grad_norm": 78.27678680419922, "learning_rate": 3.017097146359502e-05, "loss": 6.0458, "step": 390 }, { "epoch": 0.3454231433506045, "grad_norm": 51.60271072387695, "learning_rate": 2.5150222364709875e-05, "loss": 6.0825, "step": 400 }, { "epoch": 0.3454231433506045, "eval_loss": 2.8580236434936523, "eval_runtime": 6.8437, "eval_samples_per_second": 71.307, "eval_steps_per_second": 17.827, "step": 400 }, { "epoch": 0.3540587219343696, "grad_norm": 175.19671630859375, "learning_rate": 2.053067310469316e-05, "loss": 5.6827, "step": 410 }, { "epoch": 0.3626943005181347, "grad_norm": 77.0377197265625, "learning_rate": 1.633482966318421e-05, "loss": 5.8634, "step": 420 }, { "epoch": 0.37132987910189985, "grad_norm": 80.0809555053711, "learning_rate": 1.2583133767665349e-05, "loss": 5.5702, "step": 430 }, { "epoch": 0.3799654576856649, "grad_norm": 80.28479766845703, "learning_rate": 9.293863303420395e-06, "loss": 5.7111, "step": 440 }, { "epoch": 0.38860103626943004, "grad_norm": 83.0961685180664, "learning_rate": 6.483043265514856e-06, "loss": 6.3589, "step": 450 }, { "epoch": 0.38860103626943004, "eval_loss": 2.7872583866119385, "eval_runtime": 6.7184, "eval_samples_per_second": 72.636, "eval_steps_per_second": 18.159, "step": 450 }, { "epoch": 0.39723661485319517, "grad_norm": 76.11356353759766, "learning_rate": 4.164367686630719e-06, "loss": 5.5934, "step": 460 }, { "epoch": 0.4058721934369603, "grad_norm": 125.73208618164062, "learning_rate": 2.3491329211158885e-06, "loss": 5.7158, "step": 470 }, { "epoch": 0.41450777202072536, "grad_norm": 44.78662872314453, "learning_rate": 1.046182610281186e-06, "loss": 5.8467, "step": 480 }, { "epoch": 0.4231433506044905, "grad_norm": 100.00250244140625, "learning_rate": 2.6186459706889876e-07, "loss": 5.2767, "step": 490 }, { "epoch": 0.4317789291882556, "grad_norm": 38.51133728027344, "learning_rate": 0.0, "loss": 5.9893, "step": 500 }, { "epoch": 0.4317789291882556, "eval_loss": 2.801614284515381, "eval_runtime": 6.8376, "eval_samples_per_second": 71.37, "eval_steps_per_second": 17.842, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4425082994688000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }