{ "best_metric": 3.1905126571655273, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.4317789291882556, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008635578583765112, "eval_loss": 4.056301116943359, "eval_runtime": 6.6134, "eval_samples_per_second": 73.789, "eval_steps_per_second": 18.447, "step": 1 }, { "epoch": 0.008635578583765112, "grad_norm": 75.20758056640625, "learning_rate": 4.2000000000000004e-05, "loss": 7.2673, "step": 10 }, { "epoch": 0.017271157167530225, "grad_norm": 82.5886001586914, "learning_rate": 8.400000000000001e-05, "loss": 6.9928, "step": 20 }, { "epoch": 0.025906735751295335, "grad_norm": 934.9874877929688, "learning_rate": 0.000126, "loss": 7.1944, "step": 30 }, { "epoch": 0.03454231433506045, "grad_norm": 220.6071319580078, "learning_rate": 0.00016800000000000002, "loss": 7.0499, "step": 40 }, { "epoch": 0.04317789291882556, "grad_norm": 720.1281127929688, "learning_rate": 0.00021, "loss": 8.7892, "step": 50 }, { "epoch": 0.04317789291882556, "eval_loss": 5.3484110832214355, "eval_runtime": 6.7066, "eval_samples_per_second": 72.764, "eval_steps_per_second": 18.191, "step": 50 }, { "epoch": 0.05181347150259067, "grad_norm": 62.76759719848633, "learning_rate": 0.00020974422527728155, "loss": 13.9468, "step": 60 }, { "epoch": 0.06044905008635579, "grad_norm": 154.0032501220703, "learning_rate": 0.0002089781472178649, "loss": 10.8076, "step": 70 }, { "epoch": 0.0690846286701209, "grad_norm": 666.8218994140625, "learning_rate": 0.0002077054980770496, "loss": 7.8264, "step": 80 }, { "epoch": 0.07772020725388601, "grad_norm": 109.74002838134766, "learning_rate": 0.00020593247807352348, "loss": 9.7935, "step": 90 }, { "epoch": 0.08635578583765112, "grad_norm": 324.6034851074219, "learning_rate": 0.00020366772518252038, "loss": 9.2869, "step": 100 }, { "epoch": 0.08635578583765112, "eval_loss": 4.111455917358398, "eval_runtime": 6.6401, "eval_samples_per_second": 73.493, "eval_steps_per_second": 18.373, "step": 100 }, { "epoch": 0.09499136442141623, "grad_norm": 116.87757110595703, "learning_rate": 0.0002009222730524731, "loss": 9.3815, "step": 110 }, { "epoch": 0.10362694300518134, "grad_norm": 203.64956665039062, "learning_rate": 0.00019770949725018733, "loss": 8.3639, "step": 120 }, { "epoch": 0.11226252158894647, "grad_norm": 709.5479736328125, "learning_rate": 0.00019404505009642473, "loss": 9.0807, "step": 130 }, { "epoch": 0.12089810017271158, "grad_norm": 485.1632385253906, "learning_rate": 0.0001899467844093695, "loss": 9.3948, "step": 140 }, { "epoch": 0.12953367875647667, "grad_norm": 68.30054473876953, "learning_rate": 0.00018543466652749268, "loss": 9.8128, "step": 150 }, { "epoch": 0.12953367875647667, "eval_loss": 4.509408950805664, "eval_runtime": 6.6584, "eval_samples_per_second": 73.291, "eval_steps_per_second": 18.323, "step": 150 }, { "epoch": 0.1381692573402418, "grad_norm": 49.328487396240234, "learning_rate": 0.00018053067903555837, "loss": 8.1567, "step": 160 }, { "epoch": 0.14680483592400692, "grad_norm": 51.78285598754883, "learning_rate": 0.00017525871366768012, "loss": 6.7827, "step": 170 }, { "epoch": 0.15544041450777202, "grad_norm": 26.78425407409668, "learning_rate": 0.00016964445490919413, "loss": 6.8905, "step": 180 }, { "epoch": 0.16407599309153714, "grad_norm": 114.72111511230469, "learning_rate": 0.00016371525486442843, "loss": 6.8923, "step": 190 }, { "epoch": 0.17271157167530224, "grad_norm": 136.30235290527344, "learning_rate": 0.0001575, "loss": 6.9788, "step": 200 }, { "epoch": 0.17271157167530224, "eval_loss": 3.53438401222229, "eval_runtime": 6.669, "eval_samples_per_second": 73.175, "eval_steps_per_second": 18.294, "step": 200 }, { "epoch": 0.18134715025906736, "grad_norm": 531.1268920898438, "learning_rate": 0.00015102897041285315, "loss": 7.6385, "step": 210 }, { "epoch": 0.18998272884283246, "grad_norm": 769.6386108398438, "learning_rate": 0.00014433369230867077, "loss": 9.0432, "step": 220 }, { "epoch": 0.19861830742659758, "grad_norm": 86.73228454589844, "learning_rate": 0.0001374467844093695, "loss": 8.1367, "step": 230 }, { "epoch": 0.20725388601036268, "grad_norm": 71.22868347167969, "learning_rate": 0.0001304017990379651, "loss": 7.3369, "step": 240 }, { "epoch": 0.2158894645941278, "grad_norm": 92.49053192138672, "learning_rate": 0.0001232330586550277, "loss": 6.71, "step": 250 }, { "epoch": 0.2158894645941278, "eval_loss": 3.505357503890991, "eval_runtime": 6.6649, "eval_samples_per_second": 73.219, "eval_steps_per_second": 18.305, "step": 250 }, { "epoch": 0.22452504317789293, "grad_norm": 113.52294158935547, "learning_rate": 0.00011597548864310363, "loss": 6.9686, "step": 260 }, { "epoch": 0.23316062176165803, "grad_norm": 181.00125122070312, "learning_rate": 0.00010866444715376263, "loss": 6.5956, "step": 270 }, { "epoch": 0.24179620034542315, "grad_norm": 295.9447326660156, "learning_rate": 0.00010133555284623744, "loss": 6.5293, "step": 280 }, { "epoch": 0.2504317789291883, "grad_norm": 189.24530029296875, "learning_rate": 9.402451135689641e-05, "loss": 6.727, "step": 290 }, { "epoch": 0.25906735751295334, "grad_norm": 142.2465362548828, "learning_rate": 8.676694134497232e-05, "loss": 6.7211, "step": 300 }, { "epoch": 0.25906735751295334, "eval_loss": 3.346792221069336, "eval_runtime": 6.6861, "eval_samples_per_second": 72.987, "eval_steps_per_second": 18.247, "step": 300 }, { "epoch": 0.26770293609671847, "grad_norm": 250.88473510742188, "learning_rate": 7.95982009620349e-05, "loss": 6.4387, "step": 310 }, { "epoch": 0.2763385146804836, "grad_norm": 167.4871368408203, "learning_rate": 7.255321559063053e-05, "loss": 6.3941, "step": 320 }, { "epoch": 0.2849740932642487, "grad_norm": 257.501220703125, "learning_rate": 6.566630769132923e-05, "loss": 6.8217, "step": 330 }, { "epoch": 0.29360967184801384, "grad_norm": 157.9259033203125, "learning_rate": 5.897102958714686e-05, "loss": 6.7069, "step": 340 }, { "epoch": 0.3022452504317789, "grad_norm": 94.20719909667969, "learning_rate": 5.250000000000002e-05, "loss": 6.9331, "step": 350 }, { "epoch": 0.3022452504317789, "eval_loss": 3.275651454925537, "eval_runtime": 6.7141, "eval_samples_per_second": 72.683, "eval_steps_per_second": 18.171, "step": 350 }, { "epoch": 0.31088082901554404, "grad_norm": 112.30079650878906, "learning_rate": 4.62847451355716e-05, "loss": 7.1002, "step": 360 }, { "epoch": 0.31951640759930916, "grad_norm": 145.58631896972656, "learning_rate": 4.035554509080588e-05, "loss": 6.4866, "step": 370 }, { "epoch": 0.3281519861830743, "grad_norm": 114.67252349853516, "learning_rate": 3.474128633231992e-05, "loss": 6.2346, "step": 380 }, { "epoch": 0.33678756476683935, "grad_norm": 68.26001739501953, "learning_rate": 2.946932096444165e-05, "loss": 5.992, "step": 390 }, { "epoch": 0.3454231433506045, "grad_norm": 128.07357788085938, "learning_rate": 2.456533347250732e-05, "loss": 6.4847, "step": 400 }, { "epoch": 0.3454231433506045, "eval_loss": 3.2016987800598145, "eval_runtime": 6.6861, "eval_samples_per_second": 72.987, "eval_steps_per_second": 18.247, "step": 400 }, { "epoch": 0.3540587219343696, "grad_norm": 95.15899658203125, "learning_rate": 2.005321559063053e-05, "loss": 6.611, "step": 410 }, { "epoch": 0.3626943005181347, "grad_norm": 90.93391418457031, "learning_rate": 1.5954949903575276e-05, "loss": 6.4129, "step": 420 }, { "epoch": 0.37132987910189985, "grad_norm": 197.74005126953125, "learning_rate": 1.2290502749812666e-05, "loss": 6.6629, "step": 430 }, { "epoch": 0.3799654576856649, "grad_norm": 122.5657730102539, "learning_rate": 9.077726947526898e-06, "loss": 6.6915, "step": 440 }, { "epoch": 0.38860103626943004, "grad_norm": 114.84415435791016, "learning_rate": 6.332274817479627e-06, "loss": 6.6208, "step": 450 }, { "epoch": 0.38860103626943004, "eval_loss": 3.2092630863189697, "eval_runtime": 6.7137, "eval_samples_per_second": 72.687, "eval_steps_per_second": 18.172, "step": 450 }, { "epoch": 0.39723661485319517, "grad_norm": 135.63075256347656, "learning_rate": 4.067521926476516e-06, "loss": 6.3582, "step": 460 }, { "epoch": 0.4058721934369603, "grad_norm": 60.413612365722656, "learning_rate": 2.294501922950403e-06, "loss": 6.2606, "step": 470 }, { "epoch": 0.41450777202072536, "grad_norm": 293.9761962890625, "learning_rate": 1.021852782135112e-06, "loss": 6.8701, "step": 480 }, { "epoch": 0.4231433506044905, "grad_norm": 201.35853576660156, "learning_rate": 2.5577472271845927e-07, "loss": 6.5198, "step": 490 }, { "epoch": 0.4317789291882556, "grad_norm": 332.1162414550781, "learning_rate": 0.0, "loss": 6.4061, "step": 500 }, { "epoch": 0.4317789291882556, "eval_loss": 3.1905126571655273, "eval_runtime": 6.702, "eval_samples_per_second": 72.814, "eval_steps_per_second": 18.203, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4657011228672000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }