|
{ |
|
"best_metric": 3.1905126571655273, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.4317789291882556, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008635578583765112, |
|
"eval_loss": 4.056301116943359, |
|
"eval_runtime": 6.6134, |
|
"eval_samples_per_second": 73.789, |
|
"eval_steps_per_second": 18.447, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008635578583765112, |
|
"grad_norm": 75.20758056640625, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 7.2673, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.017271157167530225, |
|
"grad_norm": 82.5886001586914, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 6.9928, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025906735751295335, |
|
"grad_norm": 934.9874877929688, |
|
"learning_rate": 0.000126, |
|
"loss": 7.1944, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03454231433506045, |
|
"grad_norm": 220.6071319580078, |
|
"learning_rate": 0.00016800000000000002, |
|
"loss": 7.0499, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04317789291882556, |
|
"grad_norm": 720.1281127929688, |
|
"learning_rate": 0.00021, |
|
"loss": 8.7892, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04317789291882556, |
|
"eval_loss": 5.3484110832214355, |
|
"eval_runtime": 6.7066, |
|
"eval_samples_per_second": 72.764, |
|
"eval_steps_per_second": 18.191, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05181347150259067, |
|
"grad_norm": 62.76759719848633, |
|
"learning_rate": 0.00020974422527728155, |
|
"loss": 13.9468, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06044905008635579, |
|
"grad_norm": 154.0032501220703, |
|
"learning_rate": 0.0002089781472178649, |
|
"loss": 10.8076, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0690846286701209, |
|
"grad_norm": 666.8218994140625, |
|
"learning_rate": 0.0002077054980770496, |
|
"loss": 7.8264, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07772020725388601, |
|
"grad_norm": 109.74002838134766, |
|
"learning_rate": 0.00020593247807352348, |
|
"loss": 9.7935, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08635578583765112, |
|
"grad_norm": 324.6034851074219, |
|
"learning_rate": 0.00020366772518252038, |
|
"loss": 9.2869, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08635578583765112, |
|
"eval_loss": 4.111455917358398, |
|
"eval_runtime": 6.6401, |
|
"eval_samples_per_second": 73.493, |
|
"eval_steps_per_second": 18.373, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09499136442141623, |
|
"grad_norm": 116.87757110595703, |
|
"learning_rate": 0.0002009222730524731, |
|
"loss": 9.3815, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 203.64956665039062, |
|
"learning_rate": 0.00019770949725018733, |
|
"loss": 8.3639, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11226252158894647, |
|
"grad_norm": 709.5479736328125, |
|
"learning_rate": 0.00019404505009642473, |
|
"loss": 9.0807, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12089810017271158, |
|
"grad_norm": 485.1632385253906, |
|
"learning_rate": 0.0001899467844093695, |
|
"loss": 9.3948, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12953367875647667, |
|
"grad_norm": 68.30054473876953, |
|
"learning_rate": 0.00018543466652749268, |
|
"loss": 9.8128, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12953367875647667, |
|
"eval_loss": 4.509408950805664, |
|
"eval_runtime": 6.6584, |
|
"eval_samples_per_second": 73.291, |
|
"eval_steps_per_second": 18.323, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1381692573402418, |
|
"grad_norm": 49.328487396240234, |
|
"learning_rate": 0.00018053067903555837, |
|
"loss": 8.1567, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14680483592400692, |
|
"grad_norm": 51.78285598754883, |
|
"learning_rate": 0.00017525871366768012, |
|
"loss": 6.7827, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15544041450777202, |
|
"grad_norm": 26.78425407409668, |
|
"learning_rate": 0.00016964445490919413, |
|
"loss": 6.8905, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16407599309153714, |
|
"grad_norm": 114.72111511230469, |
|
"learning_rate": 0.00016371525486442843, |
|
"loss": 6.8923, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17271157167530224, |
|
"grad_norm": 136.30235290527344, |
|
"learning_rate": 0.0001575, |
|
"loss": 6.9788, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17271157167530224, |
|
"eval_loss": 3.53438401222229, |
|
"eval_runtime": 6.669, |
|
"eval_samples_per_second": 73.175, |
|
"eval_steps_per_second": 18.294, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18134715025906736, |
|
"grad_norm": 531.1268920898438, |
|
"learning_rate": 0.00015102897041285315, |
|
"loss": 7.6385, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18998272884283246, |
|
"grad_norm": 769.6386108398438, |
|
"learning_rate": 0.00014433369230867077, |
|
"loss": 9.0432, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19861830742659758, |
|
"grad_norm": 86.73228454589844, |
|
"learning_rate": 0.0001374467844093695, |
|
"loss": 8.1367, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.20725388601036268, |
|
"grad_norm": 71.22868347167969, |
|
"learning_rate": 0.0001304017990379651, |
|
"loss": 7.3369, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2158894645941278, |
|
"grad_norm": 92.49053192138672, |
|
"learning_rate": 0.0001232330586550277, |
|
"loss": 6.71, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2158894645941278, |
|
"eval_loss": 3.505357503890991, |
|
"eval_runtime": 6.6649, |
|
"eval_samples_per_second": 73.219, |
|
"eval_steps_per_second": 18.305, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22452504317789293, |
|
"grad_norm": 113.52294158935547, |
|
"learning_rate": 0.00011597548864310363, |
|
"loss": 6.9686, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23316062176165803, |
|
"grad_norm": 181.00125122070312, |
|
"learning_rate": 0.00010866444715376263, |
|
"loss": 6.5956, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24179620034542315, |
|
"grad_norm": 295.9447326660156, |
|
"learning_rate": 0.00010133555284623744, |
|
"loss": 6.5293, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2504317789291883, |
|
"grad_norm": 189.24530029296875, |
|
"learning_rate": 9.402451135689641e-05, |
|
"loss": 6.727, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"grad_norm": 142.2465362548828, |
|
"learning_rate": 8.676694134497232e-05, |
|
"loss": 6.7211, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"eval_loss": 3.346792221069336, |
|
"eval_runtime": 6.6861, |
|
"eval_samples_per_second": 72.987, |
|
"eval_steps_per_second": 18.247, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26770293609671847, |
|
"grad_norm": 250.88473510742188, |
|
"learning_rate": 7.95982009620349e-05, |
|
"loss": 6.4387, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2763385146804836, |
|
"grad_norm": 167.4871368408203, |
|
"learning_rate": 7.255321559063053e-05, |
|
"loss": 6.3941, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2849740932642487, |
|
"grad_norm": 257.501220703125, |
|
"learning_rate": 6.566630769132923e-05, |
|
"loss": 6.8217, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29360967184801384, |
|
"grad_norm": 157.9259033203125, |
|
"learning_rate": 5.897102958714686e-05, |
|
"loss": 6.7069, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3022452504317789, |
|
"grad_norm": 94.20719909667969, |
|
"learning_rate": 5.250000000000002e-05, |
|
"loss": 6.9331, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3022452504317789, |
|
"eval_loss": 3.275651454925537, |
|
"eval_runtime": 6.7141, |
|
"eval_samples_per_second": 72.683, |
|
"eval_steps_per_second": 18.171, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.31088082901554404, |
|
"grad_norm": 112.30079650878906, |
|
"learning_rate": 4.62847451355716e-05, |
|
"loss": 7.1002, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.31951640759930916, |
|
"grad_norm": 145.58631896972656, |
|
"learning_rate": 4.035554509080588e-05, |
|
"loss": 6.4866, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3281519861830743, |
|
"grad_norm": 114.67252349853516, |
|
"learning_rate": 3.474128633231992e-05, |
|
"loss": 6.2346, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33678756476683935, |
|
"grad_norm": 68.26001739501953, |
|
"learning_rate": 2.946932096444165e-05, |
|
"loss": 5.992, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3454231433506045, |
|
"grad_norm": 128.07357788085938, |
|
"learning_rate": 2.456533347250732e-05, |
|
"loss": 6.4847, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3454231433506045, |
|
"eval_loss": 3.2016987800598145, |
|
"eval_runtime": 6.6861, |
|
"eval_samples_per_second": 72.987, |
|
"eval_steps_per_second": 18.247, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3540587219343696, |
|
"grad_norm": 95.15899658203125, |
|
"learning_rate": 2.005321559063053e-05, |
|
"loss": 6.611, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3626943005181347, |
|
"grad_norm": 90.93391418457031, |
|
"learning_rate": 1.5954949903575276e-05, |
|
"loss": 6.4129, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.37132987910189985, |
|
"grad_norm": 197.74005126953125, |
|
"learning_rate": 1.2290502749812666e-05, |
|
"loss": 6.6629, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3799654576856649, |
|
"grad_norm": 122.5657730102539, |
|
"learning_rate": 9.077726947526898e-06, |
|
"loss": 6.6915, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38860103626943004, |
|
"grad_norm": 114.84415435791016, |
|
"learning_rate": 6.332274817479627e-06, |
|
"loss": 6.6208, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.38860103626943004, |
|
"eval_loss": 3.2092630863189697, |
|
"eval_runtime": 6.7137, |
|
"eval_samples_per_second": 72.687, |
|
"eval_steps_per_second": 18.172, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39723661485319517, |
|
"grad_norm": 135.63075256347656, |
|
"learning_rate": 4.067521926476516e-06, |
|
"loss": 6.3582, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4058721934369603, |
|
"grad_norm": 60.413612365722656, |
|
"learning_rate": 2.294501922950403e-06, |
|
"loss": 6.2606, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.41450777202072536, |
|
"grad_norm": 293.9761962890625, |
|
"learning_rate": 1.021852782135112e-06, |
|
"loss": 6.8701, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4231433506044905, |
|
"grad_norm": 201.35853576660156, |
|
"learning_rate": 2.5577472271845927e-07, |
|
"loss": 6.5198, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4317789291882556, |
|
"grad_norm": 332.1162414550781, |
|
"learning_rate": 0.0, |
|
"loss": 6.4061, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4317789291882556, |
|
"eval_loss": 3.1905126571655273, |
|
"eval_runtime": 6.702, |
|
"eval_samples_per_second": 72.814, |
|
"eval_steps_per_second": 18.203, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4657011228672000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|