|
{ |
|
"best_metric": 2.7872583866119385, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-450", |
|
"epoch": 0.4317789291882556, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0008635578583765112, |
|
"eval_loss": 4.047359466552734, |
|
"eval_runtime": 6.7214, |
|
"eval_samples_per_second": 72.604, |
|
"eval_steps_per_second": 18.151, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008635578583765112, |
|
"grad_norm": 71.40580749511719, |
|
"learning_rate": 4.3e-05, |
|
"loss": 7.2404, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.017271157167530225, |
|
"grad_norm": 61.08443832397461, |
|
"learning_rate": 8.6e-05, |
|
"loss": 7.6144, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025906735751295335, |
|
"grad_norm": 68.64509582519531, |
|
"learning_rate": 0.000129, |
|
"loss": 7.7104, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03454231433506045, |
|
"grad_norm": 51.64717102050781, |
|
"learning_rate": 0.000172, |
|
"loss": 7.3387, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04317789291882556, |
|
"grad_norm": 60.46714782714844, |
|
"learning_rate": 0.000215, |
|
"loss": 6.4608, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04317789291882556, |
|
"eval_loss": 3.1346349716186523, |
|
"eval_runtime": 6.6605, |
|
"eval_samples_per_second": 73.268, |
|
"eval_steps_per_second": 18.317, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05181347150259067, |
|
"grad_norm": 73.30271911621094, |
|
"learning_rate": 0.0002147381354029311, |
|
"loss": 6.3668, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06044905008635579, |
|
"grad_norm": 62.13903045654297, |
|
"learning_rate": 0.0002139538173897188, |
|
"loss": 6.2347, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0690846286701209, |
|
"grad_norm": 66.6341323852539, |
|
"learning_rate": 0.0002126508670788841, |
|
"loss": 5.7629, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07772020725388601, |
|
"grad_norm": 149.75143432617188, |
|
"learning_rate": 0.00021083563231336926, |
|
"loss": 6.0598, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08635578583765112, |
|
"grad_norm": 133.63043212890625, |
|
"learning_rate": 0.00020851695673448515, |
|
"loss": 6.6264, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08635578583765112, |
|
"eval_loss": 3.3024439811706543, |
|
"eval_runtime": 6.8401, |
|
"eval_samples_per_second": 71.344, |
|
"eval_steps_per_second": 17.836, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09499136442141623, |
|
"grad_norm": 265.6074523925781, |
|
"learning_rate": 0.00020570613669657956, |
|
"loss": 6.845, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10362694300518134, |
|
"grad_norm": 188.36181640625, |
|
"learning_rate": 0.00020241686623233464, |
|
"loss": 7.5856, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.11226252158894647, |
|
"grad_norm": 72.51764678955078, |
|
"learning_rate": 0.00019866517033681577, |
|
"loss": 6.734, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.12089810017271158, |
|
"grad_norm": 188.24923706054688, |
|
"learning_rate": 0.00019446932689530684, |
|
"loss": 6.4476, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12953367875647667, |
|
"grad_norm": 139.26271057128906, |
|
"learning_rate": 0.0001898497776352901, |
|
"loss": 6.255, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12953367875647667, |
|
"eval_loss": 3.182236909866333, |
|
"eval_runtime": 6.7075, |
|
"eval_samples_per_second": 72.754, |
|
"eval_steps_per_second": 18.189, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1381692573402418, |
|
"grad_norm": 203.13534545898438, |
|
"learning_rate": 0.000184829028536405, |
|
"loss": 6.0828, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14680483592400692, |
|
"grad_norm": 112.05760955810547, |
|
"learning_rate": 0.00017943154018357726, |
|
"loss": 6.1603, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15544041450777202, |
|
"grad_norm": 72.94593048095703, |
|
"learning_rate": 0.00017368360859750824, |
|
"loss": 6.1336, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.16407599309153714, |
|
"grad_norm": 101.9844970703125, |
|
"learning_rate": 0.00016761323712310527, |
|
"loss": 6.1751, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.17271157167530224, |
|
"grad_norm": 106.93225860595703, |
|
"learning_rate": 0.00016125, |
|
"loss": 6.8375, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17271157167530224, |
|
"eval_loss": 3.004676342010498, |
|
"eval_runtime": 6.7213, |
|
"eval_samples_per_second": 72.605, |
|
"eval_steps_per_second": 18.151, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.18134715025906736, |
|
"grad_norm": 68.26008605957031, |
|
"learning_rate": 0.0001546248982798258, |
|
"loss": 5.7141, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18998272884283246, |
|
"grad_norm": 127.45634460449219, |
|
"learning_rate": 0.00014777020879221055, |
|
"loss": 5.7091, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19861830742659758, |
|
"grad_norm": 119.57172393798828, |
|
"learning_rate": 0.00014071932689530684, |
|
"loss": 6.3644, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.20725388601036268, |
|
"grad_norm": 101.10028839111328, |
|
"learning_rate": 0.00013350660377696428, |
|
"loss": 6.5899, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2158894645941278, |
|
"grad_norm": 69.55671691894531, |
|
"learning_rate": 0.00012616717909919503, |
|
"loss": 6.1797, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2158894645941278, |
|
"eval_loss": 2.9973599910736084, |
|
"eval_runtime": 7.7072, |
|
"eval_samples_per_second": 63.318, |
|
"eval_steps_per_second": 15.829, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.22452504317789293, |
|
"grad_norm": 81.67117309570312, |
|
"learning_rate": 0.00011873680980127275, |
|
"loss": 5.8035, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.23316062176165803, |
|
"grad_norm": 60.442134857177734, |
|
"learning_rate": 0.00011125169589551887, |
|
"loss": 5.969, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.24179620034542315, |
|
"grad_norm": 94.2229232788086, |
|
"learning_rate": 0.00010374830410448118, |
|
"loss": 5.6845, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2504317789291883, |
|
"grad_norm": 301.8155822753906, |
|
"learning_rate": 9.626319019872726e-05, |
|
"loss": 5.8336, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"grad_norm": 53.06270217895508, |
|
"learning_rate": 8.883282090080499e-05, |
|
"loss": 6.1913, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.25906735751295334, |
|
"eval_loss": 2.8754076957702637, |
|
"eval_runtime": 7.8782, |
|
"eval_samples_per_second": 61.943, |
|
"eval_steps_per_second": 15.486, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26770293609671847, |
|
"grad_norm": 85.92414093017578, |
|
"learning_rate": 8.149339622303573e-05, |
|
"loss": 5.7197, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2763385146804836, |
|
"grad_norm": 98.81600189208984, |
|
"learning_rate": 7.428067310469316e-05, |
|
"loss": 5.9757, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2849740932642487, |
|
"grad_norm": 83.21562194824219, |
|
"learning_rate": 6.722979120778945e-05, |
|
"loss": 5.7417, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.29360967184801384, |
|
"grad_norm": 113.16553497314453, |
|
"learning_rate": 6.0375101720174165e-05, |
|
"loss": 5.7729, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3022452504317789, |
|
"grad_norm": 65.47755432128906, |
|
"learning_rate": 5.3750000000000026e-05, |
|
"loss": 5.8536, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3022452504317789, |
|
"eval_loss": 2.931809902191162, |
|
"eval_runtime": 6.7527, |
|
"eval_samples_per_second": 72.268, |
|
"eval_steps_per_second": 18.067, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.31088082901554404, |
|
"grad_norm": 62.19725799560547, |
|
"learning_rate": 4.738676287689473e-05, |
|
"loss": 5.8092, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.31951640759930916, |
|
"grad_norm": 71.66590881347656, |
|
"learning_rate": 4.131639140249173e-05, |
|
"loss": 5.8552, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3281519861830743, |
|
"grad_norm": 76.8243408203125, |
|
"learning_rate": 3.5568459816422774e-05, |
|
"loss": 5.9306, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.33678756476683935, |
|
"grad_norm": 78.27678680419922, |
|
"learning_rate": 3.017097146359502e-05, |
|
"loss": 6.0458, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3454231433506045, |
|
"grad_norm": 51.60271072387695, |
|
"learning_rate": 2.5150222364709875e-05, |
|
"loss": 6.0825, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3454231433506045, |
|
"eval_loss": 2.8580236434936523, |
|
"eval_runtime": 6.8437, |
|
"eval_samples_per_second": 71.307, |
|
"eval_steps_per_second": 17.827, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3540587219343696, |
|
"grad_norm": 175.19671630859375, |
|
"learning_rate": 2.053067310469316e-05, |
|
"loss": 5.6827, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.3626943005181347, |
|
"grad_norm": 77.0377197265625, |
|
"learning_rate": 1.633482966318421e-05, |
|
"loss": 5.8634, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.37132987910189985, |
|
"grad_norm": 80.0809555053711, |
|
"learning_rate": 1.2583133767665349e-05, |
|
"loss": 5.5702, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.3799654576856649, |
|
"grad_norm": 80.28479766845703, |
|
"learning_rate": 9.293863303420395e-06, |
|
"loss": 5.7111, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.38860103626943004, |
|
"grad_norm": 83.0961685180664, |
|
"learning_rate": 6.483043265514856e-06, |
|
"loss": 6.3589, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.38860103626943004, |
|
"eval_loss": 2.7872583866119385, |
|
"eval_runtime": 6.7184, |
|
"eval_samples_per_second": 72.636, |
|
"eval_steps_per_second": 18.159, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.39723661485319517, |
|
"grad_norm": 76.11356353759766, |
|
"learning_rate": 4.164367686630719e-06, |
|
"loss": 5.5934, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4058721934369603, |
|
"grad_norm": 125.73208618164062, |
|
"learning_rate": 2.3491329211158885e-06, |
|
"loss": 5.7158, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.41450777202072536, |
|
"grad_norm": 44.78662872314453, |
|
"learning_rate": 1.046182610281186e-06, |
|
"loss": 5.8467, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4231433506044905, |
|
"grad_norm": 100.00250244140625, |
|
"learning_rate": 2.6186459706889876e-07, |
|
"loss": 5.2767, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4317789291882556, |
|
"grad_norm": 38.51133728027344, |
|
"learning_rate": 0.0, |
|
"loss": 5.9893, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4317789291882556, |
|
"eval_loss": 2.801614284515381, |
|
"eval_runtime": 6.8376, |
|
"eval_samples_per_second": 71.37, |
|
"eval_steps_per_second": 17.842, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 1 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4425082994688000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|