|
{ |
|
"best_metric": 0.8320402503013611, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.23889154323936931, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00047778308647873863, |
|
"eval_loss": 4.255329608917236, |
|
"eval_runtime": 12.7971, |
|
"eval_samples_per_second": 68.922, |
|
"eval_steps_per_second": 17.27, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004777830864787387, |
|
"grad_norm": 138.58535766601562, |
|
"learning_rate": 4.2800000000000004e-05, |
|
"loss": 6.9317, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009555661729574774, |
|
"grad_norm": 131.84646606445312, |
|
"learning_rate": 8.560000000000001e-05, |
|
"loss": 4.5285, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01433349259436216, |
|
"grad_norm": 46.312652587890625, |
|
"learning_rate": 0.0001284, |
|
"loss": 3.299, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.019111323459149548, |
|
"grad_norm": 38.920799255371094, |
|
"learning_rate": 0.00017120000000000001, |
|
"loss": 2.7357, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.023889154323936932, |
|
"grad_norm": 42.04233169555664, |
|
"learning_rate": 0.000214, |
|
"loss": 2.9014, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.023889154323936932, |
|
"eval_loss": 1.3917949199676514, |
|
"eval_runtime": 12.3884, |
|
"eval_samples_per_second": 71.196, |
|
"eval_steps_per_second": 17.839, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02866698518872432, |
|
"grad_norm": 102.38275146484375, |
|
"learning_rate": 0.00021373935337780118, |
|
"loss": 2.403, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.033444816053511704, |
|
"grad_norm": 527.3989868164062, |
|
"learning_rate": 0.00021295868335534802, |
|
"loss": 2.5263, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.038222646918299095, |
|
"grad_norm": 188.73924255371094, |
|
"learning_rate": 0.0002116617932785172, |
|
"loss": 3.2151, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04300047778308648, |
|
"grad_norm": 119.0323486328125, |
|
"learning_rate": 0.00020985500146540012, |
|
"loss": 3.5938, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.047778308647873864, |
|
"grad_norm": 170.87159729003906, |
|
"learning_rate": 0.0002075471104240922, |
|
"loss": 4.0067, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.047778308647873864, |
|
"eval_loss": 2.311336040496826, |
|
"eval_runtime": 13.4445, |
|
"eval_samples_per_second": 65.603, |
|
"eval_steps_per_second": 16.438, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05255613951266125, |
|
"grad_norm": 32.53251647949219, |
|
"learning_rate": 0.00020474936396775828, |
|
"loss": 10.3052, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05733397037744864, |
|
"grad_norm": 65.40621185302734, |
|
"learning_rate": 0.00020147539243590517, |
|
"loss": 7.4235, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.062111801242236024, |
|
"grad_norm": 20.591093063354492, |
|
"learning_rate": 0.00019774114628873756, |
|
"loss": 3.2373, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06688963210702341, |
|
"grad_norm": 31.703140258789062, |
|
"learning_rate": 0.00019356481839811937, |
|
"loss": 2.8812, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0716674629718108, |
|
"grad_norm": 34.521854400634766, |
|
"learning_rate": 0.00018896675541373064, |
|
"loss": 3.2874, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0716674629718108, |
|
"eval_loss": 1.4771889448165894, |
|
"eval_runtime": 12.9239, |
|
"eval_samples_per_second": 68.246, |
|
"eval_steps_per_second": 17.1, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07644529383659819, |
|
"grad_norm": 28.358295440673828, |
|
"learning_rate": 0.00018396935863623567, |
|
"loss": 2.4476, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08122312470138557, |
|
"grad_norm": 21.966550827026367, |
|
"learning_rate": 0.00017859697488039784, |
|
"loss": 2.4135, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08600095556617296, |
|
"grad_norm": 32.07853317260742, |
|
"learning_rate": 0.00017287577785984542, |
|
"loss": 2.2956, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.09077878643096035, |
|
"grad_norm": 17.3485107421875, |
|
"learning_rate": 0.0001668336406713699, |
|
"loss": 2.2849, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09555661729574773, |
|
"grad_norm": 14.762402534484863, |
|
"learning_rate": 0.0001605, |
|
"loss": 2.4335, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09555661729574773, |
|
"eval_loss": 1.0888148546218872, |
|
"eval_runtime": 12.7061, |
|
"eval_samples_per_second": 69.415, |
|
"eval_steps_per_second": 17.393, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10033444816053512, |
|
"grad_norm": 140.28663635253906, |
|
"learning_rate": 0.00015390571270643128, |
|
"loss": 1.7442, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1051122790253225, |
|
"grad_norm": 134.5390167236328, |
|
"learning_rate": 0.0001470829054955026, |
|
"loss": 1.9716, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 25.001611709594727, |
|
"learning_rate": 0.00014006481839811937, |
|
"loss": 2.0693, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11466794075489728, |
|
"grad_norm": 33.87446594238281, |
|
"learning_rate": 0.00013288564282916442, |
|
"loss": 2.4731, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11944577161968466, |
|
"grad_norm": 56.331687927246094, |
|
"learning_rate": 0.00012558035501036158, |
|
"loss": 2.7371, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11944577161968466, |
|
"eval_loss": 1.124778389930725, |
|
"eval_runtime": 12.2143, |
|
"eval_samples_per_second": 72.21, |
|
"eval_steps_per_second": 18.094, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.12422360248447205, |
|
"grad_norm": 20.330785751342773, |
|
"learning_rate": 0.00011818454556963892, |
|
"loss": 1.6651, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12900143334925943, |
|
"grad_norm": 14.653603553771973, |
|
"learning_rate": 0.00011073424614716762, |
|
"loss": 1.6375, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13377926421404682, |
|
"grad_norm": 9.812971115112305, |
|
"learning_rate": 0.00010326575385283242, |
|
"loss": 2.0153, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1385570950788342, |
|
"grad_norm": 15.295802116394043, |
|
"learning_rate": 9.58154544303611e-05, |
|
"loss": 2.2795, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1433349259436216, |
|
"grad_norm": 12.265714645385742, |
|
"learning_rate": 8.841964498963846e-05, |
|
"loss": 2.2539, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1433349259436216, |
|
"eval_loss": 0.9771943092346191, |
|
"eval_runtime": 12.2008, |
|
"eval_samples_per_second": 72.291, |
|
"eval_steps_per_second": 18.114, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.148112756808409, |
|
"grad_norm": 14.376371383666992, |
|
"learning_rate": 8.111435717083556e-05, |
|
"loss": 1.5526, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15289058767319638, |
|
"grad_norm": 9.20234203338623, |
|
"learning_rate": 7.393518160188063e-05, |
|
"loss": 1.572, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.15766841853798375, |
|
"grad_norm": 11.955562591552734, |
|
"learning_rate": 6.69170945044974e-05, |
|
"loss": 1.8717, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16244624940277114, |
|
"grad_norm": 8.291068077087402, |
|
"learning_rate": 6.009428729356871e-05, |
|
"loss": 1.9432, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"grad_norm": 21.525508880615234, |
|
"learning_rate": 5.3500000000000026e-05, |
|
"loss": 2.3061, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.16722408026755853, |
|
"eval_loss": 0.9378606677055359, |
|
"eval_runtime": 12.1993, |
|
"eval_samples_per_second": 72.3, |
|
"eval_steps_per_second": 18.116, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17200191113234592, |
|
"grad_norm": 11.051887512207031, |
|
"learning_rate": 4.7166359328630106e-05, |
|
"loss": 1.5252, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1767797419971333, |
|
"grad_norm": 16.316925048828125, |
|
"learning_rate": 4.112422214015456e-05, |
|
"loss": 1.4291, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1815575728619207, |
|
"grad_norm": 34.651214599609375, |
|
"learning_rate": 3.5403025119602206e-05, |
|
"loss": 1.7447, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.18633540372670807, |
|
"grad_norm": 15.149808883666992, |
|
"learning_rate": 3.0030641363764346e-05, |
|
"loss": 1.9199, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19111323459149546, |
|
"grad_norm": 13.764430046081543, |
|
"learning_rate": 2.5033244586269365e-05, |
|
"loss": 2.0217, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19111323459149546, |
|
"eval_loss": 0.8595229387283325, |
|
"eval_runtime": 12.2031, |
|
"eval_samples_per_second": 72.277, |
|
"eval_steps_per_second": 18.11, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19589106545628285, |
|
"grad_norm": 24.64581871032715, |
|
"learning_rate": 2.0435181601880635e-05, |
|
"loss": 1.1874, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20066889632107024, |
|
"grad_norm": 17.642654418945312, |
|
"learning_rate": 1.625885371126242e-05, |
|
"loss": 1.5195, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.20544672718585763, |
|
"grad_norm": 17.003334045410156, |
|
"learning_rate": 1.2524607564094813e-05, |
|
"loss": 1.8292, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.210224558050645, |
|
"grad_norm": 13.311195373535156, |
|
"learning_rate": 9.250636032241695e-06, |
|
"loss": 1.8517, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.21500238891543239, |
|
"grad_norm": 12.355936050415039, |
|
"learning_rate": 6.45288957590781e-06, |
|
"loss": 2.2005, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21500238891543239, |
|
"eval_loss": 0.837338387966156, |
|
"eval_runtime": 12.301, |
|
"eval_samples_per_second": 71.701, |
|
"eval_steps_per_second": 17.966, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 11.863606452941895, |
|
"learning_rate": 4.144998534599878e-06, |
|
"loss": 1.1789, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22455805064500717, |
|
"grad_norm": 20.274431228637695, |
|
"learning_rate": 2.3382067214827915e-06, |
|
"loss": 1.3979, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.22933588150979456, |
|
"grad_norm": 10.406599998474121, |
|
"learning_rate": 1.0413166446519713e-06, |
|
"loss": 1.7026, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.23411371237458195, |
|
"grad_norm": 9.674276351928711, |
|
"learning_rate": 2.6064662219881083e-07, |
|
"loss": 1.941, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.23889154323936931, |
|
"grad_norm": 12.152741432189941, |
|
"learning_rate": 0.0, |
|
"loss": 2.1046, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23889154323936931, |
|
"eval_loss": 0.8320402503013611, |
|
"eval_runtime": 12.1319, |
|
"eval_samples_per_second": 72.701, |
|
"eval_steps_per_second": 18.216, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4657011228672000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|