|
{ |
|
"best_metric": 2.872375965118408, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.004827326530021144, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 9.654653060042287e-06, |
|
"eval_loss": 3.0490081310272217, |
|
"eval_runtime": 599.4901, |
|
"eval_samples_per_second": 72.748, |
|
"eval_steps_per_second": 18.187, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 9.654653060042287e-05, |
|
"grad_norm": 9.069512367248535, |
|
"learning_rate": 4.12e-05, |
|
"loss": 5.2993, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00019309306120084574, |
|
"grad_norm": 12.16044807434082, |
|
"learning_rate": 8.24e-05, |
|
"loss": 5.2249, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00028963959180126864, |
|
"grad_norm": 14.883082389831543, |
|
"learning_rate": 0.0001236, |
|
"loss": 5.539, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0003861861224016915, |
|
"grad_norm": 16.756444931030273, |
|
"learning_rate": 0.0001648, |
|
"loss": 6.6809, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0004827326530021144, |
|
"grad_norm": 19.068893432617188, |
|
"learning_rate": 0.000206, |
|
"loss": 7.8567, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0004827326530021144, |
|
"eval_loss": 3.0399818420410156, |
|
"eval_runtime": 598.5166, |
|
"eval_samples_per_second": 72.867, |
|
"eval_steps_per_second": 18.217, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0005792791836025373, |
|
"grad_norm": 26.745712280273438, |
|
"learning_rate": 0.0002057490971767619, |
|
"loss": 5.1007, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0006758257142029601, |
|
"grad_norm": 39.823272705078125, |
|
"learning_rate": 0.00020499761108038175, |
|
"loss": 5.3979, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.000772372244803383, |
|
"grad_norm": 35.91092300415039, |
|
"learning_rate": 0.00020374920287558198, |
|
"loss": 5.5911, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0008689187754038058, |
|
"grad_norm": 69.34488677978516, |
|
"learning_rate": 0.00020200995468164684, |
|
"loss": 6.2759, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0009654653060042288, |
|
"grad_norm": 22.20043182373047, |
|
"learning_rate": 0.00019978833994094855, |
|
"loss": 7.4368, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0009654653060042288, |
|
"eval_loss": 3.062361240386963, |
|
"eval_runtime": 601.2603, |
|
"eval_samples_per_second": 72.534, |
|
"eval_steps_per_second": 18.134, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0010620118366046515, |
|
"grad_norm": 32.1593132019043, |
|
"learning_rate": 0.00019709518213718787, |
|
"loss": 5.3581, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0011585583672050746, |
|
"grad_norm": 26.68710708618164, |
|
"learning_rate": 0.00019394360206446948, |
|
"loss": 5.1333, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0012551048978054974, |
|
"grad_norm": 22.648284912109375, |
|
"learning_rate": 0.00019034895390411186, |
|
"loss": 5.7831, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0013516514284059203, |
|
"grad_norm": 23.975393295288086, |
|
"learning_rate": 0.0001863287504206196, |
|
"loss": 6.2112, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0014481979590063431, |
|
"grad_norm": 48.25133514404297, |
|
"learning_rate": 0.00018190257764125471, |
|
"loss": 7.9916, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0014481979590063431, |
|
"eval_loss": 3.3823323249816895, |
|
"eval_runtime": 600.8813, |
|
"eval_samples_per_second": 72.58, |
|
"eval_steps_per_second": 18.145, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.001544744489606766, |
|
"grad_norm": 12.734755516052246, |
|
"learning_rate": 0.00017709199943488106, |
|
"loss": 5.746, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0016412910202071888, |
|
"grad_norm": 19.810941696166992, |
|
"learning_rate": 0.00017192045245496238, |
|
"loss": 5.7286, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0017378375508076116, |
|
"grad_norm": 19.0549259185791, |
|
"learning_rate": 0.00016641313195854277, |
|
"loss": 5.7489, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0018343840814080345, |
|
"grad_norm": 13.497769355773926, |
|
"learning_rate": 0.0001605968690574869, |
|
"loss": 6.5552, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0019309306120084576, |
|
"grad_norm": 48.43019485473633, |
|
"learning_rate": 0.0001545, |
|
"loss": 7.6411, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0019309306120084576, |
|
"eval_loss": 3.024033784866333, |
|
"eval_runtime": 598.9829, |
|
"eval_samples_per_second": 72.81, |
|
"eval_steps_per_second": 18.203, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00202747714260888, |
|
"grad_norm": 16.620399475097656, |
|
"learning_rate": 0.00014815222811927496, |
|
"loss": 5.3396, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.002124023673209303, |
|
"grad_norm": 14.036992073059082, |
|
"learning_rate": 0.00014158447912183896, |
|
"loss": 5.2628, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0022205702038097263, |
|
"grad_norm": 13.502534866333008, |
|
"learning_rate": 0.00013482875042061958, |
|
"loss": 5.545, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.002317116734410149, |
|
"grad_norm": 15.779216766357422, |
|
"learning_rate": 0.00012791795524676576, |
|
"loss": 5.9849, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.002413663265010572, |
|
"grad_norm": 23.224111557006836, |
|
"learning_rate": 0.00012088576229969385, |
|
"loss": 7.1386, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.002413663265010572, |
|
"eval_loss": 2.9750823974609375, |
|
"eval_runtime": 594.6214, |
|
"eval_samples_per_second": 73.344, |
|
"eval_steps_per_second": 18.336, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.002510209795610995, |
|
"grad_norm": 10.431509017944336, |
|
"learning_rate": 0.0001137664317165683, |
|
"loss": 5.563, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0026067563262114177, |
|
"grad_norm": 12.937127113342285, |
|
"learning_rate": 0.00010659464816035761, |
|
"loss": 5.4473, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0027033028568118405, |
|
"grad_norm": 10.73960018157959, |
|
"learning_rate": 9.940535183964242e-05, |
|
"loss": 5.499, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0027998493874122634, |
|
"grad_norm": 12.294966697692871, |
|
"learning_rate": 9.22335682834317e-05, |
|
"loss": 6.223, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"grad_norm": 25.903270721435547, |
|
"learning_rate": 8.511423770030617e-05, |
|
"loss": 7.2314, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"eval_loss": 2.911633014678955, |
|
"eval_runtime": 591.9056, |
|
"eval_samples_per_second": 73.681, |
|
"eval_steps_per_second": 18.42, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.002992942448613109, |
|
"grad_norm": 13.487019538879395, |
|
"learning_rate": 7.808204475323423e-05, |
|
"loss": 5.2185, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.003089488979213532, |
|
"grad_norm": 19.71841049194336, |
|
"learning_rate": 7.117124957938042e-05, |
|
"loss": 5.3138, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0031860355098139548, |
|
"grad_norm": 18.53927230834961, |
|
"learning_rate": 6.441552087816105e-05, |
|
"loss": 5.4884, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0032825820404143776, |
|
"grad_norm": 17.105976104736328, |
|
"learning_rate": 5.784777188072502e-05, |
|
"loss": 5.8266, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0033791285710148005, |
|
"grad_norm": 32.33679962158203, |
|
"learning_rate": 5.150000000000002e-05, |
|
"loss": 6.5896, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0033791285710148005, |
|
"eval_loss": 2.8866002559661865, |
|
"eval_runtime": 591.3638, |
|
"eval_samples_per_second": 73.748, |
|
"eval_steps_per_second": 18.437, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0034756751016152233, |
|
"grad_norm": 18.319353103637695, |
|
"learning_rate": 4.540313094251309e-05, |
|
"loss": 5.55, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.003572221632215646, |
|
"grad_norm": 17.83504295349121, |
|
"learning_rate": 3.958686804145719e-05, |
|
"loss": 5.0744, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.003668768162816069, |
|
"grad_norm": 22.843358993530273, |
|
"learning_rate": 3.4079547545037634e-05, |
|
"loss": 5.3819, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0037653146934164923, |
|
"grad_norm": 28.16783332824707, |
|
"learning_rate": 2.8908000565118947e-05, |
|
"loss": 5.9155, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.003861861224016915, |
|
"grad_norm": 37.42588424682617, |
|
"learning_rate": 2.4097422358745275e-05, |
|
"loss": 6.9787, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.003861861224016915, |
|
"eval_loss": 2.877124786376953, |
|
"eval_runtime": 595.4552, |
|
"eval_samples_per_second": 73.241, |
|
"eval_steps_per_second": 18.31, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.003958407754617338, |
|
"grad_norm": 13.854421615600586, |
|
"learning_rate": 1.9671249579380422e-05, |
|
"loss": 5.1705, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00405495428521776, |
|
"grad_norm": 16.01457977294922, |
|
"learning_rate": 1.5651046095888127e-05, |
|
"loss": 5.1288, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.004151500815818184, |
|
"grad_norm": 13.360790252685547, |
|
"learning_rate": 1.205639793553052e-05, |
|
"loss": 5.3445, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.004248047346418606, |
|
"grad_norm": 15.994139671325684, |
|
"learning_rate": 8.904817862812098e-06, |
|
"loss": 5.7003, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.004344593877019029, |
|
"grad_norm": 25.979543685913086, |
|
"learning_rate": 6.211660059051443e-06, |
|
"loss": 6.8299, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.004344593877019029, |
|
"eval_loss": 2.874143123626709, |
|
"eval_runtime": 602.9673, |
|
"eval_samples_per_second": 72.329, |
|
"eval_steps_per_second": 18.082, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.004441140407619453, |
|
"grad_norm": 12.285646438598633, |
|
"learning_rate": 3.990045318353154e-06, |
|
"loss": 5.1388, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.004537686938219875, |
|
"grad_norm": 9.6546049118042, |
|
"learning_rate": 2.250797124418014e-06, |
|
"loss": 5.3243, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.004634233468820298, |
|
"grad_norm": 15.964618682861328, |
|
"learning_rate": 1.0023889196182526e-06, |
|
"loss": 5.4568, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.004730779999420721, |
|
"grad_norm": 16.29486656188965, |
|
"learning_rate": 2.5090282323810766e-07, |
|
"loss": 6.1612, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.004827326530021144, |
|
"grad_norm": 21.052181243896484, |
|
"learning_rate": 0.0, |
|
"loss": 6.7968, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.004827326530021144, |
|
"eval_loss": 2.872375965118408, |
|
"eval_runtime": 592.4933, |
|
"eval_samples_per_second": 73.608, |
|
"eval_steps_per_second": 18.402, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5128671190843392.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|