{ "best_metric": 2.872375965118408, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.004827326530021144, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.654653060042287e-06, "eval_loss": 3.0490081310272217, "eval_runtime": 599.4901, "eval_samples_per_second": 72.748, "eval_steps_per_second": 18.187, "step": 1 }, { "epoch": 9.654653060042287e-05, "grad_norm": 9.069512367248535, "learning_rate": 4.12e-05, "loss": 5.2993, "step": 10 }, { "epoch": 0.00019309306120084574, "grad_norm": 12.16044807434082, "learning_rate": 8.24e-05, "loss": 5.2249, "step": 20 }, { "epoch": 0.00028963959180126864, "grad_norm": 14.883082389831543, "learning_rate": 0.0001236, "loss": 5.539, "step": 30 }, { "epoch": 0.0003861861224016915, "grad_norm": 16.756444931030273, "learning_rate": 0.0001648, "loss": 6.6809, "step": 40 }, { "epoch": 0.0004827326530021144, "grad_norm": 19.068893432617188, "learning_rate": 0.000206, "loss": 7.8567, "step": 50 }, { "epoch": 0.0004827326530021144, "eval_loss": 3.0399818420410156, "eval_runtime": 598.5166, "eval_samples_per_second": 72.867, "eval_steps_per_second": 18.217, "step": 50 }, { "epoch": 0.0005792791836025373, "grad_norm": 26.745712280273438, "learning_rate": 0.0002057490971767619, "loss": 5.1007, "step": 60 }, { "epoch": 0.0006758257142029601, "grad_norm": 39.823272705078125, "learning_rate": 0.00020499761108038175, "loss": 5.3979, "step": 70 }, { "epoch": 0.000772372244803383, "grad_norm": 35.91092300415039, "learning_rate": 0.00020374920287558198, "loss": 5.5911, "step": 80 }, { "epoch": 0.0008689187754038058, "grad_norm": 69.34488677978516, "learning_rate": 0.00020200995468164684, "loss": 6.2759, "step": 90 }, { "epoch": 0.0009654653060042288, "grad_norm": 22.20043182373047, "learning_rate": 0.00019978833994094855, "loss": 7.4368, "step": 100 }, { "epoch": 0.0009654653060042288, "eval_loss": 3.062361240386963, "eval_runtime": 601.2603, "eval_samples_per_second": 72.534, "eval_steps_per_second": 18.134, "step": 100 }, { "epoch": 0.0010620118366046515, "grad_norm": 32.1593132019043, "learning_rate": 0.00019709518213718787, "loss": 5.3581, "step": 110 }, { "epoch": 0.0011585583672050746, "grad_norm": 26.68710708618164, "learning_rate": 0.00019394360206446948, "loss": 5.1333, "step": 120 }, { "epoch": 0.0012551048978054974, "grad_norm": 22.648284912109375, "learning_rate": 0.00019034895390411186, "loss": 5.7831, "step": 130 }, { "epoch": 0.0013516514284059203, "grad_norm": 23.975393295288086, "learning_rate": 0.0001863287504206196, "loss": 6.2112, "step": 140 }, { "epoch": 0.0014481979590063431, "grad_norm": 48.25133514404297, "learning_rate": 0.00018190257764125471, "loss": 7.9916, "step": 150 }, { "epoch": 0.0014481979590063431, "eval_loss": 3.3823323249816895, "eval_runtime": 600.8813, "eval_samples_per_second": 72.58, "eval_steps_per_second": 18.145, "step": 150 }, { "epoch": 0.001544744489606766, "grad_norm": 12.734755516052246, "learning_rate": 0.00017709199943488106, "loss": 5.746, "step": 160 }, { "epoch": 0.0016412910202071888, "grad_norm": 19.810941696166992, "learning_rate": 0.00017192045245496238, "loss": 5.7286, "step": 170 }, { "epoch": 0.0017378375508076116, "grad_norm": 19.0549259185791, "learning_rate": 0.00016641313195854277, "loss": 5.7489, "step": 180 }, { "epoch": 0.0018343840814080345, "grad_norm": 13.497769355773926, "learning_rate": 0.0001605968690574869, "loss": 6.5552, "step": 190 }, { "epoch": 0.0019309306120084576, "grad_norm": 48.43019485473633, "learning_rate": 0.0001545, "loss": 7.6411, "step": 200 }, { "epoch": 0.0019309306120084576, "eval_loss": 3.024033784866333, "eval_runtime": 598.9829, "eval_samples_per_second": 72.81, "eval_steps_per_second": 18.203, "step": 200 }, { "epoch": 0.00202747714260888, "grad_norm": 16.620399475097656, "learning_rate": 0.00014815222811927496, "loss": 5.3396, "step": 210 }, { "epoch": 0.002124023673209303, "grad_norm": 14.036992073059082, "learning_rate": 0.00014158447912183896, "loss": 5.2628, "step": 220 }, { "epoch": 0.0022205702038097263, "grad_norm": 13.502534866333008, "learning_rate": 0.00013482875042061958, "loss": 5.545, "step": 230 }, { "epoch": 0.002317116734410149, "grad_norm": 15.779216766357422, "learning_rate": 0.00012791795524676576, "loss": 5.9849, "step": 240 }, { "epoch": 0.002413663265010572, "grad_norm": 23.224111557006836, "learning_rate": 0.00012088576229969385, "loss": 7.1386, "step": 250 }, { "epoch": 0.002413663265010572, "eval_loss": 2.9750823974609375, "eval_runtime": 594.6214, "eval_samples_per_second": 73.344, "eval_steps_per_second": 18.336, "step": 250 }, { "epoch": 0.002510209795610995, "grad_norm": 10.431509017944336, "learning_rate": 0.0001137664317165683, "loss": 5.563, "step": 260 }, { "epoch": 0.0026067563262114177, "grad_norm": 12.937127113342285, "learning_rate": 0.00010659464816035761, "loss": 5.4473, "step": 270 }, { "epoch": 0.0027033028568118405, "grad_norm": 10.73960018157959, "learning_rate": 9.940535183964242e-05, "loss": 5.499, "step": 280 }, { "epoch": 0.0027998493874122634, "grad_norm": 12.294966697692871, "learning_rate": 9.22335682834317e-05, "loss": 6.223, "step": 290 }, { "epoch": 0.0028963959180126862, "grad_norm": 25.903270721435547, "learning_rate": 8.511423770030617e-05, "loss": 7.2314, "step": 300 }, { "epoch": 0.0028963959180126862, "eval_loss": 2.911633014678955, "eval_runtime": 591.9056, "eval_samples_per_second": 73.681, "eval_steps_per_second": 18.42, "step": 300 }, { "epoch": 0.002992942448613109, "grad_norm": 13.487019538879395, "learning_rate": 7.808204475323423e-05, "loss": 5.2185, "step": 310 }, { "epoch": 0.003089488979213532, "grad_norm": 19.71841049194336, "learning_rate": 7.117124957938042e-05, "loss": 5.3138, "step": 320 }, { "epoch": 0.0031860355098139548, "grad_norm": 18.53927230834961, "learning_rate": 6.441552087816105e-05, "loss": 5.4884, "step": 330 }, { "epoch": 0.0032825820404143776, "grad_norm": 17.105976104736328, "learning_rate": 5.784777188072502e-05, "loss": 5.8266, "step": 340 }, { "epoch": 0.0033791285710148005, "grad_norm": 32.33679962158203, "learning_rate": 5.150000000000002e-05, "loss": 6.5896, "step": 350 }, { "epoch": 0.0033791285710148005, "eval_loss": 2.8866002559661865, "eval_runtime": 591.3638, "eval_samples_per_second": 73.748, "eval_steps_per_second": 18.437, "step": 350 }, { "epoch": 0.0034756751016152233, "grad_norm": 18.319353103637695, "learning_rate": 4.540313094251309e-05, "loss": 5.55, "step": 360 }, { "epoch": 0.003572221632215646, "grad_norm": 17.83504295349121, "learning_rate": 3.958686804145719e-05, "loss": 5.0744, "step": 370 }, { "epoch": 0.003668768162816069, "grad_norm": 22.843358993530273, "learning_rate": 3.4079547545037634e-05, "loss": 5.3819, "step": 380 }, { "epoch": 0.0037653146934164923, "grad_norm": 28.16783332824707, "learning_rate": 2.8908000565118947e-05, "loss": 5.9155, "step": 390 }, { "epoch": 0.003861861224016915, "grad_norm": 37.42588424682617, "learning_rate": 2.4097422358745275e-05, "loss": 6.9787, "step": 400 }, { "epoch": 0.003861861224016915, "eval_loss": 2.877124786376953, "eval_runtime": 595.4552, "eval_samples_per_second": 73.241, "eval_steps_per_second": 18.31, "step": 400 }, { "epoch": 0.003958407754617338, "grad_norm": 13.854421615600586, "learning_rate": 1.9671249579380422e-05, "loss": 5.1705, "step": 410 }, { "epoch": 0.00405495428521776, "grad_norm": 16.01457977294922, "learning_rate": 1.5651046095888127e-05, "loss": 5.1288, "step": 420 }, { "epoch": 0.004151500815818184, "grad_norm": 13.360790252685547, "learning_rate": 1.205639793553052e-05, "loss": 5.3445, "step": 430 }, { "epoch": 0.004248047346418606, "grad_norm": 15.994139671325684, "learning_rate": 8.904817862812098e-06, "loss": 5.7003, "step": 440 }, { "epoch": 0.004344593877019029, "grad_norm": 25.979543685913086, "learning_rate": 6.211660059051443e-06, "loss": 6.8299, "step": 450 }, { "epoch": 0.004344593877019029, "eval_loss": 2.874143123626709, "eval_runtime": 602.9673, "eval_samples_per_second": 72.329, "eval_steps_per_second": 18.082, "step": 450 }, { "epoch": 0.004441140407619453, "grad_norm": 12.285646438598633, "learning_rate": 3.990045318353154e-06, "loss": 5.1388, "step": 460 }, { "epoch": 0.004537686938219875, "grad_norm": 9.6546049118042, "learning_rate": 2.250797124418014e-06, "loss": 5.3243, "step": 470 }, { "epoch": 0.004634233468820298, "grad_norm": 15.964618682861328, "learning_rate": 1.0023889196182526e-06, "loss": 5.4568, "step": 480 }, { "epoch": 0.004730779999420721, "grad_norm": 16.29486656188965, "learning_rate": 2.5090282323810766e-07, "loss": 6.1612, "step": 490 }, { "epoch": 0.004827326530021144, "grad_norm": 21.052181243896484, "learning_rate": 0.0, "loss": 6.7968, "step": 500 }, { "epoch": 0.004827326530021144, "eval_loss": 2.872375965118408, "eval_runtime": 592.4933, "eval_samples_per_second": 73.608, "eval_steps_per_second": 18.402, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5128671190843392.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }