{ "best_metric": 0.8746353387832642, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.23889154323936931, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047778308647873863, "eval_loss": 4.255342483520508, "eval_runtime": 12.2428, "eval_samples_per_second": 72.043, "eval_steps_per_second": 18.051, "step": 1 }, { "epoch": 0.004777830864787387, "grad_norm": 93.89539337158203, "learning_rate": 4.2600000000000005e-05, "loss": 6.865, "step": 10 }, { "epoch": 0.009555661729574774, "grad_norm": 192.6894073486328, "learning_rate": 8.520000000000001e-05, "loss": 4.7086, "step": 20 }, { "epoch": 0.01433349259436216, "grad_norm": 185.80467224121094, "learning_rate": 0.0001278, "loss": 3.7399, "step": 30 }, { "epoch": 0.019111323459149548, "grad_norm": 120.97589874267578, "learning_rate": 0.00017040000000000002, "loss": 3.3705, "step": 40 }, { "epoch": 0.023889154323936932, "grad_norm": 87.03988647460938, "learning_rate": 0.000213, "loss": 3.1617, "step": 50 }, { "epoch": 0.023889154323936932, "eval_loss": 1.551166296005249, "eval_runtime": 12.1727, "eval_samples_per_second": 72.457, "eval_steps_per_second": 18.155, "step": 50 }, { "epoch": 0.02866698518872432, "grad_norm": 99.68428039550781, "learning_rate": 0.00021274057135267128, "loss": 2.759, "step": 60 }, { "epoch": 0.033444816053511704, "grad_norm": 31.421096801757812, "learning_rate": 0.00021196354932097723, "loss": 3.1152, "step": 70 }, { "epoch": 0.038222646918299095, "grad_norm": 17.666555404663086, "learning_rate": 0.0002106727194781503, "loss": 2.3276, "step": 80 }, { "epoch": 0.04300047778308648, "grad_norm": 21.751888275146484, "learning_rate": 0.00020887437061743096, "loss": 2.3262, "step": 90 }, { "epoch": 0.047778308647873864, "grad_norm": 14.296065330505371, "learning_rate": 0.00020657726411369925, "loss": 2.4236, "step": 100 }, { "epoch": 0.047778308647873864, "eval_loss": 1.1035537719726562, "eval_runtime": 12.2578, "eval_samples_per_second": 71.954, "eval_steps_per_second": 18.029, "step": 100 }, { "epoch": 0.05255613951266125, "grad_norm": 31.222625732421875, "learning_rate": 0.000203792591238937, "loss": 1.611, "step": 110 }, { "epoch": 0.05733397037744864, "grad_norm": 108.65278625488281, "learning_rate": 0.0002005339186394757, "loss": 2.0965, "step": 120 }, { "epoch": 0.062111801242236024, "grad_norm": 33.067230224609375, "learning_rate": 0.00019681712224065936, "loss": 2.2948, "step": 130 }, { "epoch": 0.06688963210702341, "grad_norm": 33.02948760986328, "learning_rate": 0.0001926603099009319, "loss": 2.8421, "step": 140 }, { "epoch": 0.0716674629718108, "grad_norm": 15.799519538879395, "learning_rate": 0.00018808373319217114, "loss": 2.8939, "step": 150 }, { "epoch": 0.0716674629718108, "eval_loss": 1.215044379234314, "eval_runtime": 12.0789, "eval_samples_per_second": 73.02, "eval_steps_per_second": 18.296, "step": 150 }, { "epoch": 0.07644529383659819, "grad_norm": 27.139089584350586, "learning_rate": 0.00018310968873606635, "loss": 1.9604, "step": 160 }, { "epoch": 0.08122312470138557, "grad_norm": 37.57059097290039, "learning_rate": 0.0001777624095772184, "loss": 1.692, "step": 170 }, { "epoch": 0.08600095556617296, "grad_norm": 101.7945556640625, "learning_rate": 0.0001720679471221826, "loss": 3.0551, "step": 180 }, { "epoch": 0.09077878643096035, "grad_norm": 65.89530181884766, "learning_rate": 0.00016605404421963453, "loss": 2.2188, "step": 190 }, { "epoch": 0.09555661729574773, "grad_norm": 30.504783630371094, "learning_rate": 0.00015975, "loss": 2.4033, "step": 200 }, { "epoch": 0.09555661729574773, "eval_loss": 1.0835368633270264, "eval_runtime": 12.393, "eval_samples_per_second": 71.169, "eval_steps_per_second": 17.833, "step": 200 }, { "epoch": 0.10033444816053512, "grad_norm": 52.85816192626953, "learning_rate": 0.00015318652713303674, "loss": 1.599, "step": 210 }, { "epoch": 0.1051122790253225, "grad_norm": 19.591501235961914, "learning_rate": 0.00014639560219879464, "loss": 1.7043, "step": 220 }, { "epoch": 0.10989010989010989, "grad_norm": 81.96508026123047, "learning_rate": 0.0001394103099009319, "loss": 2.1468, "step": 230 }, { "epoch": 0.11466794075489728, "grad_norm": 30.87833595275879, "learning_rate": 0.0001322646818813646, "loss": 2.161, "step": 240 }, { "epoch": 0.11944577161968466, "grad_norm": 42.904701232910156, "learning_rate": 0.0001249935309215281, "loss": 2.5615, "step": 250 }, { "epoch": 0.11944577161968466, "eval_loss": 1.1374989748001099, "eval_runtime": 12.4118, "eval_samples_per_second": 71.061, "eval_steps_per_second": 17.806, "step": 250 }, { "epoch": 0.12422360248447205, "grad_norm": 26.0584716796875, "learning_rate": 0.0001176322813380051, "loss": 1.8027, "step": 260 }, { "epoch": 0.12900143334925943, "grad_norm": 19.178482055664062, "learning_rate": 0.00011021679639881638, "loss": 1.6998, "step": 270 }, { "epoch": 0.13377926421404682, "grad_norm": 24.075414657592773, "learning_rate": 0.00010278320360118368, "loss": 2.3066, "step": 280 }, { "epoch": 0.1385570950788342, "grad_norm": 19.714353561401367, "learning_rate": 9.536771866199493e-05, "loss": 2.4782, "step": 290 }, { "epoch": 0.1433349259436216, "grad_norm": 13.794386863708496, "learning_rate": 8.800646907847192e-05, "loss": 2.4282, "step": 300 }, { "epoch": 0.1433349259436216, "eval_loss": 1.0520588159561157, "eval_runtime": 12.5021, "eval_samples_per_second": 70.548, "eval_steps_per_second": 17.677, "step": 300 }, { "epoch": 0.148112756808409, "grad_norm": 67.06897735595703, "learning_rate": 8.07353181186354e-05, "loss": 1.7239, "step": 310 }, { "epoch": 0.15289058767319638, "grad_norm": 51.10677719116211, "learning_rate": 7.35896900990681e-05, "loss": 1.6737, "step": 320 }, { "epoch": 0.15766841853798375, "grad_norm": 54.609580993652344, "learning_rate": 6.660439780120536e-05, "loss": 2.0017, "step": 330 }, { "epoch": 0.16244624940277114, "grad_norm": 39.43727111816406, "learning_rate": 5.981347286696324e-05, "loss": 2.1488, "step": 340 }, { "epoch": 0.16722408026755853, "grad_norm": 30.269664764404297, "learning_rate": 5.325000000000002e-05, "loss": 2.3871, "step": 350 }, { "epoch": 0.16722408026755853, "eval_loss": 0.9656849503517151, "eval_runtime": 12.1772, "eval_samples_per_second": 72.43, "eval_steps_per_second": 18.149, "step": 350 }, { "epoch": 0.17200191113234592, "grad_norm": 30.008037567138672, "learning_rate": 4.6945955780365475e-05, "loss": 1.4687, "step": 360 }, { "epoch": 0.1767797419971333, "grad_norm": 30.746850967407227, "learning_rate": 4.0932052877817393e-05, "loss": 1.5686, "step": 370 }, { "epoch": 0.1815575728619207, "grad_norm": 29.426549911499023, "learning_rate": 3.523759042278163e-05, "loss": 1.6911, "step": 380 }, { "epoch": 0.18633540372670807, "grad_norm": 33.3541145324707, "learning_rate": 2.989031126393367e-05, "loss": 2.0111, "step": 390 }, { "epoch": 0.19111323459149546, "grad_norm": 15.817224502563477, "learning_rate": 2.4916266807828855e-05, "loss": 2.2564, "step": 400 }, { "epoch": 0.19111323459149546, "eval_loss": 0.9130141735076904, "eval_runtime": 12.0386, "eval_samples_per_second": 73.264, "eval_steps_per_second": 18.358, "step": 400 }, { "epoch": 0.19589106545628285, "grad_norm": 40.40257263183594, "learning_rate": 2.033969009906811e-05, "loss": 1.4211, "step": 410 }, { "epoch": 0.20066889632107024, "grad_norm": 28.919811248779297, "learning_rate": 1.6182877759340637e-05, "loss": 1.5233, "step": 420 }, { "epoch": 0.20544672718585763, "grad_norm": 18.431303024291992, "learning_rate": 1.2466081360524275e-05, "loss": 1.7242, "step": 430 }, { "epoch": 0.210224558050645, "grad_norm": 15.968626022338867, "learning_rate": 9.207408761062996e-06, "loss": 1.9384, "step": 440 }, { "epoch": 0.21500238891543239, "grad_norm": 35.02082061767578, "learning_rate": 6.422735886300764e-06, "loss": 2.1484, "step": 450 }, { "epoch": 0.21500238891543239, "eval_loss": 0.8792083859443665, "eval_runtime": 12.2699, "eval_samples_per_second": 71.883, "eval_steps_per_second": 18.011, "step": 450 }, { "epoch": 0.21978021978021978, "grad_norm": 28.86961555480957, "learning_rate": 4.125629382569038e-06, "loss": 1.3139, "step": 460 }, { "epoch": 0.22455805064500717, "grad_norm": 38.519046783447266, "learning_rate": 2.327280521849694e-06, "loss": 1.4227, "step": 470 }, { "epoch": 0.22933588150979456, "grad_norm": 20.339561462402344, "learning_rate": 1.0364506790227565e-06, "loss": 1.6595, "step": 480 }, { "epoch": 0.23411371237458195, "grad_norm": 55.318058013916016, "learning_rate": 2.5942864732872295e-07, "loss": 1.9445, "step": 490 }, { "epoch": 0.23889154323936931, "grad_norm": 15.225406646728516, "learning_rate": 0.0, "loss": 2.0612, "step": 500 }, { "epoch": 0.23889154323936931, "eval_loss": 0.8746353387832642, "eval_runtime": 12.2579, "eval_samples_per_second": 71.953, "eval_steps_per_second": 18.029, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4657011228672000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }