{ "best_metric": 0.832525908946991, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.23889154323936931, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047778308647873863, "eval_loss": 4.254161834716797, "eval_runtime": 11.5108, "eval_samples_per_second": 76.624, "eval_steps_per_second": 19.199, "step": 1 }, { "epoch": 0.004777830864787387, "grad_norm": 86.97021484375, "learning_rate": 4.34e-05, "loss": 6.7643, "step": 10 }, { "epoch": 0.009555661729574774, "grad_norm": 114.20069885253906, "learning_rate": 8.68e-05, "loss": 4.2519, "step": 20 }, { "epoch": 0.01433349259436216, "grad_norm": 87.37146759033203, "learning_rate": 0.0001302, "loss": 3.5983, "step": 30 }, { "epoch": 0.019111323459149548, "grad_norm": 114.74700927734375, "learning_rate": 0.0001736, "loss": 3.2439, "step": 40 }, { "epoch": 0.023889154323936932, "grad_norm": 97.87152099609375, "learning_rate": 0.000217, "loss": 2.683, "step": 50 }, { "epoch": 0.023889154323936932, "eval_loss": 1.3938980102539062, "eval_runtime": 11.5211, "eval_samples_per_second": 76.555, "eval_steps_per_second": 19.182, "step": 50 }, { "epoch": 0.02866698518872432, "grad_norm": 68.76221466064453, "learning_rate": 0.00021673569945319091, "loss": 2.163, "step": 60 }, { "epoch": 0.033444816053511704, "grad_norm": 85.8083267211914, "learning_rate": 0.00021594408545846038, "loss": 1.941, "step": 70 }, { "epoch": 0.038222646918299095, "grad_norm": 49.4107780456543, "learning_rate": 0.0002146290146796179, "loss": 2.4934, "step": 80 }, { "epoch": 0.04300047778308648, "grad_norm": 45.38887405395508, "learning_rate": 0.0002127968940093076, "loss": 2.7001, "step": 90 }, { "epoch": 0.047778308647873864, "grad_norm": 114.47595977783203, "learning_rate": 0.00021045664935527106, "loss": 2.8237, "step": 100 }, { "epoch": 0.047778308647873864, "eval_loss": 1.3519835472106934, "eval_runtime": 11.475, "eval_samples_per_second": 76.863, "eval_steps_per_second": 19.259, "step": 100 }, { "epoch": 0.05255613951266125, "grad_norm": 163.51150512695312, "learning_rate": 0.00020761968215422217, "loss": 2.9201, "step": 110 }, { "epoch": 0.05733397037744864, "grad_norm": 42.164276123046875, "learning_rate": 0.00020429981382519356, "loss": 2.8884, "step": 120 }, { "epoch": 0.062111801242236024, "grad_norm": 30.880720138549805, "learning_rate": 0.00020051321843297219, "loss": 2.5363, "step": 130 }, { "epoch": 0.06688963210702341, "grad_norm": 61.331912994384766, "learning_rate": 0.0001962783438896818, "loss": 2.4063, "step": 140 }, { "epoch": 0.0716674629718108, "grad_norm": 70.56568145751953, "learning_rate": 0.0001916158220784091, "loss": 2.5402, "step": 150 }, { "epoch": 0.0716674629718108, "eval_loss": 1.248333215713501, "eval_runtime": 11.4863, "eval_samples_per_second": 76.787, "eval_steps_per_second": 19.24, "step": 150 }, { "epoch": 0.07644529383659819, "grad_norm": 33.299007415771484, "learning_rate": 0.00018654836833674362, "loss": 1.8457, "step": 160 }, { "epoch": 0.08122312470138557, "grad_norm": 48.84244918823242, "learning_rate": 0.0001811006707899361, "loss": 2.259, "step": 170 }, { "epoch": 0.08600095556617296, "grad_norm": 16.119049072265625, "learning_rate": 0.0001752992700728339, "loss": 2.4819, "step": 180 }, { "epoch": 0.09077878643096035, "grad_norm": 32.0130500793457, "learning_rate": 0.00016917243002657602, "loss": 2.4841, "step": 190 }, { "epoch": 0.09555661729574773, "grad_norm": 70.85590362548828, "learning_rate": 0.00016275, "loss": 2.8376, "step": 200 }, { "epoch": 0.09555661729574773, "eval_loss": 1.1818195581436157, "eval_runtime": 11.5022, "eval_samples_per_second": 76.681, "eval_steps_per_second": 19.214, "step": 200 }, { "epoch": 0.10033444816053512, "grad_norm": 50.72034454345703, "learning_rate": 0.0001560632694266149, "loss": 1.869, "step": 210 }, { "epoch": 0.1051122790253225, "grad_norm": 42.09200668334961, "learning_rate": 0.00014914481538562646, "loss": 1.7881, "step": 220 }, { "epoch": 0.10989010989010989, "grad_norm": 118.14753723144531, "learning_rate": 0.0001420283438896818, "loss": 2.0519, "step": 230 }, { "epoch": 0.11466794075489728, "grad_norm": 24.315568923950195, "learning_rate": 0.00013474852567256393, "loss": 2.1679, "step": 240 }, { "epoch": 0.11944577161968466, "grad_norm": 70.42790985107422, "learning_rate": 0.00012734082727686196, "loss": 2.4192, "step": 250 }, { "epoch": 0.11944577161968466, "eval_loss": 1.1315124034881592, "eval_runtime": 11.4436, "eval_samples_per_second": 77.074, "eval_steps_per_second": 19.312, "step": 250 }, { "epoch": 0.12422360248447205, "grad_norm": 41.314945220947266, "learning_rate": 0.0001198413382645404, "loss": 1.7405, "step": 260 }, { "epoch": 0.12900143334925943, "grad_norm": 21.883790969848633, "learning_rate": 0.00011228659539222137, "loss": 1.6183, "step": 270 }, { "epoch": 0.13377926421404682, "grad_norm": 14.944008827209473, "learning_rate": 0.00010471340460777866, "loss": 1.7443, "step": 280 }, { "epoch": 0.1385570950788342, "grad_norm": 38.64680099487305, "learning_rate": 9.715866173545961e-05, "loss": 2.1661, "step": 290 }, { "epoch": 0.1433349259436216, "grad_norm": 34.455081939697266, "learning_rate": 8.965917272313806e-05, "loss": 2.2251, "step": 300 }, { "epoch": 0.1433349259436216, "eval_loss": 0.9748860597610474, "eval_runtime": 11.386, "eval_samples_per_second": 77.464, "eval_steps_per_second": 19.41, "step": 300 }, { "epoch": 0.148112756808409, "grad_norm": 20.522775650024414, "learning_rate": 8.225147432743606e-05, "loss": 1.4992, "step": 310 }, { "epoch": 0.15289058767319638, "grad_norm": 75.11093139648438, "learning_rate": 7.497165611031821e-05, "loss": 1.4972, "step": 320 }, { "epoch": 0.15766841853798375, "grad_norm": 37.47844696044922, "learning_rate": 6.785518461437353e-05, "loss": 1.8537, "step": 330 }, { "epoch": 0.16244624940277114, "grad_norm": 23.639490127563477, "learning_rate": 6.093673057338509e-05, "loss": 2.1694, "step": 340 }, { "epoch": 0.16722408026755853, "grad_norm": 54.70100021362305, "learning_rate": 5.4250000000000024e-05, "loss": 2.3268, "step": 350 }, { "epoch": 0.16722408026755853, "eval_loss": 0.9435171484947205, "eval_runtime": 11.5124, "eval_samples_per_second": 76.613, "eval_steps_per_second": 19.197, "step": 350 }, { "epoch": 0.17200191113234592, "grad_norm": 34.21720886230469, "learning_rate": 4.782756997342398e-05, "loss": 1.3658, "step": 360 }, { "epoch": 0.1767797419971333, "grad_norm": 22.63213348388672, "learning_rate": 4.170072992716607e-05, "loss": 1.4596, "step": 370 }, { "epoch": 0.1815575728619207, "grad_norm": 13.130209922790527, "learning_rate": 3.5899329210063916e-05, "loss": 1.6314, "step": 380 }, { "epoch": 0.18633540372670807, "grad_norm": 13.130266189575195, "learning_rate": 3.045163166325637e-05, "loss": 1.9832, "step": 390 }, { "epoch": 0.19111323459149546, "grad_norm": 28.62923812866211, "learning_rate": 2.5384177921590895e-05, "loss": 2.1456, "step": 400 }, { "epoch": 0.19111323459149546, "eval_loss": 0.8851467370986938, "eval_runtime": 11.5188, "eval_samples_per_second": 76.57, "eval_steps_per_second": 19.186, "step": 400 }, { "epoch": 0.19589106545628285, "grad_norm": 12.487257957458496, "learning_rate": 2.0721656110318213e-05, "loss": 1.1234, "step": 410 }, { "epoch": 0.20066889632107024, "grad_norm": 12.503289222717285, "learning_rate": 1.6486781567027783e-05, "loss": 1.2566, "step": 420 }, { "epoch": 0.20544672718585763, "grad_norm": 9.571052551269531, "learning_rate": 1.2700186174806422e-05, "loss": 1.6166, "step": 430 }, { "epoch": 0.210224558050645, "grad_norm": 11.375801086425781, "learning_rate": 9.380317845777794e-06, "loss": 1.9494, "step": 440 }, { "epoch": 0.21500238891543239, "grad_norm": 11.005843162536621, "learning_rate": 6.543350644728947e-06, "loss": 2.0351, "step": 450 }, { "epoch": 0.21500238891543239, "eval_loss": 0.8364555239677429, "eval_runtime": 11.4227, "eval_samples_per_second": 77.215, "eval_steps_per_second": 19.347, "step": 450 }, { "epoch": 0.21978021978021978, "grad_norm": 26.848785400390625, "learning_rate": 4.2031059906924e-06, "loss": 1.0386, "step": 460 }, { "epoch": 0.22455805064500717, "grad_norm": 13.840141296386719, "learning_rate": 2.3709853203820825e-06, "loss": 1.2618, "step": 470 }, { "epoch": 0.22933588150979456, "grad_norm": 37.794471740722656, "learning_rate": 1.0559145415396157e-06, "loss": 1.6539, "step": 480 }, { "epoch": 0.23411371237458195, "grad_norm": 38.33562088012695, "learning_rate": 2.643005468090745e-07, "loss": 1.7703, "step": 490 }, { "epoch": 0.23889154323936931, "grad_norm": 19.850322723388672, "learning_rate": 0.0, "loss": 2.0542, "step": 500 }, { "epoch": 0.23889154323936931, "eval_loss": 0.832525908946991, "eval_runtime": 11.6134, "eval_samples_per_second": 75.947, "eval_steps_per_second": 19.03, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4657011228672000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }