{ "best_metric": 0.8320402503013611, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.23889154323936931, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047778308647873863, "eval_loss": 4.255329608917236, "eval_runtime": 12.7971, "eval_samples_per_second": 68.922, "eval_steps_per_second": 17.27, "step": 1 }, { "epoch": 0.004777830864787387, "grad_norm": 138.58535766601562, "learning_rate": 4.2800000000000004e-05, "loss": 6.9317, "step": 10 }, { "epoch": 0.009555661729574774, "grad_norm": 131.84646606445312, "learning_rate": 8.560000000000001e-05, "loss": 4.5285, "step": 20 }, { "epoch": 0.01433349259436216, "grad_norm": 46.312652587890625, "learning_rate": 0.0001284, "loss": 3.299, "step": 30 }, { "epoch": 0.019111323459149548, "grad_norm": 38.920799255371094, "learning_rate": 0.00017120000000000001, "loss": 2.7357, "step": 40 }, { "epoch": 0.023889154323936932, "grad_norm": 42.04233169555664, "learning_rate": 0.000214, "loss": 2.9014, "step": 50 }, { "epoch": 0.023889154323936932, "eval_loss": 1.3917949199676514, "eval_runtime": 12.3884, "eval_samples_per_second": 71.196, "eval_steps_per_second": 17.839, "step": 50 }, { "epoch": 0.02866698518872432, "grad_norm": 102.38275146484375, "learning_rate": 0.00021373935337780118, "loss": 2.403, "step": 60 }, { "epoch": 0.033444816053511704, "grad_norm": 527.3989868164062, "learning_rate": 0.00021295868335534802, "loss": 2.5263, "step": 70 }, { "epoch": 0.038222646918299095, "grad_norm": 188.73924255371094, "learning_rate": 0.0002116617932785172, "loss": 3.2151, "step": 80 }, { "epoch": 0.04300047778308648, "grad_norm": 119.0323486328125, "learning_rate": 0.00020985500146540012, "loss": 3.5938, "step": 90 }, { "epoch": 0.047778308647873864, "grad_norm": 170.87159729003906, "learning_rate": 0.0002075471104240922, "loss": 4.0067, "step": 100 }, { "epoch": 0.047778308647873864, "eval_loss": 2.311336040496826, "eval_runtime": 13.4445, "eval_samples_per_second": 65.603, "eval_steps_per_second": 16.438, "step": 100 }, { "epoch": 0.05255613951266125, "grad_norm": 32.53251647949219, "learning_rate": 0.00020474936396775828, "loss": 10.3052, "step": 110 }, { "epoch": 0.05733397037744864, "grad_norm": 65.40621185302734, "learning_rate": 0.00020147539243590517, "loss": 7.4235, "step": 120 }, { "epoch": 0.062111801242236024, "grad_norm": 20.591093063354492, "learning_rate": 0.00019774114628873756, "loss": 3.2373, "step": 130 }, { "epoch": 0.06688963210702341, "grad_norm": 31.703140258789062, "learning_rate": 0.00019356481839811937, "loss": 2.8812, "step": 140 }, { "epoch": 0.0716674629718108, "grad_norm": 34.521854400634766, "learning_rate": 0.00018896675541373064, "loss": 3.2874, "step": 150 }, { "epoch": 0.0716674629718108, "eval_loss": 1.4771889448165894, "eval_runtime": 12.9239, "eval_samples_per_second": 68.246, "eval_steps_per_second": 17.1, "step": 150 }, { "epoch": 0.07644529383659819, "grad_norm": 28.358295440673828, "learning_rate": 0.00018396935863623567, "loss": 2.4476, "step": 160 }, { "epoch": 0.08122312470138557, "grad_norm": 21.966550827026367, "learning_rate": 0.00017859697488039784, "loss": 2.4135, "step": 170 }, { "epoch": 0.08600095556617296, "grad_norm": 32.07853317260742, "learning_rate": 0.00017287577785984542, "loss": 2.2956, "step": 180 }, { "epoch": 0.09077878643096035, "grad_norm": 17.3485107421875, "learning_rate": 0.0001668336406713699, "loss": 2.2849, "step": 190 }, { "epoch": 0.09555661729574773, "grad_norm": 14.762402534484863, "learning_rate": 0.0001605, "loss": 2.4335, "step": 200 }, { "epoch": 0.09555661729574773, "eval_loss": 1.0888148546218872, "eval_runtime": 12.7061, "eval_samples_per_second": 69.415, "eval_steps_per_second": 17.393, "step": 200 }, { "epoch": 0.10033444816053512, "grad_norm": 140.28663635253906, "learning_rate": 0.00015390571270643128, "loss": 1.7442, "step": 210 }, { "epoch": 0.1051122790253225, "grad_norm": 134.5390167236328, "learning_rate": 0.0001470829054955026, "loss": 1.9716, "step": 220 }, { "epoch": 0.10989010989010989, "grad_norm": 25.001611709594727, "learning_rate": 0.00014006481839811937, "loss": 2.0693, "step": 230 }, { "epoch": 0.11466794075489728, "grad_norm": 33.87446594238281, "learning_rate": 0.00013288564282916442, "loss": 2.4731, "step": 240 }, { "epoch": 0.11944577161968466, "grad_norm": 56.331687927246094, "learning_rate": 0.00012558035501036158, "loss": 2.7371, "step": 250 }, { "epoch": 0.11944577161968466, "eval_loss": 1.124778389930725, "eval_runtime": 12.2143, "eval_samples_per_second": 72.21, "eval_steps_per_second": 18.094, "step": 250 }, { "epoch": 0.12422360248447205, "grad_norm": 20.330785751342773, "learning_rate": 0.00011818454556963892, "loss": 1.6651, "step": 260 }, { "epoch": 0.12900143334925943, "grad_norm": 14.653603553771973, "learning_rate": 0.00011073424614716762, "loss": 1.6375, "step": 270 }, { "epoch": 0.13377926421404682, "grad_norm": 9.812971115112305, "learning_rate": 0.00010326575385283242, "loss": 2.0153, "step": 280 }, { "epoch": 0.1385570950788342, "grad_norm": 15.295802116394043, "learning_rate": 9.58154544303611e-05, "loss": 2.2795, "step": 290 }, { "epoch": 0.1433349259436216, "grad_norm": 12.265714645385742, "learning_rate": 8.841964498963846e-05, "loss": 2.2539, "step": 300 }, { "epoch": 0.1433349259436216, "eval_loss": 0.9771943092346191, "eval_runtime": 12.2008, "eval_samples_per_second": 72.291, "eval_steps_per_second": 18.114, "step": 300 }, { "epoch": 0.148112756808409, "grad_norm": 14.376371383666992, "learning_rate": 8.111435717083556e-05, "loss": 1.5526, "step": 310 }, { "epoch": 0.15289058767319638, "grad_norm": 9.20234203338623, "learning_rate": 7.393518160188063e-05, "loss": 1.572, "step": 320 }, { "epoch": 0.15766841853798375, "grad_norm": 11.955562591552734, "learning_rate": 6.69170945044974e-05, "loss": 1.8717, "step": 330 }, { "epoch": 0.16244624940277114, "grad_norm": 8.291068077087402, "learning_rate": 6.009428729356871e-05, "loss": 1.9432, "step": 340 }, { "epoch": 0.16722408026755853, "grad_norm": 21.525508880615234, "learning_rate": 5.3500000000000026e-05, "loss": 2.3061, "step": 350 }, { "epoch": 0.16722408026755853, "eval_loss": 0.9378606677055359, "eval_runtime": 12.1993, "eval_samples_per_second": 72.3, "eval_steps_per_second": 18.116, "step": 350 }, { "epoch": 0.17200191113234592, "grad_norm": 11.051887512207031, "learning_rate": 4.7166359328630106e-05, "loss": 1.5252, "step": 360 }, { "epoch": 0.1767797419971333, "grad_norm": 16.316925048828125, "learning_rate": 4.112422214015456e-05, "loss": 1.4291, "step": 370 }, { "epoch": 0.1815575728619207, "grad_norm": 34.651214599609375, "learning_rate": 3.5403025119602206e-05, "loss": 1.7447, "step": 380 }, { "epoch": 0.18633540372670807, "grad_norm": 15.149808883666992, "learning_rate": 3.0030641363764346e-05, "loss": 1.9199, "step": 390 }, { "epoch": 0.19111323459149546, "grad_norm": 13.764430046081543, "learning_rate": 2.5033244586269365e-05, "loss": 2.0217, "step": 400 }, { "epoch": 0.19111323459149546, "eval_loss": 0.8595229387283325, "eval_runtime": 12.2031, "eval_samples_per_second": 72.277, "eval_steps_per_second": 18.11, "step": 400 }, { "epoch": 0.19589106545628285, "grad_norm": 24.64581871032715, "learning_rate": 2.0435181601880635e-05, "loss": 1.1874, "step": 410 }, { "epoch": 0.20066889632107024, "grad_norm": 17.642654418945312, "learning_rate": 1.625885371126242e-05, "loss": 1.5195, "step": 420 }, { "epoch": 0.20544672718585763, "grad_norm": 17.003334045410156, "learning_rate": 1.2524607564094813e-05, "loss": 1.8292, "step": 430 }, { "epoch": 0.210224558050645, "grad_norm": 13.311195373535156, "learning_rate": 9.250636032241695e-06, "loss": 1.8517, "step": 440 }, { "epoch": 0.21500238891543239, "grad_norm": 12.355936050415039, "learning_rate": 6.45288957590781e-06, "loss": 2.2005, "step": 450 }, { "epoch": 0.21500238891543239, "eval_loss": 0.837338387966156, "eval_runtime": 12.301, "eval_samples_per_second": 71.701, "eval_steps_per_second": 17.966, "step": 450 }, { "epoch": 0.21978021978021978, "grad_norm": 11.863606452941895, "learning_rate": 4.144998534599878e-06, "loss": 1.1789, "step": 460 }, { "epoch": 0.22455805064500717, "grad_norm": 20.274431228637695, "learning_rate": 2.3382067214827915e-06, "loss": 1.3979, "step": 470 }, { "epoch": 0.22933588150979456, "grad_norm": 10.406599998474121, "learning_rate": 1.0413166446519713e-06, "loss": 1.7026, "step": 480 }, { "epoch": 0.23411371237458195, "grad_norm": 9.674276351928711, "learning_rate": 2.6064662219881083e-07, "loss": 1.941, "step": 490 }, { "epoch": 0.23889154323936931, "grad_norm": 12.152741432189941, "learning_rate": 0.0, "loss": 2.1046, "step": 500 }, { "epoch": 0.23889154323936931, "eval_loss": 0.8320402503013611, "eval_runtime": 12.1319, "eval_samples_per_second": 72.701, "eval_steps_per_second": 18.216, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4657011228672000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }