{ "best_metric": 2.8458566665649414, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.004827326530021144, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.654653060042287e-06, "eval_loss": 3.049283742904663, "eval_runtime": 592.2246, "eval_samples_per_second": 73.641, "eval_steps_per_second": 18.41, "step": 1 }, { "epoch": 9.654653060042287e-05, "grad_norm": 9.588415145874023, "learning_rate": 4.34e-05, "loss": 5.3317, "step": 10 }, { "epoch": 0.00019309306120084574, "grad_norm": 12.097651481628418, "learning_rate": 8.68e-05, "loss": 5.3358, "step": 20 }, { "epoch": 0.00028963959180126864, "grad_norm": 16.885087966918945, "learning_rate": 0.0001302, "loss": 5.4894, "step": 30 }, { "epoch": 0.0003861861224016915, "grad_norm": 25.137943267822266, "learning_rate": 0.0001736, "loss": 6.421, "step": 40 }, { "epoch": 0.0004827326530021144, "grad_norm": 32.7490119934082, "learning_rate": 0.000217, "loss": 8.0108, "step": 50 }, { "epoch": 0.0004827326530021144, "eval_loss": 3.020226240158081, "eval_runtime": 594.5306, "eval_samples_per_second": 73.355, "eval_steps_per_second": 18.339, "step": 50 }, { "epoch": 0.0005792791836025373, "grad_norm": 11.534395217895508, "learning_rate": 0.00021673569945319091, "loss": 5.425, "step": 60 }, { "epoch": 0.0006758257142029601, "grad_norm": 13.132587432861328, "learning_rate": 0.00021594408545846038, "loss": 5.2352, "step": 70 }, { "epoch": 0.000772372244803383, "grad_norm": 17.452167510986328, "learning_rate": 0.0002146290146796179, "loss": 5.6235, "step": 80 }, { "epoch": 0.0008689187754038058, "grad_norm": 12.21745491027832, "learning_rate": 0.0002127968940093076, "loss": 6.2937, "step": 90 }, { "epoch": 0.0009654653060042288, "grad_norm": 23.016115188598633, "learning_rate": 0.00021045664935527106, "loss": 7.6852, "step": 100 }, { "epoch": 0.0009654653060042288, "eval_loss": 3.0362181663513184, "eval_runtime": 593.3315, "eval_samples_per_second": 73.504, "eval_steps_per_second": 18.376, "step": 100 }, { "epoch": 0.0010620118366046515, "grad_norm": 21.253969192504883, "learning_rate": 0.00020761968215422217, "loss": 5.5279, "step": 110 }, { "epoch": 0.0011585583672050746, "grad_norm": 10.032729148864746, "learning_rate": 0.00020429981382519356, "loss": 5.3829, "step": 120 }, { "epoch": 0.0012551048978054974, "grad_norm": 18.771055221557617, "learning_rate": 0.00020051321843297219, "loss": 5.7132, "step": 130 }, { "epoch": 0.0013516514284059203, "grad_norm": 16.89134407043457, "learning_rate": 0.0001962783438896818, "loss": 6.0961, "step": 140 }, { "epoch": 0.0014481979590063431, "grad_norm": 18.406845092773438, "learning_rate": 0.0001916158220784091, "loss": 7.3874, "step": 150 }, { "epoch": 0.0014481979590063431, "eval_loss": 2.9778358936309814, "eval_runtime": 608.5301, "eval_samples_per_second": 71.668, "eval_steps_per_second": 17.917, "step": 150 }, { "epoch": 0.001544744489606766, "grad_norm": 12.62802505493164, "learning_rate": 0.00018654836833674362, "loss": 5.17, "step": 160 }, { "epoch": 0.0016412910202071888, "grad_norm": 12.671950340270996, "learning_rate": 0.0001811006707899361, "loss": 5.1057, "step": 170 }, { "epoch": 0.0017378375508076116, "grad_norm": 9.260117530822754, "learning_rate": 0.0001752992700728339, "loss": 5.3075, "step": 180 }, { "epoch": 0.0018343840814080345, "grad_norm": 13.710508346557617, "learning_rate": 0.00016917243002657602, "loss": 6.106, "step": 190 }, { "epoch": 0.0019309306120084576, "grad_norm": 15.460708618164062, "learning_rate": 0.00016275, "loss": 7.2009, "step": 200 }, { "epoch": 0.0019309306120084576, "eval_loss": 2.9488205909729004, "eval_runtime": 600.8577, "eval_samples_per_second": 72.583, "eval_steps_per_second": 18.146, "step": 200 }, { "epoch": 0.00202747714260888, "grad_norm": 12.144033432006836, "learning_rate": 0.0001560632694266149, "loss": 5.208, "step": 210 }, { "epoch": 0.002124023673209303, "grad_norm": 12.754302978515625, "learning_rate": 0.00014914481538562646, "loss": 5.3033, "step": 220 }, { "epoch": 0.0022205702038097263, "grad_norm": 10.792929649353027, "learning_rate": 0.0001420283438896818, "loss": 5.5286, "step": 230 }, { "epoch": 0.002317116734410149, "grad_norm": 13.988052368164062, "learning_rate": 0.00013474852567256393, "loss": 6.0218, "step": 240 }, { "epoch": 0.002413663265010572, "grad_norm": 26.327573776245117, "learning_rate": 0.00012734082727686196, "loss": 6.9626, "step": 250 }, { "epoch": 0.002413663265010572, "eval_loss": 2.8977274894714355, "eval_runtime": 605.0961, "eval_samples_per_second": 72.075, "eval_steps_per_second": 18.019, "step": 250 }, { "epoch": 0.002510209795610995, "grad_norm": 11.27015209197998, "learning_rate": 0.0001198413382645404, "loss": 5.2217, "step": 260 }, { "epoch": 0.0026067563262114177, "grad_norm": 20.627559661865234, "learning_rate": 0.00011228659539222137, "loss": 5.4243, "step": 270 }, { "epoch": 0.0027033028568118405, "grad_norm": 23.235761642456055, "learning_rate": 0.00010471340460777866, "loss": 5.634, "step": 280 }, { "epoch": 0.0027998493874122634, "grad_norm": 26.621187210083008, "learning_rate": 9.715866173545961e-05, "loss": 5.9205, "step": 290 }, { "epoch": 0.0028963959180126862, "grad_norm": 38.239044189453125, "learning_rate": 8.965917272313806e-05, "loss": 7.0007, "step": 300 }, { "epoch": 0.0028963959180126862, "eval_loss": 2.8962948322296143, "eval_runtime": 602.2623, "eval_samples_per_second": 72.414, "eval_steps_per_second": 18.103, "step": 300 }, { "epoch": 0.002992942448613109, "grad_norm": 25.593109130859375, "learning_rate": 8.225147432743606e-05, "loss": 5.3537, "step": 310 }, { "epoch": 0.003089488979213532, "grad_norm": 20.555158615112305, "learning_rate": 7.497165611031821e-05, "loss": 5.2222, "step": 320 }, { "epoch": 0.0031860355098139548, "grad_norm": 17.040815353393555, "learning_rate": 6.785518461437353e-05, "loss": 5.4117, "step": 330 }, { "epoch": 0.0032825820404143776, "grad_norm": 44.51173400878906, "learning_rate": 6.093673057338509e-05, "loss": 5.9886, "step": 340 }, { "epoch": 0.0033791285710148005, "grad_norm": 21.535146713256836, "learning_rate": 5.4250000000000024e-05, "loss": 6.8052, "step": 350 }, { "epoch": 0.0033791285710148005, "eval_loss": 2.8616268634796143, "eval_runtime": 594.2182, "eval_samples_per_second": 73.394, "eval_steps_per_second": 18.348, "step": 350 }, { "epoch": 0.0034756751016152233, "grad_norm": 15.48707389831543, "learning_rate": 4.782756997342398e-05, "loss": 5.2369, "step": 360 }, { "epoch": 0.003572221632215646, "grad_norm": 12.415227890014648, "learning_rate": 4.170072992716607e-05, "loss": 5.0543, "step": 370 }, { "epoch": 0.003668768162816069, "grad_norm": 25.784727096557617, "learning_rate": 3.5899329210063916e-05, "loss": 5.3615, "step": 380 }, { "epoch": 0.0037653146934164923, "grad_norm": 17.225284576416016, "learning_rate": 3.045163166325637e-05, "loss": 6.0156, "step": 390 }, { "epoch": 0.003861861224016915, "grad_norm": 25.5921573638916, "learning_rate": 2.5384177921590895e-05, "loss": 7.0295, "step": 400 }, { "epoch": 0.003861861224016915, "eval_loss": 2.8515334129333496, "eval_runtime": 594.2913, "eval_samples_per_second": 73.385, "eval_steps_per_second": 18.346, "step": 400 }, { "epoch": 0.003958407754617338, "grad_norm": 17.807329177856445, "learning_rate": 2.0721656110318213e-05, "loss": 4.9179, "step": 410 }, { "epoch": 0.00405495428521776, "grad_norm": 14.401619911193848, "learning_rate": 1.6486781567027783e-05, "loss": 5.0845, "step": 420 }, { "epoch": 0.004151500815818184, "grad_norm": 12.0913667678833, "learning_rate": 1.2700186174806422e-05, "loss": 5.4731, "step": 430 }, { "epoch": 0.004248047346418606, "grad_norm": 20.124168395996094, "learning_rate": 9.380317845777794e-06, "loss": 5.8493, "step": 440 }, { "epoch": 0.004344593877019029, "grad_norm": 19.486072540283203, "learning_rate": 6.543350644728947e-06, "loss": 7.0532, "step": 450 }, { "epoch": 0.004344593877019029, "eval_loss": 2.8470160961151123, "eval_runtime": 631.7626, "eval_samples_per_second": 69.032, "eval_steps_per_second": 17.258, "step": 450 }, { "epoch": 0.004441140407619453, "grad_norm": 12.585321426391602, "learning_rate": 4.2031059906924e-06, "loss": 5.103, "step": 460 }, { "epoch": 0.004537686938219875, "grad_norm": 12.552547454833984, "learning_rate": 2.3709853203820825e-06, "loss": 5.1533, "step": 470 }, { "epoch": 0.004634233468820298, "grad_norm": 13.851476669311523, "learning_rate": 1.0559145415396157e-06, "loss": 5.6343, "step": 480 }, { "epoch": 0.004730779999420721, "grad_norm": 20.92866325378418, "learning_rate": 2.643005468090745e-07, "loss": 6.2419, "step": 490 }, { "epoch": 0.004827326530021144, "grad_norm": 93.0564956665039, "learning_rate": 0.0, "loss": 6.9058, "step": 500 }, { "epoch": 0.004827326530021144, "eval_loss": 2.8458566665649414, "eval_runtime": 599.551, "eval_samples_per_second": 72.741, "eval_steps_per_second": 18.185, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5203897601753088.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }