|
{ |
|
"best_metric": 2.8458566665649414, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.004827326530021144, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 9.654653060042287e-06, |
|
"eval_loss": 3.049283742904663, |
|
"eval_runtime": 592.2246, |
|
"eval_samples_per_second": 73.641, |
|
"eval_steps_per_second": 18.41, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 9.654653060042287e-05, |
|
"grad_norm": 9.588415145874023, |
|
"learning_rate": 4.34e-05, |
|
"loss": 5.3317, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00019309306120084574, |
|
"grad_norm": 12.097651481628418, |
|
"learning_rate": 8.68e-05, |
|
"loss": 5.3358, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.00028963959180126864, |
|
"grad_norm": 16.885087966918945, |
|
"learning_rate": 0.0001302, |
|
"loss": 5.4894, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0003861861224016915, |
|
"grad_norm": 25.137943267822266, |
|
"learning_rate": 0.0001736, |
|
"loss": 6.421, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0004827326530021144, |
|
"grad_norm": 32.7490119934082, |
|
"learning_rate": 0.000217, |
|
"loss": 8.0108, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0004827326530021144, |
|
"eval_loss": 3.020226240158081, |
|
"eval_runtime": 594.5306, |
|
"eval_samples_per_second": 73.355, |
|
"eval_steps_per_second": 18.339, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0005792791836025373, |
|
"grad_norm": 11.534395217895508, |
|
"learning_rate": 0.00021673569945319091, |
|
"loss": 5.425, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0006758257142029601, |
|
"grad_norm": 13.132587432861328, |
|
"learning_rate": 0.00021594408545846038, |
|
"loss": 5.2352, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.000772372244803383, |
|
"grad_norm": 17.452167510986328, |
|
"learning_rate": 0.0002146290146796179, |
|
"loss": 5.6235, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0008689187754038058, |
|
"grad_norm": 12.21745491027832, |
|
"learning_rate": 0.0002127968940093076, |
|
"loss": 6.2937, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0009654653060042288, |
|
"grad_norm": 23.016115188598633, |
|
"learning_rate": 0.00021045664935527106, |
|
"loss": 7.6852, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0009654653060042288, |
|
"eval_loss": 3.0362181663513184, |
|
"eval_runtime": 593.3315, |
|
"eval_samples_per_second": 73.504, |
|
"eval_steps_per_second": 18.376, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0010620118366046515, |
|
"grad_norm": 21.253969192504883, |
|
"learning_rate": 0.00020761968215422217, |
|
"loss": 5.5279, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0011585583672050746, |
|
"grad_norm": 10.032729148864746, |
|
"learning_rate": 0.00020429981382519356, |
|
"loss": 5.3829, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0012551048978054974, |
|
"grad_norm": 18.771055221557617, |
|
"learning_rate": 0.00020051321843297219, |
|
"loss": 5.7132, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0013516514284059203, |
|
"grad_norm": 16.89134407043457, |
|
"learning_rate": 0.0001962783438896818, |
|
"loss": 6.0961, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0014481979590063431, |
|
"grad_norm": 18.406845092773438, |
|
"learning_rate": 0.0001916158220784091, |
|
"loss": 7.3874, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0014481979590063431, |
|
"eval_loss": 2.9778358936309814, |
|
"eval_runtime": 608.5301, |
|
"eval_samples_per_second": 71.668, |
|
"eval_steps_per_second": 17.917, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.001544744489606766, |
|
"grad_norm": 12.62802505493164, |
|
"learning_rate": 0.00018654836833674362, |
|
"loss": 5.17, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0016412910202071888, |
|
"grad_norm": 12.671950340270996, |
|
"learning_rate": 0.0001811006707899361, |
|
"loss": 5.1057, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0017378375508076116, |
|
"grad_norm": 9.260117530822754, |
|
"learning_rate": 0.0001752992700728339, |
|
"loss": 5.3075, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0018343840814080345, |
|
"grad_norm": 13.710508346557617, |
|
"learning_rate": 0.00016917243002657602, |
|
"loss": 6.106, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.0019309306120084576, |
|
"grad_norm": 15.460708618164062, |
|
"learning_rate": 0.00016275, |
|
"loss": 7.2009, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0019309306120084576, |
|
"eval_loss": 2.9488205909729004, |
|
"eval_runtime": 600.8577, |
|
"eval_samples_per_second": 72.583, |
|
"eval_steps_per_second": 18.146, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.00202747714260888, |
|
"grad_norm": 12.144033432006836, |
|
"learning_rate": 0.0001560632694266149, |
|
"loss": 5.208, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.002124023673209303, |
|
"grad_norm": 12.754302978515625, |
|
"learning_rate": 0.00014914481538562646, |
|
"loss": 5.3033, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0022205702038097263, |
|
"grad_norm": 10.792929649353027, |
|
"learning_rate": 0.0001420283438896818, |
|
"loss": 5.5286, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.002317116734410149, |
|
"grad_norm": 13.988052368164062, |
|
"learning_rate": 0.00013474852567256393, |
|
"loss": 6.0218, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.002413663265010572, |
|
"grad_norm": 26.327573776245117, |
|
"learning_rate": 0.00012734082727686196, |
|
"loss": 6.9626, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.002413663265010572, |
|
"eval_loss": 2.8977274894714355, |
|
"eval_runtime": 605.0961, |
|
"eval_samples_per_second": 72.075, |
|
"eval_steps_per_second": 18.019, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.002510209795610995, |
|
"grad_norm": 11.27015209197998, |
|
"learning_rate": 0.0001198413382645404, |
|
"loss": 5.2217, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0026067563262114177, |
|
"grad_norm": 20.627559661865234, |
|
"learning_rate": 0.00011228659539222137, |
|
"loss": 5.4243, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0027033028568118405, |
|
"grad_norm": 23.235761642456055, |
|
"learning_rate": 0.00010471340460777866, |
|
"loss": 5.634, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0027998493874122634, |
|
"grad_norm": 26.621187210083008, |
|
"learning_rate": 9.715866173545961e-05, |
|
"loss": 5.9205, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"grad_norm": 38.239044189453125, |
|
"learning_rate": 8.965917272313806e-05, |
|
"loss": 7.0007, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0028963959180126862, |
|
"eval_loss": 2.8962948322296143, |
|
"eval_runtime": 602.2623, |
|
"eval_samples_per_second": 72.414, |
|
"eval_steps_per_second": 18.103, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.002992942448613109, |
|
"grad_norm": 25.593109130859375, |
|
"learning_rate": 8.225147432743606e-05, |
|
"loss": 5.3537, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.003089488979213532, |
|
"grad_norm": 20.555158615112305, |
|
"learning_rate": 7.497165611031821e-05, |
|
"loss": 5.2222, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0031860355098139548, |
|
"grad_norm": 17.040815353393555, |
|
"learning_rate": 6.785518461437353e-05, |
|
"loss": 5.4117, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.0032825820404143776, |
|
"grad_norm": 44.51173400878906, |
|
"learning_rate": 6.093673057338509e-05, |
|
"loss": 5.9886, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0033791285710148005, |
|
"grad_norm": 21.535146713256836, |
|
"learning_rate": 5.4250000000000024e-05, |
|
"loss": 6.8052, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0033791285710148005, |
|
"eval_loss": 2.8616268634796143, |
|
"eval_runtime": 594.2182, |
|
"eval_samples_per_second": 73.394, |
|
"eval_steps_per_second": 18.348, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0034756751016152233, |
|
"grad_norm": 15.48707389831543, |
|
"learning_rate": 4.782756997342398e-05, |
|
"loss": 5.2369, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.003572221632215646, |
|
"grad_norm": 12.415227890014648, |
|
"learning_rate": 4.170072992716607e-05, |
|
"loss": 5.0543, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.003668768162816069, |
|
"grad_norm": 25.784727096557617, |
|
"learning_rate": 3.5899329210063916e-05, |
|
"loss": 5.3615, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0037653146934164923, |
|
"grad_norm": 17.225284576416016, |
|
"learning_rate": 3.045163166325637e-05, |
|
"loss": 6.0156, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.003861861224016915, |
|
"grad_norm": 25.5921573638916, |
|
"learning_rate": 2.5384177921590895e-05, |
|
"loss": 7.0295, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.003861861224016915, |
|
"eval_loss": 2.8515334129333496, |
|
"eval_runtime": 594.2913, |
|
"eval_samples_per_second": 73.385, |
|
"eval_steps_per_second": 18.346, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.003958407754617338, |
|
"grad_norm": 17.807329177856445, |
|
"learning_rate": 2.0721656110318213e-05, |
|
"loss": 4.9179, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.00405495428521776, |
|
"grad_norm": 14.401619911193848, |
|
"learning_rate": 1.6486781567027783e-05, |
|
"loss": 5.0845, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.004151500815818184, |
|
"grad_norm": 12.0913667678833, |
|
"learning_rate": 1.2700186174806422e-05, |
|
"loss": 5.4731, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.004248047346418606, |
|
"grad_norm": 20.124168395996094, |
|
"learning_rate": 9.380317845777794e-06, |
|
"loss": 5.8493, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.004344593877019029, |
|
"grad_norm": 19.486072540283203, |
|
"learning_rate": 6.543350644728947e-06, |
|
"loss": 7.0532, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.004344593877019029, |
|
"eval_loss": 2.8470160961151123, |
|
"eval_runtime": 631.7626, |
|
"eval_samples_per_second": 69.032, |
|
"eval_steps_per_second": 17.258, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.004441140407619453, |
|
"grad_norm": 12.585321426391602, |
|
"learning_rate": 4.2031059906924e-06, |
|
"loss": 5.103, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.004537686938219875, |
|
"grad_norm": 12.552547454833984, |
|
"learning_rate": 2.3709853203820825e-06, |
|
"loss": 5.1533, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.004634233468820298, |
|
"grad_norm": 13.851476669311523, |
|
"learning_rate": 1.0559145415396157e-06, |
|
"loss": 5.6343, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.004730779999420721, |
|
"grad_norm": 20.92866325378418, |
|
"learning_rate": 2.643005468090745e-07, |
|
"loss": 6.2419, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.004827326530021144, |
|
"grad_norm": 93.0564956665039, |
|
"learning_rate": 0.0, |
|
"loss": 6.9058, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.004827326530021144, |
|
"eval_loss": 2.8458566665649414, |
|
"eval_runtime": 599.551, |
|
"eval_samples_per_second": 72.741, |
|
"eval_steps_per_second": 18.185, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5203897601753088.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|