brixeus's picture
Training in progress, step 1500, checkpoint
8abb0af verified
{
"best_metric": 2.7592430114746094,
"best_model_checkpoint": "miner_id_24/checkpoint-1350",
"epoch": 0.028963959180126862,
"eval_steps": 150,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.9309306120084574e-05,
"eval_loss": 3.049531936645508,
"eval_runtime": 325.6693,
"eval_samples_per_second": 133.915,
"eval_steps_per_second": 33.479,
"step": 1
},
{
"epoch": 0.00019309306120084574,
"grad_norm": 17.593538284301758,
"learning_rate": 0.0001,
"loss": 10.3405,
"step": 10
},
{
"epoch": 0.0003861861224016915,
"grad_norm": 25.346281051635742,
"learning_rate": 0.0001,
"loss": 10.4921,
"step": 20
},
{
"epoch": 0.0005792791836025373,
"grad_norm": 29.28500747680664,
"learning_rate": 0.0001,
"loss": 11.2535,
"step": 30
},
{
"epoch": 0.000772372244803383,
"grad_norm": 25.79044532775879,
"learning_rate": 0.0001,
"loss": 12.771,
"step": 40
},
{
"epoch": 0.0009654653060042288,
"grad_norm": 29.85230827331543,
"learning_rate": 0.0001,
"loss": 15.3004,
"step": 50
},
{
"epoch": 0.0011585583672050746,
"grad_norm": 18.505237579345703,
"learning_rate": 0.0001,
"loss": 10.1685,
"step": 60
},
{
"epoch": 0.0013516514284059203,
"grad_norm": 20.607799530029297,
"learning_rate": 0.0001,
"loss": 10.0184,
"step": 70
},
{
"epoch": 0.001544744489606766,
"grad_norm": 28.849403381347656,
"learning_rate": 0.0001,
"loss": 10.9882,
"step": 80
},
{
"epoch": 0.0017378375508076116,
"grad_norm": 22.832693099975586,
"learning_rate": 0.0001,
"loss": 12.2328,
"step": 90
},
{
"epoch": 0.0019309306120084576,
"grad_norm": 36.24290084838867,
"learning_rate": 0.0001,
"loss": 15.052,
"step": 100
},
{
"epoch": 0.002124023673209303,
"grad_norm": 26.353515625,
"learning_rate": 0.0001,
"loss": 9.8579,
"step": 110
},
{
"epoch": 0.002317116734410149,
"grad_norm": 36.15186309814453,
"learning_rate": 0.0001,
"loss": 10.1559,
"step": 120
},
{
"epoch": 0.002510209795610995,
"grad_norm": 25.59376335144043,
"learning_rate": 0.0001,
"loss": 10.5048,
"step": 130
},
{
"epoch": 0.0027033028568118405,
"grad_norm": 45.6728630065918,
"learning_rate": 0.0001,
"loss": 11.7695,
"step": 140
},
{
"epoch": 0.0028963959180126862,
"grad_norm": 67.35182189941406,
"learning_rate": 0.0001,
"loss": 14.9163,
"step": 150
},
{
"epoch": 0.0028963959180126862,
"eval_loss": 2.915919542312622,
"eval_runtime": 325.4265,
"eval_samples_per_second": 134.015,
"eval_steps_per_second": 33.504,
"step": 150
},
{
"epoch": 0.003089488979213532,
"grad_norm": 23.811389923095703,
"learning_rate": 0.0001,
"loss": 10.1832,
"step": 160
},
{
"epoch": 0.0032825820404143776,
"grad_norm": 33.760498046875,
"learning_rate": 0.0001,
"loss": 10.6903,
"step": 170
},
{
"epoch": 0.0034756751016152233,
"grad_norm": 36.039459228515625,
"learning_rate": 0.0001,
"loss": 10.9393,
"step": 180
},
{
"epoch": 0.003668768162816069,
"grad_norm": 47.975990295410156,
"learning_rate": 0.0001,
"loss": 12.2215,
"step": 190
},
{
"epoch": 0.003861861224016915,
"grad_norm": 44.83605194091797,
"learning_rate": 0.0001,
"loss": 14.0036,
"step": 200
},
{
"epoch": 0.00405495428521776,
"grad_norm": 27.559823989868164,
"learning_rate": 0.0001,
"loss": 10.8778,
"step": 210
},
{
"epoch": 0.004248047346418606,
"grad_norm": 29.685842514038086,
"learning_rate": 0.0001,
"loss": 10.3398,
"step": 220
},
{
"epoch": 0.004441140407619453,
"grad_norm": 32.39301681518555,
"learning_rate": 0.0001,
"loss": 10.8218,
"step": 230
},
{
"epoch": 0.004634233468820298,
"grad_norm": 26.63850212097168,
"learning_rate": 0.0001,
"loss": 11.7435,
"step": 240
},
{
"epoch": 0.004827326530021144,
"grad_norm": 42.46340560913086,
"learning_rate": 0.0001,
"loss": 13.7549,
"step": 250
},
{
"epoch": 0.00502041959122199,
"grad_norm": 41.16633605957031,
"learning_rate": 0.0001,
"loss": 10.0357,
"step": 260
},
{
"epoch": 0.005213512652422835,
"grad_norm": 29.798715591430664,
"learning_rate": 0.0001,
"loss": 9.9566,
"step": 270
},
{
"epoch": 0.005406605713623681,
"grad_norm": 26.712194442749023,
"learning_rate": 0.0001,
"loss": 10.8858,
"step": 280
},
{
"epoch": 0.005599698774824527,
"grad_norm": 24.423017501831055,
"learning_rate": 0.0001,
"loss": 12.0604,
"step": 290
},
{
"epoch": 0.0057927918360253725,
"grad_norm": 32.55687713623047,
"learning_rate": 0.0001,
"loss": 13.7669,
"step": 300
},
{
"epoch": 0.0057927918360253725,
"eval_loss": 2.8850886821746826,
"eval_runtime": 326.0923,
"eval_samples_per_second": 133.741,
"eval_steps_per_second": 33.435,
"step": 300
},
{
"epoch": 0.005985884897226218,
"grad_norm": 17.198266983032227,
"learning_rate": 0.0001,
"loss": 10.4193,
"step": 310
},
{
"epoch": 0.006178977958427064,
"grad_norm": 18.960155487060547,
"learning_rate": 0.0001,
"loss": 10.4948,
"step": 320
},
{
"epoch": 0.0063720710196279095,
"grad_norm": 15.217443466186523,
"learning_rate": 0.0001,
"loss": 10.8575,
"step": 330
},
{
"epoch": 0.006565164080828755,
"grad_norm": 17.301410675048828,
"learning_rate": 0.0001,
"loss": 11.7949,
"step": 340
},
{
"epoch": 0.006758257142029601,
"grad_norm": 44.01465606689453,
"learning_rate": 0.0001,
"loss": 14.4235,
"step": 350
},
{
"epoch": 0.006951350203230447,
"grad_norm": 18.623449325561523,
"learning_rate": 0.0001,
"loss": 10.6294,
"step": 360
},
{
"epoch": 0.007144443264431292,
"grad_norm": 15.859786987304688,
"learning_rate": 0.0001,
"loss": 10.3981,
"step": 370
},
{
"epoch": 0.007337536325632138,
"grad_norm": 15.925846099853516,
"learning_rate": 0.0001,
"loss": 10.6106,
"step": 380
},
{
"epoch": 0.0075306293868329845,
"grad_norm": 23.462480545043945,
"learning_rate": 0.0001,
"loss": 11.4959,
"step": 390
},
{
"epoch": 0.00772372244803383,
"grad_norm": 34.727569580078125,
"learning_rate": 0.0001,
"loss": 13.6495,
"step": 400
},
{
"epoch": 0.007916815509234676,
"grad_norm": 13.482560157775879,
"learning_rate": 0.0001,
"loss": 9.9286,
"step": 410
},
{
"epoch": 0.00810990857043552,
"grad_norm": 19.61520004272461,
"learning_rate": 0.0001,
"loss": 10.4189,
"step": 420
},
{
"epoch": 0.008303001631636367,
"grad_norm": 20.92690658569336,
"learning_rate": 0.0001,
"loss": 10.7233,
"step": 430
},
{
"epoch": 0.008496094692837212,
"grad_norm": 40.37583541870117,
"learning_rate": 0.0001,
"loss": 11.906,
"step": 440
},
{
"epoch": 0.008689187754038059,
"grad_norm": 54.6865234375,
"learning_rate": 0.0001,
"loss": 14.2049,
"step": 450
},
{
"epoch": 0.008689187754038059,
"eval_loss": 2.856499433517456,
"eval_runtime": 326.449,
"eval_samples_per_second": 133.595,
"eval_steps_per_second": 33.399,
"step": 450
},
{
"epoch": 0.008882280815238905,
"grad_norm": 31.493087768554688,
"learning_rate": 0.0001,
"loss": 10.422,
"step": 460
},
{
"epoch": 0.00907537387643975,
"grad_norm": 25.92237091064453,
"learning_rate": 0.0001,
"loss": 10.5121,
"step": 470
},
{
"epoch": 0.009268466937640597,
"grad_norm": 23.767501831054688,
"learning_rate": 0.0001,
"loss": 10.6667,
"step": 480
},
{
"epoch": 0.009461559998841441,
"grad_norm": 29.84052848815918,
"learning_rate": 0.0001,
"loss": 11.8893,
"step": 490
},
{
"epoch": 0.009654653060042288,
"grad_norm": 42.1278190612793,
"learning_rate": 0.0001,
"loss": 13.8487,
"step": 500
},
{
"epoch": 0.009847746121243133,
"grad_norm": 18.233877182006836,
"learning_rate": 0.0001,
"loss": 10.4124,
"step": 510
},
{
"epoch": 0.01004083918244398,
"grad_norm": 24.33700180053711,
"learning_rate": 0.0001,
"loss": 10.3656,
"step": 520
},
{
"epoch": 0.010233932243644824,
"grad_norm": 21.17763900756836,
"learning_rate": 0.0001,
"loss": 10.4647,
"step": 530
},
{
"epoch": 0.01042702530484567,
"grad_norm": 47.72334289550781,
"learning_rate": 0.0001,
"loss": 11.6587,
"step": 540
},
{
"epoch": 0.010620118366046516,
"grad_norm": 32.690364837646484,
"learning_rate": 0.0001,
"loss": 13.7886,
"step": 550
},
{
"epoch": 0.010813211427247362,
"grad_norm": 14.042930603027344,
"learning_rate": 0.0001,
"loss": 10.1801,
"step": 560
},
{
"epoch": 0.011006304488448207,
"grad_norm": 13.466276168823242,
"learning_rate": 0.0001,
"loss": 9.7874,
"step": 570
},
{
"epoch": 0.011199397549649054,
"grad_norm": 13.901522636413574,
"learning_rate": 0.0001,
"loss": 10.7971,
"step": 580
},
{
"epoch": 0.011392490610849898,
"grad_norm": 15.882288932800293,
"learning_rate": 0.0001,
"loss": 11.6839,
"step": 590
},
{
"epoch": 0.011585583672050745,
"grad_norm": 31.395444869995117,
"learning_rate": 0.0001,
"loss": 13.7779,
"step": 600
},
{
"epoch": 0.011585583672050745,
"eval_loss": 2.848609447479248,
"eval_runtime": 328.4742,
"eval_samples_per_second": 132.771,
"eval_steps_per_second": 33.193,
"step": 600
},
{
"epoch": 0.011778676733251591,
"grad_norm": 16.243173599243164,
"learning_rate": 0.0001,
"loss": 10.3243,
"step": 610
},
{
"epoch": 0.011971769794452436,
"grad_norm": 10.895001411437988,
"learning_rate": 0.0001,
"loss": 10.1884,
"step": 620
},
{
"epoch": 0.012164862855653283,
"grad_norm": 12.364214897155762,
"learning_rate": 0.0001,
"loss": 10.5667,
"step": 630
},
{
"epoch": 0.012357955916854128,
"grad_norm": 14.893481254577637,
"learning_rate": 0.0001,
"loss": 11.7055,
"step": 640
},
{
"epoch": 0.012551048978054974,
"grad_norm": 31.42432403564453,
"learning_rate": 0.0001,
"loss": 13.6669,
"step": 650
},
{
"epoch": 0.012744142039255819,
"grad_norm": 11.641709327697754,
"learning_rate": 0.0001,
"loss": 10.4052,
"step": 660
},
{
"epoch": 0.012937235100456666,
"grad_norm": 12.934191703796387,
"learning_rate": 0.0001,
"loss": 10.1611,
"step": 670
},
{
"epoch": 0.01313032816165751,
"grad_norm": 28.218975067138672,
"learning_rate": 0.0001,
"loss": 10.7762,
"step": 680
},
{
"epoch": 0.013323421222858357,
"grad_norm": 16.593141555786133,
"learning_rate": 0.0001,
"loss": 11.8962,
"step": 690
},
{
"epoch": 0.013516514284059202,
"grad_norm": 23.61168098449707,
"learning_rate": 0.0001,
"loss": 13.6789,
"step": 700
},
{
"epoch": 0.013709607345260048,
"grad_norm": 11.673768043518066,
"learning_rate": 0.0001,
"loss": 10.1343,
"step": 710
},
{
"epoch": 0.013902700406460893,
"grad_norm": 13.041866302490234,
"learning_rate": 0.0001,
"loss": 10.4441,
"step": 720
},
{
"epoch": 0.01409579346766174,
"grad_norm": 15.724223136901855,
"learning_rate": 0.0001,
"loss": 10.759,
"step": 730
},
{
"epoch": 0.014288886528862585,
"grad_norm": 15.264642715454102,
"learning_rate": 0.0001,
"loss": 11.8888,
"step": 740
},
{
"epoch": 0.014481979590063431,
"grad_norm": 37.832698822021484,
"learning_rate": 0.0001,
"loss": 13.6694,
"step": 750
},
{
"epoch": 0.014481979590063431,
"eval_loss": 2.8133044242858887,
"eval_runtime": 330.5652,
"eval_samples_per_second": 131.932,
"eval_steps_per_second": 32.983,
"step": 750
},
{
"epoch": 0.014675072651264276,
"grad_norm": 13.753046035766602,
"learning_rate": 0.0001,
"loss": 10.2511,
"step": 760
},
{
"epoch": 0.014868165712465123,
"grad_norm": 12.730433464050293,
"learning_rate": 0.0001,
"loss": 10.2344,
"step": 770
},
{
"epoch": 0.015061258773665969,
"grad_norm": 18.035993576049805,
"learning_rate": 0.0001,
"loss": 10.7321,
"step": 780
},
{
"epoch": 0.015254351834866814,
"grad_norm": 20.784934997558594,
"learning_rate": 0.0001,
"loss": 11.6791,
"step": 790
},
{
"epoch": 0.01544744489606766,
"grad_norm": 29.870254516601562,
"learning_rate": 0.0001,
"loss": 13.6885,
"step": 800
},
{
"epoch": 0.015640537957268507,
"grad_norm": 16.11637306213379,
"learning_rate": 0.0001,
"loss": 10.3119,
"step": 810
},
{
"epoch": 0.015833631018469352,
"grad_norm": 12.562838554382324,
"learning_rate": 0.0001,
"loss": 10.0703,
"step": 820
},
{
"epoch": 0.016026724079670197,
"grad_norm": 11.796956062316895,
"learning_rate": 0.0001,
"loss": 10.7967,
"step": 830
},
{
"epoch": 0.01621981714087104,
"grad_norm": 18.159549713134766,
"learning_rate": 0.0001,
"loss": 11.2847,
"step": 840
},
{
"epoch": 0.01641291020207189,
"grad_norm": 22.098344802856445,
"learning_rate": 0.0001,
"loss": 13.9348,
"step": 850
},
{
"epoch": 0.016606003263272735,
"grad_norm": 10.987359046936035,
"learning_rate": 0.0001,
"loss": 10.1845,
"step": 860
},
{
"epoch": 0.01679909632447358,
"grad_norm": 22.796899795532227,
"learning_rate": 0.0001,
"loss": 10.2937,
"step": 870
},
{
"epoch": 0.016992189385674424,
"grad_norm": 14.24060344696045,
"learning_rate": 0.0001,
"loss": 10.6819,
"step": 880
},
{
"epoch": 0.017185282446875273,
"grad_norm": 14.327189445495605,
"learning_rate": 0.0001,
"loss": 11.411,
"step": 890
},
{
"epoch": 0.017378375508076117,
"grad_norm": 25.815399169921875,
"learning_rate": 0.0001,
"loss": 13.5363,
"step": 900
},
{
"epoch": 0.017378375508076117,
"eval_loss": 2.7856554985046387,
"eval_runtime": 328.0056,
"eval_samples_per_second": 132.961,
"eval_steps_per_second": 33.24,
"step": 900
},
{
"epoch": 0.017571468569276962,
"grad_norm": 14.01270580291748,
"learning_rate": 0.0001,
"loss": 10.0269,
"step": 910
},
{
"epoch": 0.01776456163047781,
"grad_norm": 12.662365913391113,
"learning_rate": 0.0001,
"loss": 10.0288,
"step": 920
},
{
"epoch": 0.017957654691678655,
"grad_norm": 11.21884822845459,
"learning_rate": 0.0001,
"loss": 10.5588,
"step": 930
},
{
"epoch": 0.0181507477528795,
"grad_norm": 14.193058013916016,
"learning_rate": 0.0001,
"loss": 11.3954,
"step": 940
},
{
"epoch": 0.018343840814080345,
"grad_norm": 33.61137008666992,
"learning_rate": 0.0001,
"loss": 13.412,
"step": 950
},
{
"epoch": 0.018536933875281193,
"grad_norm": 8.29185962677002,
"learning_rate": 0.0001,
"loss": 10.1405,
"step": 960
},
{
"epoch": 0.018730026936482038,
"grad_norm": 9.83952522277832,
"learning_rate": 0.0001,
"loss": 9.9334,
"step": 970
},
{
"epoch": 0.018923119997682883,
"grad_norm": 13.321242332458496,
"learning_rate": 0.0001,
"loss": 10.2569,
"step": 980
},
{
"epoch": 0.019116213058883728,
"grad_norm": 11.480195045471191,
"learning_rate": 0.0001,
"loss": 11.5664,
"step": 990
},
{
"epoch": 0.019309306120084576,
"grad_norm": 31.19932746887207,
"learning_rate": 0.0001,
"loss": 13.4732,
"step": 1000
},
{
"epoch": 0.01950239918128542,
"grad_norm": 8.18012809753418,
"learning_rate": 0.0001,
"loss": 9.6682,
"step": 1010
},
{
"epoch": 0.019695492242486266,
"grad_norm": 13.29211139678955,
"learning_rate": 0.0001,
"loss": 9.7695,
"step": 1020
},
{
"epoch": 0.01988858530368711,
"grad_norm": 10.953620910644531,
"learning_rate": 0.0001,
"loss": 10.6678,
"step": 1030
},
{
"epoch": 0.02008167836488796,
"grad_norm": 13.800012588500977,
"learning_rate": 0.0001,
"loss": 11.9463,
"step": 1040
},
{
"epoch": 0.020274771426088804,
"grad_norm": 38.13134765625,
"learning_rate": 0.0001,
"loss": 13.1925,
"step": 1050
},
{
"epoch": 0.020274771426088804,
"eval_loss": 2.779891014099121,
"eval_runtime": 331.3018,
"eval_samples_per_second": 131.638,
"eval_steps_per_second": 32.91,
"step": 1050
},
{
"epoch": 0.02046786448728965,
"grad_norm": 9.869668006896973,
"learning_rate": 0.0001,
"loss": 10.3068,
"step": 1060
},
{
"epoch": 0.020660957548490497,
"grad_norm": 16.215045928955078,
"learning_rate": 0.0001,
"loss": 10.0212,
"step": 1070
},
{
"epoch": 0.02085405060969134,
"grad_norm": 13.320286750793457,
"learning_rate": 0.0001,
"loss": 10.9438,
"step": 1080
},
{
"epoch": 0.021047143670892186,
"grad_norm": 17.76883888244629,
"learning_rate": 0.0001,
"loss": 11.2812,
"step": 1090
},
{
"epoch": 0.02124023673209303,
"grad_norm": 34.5752067565918,
"learning_rate": 0.0001,
"loss": 13.2006,
"step": 1100
},
{
"epoch": 0.02143332979329388,
"grad_norm": 13.33773422241211,
"learning_rate": 0.0001,
"loss": 10.145,
"step": 1110
},
{
"epoch": 0.021626422854494724,
"grad_norm": 13.584674835205078,
"learning_rate": 0.0001,
"loss": 9.9768,
"step": 1120
},
{
"epoch": 0.02181951591569557,
"grad_norm": 16.620107650756836,
"learning_rate": 0.0001,
"loss": 10.4809,
"step": 1130
},
{
"epoch": 0.022012608976896414,
"grad_norm": 20.761350631713867,
"learning_rate": 0.0001,
"loss": 11.4802,
"step": 1140
},
{
"epoch": 0.022205702038097262,
"grad_norm": 42.86295700073242,
"learning_rate": 0.0001,
"loss": 13.3743,
"step": 1150
},
{
"epoch": 0.022398795099298107,
"grad_norm": 14.914039611816406,
"learning_rate": 0.0001,
"loss": 9.9647,
"step": 1160
},
{
"epoch": 0.022591888160498952,
"grad_norm": 18.681671142578125,
"learning_rate": 0.0001,
"loss": 9.6733,
"step": 1170
},
{
"epoch": 0.022784981221699797,
"grad_norm": 16.26973533630371,
"learning_rate": 0.0001,
"loss": 10.5368,
"step": 1180
},
{
"epoch": 0.022978074282900645,
"grad_norm": 18.916370391845703,
"learning_rate": 0.0001,
"loss": 11.4679,
"step": 1190
},
{
"epoch": 0.02317116734410149,
"grad_norm": 43.991939544677734,
"learning_rate": 0.0001,
"loss": 13.6126,
"step": 1200
},
{
"epoch": 0.02317116734410149,
"eval_loss": 2.76770281791687,
"eval_runtime": 324.3376,
"eval_samples_per_second": 134.465,
"eval_steps_per_second": 33.616,
"step": 1200
},
{
"epoch": 0.023364260405302335,
"grad_norm": 19.019067764282227,
"learning_rate": 0.0001,
"loss": 9.9333,
"step": 1210
},
{
"epoch": 0.023557353466503183,
"grad_norm": 14.600743293762207,
"learning_rate": 0.0001,
"loss": 10.1374,
"step": 1220
},
{
"epoch": 0.023750446527704028,
"grad_norm": 16.908267974853516,
"learning_rate": 0.0001,
"loss": 10.7748,
"step": 1230
},
{
"epoch": 0.023943539588904873,
"grad_norm": 25.043933868408203,
"learning_rate": 0.0001,
"loss": 11.5941,
"step": 1240
},
{
"epoch": 0.024136632650105717,
"grad_norm": 31.190759658813477,
"learning_rate": 0.0001,
"loss": 13.5434,
"step": 1250
},
{
"epoch": 0.024329725711306566,
"grad_norm": 11.28165054321289,
"learning_rate": 0.0001,
"loss": 9.8626,
"step": 1260
},
{
"epoch": 0.02452281877250741,
"grad_norm": 12.803401947021484,
"learning_rate": 0.0001,
"loss": 9.9016,
"step": 1270
},
{
"epoch": 0.024715911833708255,
"grad_norm": 10.985638618469238,
"learning_rate": 0.0001,
"loss": 10.5182,
"step": 1280
},
{
"epoch": 0.0249090048949091,
"grad_norm": 14.558553695678711,
"learning_rate": 0.0001,
"loss": 11.4006,
"step": 1290
},
{
"epoch": 0.02510209795610995,
"grad_norm": 28.10079002380371,
"learning_rate": 0.0001,
"loss": 13.2948,
"step": 1300
},
{
"epoch": 0.025295191017310793,
"grad_norm": 10.139495849609375,
"learning_rate": 0.0001,
"loss": 9.7414,
"step": 1310
},
{
"epoch": 0.025488284078511638,
"grad_norm": 12.202920913696289,
"learning_rate": 0.0001,
"loss": 9.915,
"step": 1320
},
{
"epoch": 0.025681377139712483,
"grad_norm": 14.947649002075195,
"learning_rate": 0.0001,
"loss": 10.6419,
"step": 1330
},
{
"epoch": 0.02587447020091333,
"grad_norm": 16.732439041137695,
"learning_rate": 0.0001,
"loss": 11.9169,
"step": 1340
},
{
"epoch": 0.026067563262114176,
"grad_norm": 32.5897216796875,
"learning_rate": 0.0001,
"loss": 13.0356,
"step": 1350
},
{
"epoch": 0.026067563262114176,
"eval_loss": 2.7592430114746094,
"eval_runtime": 326.8015,
"eval_samples_per_second": 133.451,
"eval_steps_per_second": 33.363,
"step": 1350
},
{
"epoch": 0.02626065632331502,
"grad_norm": 10.175602912902832,
"learning_rate": 0.0001,
"loss": 10.0645,
"step": 1360
},
{
"epoch": 0.026453749384515866,
"grad_norm": 13.354422569274902,
"learning_rate": 0.0001,
"loss": 9.9626,
"step": 1370
},
{
"epoch": 0.026646842445716714,
"grad_norm": 14.017874717712402,
"learning_rate": 0.0001,
"loss": 10.6526,
"step": 1380
},
{
"epoch": 0.02683993550691756,
"grad_norm": 15.533336639404297,
"learning_rate": 0.0001,
"loss": 11.3396,
"step": 1390
},
{
"epoch": 0.027033028568118404,
"grad_norm": 21.672401428222656,
"learning_rate": 0.0001,
"loss": 13.0943,
"step": 1400
},
{
"epoch": 0.027226121629319252,
"grad_norm": 11.563956260681152,
"learning_rate": 0.0001,
"loss": 10.1492,
"step": 1410
},
{
"epoch": 0.027419214690520097,
"grad_norm": 9.616212844848633,
"learning_rate": 0.0001,
"loss": 10.1169,
"step": 1420
},
{
"epoch": 0.02761230775172094,
"grad_norm": 14.188048362731934,
"learning_rate": 0.0001,
"loss": 10.3995,
"step": 1430
},
{
"epoch": 0.027805400812921786,
"grad_norm": 13.804783821105957,
"learning_rate": 0.0001,
"loss": 11.1481,
"step": 1440
},
{
"epoch": 0.027998493874122635,
"grad_norm": 23.23021125793457,
"learning_rate": 0.0001,
"loss": 13.1522,
"step": 1450
},
{
"epoch": 0.02819158693532348,
"grad_norm": 7.562139511108398,
"learning_rate": 0.0001,
"loss": 10.1138,
"step": 1460
},
{
"epoch": 0.028384679996524324,
"grad_norm": 26.057680130004883,
"learning_rate": 0.0001,
"loss": 9.7856,
"step": 1470
},
{
"epoch": 0.02857777305772517,
"grad_norm": 12.248312950134277,
"learning_rate": 0.0001,
"loss": 10.4052,
"step": 1480
},
{
"epoch": 0.028770866118926017,
"grad_norm": 14.60325813293457,
"learning_rate": 0.0001,
"loss": 11.6516,
"step": 1490
},
{
"epoch": 0.028963959180126862,
"grad_norm": 52.02046585083008,
"learning_rate": 0.0001,
"loss": 13.2193,
"step": 1500
},
{
"epoch": 0.028963959180126862,
"eval_loss": 2.770695209503174,
"eval_runtime": 330.0972,
"eval_samples_per_second": 132.119,
"eval_steps_per_second": 33.03,
"step": 1500
}
],
"logging_steps": 10,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 1
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.0944605381853184e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}