|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007407407407407408, |
|
"grad_norm": 1.5567783117294312, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 1.5566, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 1.6362663507461548, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 1.5121, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.9970850944519043, |
|
"learning_rate": 0.00014814814814814815, |
|
"loss": 1.2449, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 0.688450813293457, |
|
"learning_rate": 0.0002222222222222222, |
|
"loss": 0.8674, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.4350931942462921, |
|
"learning_rate": 0.0002962962962962963, |
|
"loss": 0.7491, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.377754271030426, |
|
"learning_rate": 0.00037037037037037035, |
|
"loss": 0.6649, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.19140222668647766, |
|
"learning_rate": 0.0004444444444444444, |
|
"loss": 0.6335, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 0.16032004356384277, |
|
"learning_rate": 0.0005185185185185185, |
|
"loss": 0.6068, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.15596596896648407, |
|
"learning_rate": 0.0005925925925925926, |
|
"loss": 0.5878, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.12440178543329239, |
|
"learning_rate": 0.0006666666666666666, |
|
"loss": 0.564, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.13573868572711945, |
|
"learning_rate": 0.0007407407407407407, |
|
"loss": 0.56, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 0.10565729439258575, |
|
"learning_rate": 0.0008148148148148148, |
|
"loss": 0.5418, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.11647824943065643, |
|
"learning_rate": 0.0008888888888888888, |
|
"loss": 0.5356, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 0.10235206037759781, |
|
"learning_rate": 0.0009629629629629629, |
|
"loss": 0.5294, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.17172235250473022, |
|
"learning_rate": 0.001037037037037037, |
|
"loss": 0.5216, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.12478260695934296, |
|
"learning_rate": 0.0011111111111111111, |
|
"loss": 0.5218, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.11295782774686813, |
|
"learning_rate": 0.0011851851851851852, |
|
"loss": 0.5144, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 0.1066703274846077, |
|
"learning_rate": 0.0012592592592592592, |
|
"loss": 0.5154, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.0921861082315445, |
|
"learning_rate": 0.0013333333333333333, |
|
"loss": 0.5003, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 0.09984983503818512, |
|
"learning_rate": 0.0014074074074074076, |
|
"loss": 0.5042, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.13562288880348206, |
|
"learning_rate": 0.0014814814814814814, |
|
"loss": 0.5003, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.09528646618127823, |
|
"learning_rate": 0.0015555555555555557, |
|
"loss": 0.4996, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 0.14511393010616302, |
|
"learning_rate": 0.0016296296296296295, |
|
"loss": 0.4929, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 0.0997464582324028, |
|
"learning_rate": 0.0017037037037037038, |
|
"loss": 0.4906, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.14288371801376343, |
|
"learning_rate": 0.0017777777777777776, |
|
"loss": 0.4924, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.12957949936389923, |
|
"learning_rate": 0.001851851851851852, |
|
"loss": 0.4879, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.10427533835172653, |
|
"learning_rate": 0.0019259259259259258, |
|
"loss": 0.4856, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.11492197960615158, |
|
"learning_rate": 0.002, |
|
"loss": 0.4861, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2494666576385498, |
|
"eval_runtime": 1.452, |
|
"eval_samples_per_second": 2.755, |
|
"eval_steps_per_second": 0.689, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.20287227630615234, |
|
"learning_rate": 0.0019999164298554373, |
|
"loss": 0.4759, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.074074074074074, |
|
"grad_norm": 0.10903054475784302, |
|
"learning_rate": 0.0019996657333896874, |
|
"loss": 0.4752, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.09817971289157867, |
|
"learning_rate": 0.00199924795250423, |
|
"loss": 0.4771, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1481481481481481, |
|
"grad_norm": 0.10313747823238373, |
|
"learning_rate": 0.001998663157027083, |
|
"loss": 0.471, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.10586988180875778, |
|
"learning_rate": 0.001997911444701132, |
|
"loss": 0.4675, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.10676765441894531, |
|
"learning_rate": 0.001996992941167792, |
|
"loss": 0.4743, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 0.09956058114767075, |
|
"learning_rate": 0.0019959077999460095, |
|
"loss": 0.4682, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.08940426260232925, |
|
"learning_rate": 0.0019946562024066015, |
|
"loss": 0.4667, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.09569676965475082, |
|
"learning_rate": 0.001993238357741943, |
|
"loss": 0.4742, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3703703703703702, |
|
"grad_norm": 0.1325625479221344, |
|
"learning_rate": 0.0019916545029310014, |
|
"loss": 0.4735, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 0.09514327347278595, |
|
"learning_rate": 0.0019899049026997273, |
|
"loss": 0.473, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.10093410313129425, |
|
"learning_rate": 0.001987989849476809, |
|
"loss": 0.4693, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.10901673138141632, |
|
"learning_rate": 0.0019859096633447963, |
|
"loss": 0.4666, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5185185185185186, |
|
"grad_norm": 0.10339543968439102, |
|
"learning_rate": 0.001983664691986601, |
|
"loss": 0.4712, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.0933491587638855, |
|
"learning_rate": 0.001981255310627385, |
|
"loss": 0.4692, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5925925925925926, |
|
"grad_norm": 0.09102772176265717, |
|
"learning_rate": 0.0019786819219718443, |
|
"loss": 0.4666, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.08935283869504929, |
|
"learning_rate": 0.0019759449561369035, |
|
"loss": 0.4605, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.10174907743930817, |
|
"learning_rate": 0.0019730448705798237, |
|
"loss": 0.465, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 0.09616223722696304, |
|
"learning_rate": 0.0019699821500217436, |
|
"loss": 0.4684, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7407407407407407, |
|
"grad_norm": 0.09224896878004074, |
|
"learning_rate": 0.001966757306366662, |
|
"loss": 0.468, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.09403350204229355, |
|
"learning_rate": 0.0019633708786158804, |
|
"loss": 0.4658, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8148148148148149, |
|
"grad_norm": 0.09764115512371063, |
|
"learning_rate": 0.001959823432777912, |
|
"loss": 0.4682, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.08690910041332245, |
|
"learning_rate": 0.0019561155617738796, |
|
"loss": 0.4679, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.10882115364074707, |
|
"learning_rate": 0.0019522478853384153, |
|
"loss": 0.46, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.10638313740491867, |
|
"learning_rate": 0.0019482210499160765, |
|
"loss": 0.4601, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.9629629629629628, |
|
"grad_norm": 0.1394616961479187, |
|
"learning_rate": 0.0019440357285533, |
|
"loss": 0.4649, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.11232441663742065, |
|
"learning_rate": 0.0019396926207859084, |
|
"loss": 0.458, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.2390342950820923, |
|
"eval_runtime": 1.4571, |
|
"eval_samples_per_second": 2.745, |
|
"eval_steps_per_second": 0.686, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.037037037037037, |
|
"grad_norm": 0.11058547347784042, |
|
"learning_rate": 0.0019351924525221897, |
|
"loss": 0.4336, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.074074074074074, |
|
"grad_norm": 0.10051153600215912, |
|
"learning_rate": 0.0019305359759215685, |
|
"loss": 0.4345, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.09568148106336594, |
|
"learning_rate": 0.0019257239692688907, |
|
"loss": 0.4356, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.148148148148148, |
|
"grad_norm": 0.09497468173503876, |
|
"learning_rate": 0.0019207572368443383, |
|
"loss": 0.44, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.185185185185185, |
|
"grad_norm": 0.10157410800457001, |
|
"learning_rate": 0.001915636608789006, |
|
"loss": 0.439, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 0.1098947674036026, |
|
"learning_rate": 0.0019103629409661467, |
|
"loss": 0.44, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.259259259259259, |
|
"grad_norm": 0.09858433902263641, |
|
"learning_rate": 0.0019049371148181253, |
|
"loss": 0.4466, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.2962962962962963, |
|
"grad_norm": 0.09663781523704529, |
|
"learning_rate": 0.0018993600372190932, |
|
"loss": 0.4408, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.100925512611866, |
|
"learning_rate": 0.0018936326403234123, |
|
"loss": 0.4393, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.3703703703703702, |
|
"grad_norm": 0.10806870460510254, |
|
"learning_rate": 0.0018877558814098562, |
|
"loss": 0.4393, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.4074074074074074, |
|
"grad_norm": 0.10387130826711655, |
|
"learning_rate": 0.001881730742721608, |
|
"loss": 0.4445, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.09329431504011154, |
|
"learning_rate": 0.0018755582313020908, |
|
"loss": 0.439, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4814814814814814, |
|
"grad_norm": 0.09396202117204666, |
|
"learning_rate": 0.0018692393788266478, |
|
"loss": 0.4379, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.5185185185185186, |
|
"grad_norm": 0.10305757075548172, |
|
"learning_rate": 0.0018627752414301084, |
|
"loss": 0.4412, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 0.1034080982208252, |
|
"learning_rate": 0.0018561668995302665, |
|
"loss": 0.4445, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.5925925925925926, |
|
"grad_norm": 0.09808813035488129, |
|
"learning_rate": 0.0018494154576472975, |
|
"loss": 0.4454, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.6296296296296298, |
|
"grad_norm": 0.0979294404387474, |
|
"learning_rate": 0.0018425220442191495, |
|
"loss": 0.4498, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.09154608845710754, |
|
"learning_rate": 0.0018354878114129364, |
|
"loss": 0.4416, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.7037037037037037, |
|
"grad_norm": 0.0931951180100441, |
|
"learning_rate": 0.0018283139349323631, |
|
"loss": 0.4439, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.7407407407407405, |
|
"grad_norm": 0.09334207326173782, |
|
"learning_rate": 0.0018210016138212187, |
|
"loss": 0.4443, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.092356376349926, |
|
"learning_rate": 0.0018135520702629675, |
|
"loss": 0.4429, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.814814814814815, |
|
"grad_norm": 0.09269876778125763, |
|
"learning_rate": 0.0018059665493764742, |
|
"loss": 0.4443, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.851851851851852, |
|
"grad_norm": 0.10383119434118271, |
|
"learning_rate": 0.0017982463190078929, |
|
"loss": 0.4436, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.0941496342420578, |
|
"learning_rate": 0.0017903926695187593, |
|
"loss": 0.4456, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.925925925925926, |
|
"grad_norm": 0.09286876022815704, |
|
"learning_rate": 0.0017824069135703197, |
|
"loss": 0.445, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.962962962962963, |
|
"grad_norm": 0.09152144938707352, |
|
"learning_rate": 0.0017742903859041324, |
|
"loss": 0.4499, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.09554123133420944, |
|
"learning_rate": 0.001766044443118978, |
|
"loss": 0.4423, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.2548832893371582, |
|
"eval_runtime": 1.4432, |
|
"eval_samples_per_second": 2.772, |
|
"eval_steps_per_second": 0.693, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.037037037037037, |
|
"grad_norm": 0.0980718806385994, |
|
"learning_rate": 0.001757670463444118, |
|
"loss": 0.4156, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.074074074074074, |
|
"grad_norm": 0.09404606372117996, |
|
"learning_rate": 0.0017491698465089362, |
|
"loss": 0.4121, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 0.11583345383405685, |
|
"learning_rate": 0.0017405440131090047, |
|
"loss": 0.4172, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.148148148148148, |
|
"grad_norm": 0.09314367175102234, |
|
"learning_rate": 0.0017317944049686123, |
|
"loss": 0.4166, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.185185185185185, |
|
"grad_norm": 0.1007940024137497, |
|
"learning_rate": 0.001722922484499793, |
|
"loss": 0.4123, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.2222222222222223, |
|
"grad_norm": 0.0978422611951828, |
|
"learning_rate": 0.0017139297345578992, |
|
"loss": 0.4199, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.259259259259259, |
|
"grad_norm": 0.09395431727170944, |
|
"learning_rate": 0.0017048176581937562, |
|
"loss": 0.4133, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.2962962962962963, |
|
"grad_norm": 0.09783247113227844, |
|
"learning_rate": 0.0016955877784024418, |
|
"loss": 0.4253, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.0955362394452095, |
|
"learning_rate": 0.0016862416378687337, |
|
"loss": 0.4204, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.3703703703703702, |
|
"grad_norm": 0.09792539477348328, |
|
"learning_rate": 0.001676780798709262, |
|
"loss": 0.4232, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.4074074074074074, |
|
"grad_norm": 0.09116431325674057, |
|
"learning_rate": 0.0016672068422114196, |
|
"loss": 0.4211, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.4444444444444446, |
|
"grad_norm": 0.1013198271393776, |
|
"learning_rate": 0.0016575213685690638, |
|
"loss": 0.4202, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.4814814814814814, |
|
"grad_norm": 0.09579209238290787, |
|
"learning_rate": 0.0016477259966150588, |
|
"loss": 0.4268, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.5185185185185186, |
|
"grad_norm": 0.09915994852781296, |
|
"learning_rate": 0.001637822363550706, |
|
"loss": 0.4274, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 0.09255506843328476, |
|
"learning_rate": 0.0016278121246720988, |
|
"loss": 0.4237, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.5925925925925926, |
|
"grad_norm": 0.09609281271696091, |
|
"learning_rate": 0.001617696953093457, |
|
"loss": 0.4287, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.6296296296296298, |
|
"grad_norm": 0.10273302346467972, |
|
"learning_rate": 0.0016074785394674836, |
|
"loss": 0.4301, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 0.09646812826395035, |
|
"learning_rate": 0.0015971585917027862, |
|
"loss": 0.4282, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 0.09089499711990356, |
|
"learning_rate": 0.001586738834678418, |
|
"loss": 0.425, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.7407407407407405, |
|
"grad_norm": 0.08903223276138306, |
|
"learning_rate": 0.0015762210099555802, |
|
"loss": 0.4222, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 0.09616349637508392, |
|
"learning_rate": 0.0015656068754865387, |
|
"loss": 0.4243, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.814814814814815, |
|
"grad_norm": 0.0883878841996193, |
|
"learning_rate": 0.001554898205320797, |
|
"loss": 0.4278, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.851851851851852, |
|
"grad_norm": 0.0974721610546112, |
|
"learning_rate": 0.0015440967893085827, |
|
"loss": 0.4239, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 0.09724871814250946, |
|
"learning_rate": 0.0015332044328016914, |
|
"loss": 0.4269, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.925925925925926, |
|
"grad_norm": 0.09783729910850525, |
|
"learning_rate": 0.0015222229563517384, |
|
"loss": 0.4289, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.962962962962963, |
|
"grad_norm": 0.09811478108167648, |
|
"learning_rate": 0.0015111541954058731, |
|
"loss": 0.4265, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.09431273490190506, |
|
"learning_rate": 0.0015, |
|
"loss": 0.4244, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.266510009765625, |
|
"eval_runtime": 1.4433, |
|
"eval_samples_per_second": 2.771, |
|
"eval_steps_per_second": 0.693, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.037037037037037, |
|
"grad_norm": 0.09484298527240753, |
|
"learning_rate": 0.0014887622344495642, |
|
"loss": 0.3886, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.074074074074074, |
|
"grad_norm": 0.09763394296169281, |
|
"learning_rate": 0.001477442777037949, |
|
"loss": 0.3867, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.111111111111111, |
|
"grad_norm": 0.09587694704532623, |
|
"learning_rate": 0.001466043519702539, |
|
"loss": 0.3934, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.148148148148148, |
|
"grad_norm": 0.09805364161729813, |
|
"learning_rate": 0.0014545663677185006, |
|
"loss": 0.3913, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.185185185185185, |
|
"grad_norm": 0.09342525154352188, |
|
"learning_rate": 0.0014430132393803352, |
|
"loss": 0.3962, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 0.09320686757564545, |
|
"learning_rate": 0.0014313860656812536, |
|
"loss": 0.393, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.2592592592592595, |
|
"grad_norm": 0.09312278032302856, |
|
"learning_rate": 0.001419686789990429, |
|
"loss": 0.3956, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.296296296296296, |
|
"grad_norm": 0.09693987667560577, |
|
"learning_rate": 0.0014079173677281835, |
|
"loss": 0.4003, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.333333333333333, |
|
"grad_norm": 0.09647519141435623, |
|
"learning_rate": 0.001396079766039157, |
|
"loss": 0.4026, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.37037037037037, |
|
"grad_norm": 0.09753508865833282, |
|
"learning_rate": 0.0013841759634635176, |
|
"loss": 0.4047, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.407407407407407, |
|
"grad_norm": 0.09481139481067657, |
|
"learning_rate": 0.00137220794960627, |
|
"loss": 0.4028, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 0.0947929173707962, |
|
"learning_rate": 0.0013601777248047106, |
|
"loss": 0.4018, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.481481481481482, |
|
"grad_norm": 0.10154122114181519, |
|
"learning_rate": 0.0013480872997940906, |
|
"loss": 0.4019, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.518518518518518, |
|
"grad_norm": 0.09206040948629379, |
|
"learning_rate": 0.0013359386953715423, |
|
"loss": 0.401, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.555555555555555, |
|
"grad_norm": 0.0979878157377243, |
|
"learning_rate": 0.0013237339420583212, |
|
"loss": 0.4046, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.592592592592593, |
|
"grad_norm": 0.09499992430210114, |
|
"learning_rate": 0.0013114750797604247, |
|
"loss": 0.4092, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 0.096099354326725, |
|
"learning_rate": 0.0012991641574276419, |
|
"loss": 0.4066, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.09620559960603714, |
|
"learning_rate": 0.0012868032327110904, |
|
"loss": 0.4075, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.703703703703704, |
|
"grad_norm": 0.09676729887723923, |
|
"learning_rate": 0.0012743943716193016, |
|
"loss": 0.4026, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 4.7407407407407405, |
|
"grad_norm": 0.09284202009439468, |
|
"learning_rate": 0.0012619396481729059, |
|
"loss": 0.4055, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.777777777777778, |
|
"grad_norm": 0.09661979228258133, |
|
"learning_rate": 0.0012494411440579815, |
|
"loss": 0.4068, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 4.814814814814815, |
|
"grad_norm": 0.09566741436719894, |
|
"learning_rate": 0.001236900948278119, |
|
"loss": 0.4021, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.851851851851852, |
|
"grad_norm": 0.09348838031291962, |
|
"learning_rate": 0.0012243211568052678, |
|
"loss": 0.4024, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 0.09663164615631104, |
|
"learning_rate": 0.0012117038722294108, |
|
"loss": 0.4062, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.925925925925926, |
|
"grad_norm": 0.09402230381965637, |
|
"learning_rate": 0.0011990512034071405, |
|
"loss": 0.4028, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 4.962962962962963, |
|
"grad_norm": 0.10056190937757492, |
|
"learning_rate": 0.0011863652651091822, |
|
"loss": 0.4074, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.09207284450531006, |
|
"learning_rate": 0.0011736481776669307, |
|
"loss": 0.4051, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.271437644958496, |
|
"eval_runtime": 1.4527, |
|
"eval_samples_per_second": 2.753, |
|
"eval_steps_per_second": 0.688, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 5.037037037037037, |
|
"grad_norm": 0.10166580975055695, |
|
"learning_rate": 0.0011609020666180573, |
|
"loss": 0.369, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.074074074074074, |
|
"grad_norm": 0.10062967240810394, |
|
"learning_rate": 0.001148129062351249, |
|
"loss": 0.3679, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 5.111111111111111, |
|
"grad_norm": 0.09751473367214203, |
|
"learning_rate": 0.0011353312997501312, |
|
"loss": 0.3679, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.148148148148148, |
|
"grad_norm": 0.0990433469414711, |
|
"learning_rate": 0.0011225109178364455, |
|
"loss": 0.3685, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 5.185185185185185, |
|
"grad_norm": 0.099190853536129, |
|
"learning_rate": 0.0011096700594125316, |
|
"loss": 0.3736, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.222222222222222, |
|
"grad_norm": 0.1061362773180008, |
|
"learning_rate": 0.0010968108707031792, |
|
"loss": 0.3751, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 5.2592592592592595, |
|
"grad_norm": 0.11798923462629318, |
|
"learning_rate": 0.0010839355009969068, |
|
"loss": 0.3757, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.296296296296296, |
|
"grad_norm": 0.09905427694320679, |
|
"learning_rate": 0.00107104610228673, |
|
"loss": 0.375, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.1025015115737915, |
|
"learning_rate": 0.0010581448289104759, |
|
"loss": 0.3763, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.37037037037037, |
|
"grad_norm": 0.1149529442191124, |
|
"learning_rate": 0.0010452338371907063, |
|
"loss": 0.3882, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 5.407407407407407, |
|
"grad_norm": 0.12777069211006165, |
|
"learning_rate": 0.0010323152850743107, |
|
"loss": 0.3812, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 5.444444444444445, |
|
"grad_norm": 0.10847526788711548, |
|
"learning_rate": 0.0010193913317718243, |
|
"loss": 0.3832, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 5.481481481481482, |
|
"grad_norm": 0.11118727922439575, |
|
"learning_rate": 0.0010064641373965393, |
|
"loss": 0.384, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.518518518518518, |
|
"grad_norm": 0.12322206795215607, |
|
"learning_rate": 0.0009935358626034607, |
|
"loss": 0.3826, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 0.1018897145986557, |
|
"learning_rate": 0.0009806086682281757, |
|
"loss": 0.3828, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 5.592592592592593, |
|
"grad_norm": 0.10415022075176239, |
|
"learning_rate": 0.0009676847149256894, |
|
"loss": 0.3834, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 5.62962962962963, |
|
"grad_norm": 0.10056640207767487, |
|
"learning_rate": 0.0009547661628092937, |
|
"loss": 0.379, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.666666666666667, |
|
"grad_norm": 0.13305579125881195, |
|
"learning_rate": 0.0009418551710895242, |
|
"loss": 0.3836, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 5.703703703703704, |
|
"grad_norm": 0.1132049411535263, |
|
"learning_rate": 0.0009289538977132702, |
|
"loss": 0.3815, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.7407407407407405, |
|
"grad_norm": 0.12073713541030884, |
|
"learning_rate": 0.000916064499003093, |
|
"loss": 0.384, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 0.0997130498290062, |
|
"learning_rate": 0.000903189129296821, |
|
"loss": 0.3836, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.814814814814815, |
|
"grad_norm": 0.1098228394985199, |
|
"learning_rate": 0.0008903299405874684, |
|
"loss": 0.3901, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 5.851851851851852, |
|
"grad_norm": 0.10409754514694214, |
|
"learning_rate": 0.0008774890821635547, |
|
"loss": 0.3806, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.888888888888889, |
|
"grad_norm": 0.09349874407052994, |
|
"learning_rate": 0.0008646687002498692, |
|
"loss": 0.3821, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 5.925925925925926, |
|
"grad_norm": 0.0936858206987381, |
|
"learning_rate": 0.0008518709376487514, |
|
"loss": 0.3842, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.962962962962963, |
|
"grad_norm": 0.09671808034181595, |
|
"learning_rate": 0.0008390979333819426, |
|
"loss": 0.3813, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.10419617593288422, |
|
"learning_rate": 0.0008263518223330697, |
|
"loss": 0.3815, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.2959271669387817, |
|
"eval_runtime": 1.4543, |
|
"eval_samples_per_second": 2.75, |
|
"eval_steps_per_second": 0.688, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.037037037037037, |
|
"grad_norm": 0.10344818234443665, |
|
"learning_rate": 0.0008136347348908179, |
|
"loss": 0.3456, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 6.074074074074074, |
|
"grad_norm": 0.11311297118663788, |
|
"learning_rate": 0.0008009487965928596, |
|
"loss": 0.3457, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.111111111111111, |
|
"grad_norm": 0.1126752719283104, |
|
"learning_rate": 0.0007882961277705895, |
|
"loss": 0.3446, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 6.148148148148148, |
|
"grad_norm": 0.10734600573778152, |
|
"learning_rate": 0.0007756788431947326, |
|
"loss": 0.3441, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.185185185185185, |
|
"grad_norm": 0.10664735734462738, |
|
"learning_rate": 0.0007630990517218807, |
|
"loss": 0.3464, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 0.10784178227186203, |
|
"learning_rate": 0.0007505588559420188, |
|
"loss": 0.3484, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.2592592592592595, |
|
"grad_norm": 0.09735342860221863, |
|
"learning_rate": 0.0007380603518270941, |
|
"loss": 0.3473, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 6.296296296296296, |
|
"grad_norm": 0.10385815799236298, |
|
"learning_rate": 0.0007256056283806986, |
|
"loss": 0.3493, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 6.333333333333333, |
|
"grad_norm": 0.10252544283866882, |
|
"learning_rate": 0.0007131967672889101, |
|
"loss": 0.3531, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 6.37037037037037, |
|
"grad_norm": 0.10720902681350708, |
|
"learning_rate": 0.0007008358425723586, |
|
"loss": 0.3543, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.407407407407407, |
|
"grad_norm": 0.1041482537984848, |
|
"learning_rate": 0.0006885249202395753, |
|
"loss": 0.3508, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 6.444444444444445, |
|
"grad_norm": 0.10373668372631073, |
|
"learning_rate": 0.000676266057941679, |
|
"loss": 0.3534, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 6.481481481481482, |
|
"grad_norm": 0.10245037823915482, |
|
"learning_rate": 0.0006640613046284581, |
|
"loss": 0.3521, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 6.518518518518518, |
|
"grad_norm": 0.10254081338644028, |
|
"learning_rate": 0.0006519127002059096, |
|
"loss": 0.3538, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.555555555555555, |
|
"grad_norm": 0.10260630398988724, |
|
"learning_rate": 0.0006398222751952898, |
|
"loss": 0.3537, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 6.592592592592593, |
|
"grad_norm": 0.10130611807107925, |
|
"learning_rate": 0.0006277920503937303, |
|
"loss": 0.3515, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 6.62962962962963, |
|
"grad_norm": 0.10895968973636627, |
|
"learning_rate": 0.0006158240365364823, |
|
"loss": 0.3578, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.09870075434446335, |
|
"learning_rate": 0.0006039202339608432, |
|
"loss": 0.3512, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.703703703703704, |
|
"grad_norm": 0.10364022850990295, |
|
"learning_rate": 0.0005920826322718165, |
|
"loss": 0.3537, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 6.7407407407407405, |
|
"grad_norm": 0.10551033914089203, |
|
"learning_rate": 0.000580313210009571, |
|
"loss": 0.3566, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 6.777777777777778, |
|
"grad_norm": 0.10657215118408203, |
|
"learning_rate": 0.0005686139343187468, |
|
"loss": 0.3552, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 6.814814814814815, |
|
"grad_norm": 0.10223071277141571, |
|
"learning_rate": 0.0005569867606196651, |
|
"loss": 0.3584, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 6.851851851851852, |
|
"grad_norm": 0.10146836936473846, |
|
"learning_rate": 0.0005454336322814994, |
|
"loss": 0.3535, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 6.888888888888889, |
|
"grad_norm": 0.1027546152472496, |
|
"learning_rate": 0.0005339564802974615, |
|
"loss": 0.3501, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.925925925925926, |
|
"grad_norm": 0.10245343297719955, |
|
"learning_rate": 0.0005225572229620509, |
|
"loss": 0.3512, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 6.962962962962963, |
|
"grad_norm": 0.10256995260715485, |
|
"learning_rate": 0.0005112377655504359, |
|
"loss": 0.3543, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.10365499556064606, |
|
"learning_rate": 0.0005000000000000002, |
|
"loss": 0.3546, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.3559757471084595, |
|
"eval_runtime": 1.4533, |
|
"eval_samples_per_second": 2.752, |
|
"eval_steps_per_second": 0.688, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 7.037037037037037, |
|
"grad_norm": 0.11704354733228683, |
|
"learning_rate": 0.0004888458045941269, |
|
"loss": 0.32, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.074074074074074, |
|
"grad_norm": 0.11646736413240433, |
|
"learning_rate": 0.0004777770436482617, |
|
"loss": 0.3177, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 0.11117232590913773, |
|
"learning_rate": 0.000466795567198309, |
|
"loss": 0.3144, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.148148148148148, |
|
"grad_norm": 0.10937786102294922, |
|
"learning_rate": 0.0004559032106914173, |
|
"loss": 0.3212, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 7.185185185185185, |
|
"grad_norm": 0.11477228999137878, |
|
"learning_rate": 0.0004451017946792032, |
|
"loss": 0.3219, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.222222222222222, |
|
"grad_norm": 0.10524013638496399, |
|
"learning_rate": 0.0004343931245134616, |
|
"loss": 0.3167, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 7.2592592592592595, |
|
"grad_norm": 0.10898813605308533, |
|
"learning_rate": 0.0004237789900444197, |
|
"loss": 0.3204, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.296296296296296, |
|
"grad_norm": 0.11245737224817276, |
|
"learning_rate": 0.0004132611653215822, |
|
"loss": 0.3216, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 7.333333333333333, |
|
"grad_norm": 0.1109575405716896, |
|
"learning_rate": 0.00040284140829721405, |
|
"loss": 0.3211, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 7.37037037037037, |
|
"grad_norm": 0.10827223211526871, |
|
"learning_rate": 0.00039252146053251637, |
|
"loss": 0.3224, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 0.11438044160604477, |
|
"learning_rate": 0.00038230304690654306, |
|
"loss": 0.3243, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.444444444444445, |
|
"grad_norm": 0.1066223755478859, |
|
"learning_rate": 0.00037218787532790164, |
|
"loss": 0.3253, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 7.481481481481482, |
|
"grad_norm": 0.11840826272964478, |
|
"learning_rate": 0.0003621776364492939, |
|
"loss": 0.327, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 7.518518518518518, |
|
"grad_norm": 0.1115390807390213, |
|
"learning_rate": 0.0003522740033849411, |
|
"loss": 0.3261, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 7.555555555555555, |
|
"grad_norm": 0.1104753240942955, |
|
"learning_rate": 0.0003424786314309365, |
|
"loss": 0.3225, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.592592592592593, |
|
"grad_norm": 0.10727293789386749, |
|
"learning_rate": 0.00033279315778858033, |
|
"loss": 0.3237, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 7.62962962962963, |
|
"grad_norm": 0.11141736060380936, |
|
"learning_rate": 0.0003232192012907381, |
|
"loss": 0.3237, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 7.666666666666667, |
|
"grad_norm": 0.10661887377500534, |
|
"learning_rate": 0.0003137583621312665, |
|
"loss": 0.3257, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 7.703703703703704, |
|
"grad_norm": 0.11351703852415085, |
|
"learning_rate": 0.000304412221597558, |
|
"loss": 0.3236, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.7407407407407405, |
|
"grad_norm": 0.10875287652015686, |
|
"learning_rate": 0.0002951823418062439, |
|
"loss": 0.3268, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 0.1116643026471138, |
|
"learning_rate": 0.0002860702654421011, |
|
"loss": 0.3239, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 7.814814814814815, |
|
"grad_norm": 0.10941953957080841, |
|
"learning_rate": 0.0002770775155002071, |
|
"loss": 0.3257, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 7.851851851851852, |
|
"grad_norm": 0.10806366801261902, |
|
"learning_rate": 0.00026820559503138797, |
|
"loss": 0.3235, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 7.888888888888889, |
|
"grad_norm": 0.1069023609161377, |
|
"learning_rate": 0.0002594559868909956, |
|
"loss": 0.3293, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 7.925925925925926, |
|
"grad_norm": 0.10821282863616943, |
|
"learning_rate": 0.000250830153491064, |
|
"loss": 0.3267, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 7.962962962962963, |
|
"grad_norm": 0.1114460751414299, |
|
"learning_rate": 0.00024232953655588209, |
|
"loss": 0.3225, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.1069159284234047, |
|
"learning_rate": 0.0002339555568810221, |
|
"loss": 0.3233, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.4125341176986694, |
|
"eval_runtime": 1.4532, |
|
"eval_samples_per_second": 2.753, |
|
"eval_steps_per_second": 0.688, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.037037037037036, |
|
"grad_norm": 0.13299891352653503, |
|
"learning_rate": 0.00022570961409586754, |
|
"loss": 0.2956, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 8.074074074074074, |
|
"grad_norm": 0.11145825684070587, |
|
"learning_rate": 0.00021759308642968023, |
|
"loss": 0.2952, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 8.11111111111111, |
|
"grad_norm": 0.11993227154016495, |
|
"learning_rate": 0.00020960733048124082, |
|
"loss": 0.2951, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 8.148148148148149, |
|
"grad_norm": 0.11672375351190567, |
|
"learning_rate": 0.00020175368099210702, |
|
"loss": 0.2961, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.185185185185185, |
|
"grad_norm": 0.11903874576091766, |
|
"learning_rate": 0.00019403345062352572, |
|
"loss": 0.2981, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 8.222222222222221, |
|
"grad_norm": 0.11281085014343262, |
|
"learning_rate": 0.00018644792973703252, |
|
"loss": 0.2983, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 8.25925925925926, |
|
"grad_norm": 0.11996857821941376, |
|
"learning_rate": 0.00017899838617878162, |
|
"loss": 0.2993, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 8.296296296296296, |
|
"grad_norm": 0.11176000535488129, |
|
"learning_rate": 0.00017168606506763696, |
|
"loss": 0.2972, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.11802016943693161, |
|
"learning_rate": 0.00016451218858706373, |
|
"loss": 0.2977, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 8.37037037037037, |
|
"grad_norm": 0.11810103803873062, |
|
"learning_rate": 0.00015747795578085046, |
|
"loss": 0.3002, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 8.407407407407407, |
|
"grad_norm": 0.12034923583269119, |
|
"learning_rate": 0.0001505845423527027, |
|
"loss": 0.2961, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 8.444444444444445, |
|
"grad_norm": 0.11696241796016693, |
|
"learning_rate": 0.00014383310046973364, |
|
"loss": 0.2977, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.481481481481481, |
|
"grad_norm": 0.11664384603500366, |
|
"learning_rate": 0.00013722475856989158, |
|
"loss": 0.2952, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 8.518518518518519, |
|
"grad_norm": 0.117145836353302, |
|
"learning_rate": 0.00013076062117335218, |
|
"loss": 0.3016, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 8.555555555555555, |
|
"grad_norm": 0.11361709982156754, |
|
"learning_rate": 0.00012444176869790924, |
|
"loss": 0.2988, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 8.592592592592592, |
|
"grad_norm": 0.1155712679028511, |
|
"learning_rate": 0.00011826925727839199, |
|
"loss": 0.2977, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.62962962962963, |
|
"grad_norm": 0.1212184950709343, |
|
"learning_rate": 0.00011224411859014417, |
|
"loss": 0.298, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 8.666666666666666, |
|
"grad_norm": 0.11797547340393066, |
|
"learning_rate": 0.00010636735967658784, |
|
"loss": 0.3007, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 8.703703703703704, |
|
"grad_norm": 0.11559978872537613, |
|
"learning_rate": 0.00010063996278090704, |
|
"loss": 0.2958, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 8.74074074074074, |
|
"grad_norm": 0.11640750616788864, |
|
"learning_rate": 9.506288518187466e-05, |
|
"loss": 0.2966, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.777777777777779, |
|
"grad_norm": 0.11927841603755951, |
|
"learning_rate": 8.963705903385344e-05, |
|
"loss": 0.2953, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 8.814814814814815, |
|
"grad_norm": 0.11425163596868515, |
|
"learning_rate": 8.436339121099412e-05, |
|
"loss": 0.2989, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 8.851851851851851, |
|
"grad_norm": 0.11373434215784073, |
|
"learning_rate": 7.92427631556617e-05, |
|
"loss": 0.2984, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 0.11790241301059723, |
|
"learning_rate": 7.427603073110967e-05, |
|
"loss": 0.2972, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.925925925925926, |
|
"grad_norm": 0.11773653328418732, |
|
"learning_rate": 6.946402407843155e-05, |
|
"loss": 0.3012, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 8.962962962962964, |
|
"grad_norm": 0.12333875894546509, |
|
"learning_rate": 6.480754747781037e-05, |
|
"loss": 0.2959, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.11808749288320541, |
|
"learning_rate": 6.0307379214091684e-05, |
|
"loss": 0.2969, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.4809459447860718, |
|
"eval_runtime": 1.4606, |
|
"eval_samples_per_second": 2.739, |
|
"eval_steps_per_second": 0.685, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 9.037037037037036, |
|
"grad_norm": 0.1122315376996994, |
|
"learning_rate": 5.596427144670002e-05, |
|
"loss": 0.2797, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 9.074074074074074, |
|
"grad_norm": 0.12401442974805832, |
|
"learning_rate": 5.1778950083923526e-05, |
|
"loss": 0.2823, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 9.11111111111111, |
|
"grad_norm": 0.11285774409770966, |
|
"learning_rate": 4.775211466158469e-05, |
|
"loss": 0.2825, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 9.148148148148149, |
|
"grad_norm": 0.11427191644906998, |
|
"learning_rate": 4.3884438226120426e-05, |
|
"loss": 0.2815, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 9.185185185185185, |
|
"grad_norm": 0.11915598809719086, |
|
"learning_rate": 4.017656722208807e-05, |
|
"loss": 0.2806, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 9.222222222222221, |
|
"grad_norm": 0.11690084636211395, |
|
"learning_rate": 3.6629121384119666e-05, |
|
"loss": 0.2811, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 9.25925925925926, |
|
"grad_norm": 0.11301770061254501, |
|
"learning_rate": 3.324269363333799e-05, |
|
"loss": 0.2811, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 9.296296296296296, |
|
"grad_norm": 0.1201760470867157, |
|
"learning_rate": 3.0017849978256518e-05, |
|
"loss": 0.2826, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 0.11819873005151749, |
|
"learning_rate": 2.6955129420176194e-05, |
|
"loss": 0.2787, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 9.37037037037037, |
|
"grad_norm": 0.11501555889844894, |
|
"learning_rate": 2.4055043863096426e-05, |
|
"loss": 0.2803, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 9.407407407407407, |
|
"grad_norm": 0.11644010990858078, |
|
"learning_rate": 2.1318078028155885e-05, |
|
"loss": 0.2823, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 9.444444444444445, |
|
"grad_norm": 0.11667327582836151, |
|
"learning_rate": 1.874468937261531e-05, |
|
"loss": 0.2836, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 9.481481481481481, |
|
"grad_norm": 0.11722666025161743, |
|
"learning_rate": 1.6335308013398887e-05, |
|
"loss": 0.2837, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.518518518518519, |
|
"grad_norm": 0.11853759735822678, |
|
"learning_rate": 1.4090336655203539e-05, |
|
"loss": 0.2816, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 9.555555555555555, |
|
"grad_norm": 0.11931838095188141, |
|
"learning_rate": 1.2010150523190988e-05, |
|
"loss": 0.2831, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 9.592592592592592, |
|
"grad_norm": 0.11536389589309692, |
|
"learning_rate": 1.0095097300273027e-05, |
|
"loss": 0.2824, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 9.62962962962963, |
|
"grad_norm": 0.11747179180383682, |
|
"learning_rate": 8.345497068998897e-06, |
|
"loss": 0.2849, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.666666666666666, |
|
"grad_norm": 0.12529109418392181, |
|
"learning_rate": 6.761642258056977e-06, |
|
"loss": 0.2841, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 9.703703703703704, |
|
"grad_norm": 0.11867683380842209, |
|
"learning_rate": 5.343797593398536e-06, |
|
"loss": 0.2822, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 9.74074074074074, |
|
"grad_norm": 0.11928539723157883, |
|
"learning_rate": 4.092200053990691e-06, |
|
"loss": 0.2809, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 0.11743851751089096, |
|
"learning_rate": 3.007058832207976e-06, |
|
"loss": 0.2819, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 9.814814814814815, |
|
"grad_norm": 0.11580769717693329, |
|
"learning_rate": 2.088555298867978e-06, |
|
"loss": 0.2819, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 9.851851851851851, |
|
"grad_norm": 0.11537632346153259, |
|
"learning_rate": 1.3368429729168074e-06, |
|
"loss": 0.2814, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 9.88888888888889, |
|
"grad_norm": 0.1173846423625946, |
|
"learning_rate": 7.520474957699585e-07, |
|
"loss": 0.2832, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 9.925925925925926, |
|
"grad_norm": 0.11481310427188873, |
|
"learning_rate": 3.3426661031255026e-07, |
|
"loss": 0.285, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 9.962962962962964, |
|
"grad_norm": 0.11797128617763519, |
|
"learning_rate": 8.357014456272793e-08, |
|
"loss": 0.2824, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.11149542033672333, |
|
"learning_rate": 0.0, |
|
"loss": 0.2818, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.517443060874939, |
|
"eval_runtime": 1.4076, |
|
"eval_samples_per_second": 2.842, |
|
"eval_steps_per_second": 0.71, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1350, |
|
"total_flos": 7.982333171800736e+18, |
|
"train_loss": 0.3983345487382677, |
|
"train_runtime": 11606.8402, |
|
"train_samples_per_second": 14.865, |
|
"train_steps_per_second": 0.116 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1350, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.982333171800736e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|