|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9815303430079156, |
|
"eval_steps": 500, |
|
"global_step": 93, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0316622691292876, |
|
"grad_norm": 2.5990071296691895, |
|
"learning_rate": 7.000000000000001e-07, |
|
"loss": 1.176, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0633245382585752, |
|
"grad_norm": 2.6270251274108887, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.1885, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.09498680738786279, |
|
"grad_norm": 2.6077260971069336, |
|
"learning_rate": 2.1e-06, |
|
"loss": 1.1853, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1266490765171504, |
|
"grad_norm": 2.4010000228881836, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.1441, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.158311345646438, |
|
"grad_norm": 1.6012561321258545, |
|
"learning_rate": 3.5e-06, |
|
"loss": 1.1434, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.18997361477572558, |
|
"grad_norm": 1.4890481233596802, |
|
"learning_rate": 4.2e-06, |
|
"loss": 1.1287, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.22163588390501318, |
|
"grad_norm": 2.0511271953582764, |
|
"learning_rate": 4.9e-06, |
|
"loss": 1.1233, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.2532981530343008, |
|
"grad_norm": 1.9991264343261719, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.1195, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2849604221635884, |
|
"grad_norm": 2.4904370307922363, |
|
"learning_rate": 6.3e-06, |
|
"loss": 1.0933, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.316622691292876, |
|
"grad_norm": 2.41306471824646, |
|
"learning_rate": 7e-06, |
|
"loss": 1.1054, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3482849604221636, |
|
"grad_norm": 1.916558861732483, |
|
"learning_rate": 6.99749314185716e-06, |
|
"loss": 1.0968, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.37994722955145116, |
|
"grad_norm": 1.6149336099624634, |
|
"learning_rate": 6.989976158478782e-06, |
|
"loss": 1.0736, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.41160949868073876, |
|
"grad_norm": 1.5957881212234497, |
|
"learning_rate": 6.977459817871147e-06, |
|
"loss": 1.0864, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.44327176781002636, |
|
"grad_norm": 1.3141591548919678, |
|
"learning_rate": 6.959962049571609e-06, |
|
"loss": 1.0507, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.47493403693931396, |
|
"grad_norm": 1.0635184049606323, |
|
"learning_rate": 6.937507918964709e-06, |
|
"loss": 1.0483, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5065963060686016, |
|
"grad_norm": 0.8129880428314209, |
|
"learning_rate": 6.9101295913762465e-06, |
|
"loss": 1.0232, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5382585751978892, |
|
"grad_norm": 1.096314549446106, |
|
"learning_rate": 6.877866285996766e-06, |
|
"loss": 1.0304, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5699208443271768, |
|
"grad_norm": 0.8672236204147339, |
|
"learning_rate": 6.840764219700443e-06, |
|
"loss": 1.0308, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6015831134564644, |
|
"grad_norm": 0.9555258750915527, |
|
"learning_rate": 6.798876540839855e-06, |
|
"loss": 1.0285, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.633245382585752, |
|
"grad_norm": 1.5928786993026733, |
|
"learning_rate": 6.752263253111479e-06, |
|
"loss": 1.0189, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6649076517150396, |
|
"grad_norm": 1.2837454080581665, |
|
"learning_rate": 6.700991129600976e-06, |
|
"loss": 1.021, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.6965699208443272, |
|
"grad_norm": 0.7489935755729675, |
|
"learning_rate": 6.645133617131388e-06, |
|
"loss": 1.0013, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7282321899736148, |
|
"grad_norm": 0.729206383228302, |
|
"learning_rate": 6.584770731051271e-06, |
|
"loss": 0.986, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7598944591029023, |
|
"grad_norm": 0.8594318628311157, |
|
"learning_rate": 6.51998894061348e-06, |
|
"loss": 0.9853, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.7915567282321899, |
|
"grad_norm": 0.6937134861946106, |
|
"learning_rate": 6.4508810451087956e-06, |
|
"loss": 0.9969, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8232189973614775, |
|
"grad_norm": 0.6618143916130066, |
|
"learning_rate": 6.377546040931835e-06, |
|
"loss": 0.9784, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8548812664907651, |
|
"grad_norm": 0.7150548100471497, |
|
"learning_rate": 6.300088979769671e-06, |
|
"loss": 0.9775, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.8865435356200527, |
|
"grad_norm": 0.8135938048362732, |
|
"learning_rate": 6.218620818116299e-06, |
|
"loss": 0.9555, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9182058047493403, |
|
"grad_norm": 0.7386907339096069, |
|
"learning_rate": 6.133258258328535e-06, |
|
"loss": 0.9536, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.9498680738786279, |
|
"grad_norm": 0.6299906969070435, |
|
"learning_rate": 6.044123581451003e-06, |
|
"loss": 0.9513, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.9815303430079155, |
|
"grad_norm": 0.6849163174629211, |
|
"learning_rate": 5.951344472049728e-06, |
|
"loss": 0.9491, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.0316622691292876, |
|
"grad_norm": 0.5353211760520935, |
|
"learning_rate": 5.855053835305216e-06, |
|
"loss": 1.2099, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.0633245382585752, |
|
"grad_norm": 0.5629773736000061, |
|
"learning_rate": 5.755389606627069e-06, |
|
"loss": 0.8727, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.0949868073878628, |
|
"grad_norm": 0.5262039303779602, |
|
"learning_rate": 5.652494554062838e-06, |
|
"loss": 0.8777, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.1266490765171504, |
|
"grad_norm": 0.9279022216796875, |
|
"learning_rate": 5.546516073784165e-06, |
|
"loss": 0.8661, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.158311345646438, |
|
"grad_norm": 1.0550135374069214, |
|
"learning_rate": 5.4376059789431955e-06, |
|
"loss": 0.8665, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1899736147757256, |
|
"grad_norm": 0.4825997054576874, |
|
"learning_rate": 5.325920282201696e-06, |
|
"loss": 0.8584, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.2216358839050132, |
|
"grad_norm": 0.5052205920219421, |
|
"learning_rate": 5.2116189722444164e-06, |
|
"loss": 0.8677, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2532981530343008, |
|
"grad_norm": 0.4654822051525116, |
|
"learning_rate": 5.094865784596845e-06, |
|
"loss": 0.8515, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2849604221635884, |
|
"grad_norm": 0.45359933376312256, |
|
"learning_rate": 4.975827967075644e-06, |
|
"loss": 0.851, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.316622691292876, |
|
"grad_norm": 0.968210756778717, |
|
"learning_rate": 4.854676040207761e-06, |
|
"loss": 0.8733, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3482849604221636, |
|
"grad_norm": 0.5169340372085571, |
|
"learning_rate": 4.731583552961416e-06, |
|
"loss": 0.8698, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3799472295514512, |
|
"grad_norm": 0.8193064332008362, |
|
"learning_rate": 4.606726834138884e-06, |
|
"loss": 0.8464, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.4116094986807388, |
|
"grad_norm": 0.589012622833252, |
|
"learning_rate": 4.480284739787175e-06, |
|
"loss": 0.8465, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.4432717678100264, |
|
"grad_norm": 0.5069411993026733, |
|
"learning_rate": 4.352438396988471e-06, |
|
"loss": 0.8415, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.474934036939314, |
|
"grad_norm": 0.4803805947303772, |
|
"learning_rate": 4.223370944397335e-06, |
|
"loss": 0.8089, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.5065963060686016, |
|
"grad_norm": 0.44742244482040405, |
|
"learning_rate": 4.093267269896339e-06, |
|
"loss": 0.8141, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5382585751978892, |
|
"grad_norm": 0.5128947496414185, |
|
"learning_rate": 3.9623137457459586e-06, |
|
"loss": 0.8334, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5699208443271768, |
|
"grad_norm": 0.43568509817123413, |
|
"learning_rate": 3.83069796160811e-06, |
|
"loss": 0.7996, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6015831134564644, |
|
"grad_norm": 0.5734230279922485, |
|
"learning_rate": 3.6986084558257596e-06, |
|
"loss": 0.8206, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.633245382585752, |
|
"grad_norm": 0.4465356469154358, |
|
"learning_rate": 3.5662344453435665e-06, |
|
"loss": 0.8036, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6649076517150396, |
|
"grad_norm": 0.9534016251564026, |
|
"learning_rate": 3.4337655546564343e-06, |
|
"loss": 0.849, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.6965699208443272, |
|
"grad_norm": 0.5999336838722229, |
|
"learning_rate": 3.301391544174241e-06, |
|
"loss": 0.7874, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.7282321899736148, |
|
"grad_norm": 0.4919511079788208, |
|
"learning_rate": 3.1693020383918907e-06, |
|
"loss": 0.8012, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.7598944591029024, |
|
"grad_norm": 0.4128033518791199, |
|
"learning_rate": 3.0376862542540426e-06, |
|
"loss": 0.8056, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.79155672823219, |
|
"grad_norm": 0.4979248642921448, |
|
"learning_rate": 2.9067327301036616e-06, |
|
"loss": 0.7769, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.8232189973614776, |
|
"grad_norm": 0.46139153838157654, |
|
"learning_rate": 2.7766290556026646e-06, |
|
"loss": 0.7984, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.8548812664907652, |
|
"grad_norm": 0.46961304545402527, |
|
"learning_rate": 2.6475616030115286e-06, |
|
"loss": 0.7917, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.8865435356200528, |
|
"grad_norm": 0.4321727752685547, |
|
"learning_rate": 2.5197152602128256e-06, |
|
"loss": 0.7852, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.9182058047493404, |
|
"grad_norm": 0.4466552138328552, |
|
"learning_rate": 2.393273165861116e-06, |
|
"loss": 0.8023, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.949868073878628, |
|
"grad_norm": 0.4448838233947754, |
|
"learning_rate": 2.2684164470385843e-06, |
|
"loss": 0.7616, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.9815303430079156, |
|
"grad_norm": 0.434826523065567, |
|
"learning_rate": 2.14532395979224e-06, |
|
"loss": 0.8165, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.0316622691292876, |
|
"grad_norm": 0.6250742673873901, |
|
"learning_rate": 2.0241720329243563e-06, |
|
"loss": 1.0035, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.063324538258575, |
|
"grad_norm": 0.41192013025283813, |
|
"learning_rate": 1.905134215403155e-06, |
|
"loss": 0.7532, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.094986807387863, |
|
"grad_norm": 0.4167821407318115, |
|
"learning_rate": 1.7883810277555837e-06, |
|
"loss": 0.7628, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.1266490765171504, |
|
"grad_norm": 0.4210691452026367, |
|
"learning_rate": 1.6740797177983044e-06, |
|
"loss": 0.7511, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.158311345646438, |
|
"grad_norm": 0.44451218843460083, |
|
"learning_rate": 1.5623940210568042e-06, |
|
"loss": 0.7302, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.1899736147757256, |
|
"grad_norm": 0.40137404203414917, |
|
"learning_rate": 1.453483926215835e-06, |
|
"loss": 0.7315, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.221635883905013, |
|
"grad_norm": 0.6961039900779724, |
|
"learning_rate": 1.3475054459371625e-06, |
|
"loss": 0.7478, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.253298153034301, |
|
"grad_norm": 0.37633243203163147, |
|
"learning_rate": 1.2446103933729302e-06, |
|
"loss": 0.7288, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.2849604221635884, |
|
"grad_norm": 0.43067774176597595, |
|
"learning_rate": 1.1449461646947839e-06, |
|
"loss": 0.7628, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.316622691292876, |
|
"grad_norm": 0.3962654769420624, |
|
"learning_rate": 1.048655527950273e-06, |
|
"loss": 0.7178, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.3482849604221636, |
|
"grad_norm": 0.9008962512016296, |
|
"learning_rate": 9.558764185489968e-07, |
|
"loss": 0.7287, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.379947229551451, |
|
"grad_norm": 0.4029214084148407, |
|
"learning_rate": 8.667417416714656e-07, |
|
"loss": 0.7135, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.411609498680739, |
|
"grad_norm": 0.37933027744293213, |
|
"learning_rate": 7.813791818837012e-07, |
|
"loss": 0.7242, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.4432717678100264, |
|
"grad_norm": 0.4242548644542694, |
|
"learning_rate": 6.999110202303293e-07, |
|
"loss": 0.7201, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.474934036939314, |
|
"grad_norm": 0.5838273167610168, |
|
"learning_rate": 6.22453959068165e-07, |
|
"loss": 0.6838, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 2.5065963060686016, |
|
"grad_norm": 0.3837423026561737, |
|
"learning_rate": 5.491189548912051e-07, |
|
"loss": 0.717, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 2.538258575197889, |
|
"grad_norm": 1.0475167036056519, |
|
"learning_rate": 4.800110593865199e-07, |
|
"loss": 0.7152, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 2.569920844327177, |
|
"grad_norm": 0.4407210946083069, |
|
"learning_rate": 4.15229268948729e-07, |
|
"loss": 0.7368, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.6015831134564644, |
|
"grad_norm": 0.39730748534202576, |
|
"learning_rate": 3.5486638286861297e-07, |
|
"loss": 0.715, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 2.633245382585752, |
|
"grad_norm": 0.5303342938423157, |
|
"learning_rate": 2.990088703990245e-07, |
|
"loss": 0.74, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 2.6649076517150396, |
|
"grad_norm": 0.6698248386383057, |
|
"learning_rate": 2.4773674688852197e-07, |
|
"loss": 0.7142, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 2.6965699208443272, |
|
"grad_norm": 0.43907734751701355, |
|
"learning_rate": 2.0112345916014578e-07, |
|
"loss": 0.7372, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 2.728232189973615, |
|
"grad_norm": 0.37293171882629395, |
|
"learning_rate": 1.592357802995572e-07, |
|
"loss": 0.7241, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 2.7598944591029024, |
|
"grad_norm": 0.394404798746109, |
|
"learning_rate": 1.2213371400323352e-07, |
|
"loss": 0.7095, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 2.79155672823219, |
|
"grad_norm": 0.7737839818000793, |
|
"learning_rate": 8.987040862375339e-08, |
|
"loss": 0.6919, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 2.8232189973614776, |
|
"grad_norm": 0.3485806882381439, |
|
"learning_rate": 6.249208103529092e-08, |
|
"loss": 0.7263, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 2.8548812664907652, |
|
"grad_norm": 0.39798641204833984, |
|
"learning_rate": 4.0037950428390144e-08, |
|
"loss": 0.7415, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 2.886543535620053, |
|
"grad_norm": 0.38925713300704956, |
|
"learning_rate": 2.254018212885278e-08, |
|
"loss": 0.7267, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.9182058047493404, |
|
"grad_norm": 0.361558198928833, |
|
"learning_rate": 1.0023841521217825e-08, |
|
"loss": 0.7175, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 2.949868073878628, |
|
"grad_norm": 0.5748413801193237, |
|
"learning_rate": 2.506858142839852e-09, |
|
"loss": 0.7126, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 2.9815303430079156, |
|
"grad_norm": 0.3813362717628479, |
|
"learning_rate": 0.0, |
|
"loss": 0.7082, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 2.9815303430079156, |
|
"step": 93, |
|
"total_flos": 2930651586101248.0, |
|
"train_loss": 0.8763364265041966, |
|
"train_runtime": 71308.8652, |
|
"train_samples_per_second": 0.382, |
|
"train_steps_per_second": 0.001 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 93, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2930651586101248.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|