|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 47226, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010587388303053403, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 7.2559, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.021174776606106806, |
|
"grad_norm": 0.875, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5321, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03176216490916021, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 6e-06, |
|
"loss": 0.4789, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04234955321221361, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4361, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05293694151526701, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4095, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06352432981832042, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.3739, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07411171812137382, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.3593, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.08469910642442723, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.3277, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.09528649472748063, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.3148, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.10587388303053402, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2913, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.11646127133358743, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 1.9763179083976696e-05, |
|
"loss": 0.2715, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.12704865963664083, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 1.9526358167953394e-05, |
|
"loss": 0.2621, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.13763604793969422, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.9289537251930092e-05, |
|
"loss": 0.2484, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.14822343624274764, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 1.905271633590679e-05, |
|
"loss": 0.2359, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.15881082454580103, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 1.8815895419883485e-05, |
|
"loss": 0.2284, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.16939821284885445, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.8579074503860183e-05, |
|
"loss": 0.229, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.17998560115190784, |
|
"grad_norm": 0.875, |
|
"learning_rate": 1.834225358783688e-05, |
|
"loss": 0.2303, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.19057298945496126, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 1.8105432671813576e-05, |
|
"loss": 0.2227, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.20116037775801465, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 1.7868611755790274e-05, |
|
"loss": 0.216, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.21174776606106804, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.763179083976697e-05, |
|
"loss": 0.2148, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.22233515436412146, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.7394969923743667e-05, |
|
"loss": 0.2106, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.23292254266717485, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 1.7158149007720365e-05, |
|
"loss": 0.205, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.24350993097022827, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 1.692132809169706e-05, |
|
"loss": 0.211, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.25409731927328166, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.6684507175673758e-05, |
|
"loss": 0.2051, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.26468470757633505, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 1.6447686259650452e-05, |
|
"loss": 0.1985, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.27527209587938845, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.621086534362715e-05, |
|
"loss": 0.2035, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.2858594841824419, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 1.5974044427603848e-05, |
|
"loss": 0.2019, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.2964468724854953, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.5737223511580543e-05, |
|
"loss": 0.1921, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.3070342607885487, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 1.550040259555724e-05, |
|
"loss": 0.1952, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.31762164909160207, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 1.526358167953394e-05, |
|
"loss": 0.1988, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.3282090373946555, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.5026760763510635e-05, |
|
"loss": 0.193, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.3387964256977089, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.4789939847487332e-05, |
|
"loss": 0.1946, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.3493838140007623, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.4553118931464028e-05, |
|
"loss": 0.1953, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.3599712023038157, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.4316298015440725e-05, |
|
"loss": 0.1902, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.3705585906068691, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 1.4079477099417423e-05, |
|
"loss": 0.1859, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.3811459789099225, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 1.3842656183394119e-05, |
|
"loss": 0.1906, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.3917333672129759, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 1.3605835267370815e-05, |
|
"loss": 0.1908, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.4023207555160293, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 1.3369014351347512e-05, |
|
"loss": 0.1805, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.4129081438190827, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 1.313219343532421e-05, |
|
"loss": 0.1855, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.4234955321221361, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.2895372519300906e-05, |
|
"loss": 0.1906, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.43408292042518953, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 1.2658551603277602e-05, |
|
"loss": 0.1801, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.4446703087282429, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 1.2421730687254299e-05, |
|
"loss": 0.1827, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.4552576970312963, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 1.2184909771230997e-05, |
|
"loss": 0.1832, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.4658450853343497, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.1948088855207693e-05, |
|
"loss": 0.1875, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.4764324736374031, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 1.171126793918439e-05, |
|
"loss": 0.1794, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.48701986194045654, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 1.1474447023161086e-05, |
|
"loss": 0.1831, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.49760725024350994, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 1.1237626107137782e-05, |
|
"loss": 0.1841, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.5081946385465633, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.100080519111448e-05, |
|
"loss": 0.186, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.5187820268496167, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 1.0763984275091177e-05, |
|
"loss": 0.1849, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.5293694151526701, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 1.0527163359067873e-05, |
|
"loss": 0.1797, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.5399568034557235, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 1.029034244304457e-05, |
|
"loss": 0.1867, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.5505441917587769, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 1.0053521527021268e-05, |
|
"loss": 0.1792, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.5611315800618304, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.816700610997964e-06, |
|
"loss": 0.1839, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.5717189683648838, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 9.57987969497466e-06, |
|
"loss": 0.1816, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.5823063566679372, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.343058778951358e-06, |
|
"loss": 0.1816, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.5928937449709906, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 9.106237862928055e-06, |
|
"loss": 0.1779, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.603481133274044, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 8.869416946904751e-06, |
|
"loss": 0.1821, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.6140685215770973, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 8.632596030881449e-06, |
|
"loss": 0.1824, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.6246559098801507, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 8.395775114858144e-06, |
|
"loss": 0.179, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.6352432981832041, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 8.158954198834842e-06, |
|
"loss": 0.1846, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.6458306864862575, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 7.922133282811538e-06, |
|
"loss": 0.1853, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.656418074789311, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.685312366788236e-06, |
|
"loss": 0.1766, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.6670054630923644, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.448491450764932e-06, |
|
"loss": 0.1819, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.6775928513954178, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 7.211670534741629e-06, |
|
"loss": 0.1807, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.6881802396984712, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 6.974849618718325e-06, |
|
"loss": 0.1824, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.6987676280015246, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 6.738028702695023e-06, |
|
"loss": 0.1829, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.709355016304578, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 6.50120778667172e-06, |
|
"loss": 0.1812, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.7199424046076314, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 6.264386870648415e-06, |
|
"loss": 0.1778, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.7305297929106848, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 6.027565954625113e-06, |
|
"loss": 0.1868, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.7411171812137382, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 5.79074503860181e-06, |
|
"loss": 0.1806, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.7517045695167915, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 5.553924122578507e-06, |
|
"loss": 0.1843, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.762291957819845, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 5.317103206555203e-06, |
|
"loss": 0.1778, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.7728793461228984, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 5.0802822905319005e-06, |
|
"loss": 0.1752, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.7834667344259518, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 4.843461374508597e-06, |
|
"loss": 0.1829, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.7940541227290052, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.606640458485293e-06, |
|
"loss": 0.1758, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.8046415110320586, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.3698195424619904e-06, |
|
"loss": 0.1817, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.815228899335112, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 4.132998626438688e-06, |
|
"loss": 0.1872, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.8258162876381654, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.896177710415384e-06, |
|
"loss": 0.1775, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.8364036759412188, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 3.659356794392081e-06, |
|
"loss": 0.1839, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.8469910642442722, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 3.422535878368778e-06, |
|
"loss": 0.1766, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.8575784525473257, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.1857149623454748e-06, |
|
"loss": 0.1782, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.8681658408503791, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.9488940463221716e-06, |
|
"loss": 0.1804, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.8787532291534325, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 2.7120731302988684e-06, |
|
"loss": 0.178, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.8893406174564858, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 2.475252214275565e-06, |
|
"loss": 0.1836, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.8999280057595392, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 2.238431298252262e-06, |
|
"loss": 0.176, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.9105153940625926, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.0016103822289587e-06, |
|
"loss": 0.1826, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.921102782365646, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 1.7647894662056555e-06, |
|
"loss": 0.1837, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.9316901706686994, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 1.527968550182352e-06, |
|
"loss": 0.1778, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.9422775589717528, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 1.291147634159049e-06, |
|
"loss": 0.1817, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.9528649472748062, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 1.0543267181357459e-06, |
|
"loss": 0.1775, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 0.9634523355778597, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 8.175058021124427e-07, |
|
"loss": 0.1759, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 0.9740397238809131, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.806848860891394e-07, |
|
"loss": 0.1723, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.9846271121839665, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 3.4386397006583623e-07, |
|
"loss": 0.1822, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 0.9952145004870199, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 1.0704305404253304e-07, |
|
"loss": 0.1825, |
|
"step": 47000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 47226, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.979597801902148e+18, |
|
"train_batch_size": 10, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|