|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9940828402366864, |
|
"eval_steps": 100, |
|
"global_step": 84, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011834319526627219, |
|
"grad_norm": 6.360359661001761, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 1.0462, |
|
"mean_token_accuracy": 0.7292188576439624, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.023668639053254437, |
|
"grad_norm": 6.106208033575247, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 1.0369, |
|
"mean_token_accuracy": 0.7320231497777155, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03550295857988166, |
|
"grad_norm": 5.760453817709745, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.0233, |
|
"mean_token_accuracy": 0.733533778553629, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.047337278106508875, |
|
"grad_norm": 4.453204363610917, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.9786, |
|
"mean_token_accuracy": 0.7379425080663377, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 2.3743512771498363, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.9366, |
|
"mean_token_accuracy": 0.7400217617800807, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.07100591715976332, |
|
"grad_norm": 4.008803407186776, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.9256, |
|
"mean_token_accuracy": 0.7412621950252716, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.08284023668639054, |
|
"grad_norm": 5.140249938186884, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 0.9051, |
|
"mean_token_accuracy": 0.7427781690224119, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.09467455621301775, |
|
"grad_norm": 6.570193629815716, |
|
"learning_rate": 1.7777777777777777e-05, |
|
"loss": 0.9228, |
|
"mean_token_accuracy": 0.7391856693362282, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10650887573964497, |
|
"grad_norm": 4.243747325903682, |
|
"learning_rate": 2e-05, |
|
"loss": 0.857, |
|
"mean_token_accuracy": 0.7546249767013169, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 3.132842540331099, |
|
"learning_rate": 1.9991228300988586e-05, |
|
"loss": 0.8044, |
|
"mean_token_accuracy": 0.7638333703029013, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1301775147928994, |
|
"grad_norm": 2.669470016163355, |
|
"learning_rate": 1.9964928592495046e-05, |
|
"loss": 0.7946, |
|
"mean_token_accuracy": 0.7652401107554927, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.14201183431952663, |
|
"grad_norm": 1.69231830246758, |
|
"learning_rate": 1.9921147013144782e-05, |
|
"loss": 0.761, |
|
"mean_token_accuracy": 0.7729921399244823, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 1.4303040897938275, |
|
"learning_rate": 1.985996037070505e-05, |
|
"loss": 0.7593, |
|
"mean_token_accuracy": 0.7713452241883023, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.16568047337278108, |
|
"grad_norm": 1.4438177567548685, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 0.7405, |
|
"mean_token_accuracy": 0.7755470269351471, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 1.094141653060019, |
|
"learning_rate": 1.9685831611286312e-05, |
|
"loss": 0.7312, |
|
"mean_token_accuracy": 0.7782765091089967, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1893491124260355, |
|
"grad_norm": 0.9483896541702548, |
|
"learning_rate": 1.9573194975320672e-05, |
|
"loss": 0.7178, |
|
"mean_token_accuracy": 0.77972894381678, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.20118343195266272, |
|
"grad_norm": 0.9514334021002452, |
|
"learning_rate": 1.944376370237481e-05, |
|
"loss": 0.697, |
|
"mean_token_accuracy": 0.785825163667149, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.21301775147928995, |
|
"grad_norm": 0.805451100734681, |
|
"learning_rate": 1.9297764858882516e-05, |
|
"loss": 0.6985, |
|
"mean_token_accuracy": 0.7849165023868023, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.22485207100591717, |
|
"grad_norm": 0.7137053533068485, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 0.6807, |
|
"mean_token_accuracy": 0.7889829364396231, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.6517336215066881, |
|
"learning_rate": 1.895711760239413e-05, |
|
"loss": 0.6919, |
|
"mean_token_accuracy": 0.7859928108628317, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2485207100591716, |
|
"grad_norm": 0.6542156278501408, |
|
"learning_rate": 1.8763066800438638e-05, |
|
"loss": 0.6853, |
|
"mean_token_accuracy": 0.7873905384764055, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2603550295857988, |
|
"grad_norm": 0.5930185894111272, |
|
"learning_rate": 1.855364260160507e-05, |
|
"loss": 0.6547, |
|
"mean_token_accuracy": 0.7962581703861765, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.27218934911242604, |
|
"grad_norm": 0.5716003972347845, |
|
"learning_rate": 1.8329212407100996e-05, |
|
"loss": 0.687, |
|
"mean_token_accuracy": 0.78618707495751, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.28402366863905326, |
|
"grad_norm": 0.6177392920654274, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.652, |
|
"mean_token_accuracy": 0.7952571353402204, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.5698986426257591, |
|
"learning_rate": 1.78369345732584e-05, |
|
"loss": 0.6677, |
|
"mean_token_accuracy": 0.7916133988159623, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.5701215768836807, |
|
"learning_rate": 1.7569950556517566e-05, |
|
"loss": 0.6681, |
|
"mean_token_accuracy": 0.7910464837092032, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.31952662721893493, |
|
"grad_norm": 0.5080399194743719, |
|
"learning_rate": 1.7289686274214116e-05, |
|
"loss": 0.6515, |
|
"mean_token_accuracy": 0.7953877454559368, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.33136094674556216, |
|
"grad_norm": 0.4717556592028091, |
|
"learning_rate": 1.6996633405133656e-05, |
|
"loss": 0.6596, |
|
"mean_token_accuracy": 0.7938733744518354, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3431952662721893, |
|
"grad_norm": 0.43712818505210993, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 0.6515, |
|
"mean_token_accuracy": 0.795406673335384, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 0.5647585307286523, |
|
"learning_rate": 1.63742398974869e-05, |
|
"loss": 0.6765, |
|
"mean_token_accuracy": 0.7879625926981235, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3668639053254438, |
|
"grad_norm": 0.4753712696159348, |
|
"learning_rate": 1.6045991148623752e-05, |
|
"loss": 0.6521, |
|
"mean_token_accuracy": 0.7942288972770943, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.378698224852071, |
|
"grad_norm": 0.4144279237112344, |
|
"learning_rate": 1.570713567684432e-05, |
|
"loss": 0.6496, |
|
"mean_token_accuracy": 0.7955444405180622, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3905325443786982, |
|
"grad_norm": 0.48091481539083, |
|
"learning_rate": 1.5358267949789968e-05, |
|
"loss": 0.6554, |
|
"mean_token_accuracy": 0.7931419956248978, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.40236686390532544, |
|
"grad_norm": 0.43887737403140387, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.6429, |
|
"mean_token_accuracy": 0.7971955767104538, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 0.4589833492342518, |
|
"learning_rate": 1.463296035119862e-05, |
|
"loss": 0.6274, |
|
"mean_token_accuracy": 0.8013304246596665, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4260355029585799, |
|
"grad_norm": 0.4296018328726048, |
|
"learning_rate": 1.4257792915650728e-05, |
|
"loss": 0.6412, |
|
"mean_token_accuracy": 0.7966311405685548, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.4378698224852071, |
|
"grad_norm": 0.43296198847979206, |
|
"learning_rate": 1.3875155864521031e-05, |
|
"loss": 0.6355, |
|
"mean_token_accuracy": 0.7992740737965425, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.44970414201183434, |
|
"grad_norm": 0.38880487342980874, |
|
"learning_rate": 1.3485720473218153e-05, |
|
"loss": 0.626, |
|
"mean_token_accuracy": 0.8008996682978254, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.43999619314461746, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.6415, |
|
"mean_token_accuracy": 0.7971045284947518, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.4648200257156182, |
|
"learning_rate": 1.2689198206152657e-05, |
|
"loss": 0.6462, |
|
"mean_token_accuracy": 0.7952968697326067, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.48520710059171596, |
|
"grad_norm": 0.3479967151805307, |
|
"learning_rate": 1.2283508701106559e-05, |
|
"loss": 0.6323, |
|
"mean_token_accuracy": 0.7998096660135896, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.4970414201183432, |
|
"grad_norm": 0.48528533316713585, |
|
"learning_rate": 1.187381314585725e-05, |
|
"loss": 0.6386, |
|
"mean_token_accuracy": 0.7970335346930519, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.5088757396449705, |
|
"grad_norm": 0.4070328582412056, |
|
"learning_rate": 1.1460830285624119e-05, |
|
"loss": 0.6428, |
|
"mean_token_accuracy": 0.7955530635882152, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.5207100591715976, |
|
"grad_norm": 0.38515895719763865, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 0.629, |
|
"mean_token_accuracy": 0.8002784669594674, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 0.3947135089059047, |
|
"learning_rate": 1.0627905195293135e-05, |
|
"loss": 0.6248, |
|
"mean_token_accuracy": 0.8020802191923001, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5443786982248521, |
|
"grad_norm": 0.32500740489262403, |
|
"learning_rate": 1.0209424198833571e-05, |
|
"loss": 0.6266, |
|
"mean_token_accuracy": 0.8013824664684448, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5562130177514792, |
|
"grad_norm": 0.4427008519391343, |
|
"learning_rate": 9.790575801166432e-06, |
|
"loss": 0.6299, |
|
"mean_token_accuracy": 0.7990073104772039, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.5680473372781065, |
|
"grad_norm": 0.33671217055241054, |
|
"learning_rate": 9.372094804706867e-06, |
|
"loss": 0.6127, |
|
"mean_token_accuracy": 0.8042786979333378, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5798816568047337, |
|
"grad_norm": 0.3447024994665335, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 0.6155, |
|
"mean_token_accuracy": 0.8035300399264838, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 0.38535071393327097, |
|
"learning_rate": 8.539169714375885e-06, |
|
"loss": 0.6251, |
|
"mean_token_accuracy": 0.8009747733015874, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6035502958579881, |
|
"grad_norm": 0.36444496001026966, |
|
"learning_rate": 8.126186854142752e-06, |
|
"loss": 0.6151, |
|
"mean_token_accuracy": 0.803726615022824, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.33289854812401104, |
|
"learning_rate": 7.716491298893443e-06, |
|
"loss": 0.6237, |
|
"mean_token_accuracy": 0.8018573004824351, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.6272189349112426, |
|
"grad_norm": 0.3779499676278362, |
|
"learning_rate": 7.310801793847344e-06, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.8030306721227509, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.6390532544378699, |
|
"grad_norm": 0.3857760141933295, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.6446, |
|
"mean_token_accuracy": 0.7951614721610321, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 0.3441596003200234, |
|
"learning_rate": 6.5142795267818505e-06, |
|
"loss": 0.6245, |
|
"mean_token_accuracy": 0.8010465961395079, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.6627218934911243, |
|
"grad_norm": 0.3413004265386508, |
|
"learning_rate": 6.124844135478971e-06, |
|
"loss": 0.6346, |
|
"mean_token_accuracy": 0.797919315970412, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6745562130177515, |
|
"grad_norm": 0.3224227524329457, |
|
"learning_rate": 5.742207084349274e-06, |
|
"loss": 0.6142, |
|
"mean_token_accuracy": 0.8038281147126183, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6863905325443787, |
|
"grad_norm": 0.3552618452178376, |
|
"learning_rate": 5.367039648801386e-06, |
|
"loss": 0.6146, |
|
"mean_token_accuracy": 0.8046393162729427, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6982248520710059, |
|
"grad_norm": 0.291412169561303, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.6099, |
|
"mean_token_accuracy": 0.8053936401536371, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 0.31200803773040214, |
|
"learning_rate": 4.641732050210032e-06, |
|
"loss": 0.5991, |
|
"mean_token_accuracy": 0.8092009667951892, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7218934911242604, |
|
"grad_norm": 0.30861041917669746, |
|
"learning_rate": 4.292864323155684e-06, |
|
"loss": 0.6194, |
|
"mean_token_accuracy": 0.8023869323011936, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.7337278106508875, |
|
"grad_norm": 0.3016074619440693, |
|
"learning_rate": 3.954008851376252e-06, |
|
"loss": 0.6074, |
|
"mean_token_accuracy": 0.8061990891659233, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.7455621301775148, |
|
"grad_norm": 0.2780289952523753, |
|
"learning_rate": 3.625760102513103e-06, |
|
"loss": 0.627, |
|
"mean_token_accuracy": 0.7997388545626677, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.757396449704142, |
|
"grad_norm": 0.27276931271932114, |
|
"learning_rate": 3.308693936411421e-06, |
|
"loss": 0.6157, |
|
"mean_token_accuracy": 0.804183757647147, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.29793300735816786, |
|
"learning_rate": 3.003366594866345e-06, |
|
"loss": 0.6173, |
|
"mean_token_accuracy": 0.8028972489414075, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.7810650887573964, |
|
"grad_norm": 0.27768983157741034, |
|
"learning_rate": 2.7103137257858867e-06, |
|
"loss": 0.6351, |
|
"mean_token_accuracy": 0.797241759861545, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7928994082840237, |
|
"grad_norm": 0.2515256068430327, |
|
"learning_rate": 2.4300494434824373e-06, |
|
"loss": 0.6151, |
|
"mean_token_accuracy": 0.8040865582678152, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.8047337278106509, |
|
"grad_norm": 0.2669457691795579, |
|
"learning_rate": 2.163065426741603e-06, |
|
"loss": 0.6136, |
|
"mean_token_accuracy": 0.803797621443186, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.8165680473372781, |
|
"grad_norm": 0.30160149261118047, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 0.6275, |
|
"mean_token_accuracy": 0.8003163727232168, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 0.2751754259680236, |
|
"learning_rate": 1.6707875928990059e-06, |
|
"loss": 0.6133, |
|
"mean_token_accuracy": 0.8047071305832921, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.8402366863905325, |
|
"grad_norm": 0.2706385532569295, |
|
"learning_rate": 1.446357398394934e-06, |
|
"loss": 0.6218, |
|
"mean_token_accuracy": 0.8022678442231608, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.8520710059171598, |
|
"grad_norm": 0.28015697822064345, |
|
"learning_rate": 1.2369331995613664e-06, |
|
"loss": 0.617, |
|
"mean_token_accuracy": 0.8031474875457701, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.863905325443787, |
|
"grad_norm": 0.2589662874319155, |
|
"learning_rate": 1.042882397605871e-06, |
|
"loss": 0.6115, |
|
"mean_token_accuracy": 0.8043582740503216, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.8757396449704142, |
|
"grad_norm": 0.24850062323600258, |
|
"learning_rate": 8.645454235739903e-07, |
|
"loss": 0.601, |
|
"mean_token_accuracy": 0.8078272791878166, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 0.25618246366172615, |
|
"learning_rate": 7.022351411174866e-07, |
|
"loss": 0.6135, |
|
"mean_token_accuracy": 0.8042458110517825, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8994082840236687, |
|
"grad_norm": 0.25009614725017143, |
|
"learning_rate": 5.562362976251901e-07, |
|
"loss": 0.6122, |
|
"mean_token_accuracy": 0.8047269396914206, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.9112426035502958, |
|
"grad_norm": 0.2385740957817605, |
|
"learning_rate": 4.268050246793276e-07, |
|
"loss": 0.6018, |
|
"mean_token_accuracy": 0.8070806317760979, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.25257325897328714, |
|
"learning_rate": 3.1416838871368925e-07, |
|
"loss": 0.5992, |
|
"mean_token_accuracy": 0.8088564726476459, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.9349112426035503, |
|
"grad_norm": 0.2409513318669118, |
|
"learning_rate": 2.1852399266194312e-07, |
|
"loss": 0.6029, |
|
"mean_token_accuracy": 0.8077979969548987, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 0.24881717772888504, |
|
"learning_rate": 1.400396292949513e-07, |
|
"loss": 0.6132, |
|
"mean_token_accuracy": 0.8045102037481361, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9585798816568047, |
|
"grad_norm": 0.26121171944408894, |
|
"learning_rate": 7.885298685522235e-08, |
|
"loss": 0.604, |
|
"mean_token_accuracy": 0.8070156862648367, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.9704142011834319, |
|
"grad_norm": 0.24329815513143002, |
|
"learning_rate": 3.50714075049563e-08, |
|
"loss": 0.6159, |
|
"mean_token_accuracy": 0.8033786700596692, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.9822485207100592, |
|
"grad_norm": 0.2401566800321565, |
|
"learning_rate": 8.771699011416169e-09, |
|
"loss": 0.6017, |
|
"mean_token_accuracy": 0.8079396676985449, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.9940828402366864, |
|
"grad_norm": 0.24933875598122412, |
|
"learning_rate": 0.0, |
|
"loss": 0.6188, |
|
"mean_token_accuracy": 0.8022081449189813, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9940828402366864, |
|
"step": 84, |
|
"total_flos": 176018430099456.0, |
|
"train_loss": 0.6783329638696852, |
|
"train_runtime": 1805.4988, |
|
"train_samples_per_second": 11.976, |
|
"train_steps_per_second": 0.047 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 84, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 176018430099456.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|