{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9940828402366864, "eval_steps": 100, "global_step": 84, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011834319526627219, "grad_norm": 6.360359661001761, "learning_rate": 2.222222222222222e-06, "loss": 1.0462, "mean_token_accuracy": 0.7292188576439624, "step": 1 }, { "epoch": 0.023668639053254437, "grad_norm": 6.106208033575247, "learning_rate": 4.444444444444444e-06, "loss": 1.0369, "mean_token_accuracy": 0.7320231497777155, "step": 2 }, { "epoch": 0.03550295857988166, "grad_norm": 5.760453817709745, "learning_rate": 6.666666666666667e-06, "loss": 1.0233, "mean_token_accuracy": 0.733533778553629, "step": 3 }, { "epoch": 0.047337278106508875, "grad_norm": 4.453204363610917, "learning_rate": 8.888888888888888e-06, "loss": 0.9786, "mean_token_accuracy": 0.7379425080663377, "step": 4 }, { "epoch": 0.05917159763313609, "grad_norm": 2.3743512771498363, "learning_rate": 1.1111111111111113e-05, "loss": 0.9366, "mean_token_accuracy": 0.7400217617800807, "step": 5 }, { "epoch": 0.07100591715976332, "grad_norm": 4.008803407186776, "learning_rate": 1.3333333333333333e-05, "loss": 0.9256, "mean_token_accuracy": 0.7412621950252716, "step": 6 }, { "epoch": 0.08284023668639054, "grad_norm": 5.140249938186884, "learning_rate": 1.555555555555556e-05, "loss": 0.9051, "mean_token_accuracy": 0.7427781690224119, "step": 7 }, { "epoch": 0.09467455621301775, "grad_norm": 6.570193629815716, "learning_rate": 1.7777777777777777e-05, "loss": 0.9228, "mean_token_accuracy": 0.7391856693362282, "step": 8 }, { "epoch": 0.10650887573964497, "grad_norm": 4.243747325903682, "learning_rate": 2e-05, "loss": 0.857, "mean_token_accuracy": 0.7546249767013169, "step": 9 }, { "epoch": 0.11834319526627218, "grad_norm": 3.132842540331099, "learning_rate": 1.9991228300988586e-05, "loss": 0.8044, "mean_token_accuracy": 0.7638333703029013, "step": 10 }, { "epoch": 0.1301775147928994, "grad_norm": 2.669470016163355, "learning_rate": 1.9964928592495046e-05, "loss": 0.7946, "mean_token_accuracy": 0.7652401107554927, "step": 11 }, { "epoch": 0.14201183431952663, "grad_norm": 1.69231830246758, "learning_rate": 1.9921147013144782e-05, "loss": 0.761, "mean_token_accuracy": 0.7729921399244823, "step": 12 }, { "epoch": 0.15384615384615385, "grad_norm": 1.4303040897938275, "learning_rate": 1.985996037070505e-05, "loss": 0.7593, "mean_token_accuracy": 0.7713452241883023, "step": 13 }, { "epoch": 0.16568047337278108, "grad_norm": 1.4438177567548685, "learning_rate": 1.9781476007338058e-05, "loss": 0.7405, "mean_token_accuracy": 0.7755470269351471, "step": 14 }, { "epoch": 0.17751479289940827, "grad_norm": 1.094141653060019, "learning_rate": 1.9685831611286312e-05, "loss": 0.7312, "mean_token_accuracy": 0.7782765091089967, "step": 15 }, { "epoch": 0.1893491124260355, "grad_norm": 0.9483896541702548, "learning_rate": 1.9573194975320672e-05, "loss": 0.7178, "mean_token_accuracy": 0.77972894381678, "step": 16 }, { "epoch": 0.20118343195266272, "grad_norm": 0.9514334021002452, "learning_rate": 1.944376370237481e-05, "loss": 0.697, "mean_token_accuracy": 0.785825163667149, "step": 17 }, { "epoch": 0.21301775147928995, "grad_norm": 0.805451100734681, "learning_rate": 1.9297764858882516e-05, "loss": 0.6985, "mean_token_accuracy": 0.7849165023868023, "step": 18 }, { "epoch": 0.22485207100591717, "grad_norm": 0.7137053533068485, "learning_rate": 1.913545457642601e-05, "loss": 0.6807, "mean_token_accuracy": 0.7889829364396231, "step": 19 }, { "epoch": 0.23668639053254437, "grad_norm": 0.6517336215066881, "learning_rate": 1.895711760239413e-05, "loss": 0.6919, "mean_token_accuracy": 0.7859928108628317, "step": 20 }, { "epoch": 0.2485207100591716, "grad_norm": 0.6542156278501408, "learning_rate": 1.8763066800438638e-05, "loss": 0.6853, "mean_token_accuracy": 0.7873905384764055, "step": 21 }, { "epoch": 0.2603550295857988, "grad_norm": 0.5930185894111272, "learning_rate": 1.855364260160507e-05, "loss": 0.6547, "mean_token_accuracy": 0.7962581703861765, "step": 22 }, { "epoch": 0.27218934911242604, "grad_norm": 0.5716003972347845, "learning_rate": 1.8329212407100996e-05, "loss": 0.687, "mean_token_accuracy": 0.78618707495751, "step": 23 }, { "epoch": 0.28402366863905326, "grad_norm": 0.6177392920654274, "learning_rate": 1.8090169943749477e-05, "loss": 0.652, "mean_token_accuracy": 0.7952571353402204, "step": 24 }, { "epoch": 0.2958579881656805, "grad_norm": 0.5698986426257591, "learning_rate": 1.78369345732584e-05, "loss": 0.6677, "mean_token_accuracy": 0.7916133988159623, "step": 25 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5701215768836807, "learning_rate": 1.7569950556517566e-05, "loss": 0.6681, "mean_token_accuracy": 0.7910464837092032, "step": 26 }, { "epoch": 0.31952662721893493, "grad_norm": 0.5080399194743719, "learning_rate": 1.7289686274214116e-05, "loss": 0.6515, "mean_token_accuracy": 0.7953877454559368, "step": 27 }, { "epoch": 0.33136094674556216, "grad_norm": 0.4717556592028091, "learning_rate": 1.6996633405133656e-05, "loss": 0.6596, "mean_token_accuracy": 0.7938733744518354, "step": 28 }, { "epoch": 0.3431952662721893, "grad_norm": 0.43712818505210993, "learning_rate": 1.6691306063588583e-05, "loss": 0.6515, "mean_token_accuracy": 0.795406673335384, "step": 29 }, { "epoch": 0.35502958579881655, "grad_norm": 0.5647585307286523, "learning_rate": 1.63742398974869e-05, "loss": 0.6765, "mean_token_accuracy": 0.7879625926981235, "step": 30 }, { "epoch": 0.3668639053254438, "grad_norm": 0.4753712696159348, "learning_rate": 1.6045991148623752e-05, "loss": 0.6521, "mean_token_accuracy": 0.7942288972770943, "step": 31 }, { "epoch": 0.378698224852071, "grad_norm": 0.4144279237112344, "learning_rate": 1.570713567684432e-05, "loss": 0.6496, "mean_token_accuracy": 0.7955444405180622, "step": 32 }, { "epoch": 0.3905325443786982, "grad_norm": 0.48091481539083, "learning_rate": 1.5358267949789968e-05, "loss": 0.6554, "mean_token_accuracy": 0.7931419956248978, "step": 33 }, { "epoch": 0.40236686390532544, "grad_norm": 0.43887737403140387, "learning_rate": 1.5000000000000002e-05, "loss": 0.6429, "mean_token_accuracy": 0.7971955767104538, "step": 34 }, { "epoch": 0.41420118343195267, "grad_norm": 0.4589833492342518, "learning_rate": 1.463296035119862e-05, "loss": 0.6274, "mean_token_accuracy": 0.8013304246596665, "step": 35 }, { "epoch": 0.4260355029585799, "grad_norm": 0.4296018328726048, "learning_rate": 1.4257792915650728e-05, "loss": 0.6412, "mean_token_accuracy": 0.7966311405685548, "step": 36 }, { "epoch": 0.4378698224852071, "grad_norm": 0.43296198847979206, "learning_rate": 1.3875155864521031e-05, "loss": 0.6355, "mean_token_accuracy": 0.7992740737965425, "step": 37 }, { "epoch": 0.44970414201183434, "grad_norm": 0.38880487342980874, "learning_rate": 1.3485720473218153e-05, "loss": 0.626, "mean_token_accuracy": 0.8008996682978254, "step": 38 }, { "epoch": 0.46153846153846156, "grad_norm": 0.43999619314461746, "learning_rate": 1.3090169943749475e-05, "loss": 0.6415, "mean_token_accuracy": 0.7971045284947518, "step": 39 }, { "epoch": 0.47337278106508873, "grad_norm": 0.4648200257156182, "learning_rate": 1.2689198206152657e-05, "loss": 0.6462, "mean_token_accuracy": 0.7952968697326067, "step": 40 }, { "epoch": 0.48520710059171596, "grad_norm": 0.3479967151805307, "learning_rate": 1.2283508701106559e-05, "loss": 0.6323, "mean_token_accuracy": 0.7998096660135896, "step": 41 }, { "epoch": 0.4970414201183432, "grad_norm": 0.48528533316713585, "learning_rate": 1.187381314585725e-05, "loss": 0.6386, "mean_token_accuracy": 0.7970335346930519, "step": 42 }, { "epoch": 0.5088757396449705, "grad_norm": 0.4070328582412056, "learning_rate": 1.1460830285624119e-05, "loss": 0.6428, "mean_token_accuracy": 0.7955530635882152, "step": 43 }, { "epoch": 0.5207100591715976, "grad_norm": 0.38515895719763865, "learning_rate": 1.1045284632676535e-05, "loss": 0.629, "mean_token_accuracy": 0.8002784669594674, "step": 44 }, { "epoch": 0.5325443786982249, "grad_norm": 0.3947135089059047, "learning_rate": 1.0627905195293135e-05, "loss": 0.6248, "mean_token_accuracy": 0.8020802191923001, "step": 45 }, { "epoch": 0.5443786982248521, "grad_norm": 0.32500740489262403, "learning_rate": 1.0209424198833571e-05, "loss": 0.6266, "mean_token_accuracy": 0.8013824664684448, "step": 46 }, { "epoch": 0.5562130177514792, "grad_norm": 0.4427008519391343, "learning_rate": 9.790575801166432e-06, "loss": 0.6299, "mean_token_accuracy": 0.7990073104772039, "step": 47 }, { "epoch": 0.5680473372781065, "grad_norm": 0.33671217055241054, "learning_rate": 9.372094804706867e-06, "loss": 0.6127, "mean_token_accuracy": 0.8042786979333378, "step": 48 }, { "epoch": 0.5798816568047337, "grad_norm": 0.3447024994665335, "learning_rate": 8.954715367323468e-06, "loss": 0.6155, "mean_token_accuracy": 0.8035300399264838, "step": 49 }, { "epoch": 0.591715976331361, "grad_norm": 0.38535071393327097, "learning_rate": 8.539169714375885e-06, "loss": 0.6251, "mean_token_accuracy": 0.8009747733015874, "step": 50 }, { "epoch": 0.6035502958579881, "grad_norm": 0.36444496001026966, "learning_rate": 8.126186854142752e-06, "loss": 0.6151, "mean_token_accuracy": 0.803726615022824, "step": 51 }, { "epoch": 0.6153846153846154, "grad_norm": 0.33289854812401104, "learning_rate": 7.716491298893443e-06, "loss": 0.6237, "mean_token_accuracy": 0.8018573004824351, "step": 52 }, { "epoch": 0.6272189349112426, "grad_norm": 0.3779499676278362, "learning_rate": 7.310801793847344e-06, "loss": 0.6157, "mean_token_accuracy": 0.8030306721227509, "step": 53 }, { "epoch": 0.6390532544378699, "grad_norm": 0.3857760141933295, "learning_rate": 6.909830056250527e-06, "loss": 0.6446, "mean_token_accuracy": 0.7951614721610321, "step": 54 }, { "epoch": 0.650887573964497, "grad_norm": 0.3441596003200234, "learning_rate": 6.5142795267818505e-06, "loss": 0.6245, "mean_token_accuracy": 0.8010465961395079, "step": 55 }, { "epoch": 0.6627218934911243, "grad_norm": 0.3413004265386508, "learning_rate": 6.124844135478971e-06, "loss": 0.6346, "mean_token_accuracy": 0.797919315970412, "step": 56 }, { "epoch": 0.6745562130177515, "grad_norm": 0.3224227524329457, "learning_rate": 5.742207084349274e-06, "loss": 0.6142, "mean_token_accuracy": 0.8038281147126183, "step": 57 }, { "epoch": 0.6863905325443787, "grad_norm": 0.3552618452178376, "learning_rate": 5.367039648801386e-06, "loss": 0.6146, "mean_token_accuracy": 0.8046393162729427, "step": 58 }, { "epoch": 0.6982248520710059, "grad_norm": 0.291412169561303, "learning_rate": 5.000000000000003e-06, "loss": 0.6099, "mean_token_accuracy": 0.8053936401536371, "step": 59 }, { "epoch": 0.7100591715976331, "grad_norm": 0.31200803773040214, "learning_rate": 4.641732050210032e-06, "loss": 0.5991, "mean_token_accuracy": 0.8092009667951892, "step": 60 }, { "epoch": 0.7218934911242604, "grad_norm": 0.30861041917669746, "learning_rate": 4.292864323155684e-06, "loss": 0.6194, "mean_token_accuracy": 0.8023869323011936, "step": 61 }, { "epoch": 0.7337278106508875, "grad_norm": 0.3016074619440693, "learning_rate": 3.954008851376252e-06, "loss": 0.6074, "mean_token_accuracy": 0.8061990891659233, "step": 62 }, { "epoch": 0.7455621301775148, "grad_norm": 0.2780289952523753, "learning_rate": 3.625760102513103e-06, "loss": 0.627, "mean_token_accuracy": 0.7997388545626677, "step": 63 }, { "epoch": 0.757396449704142, "grad_norm": 0.27276931271932114, "learning_rate": 3.308693936411421e-06, "loss": 0.6157, "mean_token_accuracy": 0.804183757647147, "step": 64 }, { "epoch": 0.7692307692307693, "grad_norm": 0.29793300735816786, "learning_rate": 3.003366594866345e-06, "loss": 0.6173, "mean_token_accuracy": 0.8028972489414075, "step": 65 }, { "epoch": 0.7810650887573964, "grad_norm": 0.27768983157741034, "learning_rate": 2.7103137257858867e-06, "loss": 0.6351, "mean_token_accuracy": 0.797241759861545, "step": 66 }, { "epoch": 0.7928994082840237, "grad_norm": 0.2515256068430327, "learning_rate": 2.4300494434824373e-06, "loss": 0.6151, "mean_token_accuracy": 0.8040865582678152, "step": 67 }, { "epoch": 0.8047337278106509, "grad_norm": 0.2669457691795579, "learning_rate": 2.163065426741603e-06, "loss": 0.6136, "mean_token_accuracy": 0.803797621443186, "step": 68 }, { "epoch": 0.8165680473372781, "grad_norm": 0.30160149261118047, "learning_rate": 1.9098300562505266e-06, "loss": 0.6275, "mean_token_accuracy": 0.8003163727232168, "step": 69 }, { "epoch": 0.8284023668639053, "grad_norm": 0.2751754259680236, "learning_rate": 1.6707875928990059e-06, "loss": 0.6133, "mean_token_accuracy": 0.8047071305832921, "step": 70 }, { "epoch": 0.8402366863905325, "grad_norm": 0.2706385532569295, "learning_rate": 1.446357398394934e-06, "loss": 0.6218, "mean_token_accuracy": 0.8022678442231608, "step": 71 }, { "epoch": 0.8520710059171598, "grad_norm": 0.28015697822064345, "learning_rate": 1.2369331995613664e-06, "loss": 0.617, "mean_token_accuracy": 0.8031474875457701, "step": 72 }, { "epoch": 0.863905325443787, "grad_norm": 0.2589662874319155, "learning_rate": 1.042882397605871e-06, "loss": 0.6115, "mean_token_accuracy": 0.8043582740503216, "step": 73 }, { "epoch": 0.8757396449704142, "grad_norm": 0.24850062323600258, "learning_rate": 8.645454235739903e-07, "loss": 0.601, "mean_token_accuracy": 0.8078272791878166, "step": 74 }, { "epoch": 0.8875739644970414, "grad_norm": 0.25618246366172615, "learning_rate": 7.022351411174866e-07, "loss": 0.6135, "mean_token_accuracy": 0.8042458110517825, "step": 75 }, { "epoch": 0.8994082840236687, "grad_norm": 0.25009614725017143, "learning_rate": 5.562362976251901e-07, "loss": 0.6122, "mean_token_accuracy": 0.8047269396914206, "step": 76 }, { "epoch": 0.9112426035502958, "grad_norm": 0.2385740957817605, "learning_rate": 4.268050246793276e-07, "loss": 0.6018, "mean_token_accuracy": 0.8070806317760979, "step": 77 }, { "epoch": 0.9230769230769231, "grad_norm": 0.25257325897328714, "learning_rate": 3.1416838871368925e-07, "loss": 0.5992, "mean_token_accuracy": 0.8088564726476459, "step": 78 }, { "epoch": 0.9349112426035503, "grad_norm": 0.2409513318669118, "learning_rate": 2.1852399266194312e-07, "loss": 0.6029, "mean_token_accuracy": 0.8077979969548987, "step": 79 }, { "epoch": 0.9467455621301775, "grad_norm": 0.24881717772888504, "learning_rate": 1.400396292949513e-07, "loss": 0.6132, "mean_token_accuracy": 0.8045102037481361, "step": 80 }, { "epoch": 0.9585798816568047, "grad_norm": 0.26121171944408894, "learning_rate": 7.885298685522235e-08, "loss": 0.604, "mean_token_accuracy": 0.8070156862648367, "step": 81 }, { "epoch": 0.9704142011834319, "grad_norm": 0.24329815513143002, "learning_rate": 3.50714075049563e-08, "loss": 0.6159, "mean_token_accuracy": 0.8033786700596692, "step": 82 }, { "epoch": 0.9822485207100592, "grad_norm": 0.2401566800321565, "learning_rate": 8.771699011416169e-09, "loss": 0.6017, "mean_token_accuracy": 0.8079396676985449, "step": 83 }, { "epoch": 0.9940828402366864, "grad_norm": 0.24933875598122412, "learning_rate": 0.0, "loss": 0.6188, "mean_token_accuracy": 0.8022081449189813, "step": 84 }, { "epoch": 0.9940828402366864, "step": 84, "total_flos": 176018430099456.0, "train_loss": 0.6783329638696852, "train_runtime": 1805.4988, "train_samples_per_second": 11.976, "train_steps_per_second": 0.047 } ], "logging_steps": 1, "max_steps": 84, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 176018430099456.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }