{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9815303430079156, "eval_steps": 500, "global_step": 93, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0316622691292876, "grad_norm": 2.5990071296691895, "learning_rate": 7.000000000000001e-07, "loss": 1.176, "step": 1 }, { "epoch": 0.0633245382585752, "grad_norm": 2.6270251274108887, "learning_rate": 1.4000000000000001e-06, "loss": 1.1885, "step": 2 }, { "epoch": 0.09498680738786279, "grad_norm": 2.6077260971069336, "learning_rate": 2.1e-06, "loss": 1.1853, "step": 3 }, { "epoch": 0.1266490765171504, "grad_norm": 2.4010000228881836, "learning_rate": 2.8000000000000003e-06, "loss": 1.1441, "step": 4 }, { "epoch": 0.158311345646438, "grad_norm": 1.6012561321258545, "learning_rate": 3.5e-06, "loss": 1.1434, "step": 5 }, { "epoch": 0.18997361477572558, "grad_norm": 1.4890481233596802, "learning_rate": 4.2e-06, "loss": 1.1287, "step": 6 }, { "epoch": 0.22163588390501318, "grad_norm": 2.0511271953582764, "learning_rate": 4.9e-06, "loss": 1.1233, "step": 7 }, { "epoch": 0.2532981530343008, "grad_norm": 1.9991264343261719, "learning_rate": 5.600000000000001e-06, "loss": 1.1195, "step": 8 }, { "epoch": 0.2849604221635884, "grad_norm": 2.4904370307922363, "learning_rate": 6.3e-06, "loss": 1.0933, "step": 9 }, { "epoch": 0.316622691292876, "grad_norm": 2.41306471824646, "learning_rate": 7e-06, "loss": 1.1054, "step": 10 }, { "epoch": 0.3482849604221636, "grad_norm": 1.916558861732483, "learning_rate": 6.99749314185716e-06, "loss": 1.0968, "step": 11 }, { "epoch": 0.37994722955145116, "grad_norm": 1.6149336099624634, "learning_rate": 6.989976158478782e-06, "loss": 1.0736, "step": 12 }, { "epoch": 0.41160949868073876, "grad_norm": 1.5957881212234497, "learning_rate": 6.977459817871147e-06, "loss": 1.0864, "step": 13 }, { "epoch": 0.44327176781002636, "grad_norm": 1.3141591548919678, "learning_rate": 6.959962049571609e-06, "loss": 1.0507, "step": 14 }, { "epoch": 0.47493403693931396, "grad_norm": 1.0635184049606323, "learning_rate": 6.937507918964709e-06, "loss": 1.0483, "step": 15 }, { "epoch": 0.5065963060686016, "grad_norm": 0.8129880428314209, "learning_rate": 6.9101295913762465e-06, "loss": 1.0232, "step": 16 }, { "epoch": 0.5382585751978892, "grad_norm": 1.096314549446106, "learning_rate": 6.877866285996766e-06, "loss": 1.0304, "step": 17 }, { "epoch": 0.5699208443271768, "grad_norm": 0.8672236204147339, "learning_rate": 6.840764219700443e-06, "loss": 1.0308, "step": 18 }, { "epoch": 0.6015831134564644, "grad_norm": 0.9555258750915527, "learning_rate": 6.798876540839855e-06, "loss": 1.0285, "step": 19 }, { "epoch": 0.633245382585752, "grad_norm": 1.5928786993026733, "learning_rate": 6.752263253111479e-06, "loss": 1.0189, "step": 20 }, { "epoch": 0.6649076517150396, "grad_norm": 1.2837454080581665, "learning_rate": 6.700991129600976e-06, "loss": 1.021, "step": 21 }, { "epoch": 0.6965699208443272, "grad_norm": 0.7489935755729675, "learning_rate": 6.645133617131388e-06, "loss": 1.0013, "step": 22 }, { "epoch": 0.7282321899736148, "grad_norm": 0.729206383228302, "learning_rate": 6.584770731051271e-06, "loss": 0.986, "step": 23 }, { "epoch": 0.7598944591029023, "grad_norm": 0.8594318628311157, "learning_rate": 6.51998894061348e-06, "loss": 0.9853, "step": 24 }, { "epoch": 0.7915567282321899, "grad_norm": 0.6937134861946106, "learning_rate": 6.4508810451087956e-06, "loss": 0.9969, "step": 25 }, { "epoch": 0.8232189973614775, "grad_norm": 0.6618143916130066, "learning_rate": 6.377546040931835e-06, "loss": 0.9784, "step": 26 }, { "epoch": 0.8548812664907651, "grad_norm": 0.7150548100471497, "learning_rate": 6.300088979769671e-06, "loss": 0.9775, "step": 27 }, { "epoch": 0.8865435356200527, "grad_norm": 0.8135938048362732, "learning_rate": 6.218620818116299e-06, "loss": 0.9555, "step": 28 }, { "epoch": 0.9182058047493403, "grad_norm": 0.7386907339096069, "learning_rate": 6.133258258328535e-06, "loss": 0.9536, "step": 29 }, { "epoch": 0.9498680738786279, "grad_norm": 0.6299906969070435, "learning_rate": 6.044123581451003e-06, "loss": 0.9513, "step": 30 }, { "epoch": 0.9815303430079155, "grad_norm": 0.6849163174629211, "learning_rate": 5.951344472049728e-06, "loss": 0.9491, "step": 31 }, { "epoch": 1.0316622691292876, "grad_norm": 0.5353211760520935, "learning_rate": 5.855053835305216e-06, "loss": 1.2099, "step": 32 }, { "epoch": 1.0633245382585752, "grad_norm": 0.5629773736000061, "learning_rate": 5.755389606627069e-06, "loss": 0.8727, "step": 33 }, { "epoch": 1.0949868073878628, "grad_norm": 0.5262039303779602, "learning_rate": 5.652494554062838e-06, "loss": 0.8777, "step": 34 }, { "epoch": 1.1266490765171504, "grad_norm": 0.9279022216796875, "learning_rate": 5.546516073784165e-06, "loss": 0.8661, "step": 35 }, { "epoch": 1.158311345646438, "grad_norm": 1.0550135374069214, "learning_rate": 5.4376059789431955e-06, "loss": 0.8665, "step": 36 }, { "epoch": 1.1899736147757256, "grad_norm": 0.4825997054576874, "learning_rate": 5.325920282201696e-06, "loss": 0.8584, "step": 37 }, { "epoch": 1.2216358839050132, "grad_norm": 0.5052205920219421, "learning_rate": 5.2116189722444164e-06, "loss": 0.8677, "step": 38 }, { "epoch": 1.2532981530343008, "grad_norm": 0.4654822051525116, "learning_rate": 5.094865784596845e-06, "loss": 0.8515, "step": 39 }, { "epoch": 1.2849604221635884, "grad_norm": 0.45359933376312256, "learning_rate": 4.975827967075644e-06, "loss": 0.851, "step": 40 }, { "epoch": 1.316622691292876, "grad_norm": 0.968210756778717, "learning_rate": 4.854676040207761e-06, "loss": 0.8733, "step": 41 }, { "epoch": 1.3482849604221636, "grad_norm": 0.5169340372085571, "learning_rate": 4.731583552961416e-06, "loss": 0.8698, "step": 42 }, { "epoch": 1.3799472295514512, "grad_norm": 0.8193064332008362, "learning_rate": 4.606726834138884e-06, "loss": 0.8464, "step": 43 }, { "epoch": 1.4116094986807388, "grad_norm": 0.589012622833252, "learning_rate": 4.480284739787175e-06, "loss": 0.8465, "step": 44 }, { "epoch": 1.4432717678100264, "grad_norm": 0.5069411993026733, "learning_rate": 4.352438396988471e-06, "loss": 0.8415, "step": 45 }, { "epoch": 1.474934036939314, "grad_norm": 0.4803805947303772, "learning_rate": 4.223370944397335e-06, "loss": 0.8089, "step": 46 }, { "epoch": 1.5065963060686016, "grad_norm": 0.44742244482040405, "learning_rate": 4.093267269896339e-06, "loss": 0.8141, "step": 47 }, { "epoch": 1.5382585751978892, "grad_norm": 0.5128947496414185, "learning_rate": 3.9623137457459586e-06, "loss": 0.8334, "step": 48 }, { "epoch": 1.5699208443271768, "grad_norm": 0.43568509817123413, "learning_rate": 3.83069796160811e-06, "loss": 0.7996, "step": 49 }, { "epoch": 1.6015831134564644, "grad_norm": 0.5734230279922485, "learning_rate": 3.6986084558257596e-06, "loss": 0.8206, "step": 50 }, { "epoch": 1.633245382585752, "grad_norm": 0.4465356469154358, "learning_rate": 3.5662344453435665e-06, "loss": 0.8036, "step": 51 }, { "epoch": 1.6649076517150396, "grad_norm": 0.9534016251564026, "learning_rate": 3.4337655546564343e-06, "loss": 0.849, "step": 52 }, { "epoch": 1.6965699208443272, "grad_norm": 0.5999336838722229, "learning_rate": 3.301391544174241e-06, "loss": 0.7874, "step": 53 }, { "epoch": 1.7282321899736148, "grad_norm": 0.4919511079788208, "learning_rate": 3.1693020383918907e-06, "loss": 0.8012, "step": 54 }, { "epoch": 1.7598944591029024, "grad_norm": 0.4128033518791199, "learning_rate": 3.0376862542540426e-06, "loss": 0.8056, "step": 55 }, { "epoch": 1.79155672823219, "grad_norm": 0.4979248642921448, "learning_rate": 2.9067327301036616e-06, "loss": 0.7769, "step": 56 }, { "epoch": 1.8232189973614776, "grad_norm": 0.46139153838157654, "learning_rate": 2.7766290556026646e-06, "loss": 0.7984, "step": 57 }, { "epoch": 1.8548812664907652, "grad_norm": 0.46961304545402527, "learning_rate": 2.6475616030115286e-06, "loss": 0.7917, "step": 58 }, { "epoch": 1.8865435356200528, "grad_norm": 0.4321727752685547, "learning_rate": 2.5197152602128256e-06, "loss": 0.7852, "step": 59 }, { "epoch": 1.9182058047493404, "grad_norm": 0.4466552138328552, "learning_rate": 2.393273165861116e-06, "loss": 0.8023, "step": 60 }, { "epoch": 1.949868073878628, "grad_norm": 0.4448838233947754, "learning_rate": 2.2684164470385843e-06, "loss": 0.7616, "step": 61 }, { "epoch": 1.9815303430079156, "grad_norm": 0.434826523065567, "learning_rate": 2.14532395979224e-06, "loss": 0.8165, "step": 62 }, { "epoch": 2.0316622691292876, "grad_norm": 0.6250742673873901, "learning_rate": 2.0241720329243563e-06, "loss": 1.0035, "step": 63 }, { "epoch": 2.063324538258575, "grad_norm": 0.41192013025283813, "learning_rate": 1.905134215403155e-06, "loss": 0.7532, "step": 64 }, { "epoch": 2.094986807387863, "grad_norm": 0.4167821407318115, "learning_rate": 1.7883810277555837e-06, "loss": 0.7628, "step": 65 }, { "epoch": 2.1266490765171504, "grad_norm": 0.4210691452026367, "learning_rate": 1.6740797177983044e-06, "loss": 0.7511, "step": 66 }, { "epoch": 2.158311345646438, "grad_norm": 0.44451218843460083, "learning_rate": 1.5623940210568042e-06, "loss": 0.7302, "step": 67 }, { "epoch": 2.1899736147757256, "grad_norm": 0.40137404203414917, "learning_rate": 1.453483926215835e-06, "loss": 0.7315, "step": 68 }, { "epoch": 2.221635883905013, "grad_norm": 0.6961039900779724, "learning_rate": 1.3475054459371625e-06, "loss": 0.7478, "step": 69 }, { "epoch": 2.253298153034301, "grad_norm": 0.37633243203163147, "learning_rate": 1.2446103933729302e-06, "loss": 0.7288, "step": 70 }, { "epoch": 2.2849604221635884, "grad_norm": 0.43067774176597595, "learning_rate": 1.1449461646947839e-06, "loss": 0.7628, "step": 71 }, { "epoch": 2.316622691292876, "grad_norm": 0.3962654769420624, "learning_rate": 1.048655527950273e-06, "loss": 0.7178, "step": 72 }, { "epoch": 2.3482849604221636, "grad_norm": 0.9008962512016296, "learning_rate": 9.558764185489968e-07, "loss": 0.7287, "step": 73 }, { "epoch": 2.379947229551451, "grad_norm": 0.4029214084148407, "learning_rate": 8.667417416714656e-07, "loss": 0.7135, "step": 74 }, { "epoch": 2.411609498680739, "grad_norm": 0.37933027744293213, "learning_rate": 7.813791818837012e-07, "loss": 0.7242, "step": 75 }, { "epoch": 2.4432717678100264, "grad_norm": 0.4242548644542694, "learning_rate": 6.999110202303293e-07, "loss": 0.7201, "step": 76 }, { "epoch": 2.474934036939314, "grad_norm": 0.5838273167610168, "learning_rate": 6.22453959068165e-07, "loss": 0.6838, "step": 77 }, { "epoch": 2.5065963060686016, "grad_norm": 0.3837423026561737, "learning_rate": 5.491189548912051e-07, "loss": 0.717, "step": 78 }, { "epoch": 2.538258575197889, "grad_norm": 1.0475167036056519, "learning_rate": 4.800110593865199e-07, "loss": 0.7152, "step": 79 }, { "epoch": 2.569920844327177, "grad_norm": 0.4407210946083069, "learning_rate": 4.15229268948729e-07, "loss": 0.7368, "step": 80 }, { "epoch": 2.6015831134564644, "grad_norm": 0.39730748534202576, "learning_rate": 3.5486638286861297e-07, "loss": 0.715, "step": 81 }, { "epoch": 2.633245382585752, "grad_norm": 0.5303342938423157, "learning_rate": 2.990088703990245e-07, "loss": 0.74, "step": 82 }, { "epoch": 2.6649076517150396, "grad_norm": 0.6698248386383057, "learning_rate": 2.4773674688852197e-07, "loss": 0.7142, "step": 83 }, { "epoch": 2.6965699208443272, "grad_norm": 0.43907734751701355, "learning_rate": 2.0112345916014578e-07, "loss": 0.7372, "step": 84 }, { "epoch": 2.728232189973615, "grad_norm": 0.37293171882629395, "learning_rate": 1.592357802995572e-07, "loss": 0.7241, "step": 85 }, { "epoch": 2.7598944591029024, "grad_norm": 0.394404798746109, "learning_rate": 1.2213371400323352e-07, "loss": 0.7095, "step": 86 }, { "epoch": 2.79155672823219, "grad_norm": 0.7737839818000793, "learning_rate": 8.987040862375339e-08, "loss": 0.6919, "step": 87 }, { "epoch": 2.8232189973614776, "grad_norm": 0.3485806882381439, "learning_rate": 6.249208103529092e-08, "loss": 0.7263, "step": 88 }, { "epoch": 2.8548812664907652, "grad_norm": 0.39798641204833984, "learning_rate": 4.0037950428390144e-08, "loss": 0.7415, "step": 89 }, { "epoch": 2.886543535620053, "grad_norm": 0.38925713300704956, "learning_rate": 2.254018212885278e-08, "loss": 0.7267, "step": 90 }, { "epoch": 2.9182058047493404, "grad_norm": 0.361558198928833, "learning_rate": 1.0023841521217825e-08, "loss": 0.7175, "step": 91 }, { "epoch": 2.949868073878628, "grad_norm": 0.5748413801193237, "learning_rate": 2.506858142839852e-09, "loss": 0.7126, "step": 92 }, { "epoch": 2.9815303430079156, "grad_norm": 0.3813362717628479, "learning_rate": 0.0, "loss": 0.7082, "step": 93 }, { "epoch": 2.9815303430079156, "step": 93, "total_flos": 2930651586101248.0, "train_loss": 0.8763364265041966, "train_runtime": 71308.8652, "train_samples_per_second": 0.382, "train_steps_per_second": 0.001 } ], "logging_steps": 1, "max_steps": 93, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2930651586101248.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }