{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 169, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005917159763313609, "grad_norm": 5.048887351152406, "learning_rate": 1.1764705882352942e-06, "loss": 1.0795, "mean_token_accuracy": 0.7206979429209666, "step": 1 }, { "epoch": 0.011834319526627219, "grad_norm": 4.879597464054353, "learning_rate": 2.3529411764705885e-06, "loss": 1.1029, "mean_token_accuracy": 0.7117051311557141, "step": 2 }, { "epoch": 0.01775147928994083, "grad_norm": 4.657432447307512, "learning_rate": 3.529411764705883e-06, "loss": 1.0906, "mean_token_accuracy": 0.7171718488582962, "step": 3 }, { "epoch": 0.023668639053254437, "grad_norm": 4.467351868935966, "learning_rate": 4.705882352941177e-06, "loss": 1.0718, "mean_token_accuracy": 0.7218519479951946, "step": 4 }, { "epoch": 0.029585798816568046, "grad_norm": 3.7064015727740425, "learning_rate": 5.882352941176471e-06, "loss": 1.0701, "mean_token_accuracy": 0.7191323254401871, "step": 5 }, { "epoch": 0.03550295857988166, "grad_norm": 2.382423991923031, "learning_rate": 7.058823529411766e-06, "loss": 1.0118, "mean_token_accuracy": 0.7285496854387108, "step": 6 }, { "epoch": 0.04142011834319527, "grad_norm": 2.101237602674026, "learning_rate": 8.23529411764706e-06, "loss": 0.9682, "mean_token_accuracy": 0.7363681597344438, "step": 7 }, { "epoch": 0.047337278106508875, "grad_norm": 2.015056074152268, "learning_rate": 9.411764705882354e-06, "loss": 1.0044, "mean_token_accuracy": 0.7251941996166579, "step": 8 }, { "epoch": 0.05325443786982249, "grad_norm": 1.6355380376413269, "learning_rate": 1.0588235294117648e-05, "loss": 0.9629, "mean_token_accuracy": 0.7324857047569558, "step": 9 }, { "epoch": 0.05917159763313609, "grad_norm": 1.7763113613059038, "learning_rate": 1.1764705882352942e-05, "loss": 0.9515, "mean_token_accuracy": 0.7337997991969313, "step": 10 }, { "epoch": 0.0650887573964497, "grad_norm": 1.3969034500597064, "learning_rate": 1.2941176470588238e-05, "loss": 0.9259, "mean_token_accuracy": 0.7387967799773972, "step": 11 }, { "epoch": 0.07100591715976332, "grad_norm": 0.953979333766809, "learning_rate": 1.4117647058823532e-05, "loss": 0.873, "mean_token_accuracy": 0.7510249436735839, "step": 12 }, { "epoch": 0.07692307692307693, "grad_norm": 1.127863397786703, "learning_rate": 1.5294117647058822e-05, "loss": 0.8988, "mean_token_accuracy": 0.7415024955395976, "step": 13 }, { "epoch": 0.08284023668639054, "grad_norm": 0.9273891062266637, "learning_rate": 1.647058823529412e-05, "loss": 0.8642, "mean_token_accuracy": 0.750242238655049, "step": 14 }, { "epoch": 0.08875739644970414, "grad_norm": 0.8053975766192991, "learning_rate": 1.7647058823529414e-05, "loss": 0.8616, "mean_token_accuracy": 0.7512058479268567, "step": 15 }, { "epoch": 0.09467455621301775, "grad_norm": 0.7333036953961584, "learning_rate": 1.8823529411764708e-05, "loss": 0.8414, "mean_token_accuracy": 0.7541005429250093, "step": 16 }, { "epoch": 0.10059171597633136, "grad_norm": 0.6867367211047125, "learning_rate": 2e-05, "loss": 0.8218, "mean_token_accuracy": 0.7606669951255678, "step": 17 }, { "epoch": 0.10650887573964497, "grad_norm": 0.7209527833538277, "learning_rate": 1.9997864167879313e-05, "loss": 0.7897, "mean_token_accuracy": 0.7669743964292547, "step": 18 }, { "epoch": 0.11242603550295859, "grad_norm": 0.6903745592521859, "learning_rate": 1.999145758387301e-05, "loss": 0.7984, "mean_token_accuracy": 0.7647671686022387, "step": 19 }, { "epoch": 0.11834319526627218, "grad_norm": 0.6167859283406073, "learning_rate": 1.9980782984658682e-05, "loss": 0.7898, "mean_token_accuracy": 0.7661677740213927, "step": 20 }, { "epoch": 0.1242603550295858, "grad_norm": 0.6108563732156459, "learning_rate": 1.99658449300667e-05, "loss": 0.804, "mean_token_accuracy": 0.7611782591111235, "step": 21 }, { "epoch": 0.1301775147928994, "grad_norm": 0.5746908311619029, "learning_rate": 1.994664980113243e-05, "loss": 0.7744, "mean_token_accuracy": 0.7691396065844442, "step": 22 }, { "epoch": 0.13609467455621302, "grad_norm": 0.5345761647636947, "learning_rate": 1.992320579737045e-05, "loss": 0.7749, "mean_token_accuracy": 0.7685584775159877, "step": 23 }, { "epoch": 0.14201183431952663, "grad_norm": 0.5299656484934693, "learning_rate": 1.9895522933272028e-05, "loss": 0.7621, "mean_token_accuracy": 0.7722506630953709, "step": 24 }, { "epoch": 0.14792899408284024, "grad_norm": 0.63083763621396, "learning_rate": 1.9863613034027224e-05, "loss": 0.7868, "mean_token_accuracy": 0.7644846690458883, "step": 25 }, { "epoch": 0.15384615384615385, "grad_norm": 0.4511582029139769, "learning_rate": 1.9827489730473597e-05, "loss": 0.7752, "mean_token_accuracy": 0.7683559567285383, "step": 26 }, { "epoch": 0.15976331360946747, "grad_norm": 0.5080850847922778, "learning_rate": 1.9787168453273546e-05, "loss": 0.7903, "mean_token_accuracy": 0.7631548471555147, "step": 27 }, { "epoch": 0.16568047337278108, "grad_norm": 0.417117734277003, "learning_rate": 1.9742666426322877e-05, "loss": 0.743, "mean_token_accuracy": 0.7766788822768681, "step": 28 }, { "epoch": 0.17159763313609466, "grad_norm": 0.4254796366932393, "learning_rate": 1.9694002659393306e-05, "loss": 0.7702, "mean_token_accuracy": 0.7695522126025716, "step": 29 }, { "epoch": 0.17751479289940827, "grad_norm": 0.41986449808823745, "learning_rate": 1.9641197940012136e-05, "loss": 0.7586, "mean_token_accuracy": 0.7721289278577101, "step": 30 }, { "epoch": 0.1834319526627219, "grad_norm": 0.4695562943601462, "learning_rate": 1.958427482458253e-05, "loss": 0.7539, "mean_token_accuracy": 0.7716929613553958, "step": 31 }, { "epoch": 0.1893491124260355, "grad_norm": 0.3921409409899768, "learning_rate": 1.9523257628748148e-05, "loss": 0.7568, "mean_token_accuracy": 0.7717328096419696, "step": 32 }, { "epoch": 0.1952662721893491, "grad_norm": 0.4160967346901947, "learning_rate": 1.9458172417006347e-05, "loss": 0.7409, "mean_token_accuracy": 0.7751419508476135, "step": 33 }, { "epoch": 0.20118343195266272, "grad_norm": 0.43718343180505526, "learning_rate": 1.9389046991574298e-05, "loss": 0.7344, "mean_token_accuracy": 0.778526386835722, "step": 34 }, { "epoch": 0.20710059171597633, "grad_norm": 0.396451454218149, "learning_rate": 1.9315910880512792e-05, "loss": 0.7585, "mean_token_accuracy": 0.7704364168416742, "step": 35 }, { "epoch": 0.21301775147928995, "grad_norm": 0.4531112283882146, "learning_rate": 1.9238795325112867e-05, "loss": 0.7255, "mean_token_accuracy": 0.7800096489081426, "step": 36 }, { "epoch": 0.21893491124260356, "grad_norm": 0.4003679262358476, "learning_rate": 1.9157733266550577e-05, "loss": 0.7251, "mean_token_accuracy": 0.779636766062183, "step": 37 }, { "epoch": 0.22485207100591717, "grad_norm": 0.4019998494369348, "learning_rate": 1.9072759331815602e-05, "loss": 0.7325, "mean_token_accuracy": 0.7773937936524439, "step": 38 }, { "epoch": 0.23076923076923078, "grad_norm": 0.4247947978619282, "learning_rate": 1.898390981891979e-05, "loss": 0.7396, "mean_token_accuracy": 0.7757475726810331, "step": 39 }, { "epoch": 0.23668639053254437, "grad_norm": 0.3801393695192698, "learning_rate": 1.8891222681391853e-05, "loss": 0.7422, "mean_token_accuracy": 0.7750002370873404, "step": 40 }, { "epoch": 0.24260355029585798, "grad_norm": 0.37960651070834583, "learning_rate": 1.879473751206489e-05, "loss": 0.7159, "mean_token_accuracy": 0.7815920659071619, "step": 41 }, { "epoch": 0.2485207100591716, "grad_norm": 0.4177406473529506, "learning_rate": 1.869449552616367e-05, "loss": 0.7583, "mean_token_accuracy": 0.7707519109077099, "step": 42 }, { "epoch": 0.25443786982248523, "grad_norm": 0.36011706007184574, "learning_rate": 1.8590539543698852e-05, "loss": 0.7201, "mean_token_accuracy": 0.7804437519430654, "step": 43 }, { "epoch": 0.2603550295857988, "grad_norm": 0.37037523132050293, "learning_rate": 1.8482913971175737e-05, "loss": 0.6911, "mean_token_accuracy": 0.789738280938329, "step": 44 }, { "epoch": 0.26627218934911245, "grad_norm": 0.4354182756113773, "learning_rate": 1.8371664782625287e-05, "loss": 0.7426, "mean_token_accuracy": 0.7737875964292505, "step": 45 }, { "epoch": 0.27218934911242604, "grad_norm": 0.38980814844589723, "learning_rate": 1.825683949996556e-05, "loss": 0.7399, "mean_token_accuracy": 0.7748932065512412, "step": 46 }, { "epoch": 0.2781065088757396, "grad_norm": 0.39132094117790983, "learning_rate": 1.813848717270195e-05, "loss": 0.7008, "mean_token_accuracy": 0.7860000963532652, "step": 47 }, { "epoch": 0.28402366863905326, "grad_norm": 0.38848469099488886, "learning_rate": 1.8016658356974885e-05, "loss": 0.7164, "mean_token_accuracy": 0.7800342185812585, "step": 48 }, { "epoch": 0.28994082840236685, "grad_norm": 0.3636566412524122, "learning_rate": 1.789140509396394e-05, "loss": 0.7212, "mean_token_accuracy": 0.7796801488922436, "step": 49 }, { "epoch": 0.2958579881656805, "grad_norm": 0.4246224936998959, "learning_rate": 1.7762780887657576e-05, "loss": 0.7319, "mean_token_accuracy": 0.7770798628956391, "step": 50 }, { "epoch": 0.30177514792899407, "grad_norm": 0.35464674247563077, "learning_rate": 1.7630840681998068e-05, "loss": 0.7278, "mean_token_accuracy": 0.7782958063801806, "step": 51 }, { "epoch": 0.3076923076923077, "grad_norm": 0.3838835290476431, "learning_rate": 1.7495640837411265e-05, "loss": 0.7276, "mean_token_accuracy": 0.778110788518626, "step": 52 }, { "epoch": 0.3136094674556213, "grad_norm": 0.39823260845501735, "learning_rate": 1.735723910673132e-05, "loss": 0.6969, "mean_token_accuracy": 0.7861765198226371, "step": 53 }, { "epoch": 0.31952662721893493, "grad_norm": 0.3616285119296811, "learning_rate": 1.7215694610530624e-05, "loss": 0.7267, "mean_token_accuracy": 0.7780276231344528, "step": 54 }, { "epoch": 0.3254437869822485, "grad_norm": 0.3564476104670264, "learning_rate": 1.7071067811865477e-05, "loss": 0.7084, "mean_token_accuracy": 0.7837116023048635, "step": 55 }, { "epoch": 0.33136094674556216, "grad_norm": 0.44035089633256974, "learning_rate": 1.6923420490448298e-05, "loss": 0.7342, "mean_token_accuracy": 0.776648161332027, "step": 56 }, { "epoch": 0.33727810650887574, "grad_norm": 0.39248432764207875, "learning_rate": 1.6772815716257414e-05, "loss": 0.725, "mean_token_accuracy": 0.7786816515421023, "step": 57 }, { "epoch": 0.3431952662721893, "grad_norm": 0.36704706608942483, "learning_rate": 1.6619317822595666e-05, "loss": 0.7017, "mean_token_accuracy": 0.7841417427362292, "step": 58 }, { "epoch": 0.34911242603550297, "grad_norm": 0.40512498191045965, "learning_rate": 1.646299237860941e-05, "loss": 0.75, "mean_token_accuracy": 0.7708883209035493, "step": 59 }, { "epoch": 0.35502958579881655, "grad_norm": 0.39029160023670084, "learning_rate": 1.6303906161279554e-05, "loss": 0.7279, "mean_token_accuracy": 0.7775349237600122, "step": 60 }, { "epoch": 0.3609467455621302, "grad_norm": 0.3953993708954393, "learning_rate": 1.6142127126896682e-05, "loss": 0.7356, "mean_token_accuracy": 0.7745875530129677, "step": 61 }, { "epoch": 0.3668639053254438, "grad_norm": 0.36256710216714955, "learning_rate": 1.597772438203241e-05, "loss": 0.6997, "mean_token_accuracy": 0.7848896531840401, "step": 62 }, { "epoch": 0.3727810650887574, "grad_norm": 0.40962620829800644, "learning_rate": 1.5810768154019386e-05, "loss": 0.7205, "mean_token_accuracy": 0.7799613349100392, "step": 63 }, { "epoch": 0.378698224852071, "grad_norm": 0.3828488838066985, "learning_rate": 1.5641329760952514e-05, "loss": 0.708, "mean_token_accuracy": 0.782232443701145, "step": 64 }, { "epoch": 0.38461538461538464, "grad_norm": 0.39122763310454406, "learning_rate": 1.5469481581224274e-05, "loss": 0.7074, "mean_token_accuracy": 0.7819801124070774, "step": 65 }, { "epoch": 0.3905325443786982, "grad_norm": 0.3910386124912464, "learning_rate": 1.529529702260709e-05, "loss": 0.7318, "mean_token_accuracy": 0.7753433511222331, "step": 66 }, { "epoch": 0.39644970414201186, "grad_norm": 0.3880043071649847, "learning_rate": 1.5118850490896012e-05, "loss": 0.7191, "mean_token_accuracy": 0.7791978555083022, "step": 67 }, { "epoch": 0.40236686390532544, "grad_norm": 0.348781202358635, "learning_rate": 1.4940217358125042e-05, "loss": 0.6978, "mean_token_accuracy": 0.785919046763025, "step": 68 }, { "epoch": 0.40828402366863903, "grad_norm": 0.34785888550803656, "learning_rate": 1.4759473930370738e-05, "loss": 0.6923, "mean_token_accuracy": 0.787418621224268, "step": 69 }, { "epoch": 0.41420118343195267, "grad_norm": 0.3653232305381835, "learning_rate": 1.4576697415156818e-05, "loss": 0.6912, "mean_token_accuracy": 0.7869110504306804, "step": 70 }, { "epoch": 0.42011834319526625, "grad_norm": 0.36764315457311614, "learning_rate": 1.4391965888473705e-05, "loss": 0.6991, "mean_token_accuracy": 0.7839725057156516, "step": 71 }, { "epoch": 0.4260355029585799, "grad_norm": 0.3514136440565239, "learning_rate": 1.4205358261427076e-05, "loss": 0.7149, "mean_token_accuracy": 0.7796496659579735, "step": 72 }, { "epoch": 0.4319526627218935, "grad_norm": 0.3686893566061322, "learning_rate": 1.4016954246529697e-05, "loss": 0.7197, "mean_token_accuracy": 0.7794793514235452, "step": 73 }, { "epoch": 0.4378698224852071, "grad_norm": 0.34605681231940283, "learning_rate": 1.3826834323650899e-05, "loss": 0.687, "mean_token_accuracy": 0.7897891867600086, "step": 74 }, { "epoch": 0.4437869822485207, "grad_norm": 0.369433655454952, "learning_rate": 1.3635079705638298e-05, "loss": 0.7018, "mean_token_accuracy": 0.7843400450532293, "step": 75 }, { "epoch": 0.44970414201183434, "grad_norm": 0.3637775910740719, "learning_rate": 1.3441772303626387e-05, "loss": 0.6811, "mean_token_accuracy": 0.7883583477300129, "step": 76 }, { "epoch": 0.4556213017751479, "grad_norm": 0.3552805600165443, "learning_rate": 1.3246994692046837e-05, "loss": 0.6877, "mean_token_accuracy": 0.7883414815445715, "step": 77 }, { "epoch": 0.46153846153846156, "grad_norm": 0.42418323654238854, "learning_rate": 1.305083007335549e-05, "loss": 0.7307, "mean_token_accuracy": 0.7747076789831797, "step": 78 }, { "epoch": 0.46745562130177515, "grad_norm": 0.35584440430369396, "learning_rate": 1.2853362242491054e-05, "loss": 0.7317, "mean_token_accuracy": 0.7756405018053134, "step": 79 }, { "epoch": 0.47337278106508873, "grad_norm": 0.3233937276689462, "learning_rate": 1.2654675551080724e-05, "loss": 0.6992, "mean_token_accuracy": 0.7849270935923538, "step": 80 }, { "epoch": 0.47928994082840237, "grad_norm": 0.350970039014215, "learning_rate": 1.2454854871407993e-05, "loss": 0.7107, "mean_token_accuracy": 0.781704652022156, "step": 81 }, { "epoch": 0.48520710059171596, "grad_norm": 0.35166716550940014, "learning_rate": 1.2253985560158064e-05, "loss": 0.694, "mean_token_accuracy": 0.7867224592606594, "step": 82 }, { "epoch": 0.4911242603550296, "grad_norm": 0.3655542868183894, "learning_rate": 1.2052153421956343e-05, "loss": 0.6913, "mean_token_accuracy": 0.7868167246531167, "step": 83 }, { "epoch": 0.4970414201183432, "grad_norm": 0.37786152221461855, "learning_rate": 1.1849444672715587e-05, "loss": 0.7265, "mean_token_accuracy": 0.776165643284536, "step": 84 }, { "epoch": 0.5029585798816568, "grad_norm": 0.34932873765751665, "learning_rate": 1.164594590280734e-05, "loss": 0.711, "mean_token_accuracy": 0.7806680330763345, "step": 85 }, { "epoch": 0.5088757396449705, "grad_norm": 0.3539261852137339, "learning_rate": 1.1441744040073469e-05, "loss": 0.7165, "mean_token_accuracy": 0.7788759918792447, "step": 86 }, { "epoch": 0.514792899408284, "grad_norm": 0.3395798470618678, "learning_rate": 1.123692631269348e-05, "loss": 0.6997, "mean_token_accuracy": 0.7835249742198639, "step": 87 }, { "epoch": 0.5207100591715976, "grad_norm": 0.3411316208847806, "learning_rate": 1.103158021192357e-05, "loss": 0.6958, "mean_token_accuracy": 0.7860397200235923, "step": 88 }, { "epoch": 0.5266272189349113, "grad_norm": 0.3619282034457249, "learning_rate": 1.0825793454723325e-05, "loss": 0.7052, "mean_token_accuracy": 0.7834187886797611, "step": 89 }, { "epoch": 0.5325443786982249, "grad_norm": 0.34273872199347044, "learning_rate": 1.0619653946285948e-05, "loss": 0.685, "mean_token_accuracy": 0.7897067821918299, "step": 90 }, { "epoch": 0.5384615384615384, "grad_norm": 0.33199715753976633, "learning_rate": 1.0413249742488132e-05, "loss": 0.6917, "mean_token_accuracy": 0.7872031382647131, "step": 91 }, { "epoch": 0.5443786982248521, "grad_norm": 0.3342393332811213, "learning_rate": 1.0206669012275546e-05, "loss": 0.7011, "mean_token_accuracy": 0.784091056433467, "step": 92 }, { "epoch": 0.5502958579881657, "grad_norm": 0.33611685834665483, "learning_rate": 1e-05, "loss": 0.7156, "mean_token_accuracy": 0.7788634871573115, "step": 93 }, { "epoch": 0.5562130177514792, "grad_norm": 0.33795640856270726, "learning_rate": 9.79333098772446e-06, "loss": 0.6843, "mean_token_accuracy": 0.7874877928517997, "step": 94 }, { "epoch": 0.5621301775147929, "grad_norm": 0.3553103385556057, "learning_rate": 9.586750257511868e-06, "loss": 0.684, "mean_token_accuracy": 0.7891702815425776, "step": 95 }, { "epoch": 0.5680473372781065, "grad_norm": 0.3599899819769442, "learning_rate": 9.380346053714055e-06, "loss": 0.6809, "mean_token_accuracy": 0.7879558819894679, "step": 96 }, { "epoch": 0.5739644970414202, "grad_norm": 0.33562142706566866, "learning_rate": 9.174206545276678e-06, "loss": 0.6732, "mean_token_accuracy": 0.7917185320065199, "step": 97 }, { "epoch": 0.5798816568047337, "grad_norm": 0.33455633173601695, "learning_rate": 8.968419788076431e-06, "loss": 0.6957, "mean_token_accuracy": 0.7847382398115799, "step": 98 }, { "epoch": 0.5857988165680473, "grad_norm": 0.3490744063204216, "learning_rate": 8.763073687306523e-06, "loss": 0.6924, "mean_token_accuracy": 0.7862588216550412, "step": 99 }, { "epoch": 0.591715976331361, "grad_norm": 0.33348768096991704, "learning_rate": 8.558255959926533e-06, "loss": 0.6992, "mean_token_accuracy": 0.7847955930336572, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.7157539129257202, "eval_mean_token_accuracy": 0.7586642623023677, "eval_runtime": 4.8665, "eval_samples_per_second": 26.508, "eval_steps_per_second": 1.027, "step": 100 }, { "epoch": 0.5976331360946746, "grad_norm": 0.33075937472785844, "learning_rate": 8.35405409719266e-06, "loss": 0.6872, "mean_token_accuracy": 0.787283402091289, "step": 101 }, { "epoch": 0.6035502958579881, "grad_norm": 0.3590337795434198, "learning_rate": 8.150555327284417e-06, "loss": 0.6861, "mean_token_accuracy": 0.7881656395518974, "step": 102 }, { "epoch": 0.6094674556213018, "grad_norm": 0.3243096344713308, "learning_rate": 7.947846578043658e-06, "loss": 0.687, "mean_token_accuracy": 0.7868769866991424, "step": 103 }, { "epoch": 0.6153846153846154, "grad_norm": 0.31809647789793993, "learning_rate": 7.746014439841941e-06, "loss": 0.7024, "mean_token_accuracy": 0.7840864820127753, "step": 104 }, { "epoch": 0.621301775147929, "grad_norm": 0.3237845217614286, "learning_rate": 7.545145128592009e-06, "loss": 0.674, "mean_token_accuracy": 0.7901693771106895, "step": 105 }, { "epoch": 0.6272189349112426, "grad_norm": 0.31884492713723883, "learning_rate": 7.34532444891928e-06, "loss": 0.6972, "mean_token_accuracy": 0.7838730344185979, "step": 106 }, { "epoch": 0.6331360946745562, "grad_norm": 0.3250725158937223, "learning_rate": 7.14663775750895e-06, "loss": 0.7351, "mean_token_accuracy": 0.7743010028506793, "step": 107 }, { "epoch": 0.6390532544378699, "grad_norm": 0.34110974065150124, "learning_rate": 6.949169926644513e-06, "loss": 0.7011, "mean_token_accuracy": 0.783713761454518, "step": 108 }, { "epoch": 0.6449704142011834, "grad_norm": 0.31095600200622964, "learning_rate": 6.7530053079531664e-06, "loss": 0.7021, "mean_token_accuracy": 0.7831529561955156, "step": 109 }, { "epoch": 0.650887573964497, "grad_norm": 0.31355515565609077, "learning_rate": 6.558227696373617e-06, "loss": 0.6887, "mean_token_accuracy": 0.7874667036092751, "step": 110 }, { "epoch": 0.6568047337278107, "grad_norm": 0.329751885795174, "learning_rate": 6.364920294361701e-06, "loss": 0.709, "mean_token_accuracy": 0.7815413361996215, "step": 111 }, { "epoch": 0.6627218934911243, "grad_norm": 0.30882694735110866, "learning_rate": 6.173165676349103e-06, "loss": 0.7062, "mean_token_accuracy": 0.7818792436067044, "step": 112 }, { "epoch": 0.6686390532544378, "grad_norm": 0.3191082129763723, "learning_rate": 5.983045753470308e-06, "loss": 0.6882, "mean_token_accuracy": 0.7865235373585306, "step": 113 }, { "epoch": 0.6745562130177515, "grad_norm": 0.32856595537873384, "learning_rate": 5.794641738572925e-06, "loss": 0.6838, "mean_token_accuracy": 0.7889740840283941, "step": 114 }, { "epoch": 0.6804733727810651, "grad_norm": 0.3147396913003447, "learning_rate": 5.608034111526298e-06, "loss": 0.6694, "mean_token_accuracy": 0.7932459708127269, "step": 115 }, { "epoch": 0.6863905325443787, "grad_norm": 0.3428397687955701, "learning_rate": 5.423302584843186e-06, "loss": 0.7074, "mean_token_accuracy": 0.7825890227644128, "step": 116 }, { "epoch": 0.6923076923076923, "grad_norm": 0.30279217397028946, "learning_rate": 5.240526069629265e-06, "loss": 0.6784, "mean_token_accuracy": 0.7906955891603145, "step": 117 }, { "epoch": 0.6982248520710059, "grad_norm": 0.3003758212327991, "learning_rate": 5.059782641874962e-06, "loss": 0.6819, "mean_token_accuracy": 0.7889431193959375, "step": 118 }, { "epoch": 0.7041420118343196, "grad_norm": 0.31649740332350773, "learning_rate": 4.881149509103993e-06, "loss": 0.6816, "mean_token_accuracy": 0.7894825257236546, "step": 119 }, { "epoch": 0.7100591715976331, "grad_norm": 0.3141834954228162, "learning_rate": 4.704702977392914e-06, "loss": 0.6549, "mean_token_accuracy": 0.7973338512758128, "step": 120 }, { "epoch": 0.7159763313609467, "grad_norm": 0.3146743815009587, "learning_rate": 4.530518418775734e-06, "loss": 0.6995, "mean_token_accuracy": 0.7840122749260579, "step": 121 }, { "epoch": 0.7218934911242604, "grad_norm": 0.31485965487518236, "learning_rate": 4.35867023904749e-06, "loss": 0.683, "mean_token_accuracy": 0.7888863658938954, "step": 122 }, { "epoch": 0.727810650887574, "grad_norm": 0.31400988808245894, "learning_rate": 4.189231845980618e-06, "loss": 0.675, "mean_token_accuracy": 0.7907750008695696, "step": 123 }, { "epoch": 0.7337278106508875, "grad_norm": 0.31249917487430845, "learning_rate": 4.0222756179675915e-06, "loss": 0.6814, "mean_token_accuracy": 0.7889693618490632, "step": 124 }, { "epoch": 0.7396449704142012, "grad_norm": 0.3200490939966776, "learning_rate": 3.857872873103322e-06, "loss": 0.7015, "mean_token_accuracy": 0.7827075366904457, "step": 125 }, { "epoch": 0.7455621301775148, "grad_norm": 0.3355400247789273, "learning_rate": 3.69609383872045e-06, "loss": 0.7009, "mean_token_accuracy": 0.7832582153821204, "step": 126 }, { "epoch": 0.7514792899408284, "grad_norm": 0.305679884120639, "learning_rate": 3.5370076213905904e-06, "loss": 0.6787, "mean_token_accuracy": 0.790282171142539, "step": 127 }, { "epoch": 0.757396449704142, "grad_norm": 0.30051476204613337, "learning_rate": 3.380682177404335e-06, "loss": 0.6965, "mean_token_accuracy": 0.7849160254658745, "step": 128 }, { "epoch": 0.7633136094674556, "grad_norm": 0.31059288083076164, "learning_rate": 3.2271842837425917e-06, "loss": 0.7068, "mean_token_accuracy": 0.7814057339178149, "step": 129 }, { "epoch": 0.7692307692307693, "grad_norm": 0.30761030210291535, "learning_rate": 3.0765795095517026e-06, "loss": 0.671, "mean_token_accuracy": 0.7930299117720654, "step": 130 }, { "epoch": 0.7751479289940828, "grad_norm": 0.31294726987243277, "learning_rate": 2.9289321881345257e-06, "loss": 0.7006, "mean_token_accuracy": 0.7824703704600162, "step": 131 }, { "epoch": 0.7810650887573964, "grad_norm": 0.32317729195664163, "learning_rate": 2.7843053894693805e-06, "loss": 0.7152, "mean_token_accuracy": 0.7791240758955355, "step": 132 }, { "epoch": 0.7869822485207101, "grad_norm": 0.2961077123206801, "learning_rate": 2.642760893268684e-06, "loss": 0.6914, "mean_token_accuracy": 0.7868451333562997, "step": 133 }, { "epoch": 0.7928994082840237, "grad_norm": 0.3079418866041373, "learning_rate": 2.504359162588741e-06, "loss": 0.6841, "mean_token_accuracy": 0.7884792735031155, "step": 134 }, { "epoch": 0.7988165680473372, "grad_norm": 0.3037332779693865, "learning_rate": 2.369159318001937e-06, "loss": 0.6988, "mean_token_accuracy": 0.7842482711786377, "step": 135 }, { "epoch": 0.8047337278106509, "grad_norm": 0.3009627129738944, "learning_rate": 2.237219112342426e-06, "loss": 0.6717, "mean_token_accuracy": 0.7914490102362668, "step": 136 }, { "epoch": 0.8106508875739645, "grad_norm": 0.29623020549336865, "learning_rate": 2.1085949060360654e-06, "loss": 0.6955, "mean_token_accuracy": 0.7852500014511501, "step": 137 }, { "epoch": 0.8165680473372781, "grad_norm": 0.31868412326360324, "learning_rate": 1.983341643025117e-06, "loss": 0.7077, "mean_token_accuracy": 0.7818251421138749, "step": 138 }, { "epoch": 0.8224852071005917, "grad_norm": 0.28770049101277134, "learning_rate": 1.861512827298051e-06, "loss": 0.6702, "mean_token_accuracy": 0.7927647650840932, "step": 139 }, { "epoch": 0.8284023668639053, "grad_norm": 0.2990059242232171, "learning_rate": 1.743160500034443e-06, "loss": 0.698, "mean_token_accuracy": 0.7854336655349383, "step": 140 }, { "epoch": 0.834319526627219, "grad_norm": 0.31036052278345283, "learning_rate": 1.6283352173747148e-06, "loss": 0.6944, "mean_token_accuracy": 0.7851812952221086, "step": 141 }, { "epoch": 0.8402366863905325, "grad_norm": 0.31687773547502635, "learning_rate": 1.5170860288242638e-06, "loss": 0.6934, "mean_token_accuracy": 0.7867884869373907, "step": 142 }, { "epoch": 0.8461538461538461, "grad_norm": 0.30938603900072376, "learning_rate": 1.409460456301147e-06, "loss": 0.6795, "mean_token_accuracy": 0.7895160039493238, "step": 143 }, { "epoch": 0.8520710059171598, "grad_norm": 0.2952536141242446, "learning_rate": 1.305504473836331e-06, "loss": 0.6984, "mean_token_accuracy": 0.7844431538363902, "step": 144 }, { "epoch": 0.8579881656804734, "grad_norm": 0.3057613487662314, "learning_rate": 1.2052624879351105e-06, "loss": 0.6808, "mean_token_accuracy": 0.7889678951582548, "step": 145 }, { "epoch": 0.863905325443787, "grad_norm": 0.29097999865599966, "learning_rate": 1.1087773186081474e-06, "loss": 0.6841, "mean_token_accuracy": 0.7886474079309541, "step": 146 }, { "epoch": 0.8698224852071006, "grad_norm": 0.3142274595418203, "learning_rate": 1.0160901810802114e-06, "loss": 0.6738, "mean_token_accuracy": 0.7907368421042241, "step": 147 }, { "epoch": 0.8757396449704142, "grad_norm": 0.2925617205492779, "learning_rate": 9.272406681844015e-07, "loss": 0.6709, "mean_token_accuracy": 0.7922187969571426, "step": 148 }, { "epoch": 0.8816568047337278, "grad_norm": 0.27724813470631104, "learning_rate": 8.42266733449425e-07, "loss": 0.6858, "mean_token_accuracy": 0.7878970031830121, "step": 149 }, { "epoch": 0.8875739644970414, "grad_norm": 0.2969141966080097, "learning_rate": 7.612046748871327e-07, "loss": 0.6835, "mean_token_accuracy": 0.7885832175544185, "step": 150 }, { "epoch": 0.893491124260355, "grad_norm": 0.30371772219322385, "learning_rate": 6.840891194872112e-07, "loss": 0.6919, "mean_token_accuracy": 0.7857929535715237, "step": 151 }, { "epoch": 0.8994082840236687, "grad_norm": 0.291180674993794, "learning_rate": 6.109530084257043e-07, "loss": 0.6767, "mean_token_accuracy": 0.7914383926503787, "step": 152 }, { "epoch": 0.9053254437869822, "grad_norm": 0.29261209458574694, "learning_rate": 5.418275829936537e-07, "loss": 0.6726, "mean_token_accuracy": 0.7907148025796484, "step": 153 }, { "epoch": 0.9112426035502958, "grad_norm": 0.2776593810597509, "learning_rate": 4.7674237125185597e-07, "loss": 0.674, "mean_token_accuracy": 0.7914961569221735, "step": 154 }, { "epoch": 0.9171597633136095, "grad_norm": 0.28387582813592127, "learning_rate": 4.1572517541747294e-07, "loss": 0.6422, "mean_token_accuracy": 0.8009981820698834, "step": 155 }, { "epoch": 0.9230769230769231, "grad_norm": 0.2938250907370977, "learning_rate": 3.588020599878639e-07, "loss": 0.6962, "mean_token_accuracy": 0.7844788274775369, "step": 156 }, { "epoch": 0.9289940828402367, "grad_norm": 0.2804499286030571, "learning_rate": 3.059973406066963e-07, "loss": 0.6714, "mean_token_accuracy": 0.7933834235664327, "step": 157 }, { "epoch": 0.9349112426035503, "grad_norm": 0.29640364523794643, "learning_rate": 2.573335736771254e-07, "loss": 0.6786, "mean_token_accuracy": 0.7897013469630543, "step": 158 }, { "epoch": 0.9408284023668639, "grad_norm": 0.2783198872073197, "learning_rate": 2.1283154672645522e-07, "loss": 0.6731, "mean_token_accuracy": 0.7911190417303665, "step": 159 }, { "epoch": 0.9467455621301775, "grad_norm": 0.2839659205849638, "learning_rate": 1.7251026952640583e-07, "loss": 0.6969, "mean_token_accuracy": 0.7854169932946689, "step": 160 }, { "epoch": 0.9526627218934911, "grad_norm": 0.29734374928559365, "learning_rate": 1.3638696597277678e-07, "loss": 0.6764, "mean_token_accuracy": 0.790571362847956, "step": 161 }, { "epoch": 0.9585798816568047, "grad_norm": 0.30464858616674234, "learning_rate": 1.0447706672797264e-07, "loss": 0.6735, "mean_token_accuracy": 0.7917316028516589, "step": 162 }, { "epoch": 0.9644970414201184, "grad_norm": 0.29012852499944247, "learning_rate": 7.679420262954984e-08, "loss": 0.6699, "mean_token_accuracy": 0.7924379369006603, "step": 163 }, { "epoch": 0.9704142011834319, "grad_norm": 0.2781835486284672, "learning_rate": 5.3350198867574424e-08, "loss": 0.7051, "mean_token_accuracy": 0.7818380497316074, "step": 164 }, { "epoch": 0.9763313609467456, "grad_norm": 0.2722470698221784, "learning_rate": 3.4155069933301535e-08, "loss": 0.6566, "mean_token_accuracy": 0.7958629252087849, "step": 165 }, { "epoch": 0.9822485207100592, "grad_norm": 0.2853070020591702, "learning_rate": 1.9217015341318478e-08, "loss": 0.6902, "mean_token_accuracy": 0.7868536587227888, "step": 166 }, { "epoch": 0.9881656804733728, "grad_norm": 0.28783554756968166, "learning_rate": 8.542416126989805e-09, "loss": 0.6799, "mean_token_accuracy": 0.7887756232646665, "step": 167 }, { "epoch": 0.9940828402366864, "grad_norm": 0.2874058567649186, "learning_rate": 2.1358321206899067e-09, "loss": 0.703, "mean_token_accuracy": 0.7823018287164306, "step": 168 }, { "epoch": 1.0, "grad_norm": 0.2792632140682557, "learning_rate": 0.0, "loss": 0.6884, "mean_token_accuracy": 0.7866684149311904, "step": 169 }, { "epoch": 1.0, "step": 169, "total_flos": 128380330573824.0, "train_loss": 0.7332075143001489, "train_runtime": 1234.0886, "train_samples_per_second": 17.521, "train_steps_per_second": 0.137 } ], "logging_steps": 1, "max_steps": 169, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 128380330573824.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }