|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 169, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005917159763313609, |
|
"grad_norm": 5.048887351152406, |
|
"learning_rate": 1.1764705882352942e-06, |
|
"loss": 1.0795, |
|
"mean_token_accuracy": 0.7206979429209666, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011834319526627219, |
|
"grad_norm": 4.879597464054353, |
|
"learning_rate": 2.3529411764705885e-06, |
|
"loss": 1.1029, |
|
"mean_token_accuracy": 0.7117051311557141, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01775147928994083, |
|
"grad_norm": 4.657432447307512, |
|
"learning_rate": 3.529411764705883e-06, |
|
"loss": 1.0906, |
|
"mean_token_accuracy": 0.7171718488582962, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.023668639053254437, |
|
"grad_norm": 4.467351868935966, |
|
"learning_rate": 4.705882352941177e-06, |
|
"loss": 1.0718, |
|
"mean_token_accuracy": 0.7218519479951946, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.029585798816568046, |
|
"grad_norm": 3.7064015727740425, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 1.0701, |
|
"mean_token_accuracy": 0.7191323254401871, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03550295857988166, |
|
"grad_norm": 2.382423991923031, |
|
"learning_rate": 7.058823529411766e-06, |
|
"loss": 1.0118, |
|
"mean_token_accuracy": 0.7285496854387108, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04142011834319527, |
|
"grad_norm": 2.101237602674026, |
|
"learning_rate": 8.23529411764706e-06, |
|
"loss": 0.9682, |
|
"mean_token_accuracy": 0.7363681597344438, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.047337278106508875, |
|
"grad_norm": 2.015056074152268, |
|
"learning_rate": 9.411764705882354e-06, |
|
"loss": 1.0044, |
|
"mean_token_accuracy": 0.7251941996166579, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05325443786982249, |
|
"grad_norm": 1.6355380376413269, |
|
"learning_rate": 1.0588235294117648e-05, |
|
"loss": 0.9629, |
|
"mean_token_accuracy": 0.7324857047569558, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05917159763313609, |
|
"grad_norm": 1.7763113613059038, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.9515, |
|
"mean_token_accuracy": 0.7337997991969313, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0650887573964497, |
|
"grad_norm": 1.3969034500597064, |
|
"learning_rate": 1.2941176470588238e-05, |
|
"loss": 0.9259, |
|
"mean_token_accuracy": 0.7387967799773972, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07100591715976332, |
|
"grad_norm": 0.953979333766809, |
|
"learning_rate": 1.4117647058823532e-05, |
|
"loss": 0.873, |
|
"mean_token_accuracy": 0.7510249436735839, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 1.127863397786703, |
|
"learning_rate": 1.5294117647058822e-05, |
|
"loss": 0.8988, |
|
"mean_token_accuracy": 0.7415024955395976, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08284023668639054, |
|
"grad_norm": 0.9273891062266637, |
|
"learning_rate": 1.647058823529412e-05, |
|
"loss": 0.8642, |
|
"mean_token_accuracy": 0.750242238655049, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08875739644970414, |
|
"grad_norm": 0.8053975766192991, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 0.8616, |
|
"mean_token_accuracy": 0.7512058479268567, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09467455621301775, |
|
"grad_norm": 0.7333036953961584, |
|
"learning_rate": 1.8823529411764708e-05, |
|
"loss": 0.8414, |
|
"mean_token_accuracy": 0.7541005429250093, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10059171597633136, |
|
"grad_norm": 0.6867367211047125, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8218, |
|
"mean_token_accuracy": 0.7606669951255678, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10650887573964497, |
|
"grad_norm": 0.7209527833538277, |
|
"learning_rate": 1.9997864167879313e-05, |
|
"loss": 0.7897, |
|
"mean_token_accuracy": 0.7669743964292547, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11242603550295859, |
|
"grad_norm": 0.6903745592521859, |
|
"learning_rate": 1.999145758387301e-05, |
|
"loss": 0.7984, |
|
"mean_token_accuracy": 0.7647671686022387, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11834319526627218, |
|
"grad_norm": 0.6167859283406073, |
|
"learning_rate": 1.9980782984658682e-05, |
|
"loss": 0.7898, |
|
"mean_token_accuracy": 0.7661677740213927, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1242603550295858, |
|
"grad_norm": 0.6108563732156459, |
|
"learning_rate": 1.99658449300667e-05, |
|
"loss": 0.804, |
|
"mean_token_accuracy": 0.7611782591111235, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1301775147928994, |
|
"grad_norm": 0.5746908311619029, |
|
"learning_rate": 1.994664980113243e-05, |
|
"loss": 0.7744, |
|
"mean_token_accuracy": 0.7691396065844442, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.13609467455621302, |
|
"grad_norm": 0.5345761647636947, |
|
"learning_rate": 1.992320579737045e-05, |
|
"loss": 0.7749, |
|
"mean_token_accuracy": 0.7685584775159877, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14201183431952663, |
|
"grad_norm": 0.5299656484934693, |
|
"learning_rate": 1.9895522933272028e-05, |
|
"loss": 0.7621, |
|
"mean_token_accuracy": 0.7722506630953709, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.14792899408284024, |
|
"grad_norm": 0.63083763621396, |
|
"learning_rate": 1.9863613034027224e-05, |
|
"loss": 0.7868, |
|
"mean_token_accuracy": 0.7644846690458883, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 0.4511582029139769, |
|
"learning_rate": 1.9827489730473597e-05, |
|
"loss": 0.7752, |
|
"mean_token_accuracy": 0.7683559567285383, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.15976331360946747, |
|
"grad_norm": 0.5080850847922778, |
|
"learning_rate": 1.9787168453273546e-05, |
|
"loss": 0.7903, |
|
"mean_token_accuracy": 0.7631548471555147, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16568047337278108, |
|
"grad_norm": 0.417117734277003, |
|
"learning_rate": 1.9742666426322877e-05, |
|
"loss": 0.743, |
|
"mean_token_accuracy": 0.7766788822768681, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17159763313609466, |
|
"grad_norm": 0.4254796366932393, |
|
"learning_rate": 1.9694002659393306e-05, |
|
"loss": 0.7702, |
|
"mean_token_accuracy": 0.7695522126025716, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.17751479289940827, |
|
"grad_norm": 0.41986449808823745, |
|
"learning_rate": 1.9641197940012136e-05, |
|
"loss": 0.7586, |
|
"mean_token_accuracy": 0.7721289278577101, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1834319526627219, |
|
"grad_norm": 0.4695562943601462, |
|
"learning_rate": 1.958427482458253e-05, |
|
"loss": 0.7539, |
|
"mean_token_accuracy": 0.7716929613553958, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1893491124260355, |
|
"grad_norm": 0.3921409409899768, |
|
"learning_rate": 1.9523257628748148e-05, |
|
"loss": 0.7568, |
|
"mean_token_accuracy": 0.7717328096419696, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1952662721893491, |
|
"grad_norm": 0.4160967346901947, |
|
"learning_rate": 1.9458172417006347e-05, |
|
"loss": 0.7409, |
|
"mean_token_accuracy": 0.7751419508476135, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.20118343195266272, |
|
"grad_norm": 0.43718343180505526, |
|
"learning_rate": 1.9389046991574298e-05, |
|
"loss": 0.7344, |
|
"mean_token_accuracy": 0.778526386835722, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.20710059171597633, |
|
"grad_norm": 0.396451454218149, |
|
"learning_rate": 1.9315910880512792e-05, |
|
"loss": 0.7585, |
|
"mean_token_accuracy": 0.7704364168416742, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.21301775147928995, |
|
"grad_norm": 0.4531112283882146, |
|
"learning_rate": 1.9238795325112867e-05, |
|
"loss": 0.7255, |
|
"mean_token_accuracy": 0.7800096489081426, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.21893491124260356, |
|
"grad_norm": 0.4003679262358476, |
|
"learning_rate": 1.9157733266550577e-05, |
|
"loss": 0.7251, |
|
"mean_token_accuracy": 0.779636766062183, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.22485207100591717, |
|
"grad_norm": 0.4019998494369348, |
|
"learning_rate": 1.9072759331815602e-05, |
|
"loss": 0.7325, |
|
"mean_token_accuracy": 0.7773937936524439, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 0.4247947978619282, |
|
"learning_rate": 1.898390981891979e-05, |
|
"loss": 0.7396, |
|
"mean_token_accuracy": 0.7757475726810331, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.23668639053254437, |
|
"grad_norm": 0.3801393695192698, |
|
"learning_rate": 1.8891222681391853e-05, |
|
"loss": 0.7422, |
|
"mean_token_accuracy": 0.7750002370873404, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24260355029585798, |
|
"grad_norm": 0.37960651070834583, |
|
"learning_rate": 1.879473751206489e-05, |
|
"loss": 0.7159, |
|
"mean_token_accuracy": 0.7815920659071619, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2485207100591716, |
|
"grad_norm": 0.4177406473529506, |
|
"learning_rate": 1.869449552616367e-05, |
|
"loss": 0.7583, |
|
"mean_token_accuracy": 0.7707519109077099, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.25443786982248523, |
|
"grad_norm": 0.36011706007184574, |
|
"learning_rate": 1.8590539543698852e-05, |
|
"loss": 0.7201, |
|
"mean_token_accuracy": 0.7804437519430654, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2603550295857988, |
|
"grad_norm": 0.37037523132050293, |
|
"learning_rate": 1.8482913971175737e-05, |
|
"loss": 0.6911, |
|
"mean_token_accuracy": 0.789738280938329, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.26627218934911245, |
|
"grad_norm": 0.4354182756113773, |
|
"learning_rate": 1.8371664782625287e-05, |
|
"loss": 0.7426, |
|
"mean_token_accuracy": 0.7737875964292505, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.27218934911242604, |
|
"grad_norm": 0.38980814844589723, |
|
"learning_rate": 1.825683949996556e-05, |
|
"loss": 0.7399, |
|
"mean_token_accuracy": 0.7748932065512412, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2781065088757396, |
|
"grad_norm": 0.39132094117790983, |
|
"learning_rate": 1.813848717270195e-05, |
|
"loss": 0.7008, |
|
"mean_token_accuracy": 0.7860000963532652, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.28402366863905326, |
|
"grad_norm": 0.38848469099488886, |
|
"learning_rate": 1.8016658356974885e-05, |
|
"loss": 0.7164, |
|
"mean_token_accuracy": 0.7800342185812585, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.28994082840236685, |
|
"grad_norm": 0.3636566412524122, |
|
"learning_rate": 1.789140509396394e-05, |
|
"loss": 0.7212, |
|
"mean_token_accuracy": 0.7796801488922436, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2958579881656805, |
|
"grad_norm": 0.4246224936998959, |
|
"learning_rate": 1.7762780887657576e-05, |
|
"loss": 0.7319, |
|
"mean_token_accuracy": 0.7770798628956391, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.30177514792899407, |
|
"grad_norm": 0.35464674247563077, |
|
"learning_rate": 1.7630840681998068e-05, |
|
"loss": 0.7278, |
|
"mean_token_accuracy": 0.7782958063801806, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.3838835290476431, |
|
"learning_rate": 1.7495640837411265e-05, |
|
"loss": 0.7276, |
|
"mean_token_accuracy": 0.778110788518626, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3136094674556213, |
|
"grad_norm": 0.39823260845501735, |
|
"learning_rate": 1.735723910673132e-05, |
|
"loss": 0.6969, |
|
"mean_token_accuracy": 0.7861765198226371, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.31952662721893493, |
|
"grad_norm": 0.3616285119296811, |
|
"learning_rate": 1.7215694610530624e-05, |
|
"loss": 0.7267, |
|
"mean_token_accuracy": 0.7780276231344528, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3254437869822485, |
|
"grad_norm": 0.3564476104670264, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.7084, |
|
"mean_token_accuracy": 0.7837116023048635, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.33136094674556216, |
|
"grad_norm": 0.44035089633256974, |
|
"learning_rate": 1.6923420490448298e-05, |
|
"loss": 0.7342, |
|
"mean_token_accuracy": 0.776648161332027, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.33727810650887574, |
|
"grad_norm": 0.39248432764207875, |
|
"learning_rate": 1.6772815716257414e-05, |
|
"loss": 0.725, |
|
"mean_token_accuracy": 0.7786816515421023, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3431952662721893, |
|
"grad_norm": 0.36704706608942483, |
|
"learning_rate": 1.6619317822595666e-05, |
|
"loss": 0.7017, |
|
"mean_token_accuracy": 0.7841417427362292, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.34911242603550297, |
|
"grad_norm": 0.40512498191045965, |
|
"learning_rate": 1.646299237860941e-05, |
|
"loss": 0.75, |
|
"mean_token_accuracy": 0.7708883209035493, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.35502958579881655, |
|
"grad_norm": 0.39029160023670084, |
|
"learning_rate": 1.6303906161279554e-05, |
|
"loss": 0.7279, |
|
"mean_token_accuracy": 0.7775349237600122, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3609467455621302, |
|
"grad_norm": 0.3953993708954393, |
|
"learning_rate": 1.6142127126896682e-05, |
|
"loss": 0.7356, |
|
"mean_token_accuracy": 0.7745875530129677, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3668639053254438, |
|
"grad_norm": 0.36256710216714955, |
|
"learning_rate": 1.597772438203241e-05, |
|
"loss": 0.6997, |
|
"mean_token_accuracy": 0.7848896531840401, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.3727810650887574, |
|
"grad_norm": 0.40962620829800644, |
|
"learning_rate": 1.5810768154019386e-05, |
|
"loss": 0.7205, |
|
"mean_token_accuracy": 0.7799613349100392, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.378698224852071, |
|
"grad_norm": 0.3828488838066985, |
|
"learning_rate": 1.5641329760952514e-05, |
|
"loss": 0.708, |
|
"mean_token_accuracy": 0.782232443701145, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 0.39122763310454406, |
|
"learning_rate": 1.5469481581224274e-05, |
|
"loss": 0.7074, |
|
"mean_token_accuracy": 0.7819801124070774, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3905325443786982, |
|
"grad_norm": 0.3910386124912464, |
|
"learning_rate": 1.529529702260709e-05, |
|
"loss": 0.7318, |
|
"mean_token_accuracy": 0.7753433511222331, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.39644970414201186, |
|
"grad_norm": 0.3880043071649847, |
|
"learning_rate": 1.5118850490896012e-05, |
|
"loss": 0.7191, |
|
"mean_token_accuracy": 0.7791978555083022, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.40236686390532544, |
|
"grad_norm": 0.348781202358635, |
|
"learning_rate": 1.4940217358125042e-05, |
|
"loss": 0.6978, |
|
"mean_token_accuracy": 0.785919046763025, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.40828402366863903, |
|
"grad_norm": 0.34785888550803656, |
|
"learning_rate": 1.4759473930370738e-05, |
|
"loss": 0.6923, |
|
"mean_token_accuracy": 0.787418621224268, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.41420118343195267, |
|
"grad_norm": 0.3653232305381835, |
|
"learning_rate": 1.4576697415156818e-05, |
|
"loss": 0.6912, |
|
"mean_token_accuracy": 0.7869110504306804, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.42011834319526625, |
|
"grad_norm": 0.36764315457311614, |
|
"learning_rate": 1.4391965888473705e-05, |
|
"loss": 0.6991, |
|
"mean_token_accuracy": 0.7839725057156516, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4260355029585799, |
|
"grad_norm": 0.3514136440565239, |
|
"learning_rate": 1.4205358261427076e-05, |
|
"loss": 0.7149, |
|
"mean_token_accuracy": 0.7796496659579735, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4319526627218935, |
|
"grad_norm": 0.3686893566061322, |
|
"learning_rate": 1.4016954246529697e-05, |
|
"loss": 0.7197, |
|
"mean_token_accuracy": 0.7794793514235452, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4378698224852071, |
|
"grad_norm": 0.34605681231940283, |
|
"learning_rate": 1.3826834323650899e-05, |
|
"loss": 0.687, |
|
"mean_token_accuracy": 0.7897891867600086, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4437869822485207, |
|
"grad_norm": 0.369433655454952, |
|
"learning_rate": 1.3635079705638298e-05, |
|
"loss": 0.7018, |
|
"mean_token_accuracy": 0.7843400450532293, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.44970414201183434, |
|
"grad_norm": 0.3637775910740719, |
|
"learning_rate": 1.3441772303626387e-05, |
|
"loss": 0.6811, |
|
"mean_token_accuracy": 0.7883583477300129, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4556213017751479, |
|
"grad_norm": 0.3552805600165443, |
|
"learning_rate": 1.3246994692046837e-05, |
|
"loss": 0.6877, |
|
"mean_token_accuracy": 0.7883414815445715, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.42418323654238854, |
|
"learning_rate": 1.305083007335549e-05, |
|
"loss": 0.7307, |
|
"mean_token_accuracy": 0.7747076789831797, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.46745562130177515, |
|
"grad_norm": 0.35584440430369396, |
|
"learning_rate": 1.2853362242491054e-05, |
|
"loss": 0.7317, |
|
"mean_token_accuracy": 0.7756405018053134, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.47337278106508873, |
|
"grad_norm": 0.3233937276689462, |
|
"learning_rate": 1.2654675551080724e-05, |
|
"loss": 0.6992, |
|
"mean_token_accuracy": 0.7849270935923538, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.47928994082840237, |
|
"grad_norm": 0.350970039014215, |
|
"learning_rate": 1.2454854871407993e-05, |
|
"loss": 0.7107, |
|
"mean_token_accuracy": 0.781704652022156, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.48520710059171596, |
|
"grad_norm": 0.35166716550940014, |
|
"learning_rate": 1.2253985560158064e-05, |
|
"loss": 0.694, |
|
"mean_token_accuracy": 0.7867224592606594, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4911242603550296, |
|
"grad_norm": 0.3655542868183894, |
|
"learning_rate": 1.2052153421956343e-05, |
|
"loss": 0.6913, |
|
"mean_token_accuracy": 0.7868167246531167, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.4970414201183432, |
|
"grad_norm": 0.37786152221461855, |
|
"learning_rate": 1.1849444672715587e-05, |
|
"loss": 0.7265, |
|
"mean_token_accuracy": 0.776165643284536, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5029585798816568, |
|
"grad_norm": 0.34932873765751665, |
|
"learning_rate": 1.164594590280734e-05, |
|
"loss": 0.711, |
|
"mean_token_accuracy": 0.7806680330763345, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5088757396449705, |
|
"grad_norm": 0.3539261852137339, |
|
"learning_rate": 1.1441744040073469e-05, |
|
"loss": 0.7165, |
|
"mean_token_accuracy": 0.7788759918792447, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.514792899408284, |
|
"grad_norm": 0.3395798470618678, |
|
"learning_rate": 1.123692631269348e-05, |
|
"loss": 0.6997, |
|
"mean_token_accuracy": 0.7835249742198639, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5207100591715976, |
|
"grad_norm": 0.3411316208847806, |
|
"learning_rate": 1.103158021192357e-05, |
|
"loss": 0.6958, |
|
"mean_token_accuracy": 0.7860397200235923, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5266272189349113, |
|
"grad_norm": 0.3619282034457249, |
|
"learning_rate": 1.0825793454723325e-05, |
|
"loss": 0.7052, |
|
"mean_token_accuracy": 0.7834187886797611, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5325443786982249, |
|
"grad_norm": 0.34273872199347044, |
|
"learning_rate": 1.0619653946285948e-05, |
|
"loss": 0.685, |
|
"mean_token_accuracy": 0.7897067821918299, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 0.33199715753976633, |
|
"learning_rate": 1.0413249742488132e-05, |
|
"loss": 0.6917, |
|
"mean_token_accuracy": 0.7872031382647131, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5443786982248521, |
|
"grad_norm": 0.3342393332811213, |
|
"learning_rate": 1.0206669012275546e-05, |
|
"loss": 0.7011, |
|
"mean_token_accuracy": 0.784091056433467, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5502958579881657, |
|
"grad_norm": 0.33611685834665483, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7156, |
|
"mean_token_accuracy": 0.7788634871573115, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5562130177514792, |
|
"grad_norm": 0.33795640856270726, |
|
"learning_rate": 9.79333098772446e-06, |
|
"loss": 0.6843, |
|
"mean_token_accuracy": 0.7874877928517997, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.5621301775147929, |
|
"grad_norm": 0.3553103385556057, |
|
"learning_rate": 9.586750257511868e-06, |
|
"loss": 0.684, |
|
"mean_token_accuracy": 0.7891702815425776, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5680473372781065, |
|
"grad_norm": 0.3599899819769442, |
|
"learning_rate": 9.380346053714055e-06, |
|
"loss": 0.6809, |
|
"mean_token_accuracy": 0.7879558819894679, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5739644970414202, |
|
"grad_norm": 0.33562142706566866, |
|
"learning_rate": 9.174206545276678e-06, |
|
"loss": 0.6732, |
|
"mean_token_accuracy": 0.7917185320065199, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5798816568047337, |
|
"grad_norm": 0.33455633173601695, |
|
"learning_rate": 8.968419788076431e-06, |
|
"loss": 0.6957, |
|
"mean_token_accuracy": 0.7847382398115799, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5857988165680473, |
|
"grad_norm": 0.3490744063204216, |
|
"learning_rate": 8.763073687306523e-06, |
|
"loss": 0.6924, |
|
"mean_token_accuracy": 0.7862588216550412, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"grad_norm": 0.33348768096991704, |
|
"learning_rate": 8.558255959926533e-06, |
|
"loss": 0.6992, |
|
"mean_token_accuracy": 0.7847955930336572, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.591715976331361, |
|
"eval_loss": 0.7157539129257202, |
|
"eval_mean_token_accuracy": 0.7586642623023677, |
|
"eval_runtime": 4.8665, |
|
"eval_samples_per_second": 26.508, |
|
"eval_steps_per_second": 1.027, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5976331360946746, |
|
"grad_norm": 0.33075937472785844, |
|
"learning_rate": 8.35405409719266e-06, |
|
"loss": 0.6872, |
|
"mean_token_accuracy": 0.787283402091289, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6035502958579881, |
|
"grad_norm": 0.3590337795434198, |
|
"learning_rate": 8.150555327284417e-06, |
|
"loss": 0.6861, |
|
"mean_token_accuracy": 0.7881656395518974, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6094674556213018, |
|
"grad_norm": 0.3243096344713308, |
|
"learning_rate": 7.947846578043658e-06, |
|
"loss": 0.687, |
|
"mean_token_accuracy": 0.7868769866991424, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.31809647789793993, |
|
"learning_rate": 7.746014439841941e-06, |
|
"loss": 0.7024, |
|
"mean_token_accuracy": 0.7840864820127753, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.621301775147929, |
|
"grad_norm": 0.3237845217614286, |
|
"learning_rate": 7.545145128592009e-06, |
|
"loss": 0.674, |
|
"mean_token_accuracy": 0.7901693771106895, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6272189349112426, |
|
"grad_norm": 0.31884492713723883, |
|
"learning_rate": 7.34532444891928e-06, |
|
"loss": 0.6972, |
|
"mean_token_accuracy": 0.7838730344185979, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6331360946745562, |
|
"grad_norm": 0.3250725158937223, |
|
"learning_rate": 7.14663775750895e-06, |
|
"loss": 0.7351, |
|
"mean_token_accuracy": 0.7743010028506793, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6390532544378699, |
|
"grad_norm": 0.34110974065150124, |
|
"learning_rate": 6.949169926644513e-06, |
|
"loss": 0.7011, |
|
"mean_token_accuracy": 0.783713761454518, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6449704142011834, |
|
"grad_norm": 0.31095600200622964, |
|
"learning_rate": 6.7530053079531664e-06, |
|
"loss": 0.7021, |
|
"mean_token_accuracy": 0.7831529561955156, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.650887573964497, |
|
"grad_norm": 0.31355515565609077, |
|
"learning_rate": 6.558227696373617e-06, |
|
"loss": 0.6887, |
|
"mean_token_accuracy": 0.7874667036092751, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6568047337278107, |
|
"grad_norm": 0.329751885795174, |
|
"learning_rate": 6.364920294361701e-06, |
|
"loss": 0.709, |
|
"mean_token_accuracy": 0.7815413361996215, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6627218934911243, |
|
"grad_norm": 0.30882694735110866, |
|
"learning_rate": 6.173165676349103e-06, |
|
"loss": 0.7062, |
|
"mean_token_accuracy": 0.7818792436067044, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.6686390532544378, |
|
"grad_norm": 0.3191082129763723, |
|
"learning_rate": 5.983045753470308e-06, |
|
"loss": 0.6882, |
|
"mean_token_accuracy": 0.7865235373585306, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.6745562130177515, |
|
"grad_norm": 0.32856595537873384, |
|
"learning_rate": 5.794641738572925e-06, |
|
"loss": 0.6838, |
|
"mean_token_accuracy": 0.7889740840283941, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.6804733727810651, |
|
"grad_norm": 0.3147396913003447, |
|
"learning_rate": 5.608034111526298e-06, |
|
"loss": 0.6694, |
|
"mean_token_accuracy": 0.7932459708127269, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6863905325443787, |
|
"grad_norm": 0.3428397687955701, |
|
"learning_rate": 5.423302584843186e-06, |
|
"loss": 0.7074, |
|
"mean_token_accuracy": 0.7825890227644128, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 0.30279217397028946, |
|
"learning_rate": 5.240526069629265e-06, |
|
"loss": 0.6784, |
|
"mean_token_accuracy": 0.7906955891603145, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6982248520710059, |
|
"grad_norm": 0.3003758212327991, |
|
"learning_rate": 5.059782641874962e-06, |
|
"loss": 0.6819, |
|
"mean_token_accuracy": 0.7889431193959375, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7041420118343196, |
|
"grad_norm": 0.31649740332350773, |
|
"learning_rate": 4.881149509103993e-06, |
|
"loss": 0.6816, |
|
"mean_token_accuracy": 0.7894825257236546, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7100591715976331, |
|
"grad_norm": 0.3141834954228162, |
|
"learning_rate": 4.704702977392914e-06, |
|
"loss": 0.6549, |
|
"mean_token_accuracy": 0.7973338512758128, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7159763313609467, |
|
"grad_norm": 0.3146743815009587, |
|
"learning_rate": 4.530518418775734e-06, |
|
"loss": 0.6995, |
|
"mean_token_accuracy": 0.7840122749260579, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7218934911242604, |
|
"grad_norm": 0.31485965487518236, |
|
"learning_rate": 4.35867023904749e-06, |
|
"loss": 0.683, |
|
"mean_token_accuracy": 0.7888863658938954, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.727810650887574, |
|
"grad_norm": 0.31400988808245894, |
|
"learning_rate": 4.189231845980618e-06, |
|
"loss": 0.675, |
|
"mean_token_accuracy": 0.7907750008695696, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7337278106508875, |
|
"grad_norm": 0.31249917487430845, |
|
"learning_rate": 4.0222756179675915e-06, |
|
"loss": 0.6814, |
|
"mean_token_accuracy": 0.7889693618490632, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7396449704142012, |
|
"grad_norm": 0.3200490939966776, |
|
"learning_rate": 3.857872873103322e-06, |
|
"loss": 0.7015, |
|
"mean_token_accuracy": 0.7827075366904457, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7455621301775148, |
|
"grad_norm": 0.3355400247789273, |
|
"learning_rate": 3.69609383872045e-06, |
|
"loss": 0.7009, |
|
"mean_token_accuracy": 0.7832582153821204, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7514792899408284, |
|
"grad_norm": 0.305679884120639, |
|
"learning_rate": 3.5370076213905904e-06, |
|
"loss": 0.6787, |
|
"mean_token_accuracy": 0.790282171142539, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.757396449704142, |
|
"grad_norm": 0.30051476204613337, |
|
"learning_rate": 3.380682177404335e-06, |
|
"loss": 0.6965, |
|
"mean_token_accuracy": 0.7849160254658745, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.7633136094674556, |
|
"grad_norm": 0.31059288083076164, |
|
"learning_rate": 3.2271842837425917e-06, |
|
"loss": 0.7068, |
|
"mean_token_accuracy": 0.7814057339178149, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.30761030210291535, |
|
"learning_rate": 3.0765795095517026e-06, |
|
"loss": 0.671, |
|
"mean_token_accuracy": 0.7930299117720654, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.7751479289940828, |
|
"grad_norm": 0.31294726987243277, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.7006, |
|
"mean_token_accuracy": 0.7824703704600162, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.7810650887573964, |
|
"grad_norm": 0.32317729195664163, |
|
"learning_rate": 2.7843053894693805e-06, |
|
"loss": 0.7152, |
|
"mean_token_accuracy": 0.7791240758955355, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.7869822485207101, |
|
"grad_norm": 0.2961077123206801, |
|
"learning_rate": 2.642760893268684e-06, |
|
"loss": 0.6914, |
|
"mean_token_accuracy": 0.7868451333562997, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.7928994082840237, |
|
"grad_norm": 0.3079418866041373, |
|
"learning_rate": 2.504359162588741e-06, |
|
"loss": 0.6841, |
|
"mean_token_accuracy": 0.7884792735031155, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7988165680473372, |
|
"grad_norm": 0.3037332779693865, |
|
"learning_rate": 2.369159318001937e-06, |
|
"loss": 0.6988, |
|
"mean_token_accuracy": 0.7842482711786377, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8047337278106509, |
|
"grad_norm": 0.3009627129738944, |
|
"learning_rate": 2.237219112342426e-06, |
|
"loss": 0.6717, |
|
"mean_token_accuracy": 0.7914490102362668, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8106508875739645, |
|
"grad_norm": 0.29623020549336865, |
|
"learning_rate": 2.1085949060360654e-06, |
|
"loss": 0.6955, |
|
"mean_token_accuracy": 0.7852500014511501, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8165680473372781, |
|
"grad_norm": 0.31868412326360324, |
|
"learning_rate": 1.983341643025117e-06, |
|
"loss": 0.7077, |
|
"mean_token_accuracy": 0.7818251421138749, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8224852071005917, |
|
"grad_norm": 0.28770049101277134, |
|
"learning_rate": 1.861512827298051e-06, |
|
"loss": 0.6702, |
|
"mean_token_accuracy": 0.7927647650840932, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8284023668639053, |
|
"grad_norm": 0.2990059242232171, |
|
"learning_rate": 1.743160500034443e-06, |
|
"loss": 0.698, |
|
"mean_token_accuracy": 0.7854336655349383, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.834319526627219, |
|
"grad_norm": 0.31036052278345283, |
|
"learning_rate": 1.6283352173747148e-06, |
|
"loss": 0.6944, |
|
"mean_token_accuracy": 0.7851812952221086, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8402366863905325, |
|
"grad_norm": 0.31687773547502635, |
|
"learning_rate": 1.5170860288242638e-06, |
|
"loss": 0.6934, |
|
"mean_token_accuracy": 0.7867884869373907, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 0.30938603900072376, |
|
"learning_rate": 1.409460456301147e-06, |
|
"loss": 0.6795, |
|
"mean_token_accuracy": 0.7895160039493238, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8520710059171598, |
|
"grad_norm": 0.2952536141242446, |
|
"learning_rate": 1.305504473836331e-06, |
|
"loss": 0.6984, |
|
"mean_token_accuracy": 0.7844431538363902, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.8579881656804734, |
|
"grad_norm": 0.3057613487662314, |
|
"learning_rate": 1.2052624879351105e-06, |
|
"loss": 0.6808, |
|
"mean_token_accuracy": 0.7889678951582548, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.863905325443787, |
|
"grad_norm": 0.29097999865599966, |
|
"learning_rate": 1.1087773186081474e-06, |
|
"loss": 0.6841, |
|
"mean_token_accuracy": 0.7886474079309541, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.8698224852071006, |
|
"grad_norm": 0.3142274595418203, |
|
"learning_rate": 1.0160901810802114e-06, |
|
"loss": 0.6738, |
|
"mean_token_accuracy": 0.7907368421042241, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.8757396449704142, |
|
"grad_norm": 0.2925617205492779, |
|
"learning_rate": 9.272406681844015e-07, |
|
"loss": 0.6709, |
|
"mean_token_accuracy": 0.7922187969571426, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.8816568047337278, |
|
"grad_norm": 0.27724813470631104, |
|
"learning_rate": 8.42266733449425e-07, |
|
"loss": 0.6858, |
|
"mean_token_accuracy": 0.7878970031830121, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.8875739644970414, |
|
"grad_norm": 0.2969141966080097, |
|
"learning_rate": 7.612046748871327e-07, |
|
"loss": 0.6835, |
|
"mean_token_accuracy": 0.7885832175544185, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.893491124260355, |
|
"grad_norm": 0.30371772219322385, |
|
"learning_rate": 6.840891194872112e-07, |
|
"loss": 0.6919, |
|
"mean_token_accuracy": 0.7857929535715237, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.8994082840236687, |
|
"grad_norm": 0.291180674993794, |
|
"learning_rate": 6.109530084257043e-07, |
|
"loss": 0.6767, |
|
"mean_token_accuracy": 0.7914383926503787, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9053254437869822, |
|
"grad_norm": 0.29261209458574694, |
|
"learning_rate": 5.418275829936537e-07, |
|
"loss": 0.6726, |
|
"mean_token_accuracy": 0.7907148025796484, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9112426035502958, |
|
"grad_norm": 0.2776593810597509, |
|
"learning_rate": 4.7674237125185597e-07, |
|
"loss": 0.674, |
|
"mean_token_accuracy": 0.7914961569221735, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9171597633136095, |
|
"grad_norm": 0.28387582813592127, |
|
"learning_rate": 4.1572517541747294e-07, |
|
"loss": 0.6422, |
|
"mean_token_accuracy": 0.8009981820698834, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.2938250907370977, |
|
"learning_rate": 3.588020599878639e-07, |
|
"loss": 0.6962, |
|
"mean_token_accuracy": 0.7844788274775369, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9289940828402367, |
|
"grad_norm": 0.2804499286030571, |
|
"learning_rate": 3.059973406066963e-07, |
|
"loss": 0.6714, |
|
"mean_token_accuracy": 0.7933834235664327, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9349112426035503, |
|
"grad_norm": 0.29640364523794643, |
|
"learning_rate": 2.573335736771254e-07, |
|
"loss": 0.6786, |
|
"mean_token_accuracy": 0.7897013469630543, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9408284023668639, |
|
"grad_norm": 0.2783198872073197, |
|
"learning_rate": 2.1283154672645522e-07, |
|
"loss": 0.6731, |
|
"mean_token_accuracy": 0.7911190417303665, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9467455621301775, |
|
"grad_norm": 0.2839659205849638, |
|
"learning_rate": 1.7251026952640583e-07, |
|
"loss": 0.6969, |
|
"mean_token_accuracy": 0.7854169932946689, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9526627218934911, |
|
"grad_norm": 0.29734374928559365, |
|
"learning_rate": 1.3638696597277678e-07, |
|
"loss": 0.6764, |
|
"mean_token_accuracy": 0.790571362847956, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.9585798816568047, |
|
"grad_norm": 0.30464858616674234, |
|
"learning_rate": 1.0447706672797264e-07, |
|
"loss": 0.6735, |
|
"mean_token_accuracy": 0.7917316028516589, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.9644970414201184, |
|
"grad_norm": 0.29012852499944247, |
|
"learning_rate": 7.679420262954984e-08, |
|
"loss": 0.6699, |
|
"mean_token_accuracy": 0.7924379369006603, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.9704142011834319, |
|
"grad_norm": 0.2781835486284672, |
|
"learning_rate": 5.3350198867574424e-08, |
|
"loss": 0.7051, |
|
"mean_token_accuracy": 0.7818380497316074, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.9763313609467456, |
|
"grad_norm": 0.2722470698221784, |
|
"learning_rate": 3.4155069933301535e-08, |
|
"loss": 0.6566, |
|
"mean_token_accuracy": 0.7958629252087849, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.9822485207100592, |
|
"grad_norm": 0.2853070020591702, |
|
"learning_rate": 1.9217015341318478e-08, |
|
"loss": 0.6902, |
|
"mean_token_accuracy": 0.7868536587227888, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.9881656804733728, |
|
"grad_norm": 0.28783554756968166, |
|
"learning_rate": 8.542416126989805e-09, |
|
"loss": 0.6799, |
|
"mean_token_accuracy": 0.7887756232646665, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.9940828402366864, |
|
"grad_norm": 0.2874058567649186, |
|
"learning_rate": 2.1358321206899067e-09, |
|
"loss": 0.703, |
|
"mean_token_accuracy": 0.7823018287164306, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2792632140682557, |
|
"learning_rate": 0.0, |
|
"loss": 0.6884, |
|
"mean_token_accuracy": 0.7866684149311904, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 169, |
|
"total_flos": 128380330573824.0, |
|
"train_loss": 0.7332075143001489, |
|
"train_runtime": 1234.0886, |
|
"train_samples_per_second": 17.521, |
|
"train_steps_per_second": 0.137 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 169, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 128380330573824.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|