|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 399, |
|
"global_step": 797, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012547051442910915, |
|
"grad_norm": 19.01241111755371, |
|
"learning_rate": 4.347826086956522e-07, |
|
"loss": 3.7662, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0012547051442910915, |
|
"eval_loss": 3.5013513565063477, |
|
"eval_runtime": 6.1042, |
|
"eval_samples_per_second": 109.924, |
|
"eval_steps_per_second": 6.881, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.002509410288582183, |
|
"grad_norm": 16.68044090270996, |
|
"learning_rate": 8.695652173913044e-07, |
|
"loss": 4.9852, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0037641154328732747, |
|
"grad_norm": 20.054622650146484, |
|
"learning_rate": 1.3043478260869566e-06, |
|
"loss": 3.371, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.005018820577164366, |
|
"grad_norm": 17.74432373046875, |
|
"learning_rate": 1.7391304347826088e-06, |
|
"loss": 3.2279, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006273525721455458, |
|
"grad_norm": 15.706853866577148, |
|
"learning_rate": 2.173913043478261e-06, |
|
"loss": 3.2849, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0075282308657465494, |
|
"grad_norm": 16.103614807128906, |
|
"learning_rate": 2.6086956521739132e-06, |
|
"loss": 3.3386, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00878293601003764, |
|
"grad_norm": 17.866008758544922, |
|
"learning_rate": 3.043478260869566e-06, |
|
"loss": 3.7729, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.010037641154328732, |
|
"grad_norm": 15.147605895996094, |
|
"learning_rate": 3.4782608695652175e-06, |
|
"loss": 3.7953, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.011292346298619825, |
|
"grad_norm": 18.901615142822266, |
|
"learning_rate": 3.91304347826087e-06, |
|
"loss": 3.5709, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.012547051442910916, |
|
"grad_norm": 15.003190994262695, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 4.4118, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.013801756587202008, |
|
"grad_norm": 16.06182098388672, |
|
"learning_rate": 4.782608695652174e-06, |
|
"loss": 3.6714, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.015056461731493099, |
|
"grad_norm": 17.36846923828125, |
|
"learning_rate": 5.2173913043478265e-06, |
|
"loss": 3.8605, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.01631116687578419, |
|
"grad_norm": 17.79241180419922, |
|
"learning_rate": 5.652173913043479e-06, |
|
"loss": 3.5272, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.01756587202007528, |
|
"grad_norm": 14.266884803771973, |
|
"learning_rate": 6.086956521739132e-06, |
|
"loss": 3.6034, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.018820577164366373, |
|
"grad_norm": 15.910148620605469, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 3.1557, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.020075282308657464, |
|
"grad_norm": 16.65166473388672, |
|
"learning_rate": 6.956521739130435e-06, |
|
"loss": 3.1012, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.02132998745294856, |
|
"grad_norm": 14.562183380126953, |
|
"learning_rate": 7.391304347826087e-06, |
|
"loss": 3.7211, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02258469259723965, |
|
"grad_norm": 12.386832237243652, |
|
"learning_rate": 7.82608695652174e-06, |
|
"loss": 3.4626, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02383939774153074, |
|
"grad_norm": 17.05539894104004, |
|
"learning_rate": 8.260869565217392e-06, |
|
"loss": 3.4974, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.025094102885821833, |
|
"grad_norm": 20.27201271057129, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 3.6714, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.026348808030112924, |
|
"grad_norm": 14.487604141235352, |
|
"learning_rate": 9.130434782608697e-06, |
|
"loss": 3.3083, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.027603513174404015, |
|
"grad_norm": 16.102643966674805, |
|
"learning_rate": 9.565217391304349e-06, |
|
"loss": 3.4909, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.028858218318695106, |
|
"grad_norm": 19.385705947875977, |
|
"learning_rate": 1e-05, |
|
"loss": 3.6031, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.030112923462986198, |
|
"grad_norm": 16.015605926513672, |
|
"learning_rate": 9.999958813277235e-06, |
|
"loss": 3.2406, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03136762860727729, |
|
"grad_norm": 13.68839168548584, |
|
"learning_rate": 9.999835253787472e-06, |
|
"loss": 3.3278, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03262233375156838, |
|
"grad_norm": 14.528271675109863, |
|
"learning_rate": 9.999629323566323e-06, |
|
"loss": 2.9922, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.033877038895859475, |
|
"grad_norm": 17.02483558654785, |
|
"learning_rate": 9.99934102600642e-06, |
|
"loss": 3.2691, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03513174404015056, |
|
"grad_norm": 16.797338485717773, |
|
"learning_rate": 9.998970365857374e-06, |
|
"loss": 3.3698, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03638644918444166, |
|
"grad_norm": 14.764263153076172, |
|
"learning_rate": 9.998517349225698e-06, |
|
"loss": 3.2543, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.037641154328732745, |
|
"grad_norm": 14.670446395874023, |
|
"learning_rate": 9.9979819835747e-06, |
|
"loss": 3.2072, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03889585947302384, |
|
"grad_norm": 21.932998657226562, |
|
"learning_rate": 9.997364277724362e-06, |
|
"loss": 3.3685, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04015056461731493, |
|
"grad_norm": 18.700294494628906, |
|
"learning_rate": 9.996664241851197e-06, |
|
"loss": 2.9231, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.04140526976160602, |
|
"grad_norm": 13.988219261169434, |
|
"learning_rate": 9.99588188748808e-06, |
|
"loss": 3.0859, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04265997490589712, |
|
"grad_norm": 15.462250709533691, |
|
"learning_rate": 9.995017227524049e-06, |
|
"loss": 3.5131, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.043914680050188205, |
|
"grad_norm": 18.01273536682129, |
|
"learning_rate": 9.994070276204115e-06, |
|
"loss": 3.7054, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0451693851944793, |
|
"grad_norm": 21.7486515045166, |
|
"learning_rate": 9.993041049129005e-06, |
|
"loss": 3.2674, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04642409033877039, |
|
"grad_norm": 19.066349029541016, |
|
"learning_rate": 9.991929563254913e-06, |
|
"loss": 3.3495, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.04767879548306148, |
|
"grad_norm": 20.077730178833008, |
|
"learning_rate": 9.990735836893226e-06, |
|
"loss": 3.396, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.04893350062735257, |
|
"grad_norm": 14.028002738952637, |
|
"learning_rate": 9.989459889710214e-06, |
|
"loss": 2.9541, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.050188205771643665, |
|
"grad_norm": 16.57832908630371, |
|
"learning_rate": 9.988101742726708e-06, |
|
"loss": 4.1811, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05144291091593475, |
|
"grad_norm": 16.29513931274414, |
|
"learning_rate": 9.986661418317759e-06, |
|
"loss": 3.282, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.05269761606022585, |
|
"grad_norm": 19.623991012573242, |
|
"learning_rate": 9.985138940212264e-06, |
|
"loss": 2.7565, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.053952321204516936, |
|
"grad_norm": 17.891677856445312, |
|
"learning_rate": 9.983534333492575e-06, |
|
"loss": 3.2153, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05520702634880803, |
|
"grad_norm": 15.788727760314941, |
|
"learning_rate": 9.981847624594093e-06, |
|
"loss": 3.2207, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.056461731493099125, |
|
"grad_norm": 17.83070182800293, |
|
"learning_rate": 9.980078841304817e-06, |
|
"loss": 3.2656, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05771643663739021, |
|
"grad_norm": 15.635571479797363, |
|
"learning_rate": 9.978228012764904e-06, |
|
"loss": 3.0155, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.05897114178168131, |
|
"grad_norm": 24.62743377685547, |
|
"learning_rate": 9.97629516946618e-06, |
|
"loss": 3.1795, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.060225846925972396, |
|
"grad_norm": 16.98169708251953, |
|
"learning_rate": 9.974280343251637e-06, |
|
"loss": 3.3745, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06148055207026349, |
|
"grad_norm": 19.704118728637695, |
|
"learning_rate": 9.97218356731491e-06, |
|
"loss": 3.0855, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06273525721455459, |
|
"grad_norm": 15.05843734741211, |
|
"learning_rate": 9.970004876199731e-06, |
|
"loss": 2.9471, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06398996235884567, |
|
"grad_norm": 20.934194564819336, |
|
"learning_rate": 9.967744305799358e-06, |
|
"loss": 2.3807, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.06524466750313676, |
|
"grad_norm": 19.842937469482422, |
|
"learning_rate": 9.965401893355985e-06, |
|
"loss": 3.0332, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.06649937264742785, |
|
"grad_norm": 14.741073608398438, |
|
"learning_rate": 9.962977677460132e-06, |
|
"loss": 3.2762, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.06775407779171895, |
|
"grad_norm": 16.709836959838867, |
|
"learning_rate": 9.96047169805e-06, |
|
"loss": 2.8045, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06900878293601004, |
|
"grad_norm": 22.069616317749023, |
|
"learning_rate": 9.957883996410821e-06, |
|
"loss": 2.9735, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07026348808030113, |
|
"grad_norm": 17.32145881652832, |
|
"learning_rate": 9.955214615174174e-06, |
|
"loss": 3.2817, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07151819322459223, |
|
"grad_norm": 16.994312286376953, |
|
"learning_rate": 9.952463598317286e-06, |
|
"loss": 2.9389, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07277289836888332, |
|
"grad_norm": 16.050113677978516, |
|
"learning_rate": 9.949630991162304e-06, |
|
"loss": 2.6915, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0740276035131744, |
|
"grad_norm": 12.047767639160156, |
|
"learning_rate": 9.946716840375552e-06, |
|
"loss": 3.1678, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.07528230865746549, |
|
"grad_norm": 21.182559967041016, |
|
"learning_rate": 9.943721193966755e-06, |
|
"loss": 3.0534, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07653701380175659, |
|
"grad_norm": 15.370920181274414, |
|
"learning_rate": 9.940644101288259e-06, |
|
"loss": 2.9404, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.07779171894604768, |
|
"grad_norm": 17.937530517578125, |
|
"learning_rate": 9.937485613034209e-06, |
|
"loss": 3.1182, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.07904642409033877, |
|
"grad_norm": 15.242935180664062, |
|
"learning_rate": 9.934245781239714e-06, |
|
"loss": 3.2562, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08030112923462986, |
|
"grad_norm": 19.794172286987305, |
|
"learning_rate": 9.93092465928e-06, |
|
"loss": 2.9321, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08155583437892096, |
|
"grad_norm": 12.679776191711426, |
|
"learning_rate": 9.927522301869515e-06, |
|
"loss": 2.4835, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08281053952321205, |
|
"grad_norm": 17.018342971801758, |
|
"learning_rate": 9.924038765061042e-06, |
|
"loss": 2.7064, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.08406524466750313, |
|
"grad_norm": 17.553468704223633, |
|
"learning_rate": 9.920474106244764e-06, |
|
"loss": 3.509, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.08531994981179424, |
|
"grad_norm": 18.098421096801758, |
|
"learning_rate": 9.91682838414733e-06, |
|
"loss": 3.1203, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.08657465495608532, |
|
"grad_norm": 20.72344398498535, |
|
"learning_rate": 9.913101658830879e-06, |
|
"loss": 3.1849, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08782936010037641, |
|
"grad_norm": 18.358638763427734, |
|
"learning_rate": 9.909293991692049e-06, |
|
"loss": 3.4645, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0890840652446675, |
|
"grad_norm": 19.074031829833984, |
|
"learning_rate": 9.905405445460972e-06, |
|
"loss": 2.9314, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0903387703889586, |
|
"grad_norm": 18.550411224365234, |
|
"learning_rate": 9.90143608420024e-06, |
|
"loss": 2.8456, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09159347553324969, |
|
"grad_norm": 23.823490142822266, |
|
"learning_rate": 9.897385973303845e-06, |
|
"loss": 3.3129, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.09284818067754078, |
|
"grad_norm": 17.849550247192383, |
|
"learning_rate": 9.893255179496106e-06, |
|
"loss": 2.7676, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.09410288582183186, |
|
"grad_norm": 14.884727478027344, |
|
"learning_rate": 9.889043770830566e-06, |
|
"loss": 2.9774, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09535759096612297, |
|
"grad_norm": 15.499114990234375, |
|
"learning_rate": 9.884751816688873e-06, |
|
"loss": 2.5129, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.09661229611041405, |
|
"grad_norm": 17.986732482910156, |
|
"learning_rate": 9.880379387779637e-06, |
|
"loss": 3.5453, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.09786700125470514, |
|
"grad_norm": 16.63545036315918, |
|
"learning_rate": 9.875926556137265e-06, |
|
"loss": 2.9293, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09912170639899624, |
|
"grad_norm": 17.408201217651367, |
|
"learning_rate": 9.871393395120774e-06, |
|
"loss": 3.1488, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.10037641154328733, |
|
"grad_norm": 17.50285530090332, |
|
"learning_rate": 9.866779979412583e-06, |
|
"loss": 2.7078, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10163111668757842, |
|
"grad_norm": 16.590560913085938, |
|
"learning_rate": 9.862086385017283e-06, |
|
"loss": 2.8491, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.1028858218318695, |
|
"grad_norm": 18.618976593017578, |
|
"learning_rate": 9.85731268926038e-06, |
|
"loss": 3.0485, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.10414052697616061, |
|
"grad_norm": 17.413230895996094, |
|
"learning_rate": 9.852458970787027e-06, |
|
"loss": 3.0812, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.1053952321204517, |
|
"grad_norm": 14.060961723327637, |
|
"learning_rate": 9.847525309560729e-06, |
|
"loss": 2.5551, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.10664993726474278, |
|
"grad_norm": 14.511148452758789, |
|
"learning_rate": 9.842511786862018e-06, |
|
"loss": 2.8406, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.10790464240903387, |
|
"grad_norm": 18.97178077697754, |
|
"learning_rate": 9.837418485287126e-06, |
|
"loss": 3.2963, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.10915934755332497, |
|
"grad_norm": 13.818567276000977, |
|
"learning_rate": 9.832245488746612e-06, |
|
"loss": 2.6757, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.11041405269761606, |
|
"grad_norm": 18.294200897216797, |
|
"learning_rate": 9.826992882463982e-06, |
|
"loss": 2.3428, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.11166875784190715, |
|
"grad_norm": 17.605432510375977, |
|
"learning_rate": 9.821660752974294e-06, |
|
"loss": 2.8555, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.11292346298619825, |
|
"grad_norm": 16.119766235351562, |
|
"learning_rate": 9.816249188122724e-06, |
|
"loss": 2.8055, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11417816813048934, |
|
"grad_norm": 16.537944793701172, |
|
"learning_rate": 9.81075827706312e-06, |
|
"loss": 2.7496, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.11543287327478043, |
|
"grad_norm": 18.349796295166016, |
|
"learning_rate": 9.805188110256533e-06, |
|
"loss": 2.5472, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.11668757841907151, |
|
"grad_norm": 21.679128646850586, |
|
"learning_rate": 9.799538779469734e-06, |
|
"loss": 2.9006, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.11794228356336262, |
|
"grad_norm": 15.701348304748535, |
|
"learning_rate": 9.793810377773688e-06, |
|
"loss": 2.434, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.1191969887076537, |
|
"grad_norm": 17.04868507385254, |
|
"learning_rate": 9.78800299954203e-06, |
|
"loss": 2.4092, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12045169385194479, |
|
"grad_norm": 17.143634796142578, |
|
"learning_rate": 9.782116740449515e-06, |
|
"loss": 2.979, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.12170639899623588, |
|
"grad_norm": 16.7327880859375, |
|
"learning_rate": 9.776151697470431e-06, |
|
"loss": 2.9258, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.12296110414052698, |
|
"grad_norm": 19.429100036621094, |
|
"learning_rate": 9.770107968877004e-06, |
|
"loss": 3.0748, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.12421580928481807, |
|
"grad_norm": 15.504218101501465, |
|
"learning_rate": 9.763985654237785e-06, |
|
"loss": 3.0054, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.12547051442910917, |
|
"grad_norm": 16.84503936767578, |
|
"learning_rate": 9.757784854416006e-06, |
|
"loss": 3.2136, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12672521957340024, |
|
"grad_norm": 16.334318161010742, |
|
"learning_rate": 9.751505671567914e-06, |
|
"loss": 2.5939, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.12797992471769135, |
|
"grad_norm": 15.902310371398926, |
|
"learning_rate": 9.745148209141094e-06, |
|
"loss": 2.4743, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.12923462986198245, |
|
"grad_norm": 13.628096580505371, |
|
"learning_rate": 9.738712571872765e-06, |
|
"loss": 2.2579, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.13048933500627352, |
|
"grad_norm": 17.617816925048828, |
|
"learning_rate": 9.732198865788047e-06, |
|
"loss": 2.4754, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.13174404015056462, |
|
"grad_norm": 18.667858123779297, |
|
"learning_rate": 9.725607198198227e-06, |
|
"loss": 2.6638, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.1329987452948557, |
|
"grad_norm": 15.029777526855469, |
|
"learning_rate": 9.718937677698976e-06, |
|
"loss": 2.8075, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1342534504391468, |
|
"grad_norm": 18.5529727935791, |
|
"learning_rate": 9.712190414168573e-06, |
|
"loss": 2.627, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.1355081555834379, |
|
"grad_norm": 17.021556854248047, |
|
"learning_rate": 9.705365518766085e-06, |
|
"loss": 2.2912, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.13676286072772897, |
|
"grad_norm": 17.83435821533203, |
|
"learning_rate": 9.698463103929542e-06, |
|
"loss": 2.3247, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.13801756587202008, |
|
"grad_norm": 17.74312400817871, |
|
"learning_rate": 9.691483283374085e-06, |
|
"loss": 2.5844, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13927227101631118, |
|
"grad_norm": 22.43841552734375, |
|
"learning_rate": 9.684426172090084e-06, |
|
"loss": 3.1616, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.14052697616060225, |
|
"grad_norm": 16.035985946655273, |
|
"learning_rate": 9.677291886341256e-06, |
|
"loss": 2.5391, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.14178168130489335, |
|
"grad_norm": 20.342103958129883, |
|
"learning_rate": 9.670080543662742e-06, |
|
"loss": 2.5258, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.14303638644918445, |
|
"grad_norm": 20.725093841552734, |
|
"learning_rate": 9.662792262859167e-06, |
|
"loss": 2.5076, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.14429109159347553, |
|
"grad_norm": 15.233530044555664, |
|
"learning_rate": 9.655427164002692e-06, |
|
"loss": 2.3355, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14554579673776663, |
|
"grad_norm": 15.496427536010742, |
|
"learning_rate": 9.647985368431031e-06, |
|
"loss": 2.5312, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.1468005018820577, |
|
"grad_norm": 24.412311553955078, |
|
"learning_rate": 9.640466998745456e-06, |
|
"loss": 2.7875, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.1480552070263488, |
|
"grad_norm": 15.683626174926758, |
|
"learning_rate": 9.632872178808766e-06, |
|
"loss": 2.2883, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.1493099121706399, |
|
"grad_norm": 17.257770538330078, |
|
"learning_rate": 9.625201033743262e-06, |
|
"loss": 2.8936, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.15056461731493098, |
|
"grad_norm": 19.208641052246094, |
|
"learning_rate": 9.617453689928668e-06, |
|
"loss": 2.7428, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15181932245922208, |
|
"grad_norm": 17.00638771057129, |
|
"learning_rate": 9.609630275000072e-06, |
|
"loss": 2.5065, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.15307402760351319, |
|
"grad_norm": 17.896059036254883, |
|
"learning_rate": 9.601730917845798e-06, |
|
"loss": 2.4492, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.15432873274780426, |
|
"grad_norm": 17.655044555664062, |
|
"learning_rate": 9.5937557486053e-06, |
|
"loss": 2.3202, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.15558343789209536, |
|
"grad_norm": 19.35125732421875, |
|
"learning_rate": 9.585704898667015e-06, |
|
"loss": 2.5956, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.15683814303638646, |
|
"grad_norm": 17.047664642333984, |
|
"learning_rate": 9.577578500666187e-06, |
|
"loss": 2.547, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15809284818067754, |
|
"grad_norm": 17.756309509277344, |
|
"learning_rate": 9.5693766884827e-06, |
|
"loss": 2.6131, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.15934755332496864, |
|
"grad_norm": 18.9345760345459, |
|
"learning_rate": 9.561099597238862e-06, |
|
"loss": 2.4613, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.1606022584692597, |
|
"grad_norm": 16.88786506652832, |
|
"learning_rate": 9.552747363297172e-06, |
|
"loss": 2.363, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.1618569636135508, |
|
"grad_norm": 17.6533203125, |
|
"learning_rate": 9.544320124258093e-06, |
|
"loss": 2.453, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.16311166875784192, |
|
"grad_norm": 19.48556137084961, |
|
"learning_rate": 9.535818018957768e-06, |
|
"loss": 2.2917, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.164366373902133, |
|
"grad_norm": 17.511598587036133, |
|
"learning_rate": 9.527241187465735e-06, |
|
"loss": 2.2477, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.1656210790464241, |
|
"grad_norm": 15.644845008850098, |
|
"learning_rate": 9.518589771082627e-06, |
|
"loss": 2.6145, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.1668757841907152, |
|
"grad_norm": 13.586119651794434, |
|
"learning_rate": 9.509863912337843e-06, |
|
"loss": 2.3622, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.16813048933500627, |
|
"grad_norm": 18.941696166992188, |
|
"learning_rate": 9.501063754987188e-06, |
|
"loss": 2.4396, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.16938519447929737, |
|
"grad_norm": 19.57110023498535, |
|
"learning_rate": 9.492189444010522e-06, |
|
"loss": 2.082, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.17063989962358847, |
|
"grad_norm": 16.997098922729492, |
|
"learning_rate": 9.483241125609358e-06, |
|
"loss": 2.1185, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.17189460476787954, |
|
"grad_norm": 20.234926223754883, |
|
"learning_rate": 9.47421894720446e-06, |
|
"loss": 2.487, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.17314930991217065, |
|
"grad_norm": 20.660642623901367, |
|
"learning_rate": 9.465123057433413e-06, |
|
"loss": 2.1378, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.17440401505646172, |
|
"grad_norm": 21.305038452148438, |
|
"learning_rate": 9.455953606148172e-06, |
|
"loss": 2.7265, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.17565872020075282, |
|
"grad_norm": 20.652212142944336, |
|
"learning_rate": 9.446710744412595e-06, |
|
"loss": 2.3179, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17691342534504392, |
|
"grad_norm": 22.552457809448242, |
|
"learning_rate": 9.437394624499957e-06, |
|
"loss": 2.2027, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.178168130489335, |
|
"grad_norm": 18.889108657836914, |
|
"learning_rate": 9.428005399890442e-06, |
|
"loss": 2.3326, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.1794228356336261, |
|
"grad_norm": 18.121183395385742, |
|
"learning_rate": 9.418543225268598e-06, |
|
"loss": 2.0905, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.1806775407779172, |
|
"grad_norm": 28.54220199584961, |
|
"learning_rate": 9.409008256520814e-06, |
|
"loss": 2.1567, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.18193224592220827, |
|
"grad_norm": 28.761722564697266, |
|
"learning_rate": 9.399400650732735e-06, |
|
"loss": 2.3487, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.18318695106649938, |
|
"grad_norm": 20.803058624267578, |
|
"learning_rate": 9.38972056618668e-06, |
|
"loss": 2.4545, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.18444165621079048, |
|
"grad_norm": 14.15235424041748, |
|
"learning_rate": 9.379968162359034e-06, |
|
"loss": 2.1002, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.18569636135508155, |
|
"grad_norm": 18.501392364501953, |
|
"learning_rate": 9.370143599917617e-06, |
|
"loss": 2.1081, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.18695106649937265, |
|
"grad_norm": 23.19183921813965, |
|
"learning_rate": 9.36024704071904e-06, |
|
"loss": 2.2682, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.18820577164366373, |
|
"grad_norm": 21.424211502075195, |
|
"learning_rate": 9.350278647806037e-06, |
|
"loss": 2.3408, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18946047678795483, |
|
"grad_norm": 22.568864822387695, |
|
"learning_rate": 9.340238585404787e-06, |
|
"loss": 2.357, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.19071518193224593, |
|
"grad_norm": 17.558080673217773, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 2.1341, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.191969887076537, |
|
"grad_norm": 21.05203628540039, |
|
"learning_rate": 9.319944114943171e-06, |
|
"loss": 2.736, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.1932245922208281, |
|
"grad_norm": 28.293092727661133, |
|
"learning_rate": 9.309690041227898e-06, |
|
"loss": 2.4961, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.1944792973651192, |
|
"grad_norm": 21.68331527709961, |
|
"learning_rate": 9.299364966709051e-06, |
|
"loss": 2.2222, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.19573400250941028, |
|
"grad_norm": 28.366355895996094, |
|
"learning_rate": 9.28896906148902e-06, |
|
"loss": 2.719, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.19698870765370138, |
|
"grad_norm": 25.245935440063477, |
|
"learning_rate": 9.278502496837116e-06, |
|
"loss": 2.4558, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.19824341279799249, |
|
"grad_norm": 34.29158020019531, |
|
"learning_rate": 9.267965445186733e-06, |
|
"loss": 2.1928, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.19949811794228356, |
|
"grad_norm": 23.639026641845703, |
|
"learning_rate": 9.257358080132524e-06, |
|
"loss": 1.8916, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.20075282308657466, |
|
"grad_norm": 17.318647384643555, |
|
"learning_rate": 9.24668057642753e-06, |
|
"loss": 2.2254, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20200752823086573, |
|
"grad_norm": 18.8333740234375, |
|
"learning_rate": 9.235933109980302e-06, |
|
"loss": 2.0529, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.20326223337515684, |
|
"grad_norm": 20.41586685180664, |
|
"learning_rate": 9.225115857852015e-06, |
|
"loss": 2.0644, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.20451693851944794, |
|
"grad_norm": 22.13117218017578, |
|
"learning_rate": 9.214228998253526e-06, |
|
"loss": 2.2199, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.205771643663739, |
|
"grad_norm": 22.590608596801758, |
|
"learning_rate": 9.20327271054247e-06, |
|
"loss": 1.9851, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.20702634880803011, |
|
"grad_norm": 19.450021743774414, |
|
"learning_rate": 9.192247175220276e-06, |
|
"loss": 2.1396, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.20828105395232122, |
|
"grad_norm": 24.714031219482422, |
|
"learning_rate": 9.181152573929215e-06, |
|
"loss": 2.0162, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2095357590966123, |
|
"grad_norm": 25.66572380065918, |
|
"learning_rate": 9.16998908944939e-06, |
|
"loss": 2.1091, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.2107904642409034, |
|
"grad_norm": 24.950700759887695, |
|
"learning_rate": 9.15875690569574e-06, |
|
"loss": 2.2533, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.2120451693851945, |
|
"grad_norm": 23.020002365112305, |
|
"learning_rate": 9.147456207714998e-06, |
|
"loss": 2.3229, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.21329987452948557, |
|
"grad_norm": 22.205028533935547, |
|
"learning_rate": 9.13608718168265e-06, |
|
"loss": 2.3614, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21455457967377667, |
|
"grad_norm": 19.170259475708008, |
|
"learning_rate": 9.124650014899868e-06, |
|
"loss": 2.1497, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.21580928481806774, |
|
"grad_norm": 18.129199981689453, |
|
"learning_rate": 9.113144895790416e-06, |
|
"loss": 2.2325, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.21706398996235884, |
|
"grad_norm": 18.413124084472656, |
|
"learning_rate": 9.101572013897555e-06, |
|
"loss": 1.8652, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.21831869510664995, |
|
"grad_norm": 18.207448959350586, |
|
"learning_rate": 9.089931559880918e-06, |
|
"loss": 1.9094, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.21957340025094102, |
|
"grad_norm": 26.02681541442871, |
|
"learning_rate": 9.078223725513366e-06, |
|
"loss": 2.2922, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22082810539523212, |
|
"grad_norm": 30.541122436523438, |
|
"learning_rate": 9.066448703677828e-06, |
|
"loss": 1.8914, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.22208281053952322, |
|
"grad_norm": 19.35504722595215, |
|
"learning_rate": 9.05460668836413e-06, |
|
"loss": 2.0448, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.2233375156838143, |
|
"grad_norm": 24.406612396240234, |
|
"learning_rate": 9.04269787466579e-06, |
|
"loss": 2.2088, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.2245922208281054, |
|
"grad_norm": 28.934782028198242, |
|
"learning_rate": 9.030722458776815e-06, |
|
"loss": 2.0474, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.2258469259723965, |
|
"grad_norm": 23.718971252441406, |
|
"learning_rate": 9.018680637988456e-06, |
|
"loss": 2.1075, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.22710163111668757, |
|
"grad_norm": 19.34891700744629, |
|
"learning_rate": 9.006572610685969e-06, |
|
"loss": 2.0024, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.22835633626097868, |
|
"grad_norm": 17.186641693115234, |
|
"learning_rate": 8.994398576345335e-06, |
|
"loss": 1.8304, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.22961104140526975, |
|
"grad_norm": 23.781911849975586, |
|
"learning_rate": 8.982158735529991e-06, |
|
"loss": 1.8478, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.23086574654956085, |
|
"grad_norm": 28.87154769897461, |
|
"learning_rate": 8.969853289887507e-06, |
|
"loss": 1.9214, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.23212045169385195, |
|
"grad_norm": 24.24917221069336, |
|
"learning_rate": 8.957482442146271e-06, |
|
"loss": 1.8442, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.23337515683814303, |
|
"grad_norm": 23.922151565551758, |
|
"learning_rate": 8.945046396112158e-06, |
|
"loss": 1.9284, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.23462986198243413, |
|
"grad_norm": 22.065723419189453, |
|
"learning_rate": 8.932545356665157e-06, |
|
"loss": 1.8711, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.23588456712672523, |
|
"grad_norm": 28.266712188720703, |
|
"learning_rate": 8.919979529756008e-06, |
|
"loss": 1.8295, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.2371392722710163, |
|
"grad_norm": 22.024778366088867, |
|
"learning_rate": 8.907349122402803e-06, |
|
"loss": 1.9236, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.2383939774153074, |
|
"grad_norm": 17.683101654052734, |
|
"learning_rate": 8.894654342687574e-06, |
|
"loss": 1.8348, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2396486825595985, |
|
"grad_norm": 26.601009368896484, |
|
"learning_rate": 8.881895399752873e-06, |
|
"loss": 1.7325, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.24090338770388958, |
|
"grad_norm": 30.148361206054688, |
|
"learning_rate": 8.869072503798315e-06, |
|
"loss": 2.0121, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.24215809284818068, |
|
"grad_norm": 23.811433792114258, |
|
"learning_rate": 8.85618586607713e-06, |
|
"loss": 1.7341, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.24341279799247176, |
|
"grad_norm": 17.06600570678711, |
|
"learning_rate": 8.843235698892661e-06, |
|
"loss": 1.7895, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.24466750313676286, |
|
"grad_norm": 21.146913528442383, |
|
"learning_rate": 8.83022221559489e-06, |
|
"loss": 1.8371, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.24592220828105396, |
|
"grad_norm": 22.374889373779297, |
|
"learning_rate": 8.81714563057691e-06, |
|
"loss": 2.0259, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.24717691342534504, |
|
"grad_norm": 23.482807159423828, |
|
"learning_rate": 8.80400615927139e-06, |
|
"loss": 2.126, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.24843161856963614, |
|
"grad_norm": 20.430444717407227, |
|
"learning_rate": 8.790804018147039e-06, |
|
"loss": 1.5703, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.24968632371392724, |
|
"grad_norm": 29.053224563598633, |
|
"learning_rate": 8.777539424705022e-06, |
|
"loss": 1.9014, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.25094102885821834, |
|
"grad_norm": 22.412776947021484, |
|
"learning_rate": 8.764212597475397e-06, |
|
"loss": 1.9072, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2521957340025094, |
|
"grad_norm": 27.57085418701172, |
|
"learning_rate": 8.750823756013498e-06, |
|
"loss": 2.0304, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.2534504391468005, |
|
"grad_norm": 21.350475311279297, |
|
"learning_rate": 8.737373120896325e-06, |
|
"loss": 1.797, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.2547051442910916, |
|
"grad_norm": 25.71649169921875, |
|
"learning_rate": 8.72386091371891e-06, |
|
"loss": 1.9805, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.2559598494353827, |
|
"grad_norm": 24.62053108215332, |
|
"learning_rate": 8.710287357090666e-06, |
|
"loss": 1.6377, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.2572145545796738, |
|
"grad_norm": 26.515974044799805, |
|
"learning_rate": 8.696652674631716e-06, |
|
"loss": 2.2071, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2584692597239649, |
|
"grad_norm": 22.19689178466797, |
|
"learning_rate": 8.68295709096922e-06, |
|
"loss": 1.8681, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.25972396486825594, |
|
"grad_norm": 22.31092643737793, |
|
"learning_rate": 8.669200831733655e-06, |
|
"loss": 1.643, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.26097867001254704, |
|
"grad_norm": 18.85532569885254, |
|
"learning_rate": 8.655384123555117e-06, |
|
"loss": 1.669, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.26223337515683814, |
|
"grad_norm": 24.516279220581055, |
|
"learning_rate": 8.64150719405958e-06, |
|
"loss": 1.8626, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.26348808030112925, |
|
"grad_norm": 20.873056411743164, |
|
"learning_rate": 8.627570271865143e-06, |
|
"loss": 1.6009, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26474278544542035, |
|
"grad_norm": 26.961584091186523, |
|
"learning_rate": 8.613573586578262e-06, |
|
"loss": 1.8991, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.2659974905897114, |
|
"grad_norm": 23.05677032470703, |
|
"learning_rate": 8.599517368789981e-06, |
|
"loss": 1.6264, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.2672521957340025, |
|
"grad_norm": 23.3626766204834, |
|
"learning_rate": 8.585401850072114e-06, |
|
"loss": 1.763, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.2685069008782936, |
|
"grad_norm": 22.876678466796875, |
|
"learning_rate": 8.571227262973444e-06, |
|
"loss": 1.8171, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.2697616060225847, |
|
"grad_norm": 21.870689392089844, |
|
"learning_rate": 8.55699384101589e-06, |
|
"loss": 1.7618, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2710163111668758, |
|
"grad_norm": 23.80776023864746, |
|
"learning_rate": 8.54270181869065e-06, |
|
"loss": 1.7353, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.2722710163111669, |
|
"grad_norm": 21.69217872619629, |
|
"learning_rate": 8.528351431454352e-06, |
|
"loss": 1.8667, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.27352572145545795, |
|
"grad_norm": 22.88399887084961, |
|
"learning_rate": 8.513942915725159e-06, |
|
"loss": 1.7512, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.27478042659974905, |
|
"grad_norm": 22.40818977355957, |
|
"learning_rate": 8.499476508878894e-06, |
|
"loss": 1.7168, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.27603513174404015, |
|
"grad_norm": 25.04762840270996, |
|
"learning_rate": 8.484952449245107e-06, |
|
"loss": 1.6717, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.27728983688833125, |
|
"grad_norm": 22.810468673706055, |
|
"learning_rate": 8.470370976103171e-06, |
|
"loss": 1.8007, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.27854454203262236, |
|
"grad_norm": 24.604190826416016, |
|
"learning_rate": 8.455732329678317e-06, |
|
"loss": 1.9564, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.2797992471769134, |
|
"grad_norm": 27.309738159179688, |
|
"learning_rate": 8.441036751137697e-06, |
|
"loss": 1.6334, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.2810539523212045, |
|
"grad_norm": 29.318500518798828, |
|
"learning_rate": 8.426284482586397e-06, |
|
"loss": 1.6922, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.2823086574654956, |
|
"grad_norm": 28.5482177734375, |
|
"learning_rate": 8.411475767063454e-06, |
|
"loss": 1.8862, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2835633626097867, |
|
"grad_norm": 25.247356414794922, |
|
"learning_rate": 8.396610848537858e-06, |
|
"loss": 1.7688, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.2848180677540778, |
|
"grad_norm": 24.79906463623047, |
|
"learning_rate": 8.381689971904514e-06, |
|
"loss": 1.7844, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.2860727728983689, |
|
"grad_norm": 28.987627029418945, |
|
"learning_rate": 8.36671338298023e-06, |
|
"loss": 1.7785, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.28732747804265996, |
|
"grad_norm": 25.145153045654297, |
|
"learning_rate": 8.35168132849965e-06, |
|
"loss": 1.7741, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.28858218318695106, |
|
"grad_norm": 22.089122772216797, |
|
"learning_rate": 8.336594056111197e-06, |
|
"loss": 1.5078, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28983688833124216, |
|
"grad_norm": 27.65213966369629, |
|
"learning_rate": 8.321451814372998e-06, |
|
"loss": 1.7603, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.29109159347553326, |
|
"grad_norm": 33.60897445678711, |
|
"learning_rate": 8.306254852748773e-06, |
|
"loss": 1.7254, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.29234629861982436, |
|
"grad_norm": 25.02092933654785, |
|
"learning_rate": 8.29100342160374e-06, |
|
"loss": 1.795, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.2936010037641154, |
|
"grad_norm": 21.960206985473633, |
|
"learning_rate": 8.275697772200491e-06, |
|
"loss": 1.7087, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.2948557089084065, |
|
"grad_norm": 29.953306198120117, |
|
"learning_rate": 8.260338156694836e-06, |
|
"loss": 1.4295, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2961104140526976, |
|
"grad_norm": 26.209787368774414, |
|
"learning_rate": 8.244924828131668e-06, |
|
"loss": 1.4427, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.2973651191969887, |
|
"grad_norm": 23.775861740112305, |
|
"learning_rate": 8.229458040440783e-06, |
|
"loss": 1.7755, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.2986198243412798, |
|
"grad_norm": 22.297338485717773, |
|
"learning_rate": 8.213938048432697e-06, |
|
"loss": 1.5213, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.2998745294855709, |
|
"grad_norm": 24.113645553588867, |
|
"learning_rate": 8.198365107794457e-06, |
|
"loss": 1.5942, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.30112923462986196, |
|
"grad_norm": 24.177122116088867, |
|
"learning_rate": 8.182739475085417e-06, |
|
"loss": 1.8395, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.30238393977415307, |
|
"grad_norm": 28.40700912475586, |
|
"learning_rate": 8.167061407733018e-06, |
|
"loss": 1.6086, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.30363864491844417, |
|
"grad_norm": 24.49298667907715, |
|
"learning_rate": 8.151331164028544e-06, |
|
"loss": 1.5645, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.30489335006273527, |
|
"grad_norm": 33.37433624267578, |
|
"learning_rate": 8.135549003122871e-06, |
|
"loss": 1.698, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.30614805520702637, |
|
"grad_norm": 24.059009552001953, |
|
"learning_rate": 8.119715185022195e-06, |
|
"loss": 1.5047, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.3074027603513174, |
|
"grad_norm": 29.42665672302246, |
|
"learning_rate": 8.103829970583742e-06, |
|
"loss": 1.68, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.3086574654956085, |
|
"grad_norm": 29.08376121520996, |
|
"learning_rate": 8.087893621511487e-06, |
|
"loss": 1.5872, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.3099121706398996, |
|
"grad_norm": 28.20993995666504, |
|
"learning_rate": 8.071906400351823e-06, |
|
"loss": 1.6515, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.3111668757841907, |
|
"grad_norm": 19.08958625793457, |
|
"learning_rate": 8.055868570489247e-06, |
|
"loss": 1.4665, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.3124215809284818, |
|
"grad_norm": 20.03516960144043, |
|
"learning_rate": 8.039780396142023e-06, |
|
"loss": 1.6523, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.3136762860727729, |
|
"grad_norm": 25.80693244934082, |
|
"learning_rate": 8.023642142357821e-06, |
|
"loss": 1.7412, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.31493099121706397, |
|
"grad_norm": 24.467342376708984, |
|
"learning_rate": 8.007454075009352e-06, |
|
"loss": 1.5459, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.3161856963613551, |
|
"grad_norm": 34.97882843017578, |
|
"learning_rate": 7.991216460789997e-06, |
|
"loss": 1.7311, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.3174404015056462, |
|
"grad_norm": 29.624479293823242, |
|
"learning_rate": 7.974929567209399e-06, |
|
"loss": 1.7838, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.3186951066499373, |
|
"grad_norm": 28.10247039794922, |
|
"learning_rate": 7.95859366258907e-06, |
|
"loss": 1.7842, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.3199498117942284, |
|
"grad_norm": 25.512306213378906, |
|
"learning_rate": 7.942209016057954e-06, |
|
"loss": 1.6854, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.3212045169385194, |
|
"grad_norm": 27.726490020751953, |
|
"learning_rate": 7.925775897548013e-06, |
|
"loss": 1.7176, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.3224592220828105, |
|
"grad_norm": 29.725744247436523, |
|
"learning_rate": 7.909294577789765e-06, |
|
"loss": 1.6355, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.3237139272271016, |
|
"grad_norm": 21.763940811157227, |
|
"learning_rate": 7.892765328307828e-06, |
|
"loss": 1.614, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.32496863237139273, |
|
"grad_norm": 29.157032012939453, |
|
"learning_rate": 7.87618842141645e-06, |
|
"loss": 1.5684, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.32622333751568383, |
|
"grad_norm": 29.150402069091797, |
|
"learning_rate": 7.859564130215015e-06, |
|
"loss": 1.5138, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.32747804265997493, |
|
"grad_norm": 38.0162239074707, |
|
"learning_rate": 7.842892728583557e-06, |
|
"loss": 1.4729, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.328732747804266, |
|
"grad_norm": 28.247106552124023, |
|
"learning_rate": 7.826174491178231e-06, |
|
"loss": 1.6418, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.3299874529485571, |
|
"grad_norm": 28.189817428588867, |
|
"learning_rate": 7.809409693426803e-06, |
|
"loss": 1.5794, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.3312421580928482, |
|
"grad_norm": 34.21451950073242, |
|
"learning_rate": 7.792598611524103e-06, |
|
"loss": 1.5883, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.3324968632371393, |
|
"grad_norm": 27.97997283935547, |
|
"learning_rate": 7.775741522427477e-06, |
|
"loss": 1.4462, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3337515683814304, |
|
"grad_norm": 27.05823516845703, |
|
"learning_rate": 7.75883870385223e-06, |
|
"loss": 1.5044, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.33500627352572143, |
|
"grad_norm": 29.075641632080078, |
|
"learning_rate": 7.741890434267043e-06, |
|
"loss": 1.5352, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.33626097867001253, |
|
"grad_norm": 36.941951751708984, |
|
"learning_rate": 7.724896992889385e-06, |
|
"loss": 1.5779, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.33751568381430364, |
|
"grad_norm": 28.30890655517578, |
|
"learning_rate": 7.707858659680924e-06, |
|
"loss": 1.8306, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.33877038895859474, |
|
"grad_norm": 28.968425750732422, |
|
"learning_rate": 7.690775715342898e-06, |
|
"loss": 1.5735, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.34002509410288584, |
|
"grad_norm": 23.6066951751709, |
|
"learning_rate": 7.67364844131151e-06, |
|
"loss": 1.6057, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.34127979924717694, |
|
"grad_norm": 31.214929580688477, |
|
"learning_rate": 7.656477119753268e-06, |
|
"loss": 1.8741, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.342534504391468, |
|
"grad_norm": 37.89013671875, |
|
"learning_rate": 7.63926203356036e-06, |
|
"loss": 1.7272, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.3437892095357591, |
|
"grad_norm": 26.85829734802246, |
|
"learning_rate": 7.622003466345977e-06, |
|
"loss": 1.6312, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.3450439146800502, |
|
"grad_norm": 25.076658248901367, |
|
"learning_rate": 7.604701702439652e-06, |
|
"loss": 1.5652, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3462986198243413, |
|
"grad_norm": 33.68350601196289, |
|
"learning_rate": 7.587357026882563e-06, |
|
"loss": 1.5935, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.3475533249686324, |
|
"grad_norm": 26.654830932617188, |
|
"learning_rate": 7.5699697254228496e-06, |
|
"loss": 1.4547, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.34880803011292344, |
|
"grad_norm": 25.102251052856445, |
|
"learning_rate": 7.552540084510896e-06, |
|
"loss": 1.6585, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.35006273525721454, |
|
"grad_norm": 30.08404541015625, |
|
"learning_rate": 7.535068391294618e-06, |
|
"loss": 1.7801, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.35131744040150564, |
|
"grad_norm": 23.15135955810547, |
|
"learning_rate": 7.517554933614729e-06, |
|
"loss": 1.4114, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.35257214554579674, |
|
"grad_norm": 26.793306350708008, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 1.5748, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.35382685069008785, |
|
"grad_norm": 26.644601821899414, |
|
"learning_rate": 7.482403879662505e-06, |
|
"loss": 1.7082, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.35508155583437895, |
|
"grad_norm": 29.40913200378418, |
|
"learning_rate": 7.464766862492856e-06, |
|
"loss": 1.5906, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.35633626097867, |
|
"grad_norm": 28.093795776367188, |
|
"learning_rate": 7.447089239055428e-06, |
|
"loss": 1.6122, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.3575909661229611, |
|
"grad_norm": 23.78188133239746, |
|
"learning_rate": 7.42937130058357e-06, |
|
"loss": 1.4623, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3588456712672522, |
|
"grad_norm": 35.69364929199219, |
|
"learning_rate": 7.4116133389748115e-06, |
|
"loss": 1.6225, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.3601003764115433, |
|
"grad_norm": 30.77789306640625, |
|
"learning_rate": 7.393815646786047e-06, |
|
"loss": 1.5917, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.3613550815558344, |
|
"grad_norm": 41.9234619140625, |
|
"learning_rate": 7.3759785172287235e-06, |
|
"loss": 1.4922, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.36260978670012545, |
|
"grad_norm": 26.941680908203125, |
|
"learning_rate": 7.358102244164003e-06, |
|
"loss": 1.8153, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.36386449184441655, |
|
"grad_norm": 27.374059677124023, |
|
"learning_rate": 7.340187122097931e-06, |
|
"loss": 1.64, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.36511919698870765, |
|
"grad_norm": 23.783817291259766, |
|
"learning_rate": 7.322233446176571e-06, |
|
"loss": 1.5758, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.36637390213299875, |
|
"grad_norm": 23.492393493652344, |
|
"learning_rate": 7.304241512181152e-06, |
|
"loss": 1.479, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.36762860727728985, |
|
"grad_norm": 27.81630516052246, |
|
"learning_rate": 7.286211616523193e-06, |
|
"loss": 1.5494, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.36888331242158096, |
|
"grad_norm": 35.152557373046875, |
|
"learning_rate": 7.268144056239621e-06, |
|
"loss": 1.8003, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.370138017565872, |
|
"grad_norm": 24.756799697875977, |
|
"learning_rate": 7.250039128987874e-06, |
|
"loss": 1.6751, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.3713927227101631, |
|
"grad_norm": 30.238140106201172, |
|
"learning_rate": 7.231897133040997e-06, |
|
"loss": 1.4538, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.3726474278544542, |
|
"grad_norm": 25.516706466674805, |
|
"learning_rate": 7.213718367282737e-06, |
|
"loss": 1.41, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.3739021329987453, |
|
"grad_norm": 45.06476593017578, |
|
"learning_rate": 7.195503131202607e-06, |
|
"loss": 1.5351, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.3751568381430364, |
|
"grad_norm": 30.282215118408203, |
|
"learning_rate": 7.177251724890957e-06, |
|
"loss": 1.6859, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.37641154328732745, |
|
"grad_norm": 26.890932083129883, |
|
"learning_rate": 7.1589644490340334e-06, |
|
"loss": 1.5883, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.37766624843161856, |
|
"grad_norm": 29.712207794189453, |
|
"learning_rate": 7.14064160490902e-06, |
|
"loss": 1.7468, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.37892095357590966, |
|
"grad_norm": 23.99646759033203, |
|
"learning_rate": 7.122283494379076e-06, |
|
"loss": 1.3783, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.38017565872020076, |
|
"grad_norm": 28.590595245361328, |
|
"learning_rate": 7.103890419888367e-06, |
|
"loss": 1.694, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.38143036386449186, |
|
"grad_norm": 22.65292739868164, |
|
"learning_rate": 7.085462684457076e-06, |
|
"loss": 1.5418, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.38268506900878296, |
|
"grad_norm": 27.158199310302734, |
|
"learning_rate": 7.067000591676416e-06, |
|
"loss": 1.6183, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.383939774153074, |
|
"grad_norm": 29.83051872253418, |
|
"learning_rate": 7.048504445703623e-06, |
|
"loss": 1.5936, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.3851944792973651, |
|
"grad_norm": 24.005414962768555, |
|
"learning_rate": 7.029974551256957e-06, |
|
"loss": 1.3992, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.3864491844416562, |
|
"grad_norm": 34.38796615600586, |
|
"learning_rate": 7.011411213610663e-06, |
|
"loss": 1.6884, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.3877038895859473, |
|
"grad_norm": 25.36124038696289, |
|
"learning_rate": 6.992814738589958e-06, |
|
"loss": 1.6561, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.3889585947302384, |
|
"grad_norm": 21.46540641784668, |
|
"learning_rate": 6.97418543256599e-06, |
|
"loss": 1.3287, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.39021329987452946, |
|
"grad_norm": 35.439361572265625, |
|
"learning_rate": 6.95552360245078e-06, |
|
"loss": 1.6699, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.39146800501882056, |
|
"grad_norm": 32.73426055908203, |
|
"learning_rate": 6.936829555692182e-06, |
|
"loss": 1.3947, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.39272271016311167, |
|
"grad_norm": 28.283676147460938, |
|
"learning_rate": 6.9181036002687985e-06, |
|
"loss": 1.4841, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.39397741530740277, |
|
"grad_norm": 20.66922378540039, |
|
"learning_rate": 6.899346044684928e-06, |
|
"loss": 1.3804, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.39523212045169387, |
|
"grad_norm": 31.596906661987305, |
|
"learning_rate": 6.880557197965465e-06, |
|
"loss": 1.467, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.39648682559598497, |
|
"grad_norm": 22.125431060791016, |
|
"learning_rate": 6.861737369650818e-06, |
|
"loss": 1.4638, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.397741530740276, |
|
"grad_norm": 26.49312400817871, |
|
"learning_rate": 6.84288686979181e-06, |
|
"loss": 1.2585, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.3989962358845671, |
|
"grad_norm": 31.771793365478516, |
|
"learning_rate": 6.824006008944561e-06, |
|
"loss": 1.5593, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.4002509410288582, |
|
"grad_norm": 33.718238830566406, |
|
"learning_rate": 6.805095098165388e-06, |
|
"loss": 1.5027, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.4015056461731493, |
|
"grad_norm": 27.339921951293945, |
|
"learning_rate": 6.786154449005664e-06, |
|
"loss": 1.438, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4027603513174404, |
|
"grad_norm": 24.385299682617188, |
|
"learning_rate": 6.767184373506698e-06, |
|
"loss": 1.5481, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.40401505646173147, |
|
"grad_norm": 38.833770751953125, |
|
"learning_rate": 6.7481851841945835e-06, |
|
"loss": 1.6319, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.40526976160602257, |
|
"grad_norm": 27.79740333557129, |
|
"learning_rate": 6.7291571940750575e-06, |
|
"loss": 1.5855, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.4065244667503137, |
|
"grad_norm": 30.081342697143555, |
|
"learning_rate": 6.710100716628345e-06, |
|
"loss": 1.3305, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.4077791718946048, |
|
"grad_norm": 28.723339080810547, |
|
"learning_rate": 6.6910160658039835e-06, |
|
"loss": 1.5928, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4090338770388959, |
|
"grad_norm": 36.5059814453125, |
|
"learning_rate": 6.671903556015664e-06, |
|
"loss": 1.7107, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.410288582183187, |
|
"grad_norm": 22.986221313476562, |
|
"learning_rate": 6.652763502136044e-06, |
|
"loss": 1.4106, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.411543287327478, |
|
"grad_norm": 31.11964988708496, |
|
"learning_rate": 6.633596219491559e-06, |
|
"loss": 1.6816, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.4127979924717691, |
|
"grad_norm": 25.74013900756836, |
|
"learning_rate": 6.614402023857231e-06, |
|
"loss": 1.5055, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.41405269761606023, |
|
"grad_norm": 30.515594482421875, |
|
"learning_rate": 6.595181231451469e-06, |
|
"loss": 1.5854, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.41530740276035133, |
|
"grad_norm": 37.943180084228516, |
|
"learning_rate": 6.57593415893085e-06, |
|
"loss": 1.4225, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.41656210790464243, |
|
"grad_norm": 30.183914184570312, |
|
"learning_rate": 6.556661123384909e-06, |
|
"loss": 1.5019, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.4178168130489335, |
|
"grad_norm": 35.5178337097168, |
|
"learning_rate": 6.5373624423309165e-06, |
|
"loss": 1.4571, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.4190715181932246, |
|
"grad_norm": 30.98124885559082, |
|
"learning_rate": 6.518038433708643e-06, |
|
"loss": 1.381, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.4203262233375157, |
|
"grad_norm": 31.475486755371094, |
|
"learning_rate": 6.498689415875121e-06, |
|
"loss": 1.607, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4215809284818068, |
|
"grad_norm": 29.79499053955078, |
|
"learning_rate": 6.479315707599407e-06, |
|
"loss": 1.3446, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.4228356336260979, |
|
"grad_norm": 23.057994842529297, |
|
"learning_rate": 6.459917628057319e-06, |
|
"loss": 1.4102, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.424090338770389, |
|
"grad_norm": 32.09408187866211, |
|
"learning_rate": 6.440495496826189e-06, |
|
"loss": 1.6248, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.42534504391468003, |
|
"grad_norm": 30.396852493286133, |
|
"learning_rate": 6.421049633879588e-06, |
|
"loss": 1.5172, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.42659974905897113, |
|
"grad_norm": 37.36663818359375, |
|
"learning_rate": 6.4015803595820635e-06, |
|
"loss": 1.6684, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.42785445420326224, |
|
"grad_norm": 36.27682876586914, |
|
"learning_rate": 6.3820879946838585e-06, |
|
"loss": 1.43, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.42910915934755334, |
|
"grad_norm": 38.0621223449707, |
|
"learning_rate": 6.3625728603156215e-06, |
|
"loss": 1.5009, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.43036386449184444, |
|
"grad_norm": 30.142953872680664, |
|
"learning_rate": 6.3430352779831275e-06, |
|
"loss": 1.3865, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.4316185696361355, |
|
"grad_norm": 31.03050994873047, |
|
"learning_rate": 6.323475569561968e-06, |
|
"loss": 1.5305, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.4328732747804266, |
|
"grad_norm": 31.472867965698242, |
|
"learning_rate": 6.303894057292261e-06, |
|
"loss": 1.5711, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.4341279799247177, |
|
"grad_norm": 34.335853576660156, |
|
"learning_rate": 6.284291063773331e-06, |
|
"loss": 1.5281, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.4353826850690088, |
|
"grad_norm": 36.837493896484375, |
|
"learning_rate": 6.264666911958404e-06, |
|
"loss": 1.5468, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.4366373902132999, |
|
"grad_norm": 33.03227996826172, |
|
"learning_rate": 6.2450219251492795e-06, |
|
"loss": 1.483, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.437892095357591, |
|
"grad_norm": 28.33861541748047, |
|
"learning_rate": 6.225356426991007e-06, |
|
"loss": 1.2866, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.43914680050188204, |
|
"grad_norm": 27.562910079956055, |
|
"learning_rate": 6.205670741466555e-06, |
|
"loss": 1.4045, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.44040150564617314, |
|
"grad_norm": 31.761911392211914, |
|
"learning_rate": 6.185965192891472e-06, |
|
"loss": 1.337, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.44165621079046424, |
|
"grad_norm": 35.49506378173828, |
|
"learning_rate": 6.166240105908547e-06, |
|
"loss": 1.6938, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.44291091593475534, |
|
"grad_norm": 53.732215881347656, |
|
"learning_rate": 6.146495805482451e-06, |
|
"loss": 1.5635, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.44416562107904645, |
|
"grad_norm": 29.330778121948242, |
|
"learning_rate": 6.126732616894397e-06, |
|
"loss": 1.5873, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.4454203262233375, |
|
"grad_norm": 30.75185203552246, |
|
"learning_rate": 6.106950865736777e-06, |
|
"loss": 1.4611, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4466750313676286, |
|
"grad_norm": 34.61481857299805, |
|
"learning_rate": 6.087150877907786e-06, |
|
"loss": 1.5506, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.4479297365119197, |
|
"grad_norm": 36.45780563354492, |
|
"learning_rate": 6.067332979606069e-06, |
|
"loss": 1.5333, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.4491844416562108, |
|
"grad_norm": 43.751426696777344, |
|
"learning_rate": 6.047497497325341e-06, |
|
"loss": 1.5729, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.4504391468005019, |
|
"grad_norm": 30.756084442138672, |
|
"learning_rate": 6.027644757849004e-06, |
|
"loss": 1.4557, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.451693851944793, |
|
"grad_norm": 30.46338653564453, |
|
"learning_rate": 6.007775088244769e-06, |
|
"loss": 1.3311, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.45294855708908405, |
|
"grad_norm": 29.494077682495117, |
|
"learning_rate": 5.987888815859266e-06, |
|
"loss": 1.3893, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.45420326223337515, |
|
"grad_norm": 30.151817321777344, |
|
"learning_rate": 5.967986268312651e-06, |
|
"loss": 1.346, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.45545796737766625, |
|
"grad_norm": 35.56706237792969, |
|
"learning_rate": 5.948067773493205e-06, |
|
"loss": 1.5986, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.45671267252195735, |
|
"grad_norm": 26.097820281982422, |
|
"learning_rate": 5.928133659551939e-06, |
|
"loss": 1.3859, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.45796737766624845, |
|
"grad_norm": 28.94278335571289, |
|
"learning_rate": 5.908184254897183e-06, |
|
"loss": 1.5139, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4592220828105395, |
|
"grad_norm": 36.553123474121094, |
|
"learning_rate": 5.888219888189176e-06, |
|
"loss": 1.4892, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.4604767879548306, |
|
"grad_norm": 106.10436248779297, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 1.4375, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.4617314930991217, |
|
"grad_norm": 42.712303161621094, |
|
"learning_rate": 5.848247584481424e-06, |
|
"loss": 1.431, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.4629861982434128, |
|
"grad_norm": 37.82698059082031, |
|
"learning_rate": 5.828240306012957e-06, |
|
"loss": 1.5441, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.4642409033877039, |
|
"grad_norm": 35.159000396728516, |
|
"learning_rate": 5.808219382542941e-06, |
|
"loss": 1.4638, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.465495608531995, |
|
"grad_norm": 28.512142181396484, |
|
"learning_rate": 5.788185143909868e-06, |
|
"loss": 1.4615, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.46675031367628605, |
|
"grad_norm": 32.28644943237305, |
|
"learning_rate": 5.768137920171593e-06, |
|
"loss": 1.4778, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.46800501882057716, |
|
"grad_norm": 30.508554458618164, |
|
"learning_rate": 5.74807804159989e-06, |
|
"loss": 1.656, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.46925972396486826, |
|
"grad_norm": 31.334104537963867, |
|
"learning_rate": 5.728005838675026e-06, |
|
"loss": 1.3335, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.47051442910915936, |
|
"grad_norm": 30.219167709350586, |
|
"learning_rate": 5.7079216420803e-06, |
|
"loss": 1.468, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.47176913425345046, |
|
"grad_norm": 40.787261962890625, |
|
"learning_rate": 5.68782578269661e-06, |
|
"loss": 1.5705, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.4730238393977415, |
|
"grad_norm": 36.666656494140625, |
|
"learning_rate": 5.66771859159699e-06, |
|
"loss": 1.5139, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.4742785445420326, |
|
"grad_norm": 33.556617736816406, |
|
"learning_rate": 5.647600400041163e-06, |
|
"loss": 1.3386, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.4755332496863237, |
|
"grad_norm": 28.310293197631836, |
|
"learning_rate": 5.6274715394700805e-06, |
|
"loss": 1.4892, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.4767879548306148, |
|
"grad_norm": 30.385696411132812, |
|
"learning_rate": 5.6073323415004635e-06, |
|
"loss": 1.4074, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.4780426599749059, |
|
"grad_norm": 30.94135856628418, |
|
"learning_rate": 5.587183137919332e-06, |
|
"loss": 1.3804, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.479297365119197, |
|
"grad_norm": 25.842451095581055, |
|
"learning_rate": 5.567024260678559e-06, |
|
"loss": 1.3756, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.48055207026348806, |
|
"grad_norm": 24.24115753173828, |
|
"learning_rate": 5.546856041889374e-06, |
|
"loss": 1.3217, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.48180677540777916, |
|
"grad_norm": 29.69972801208496, |
|
"learning_rate": 5.526678813816912e-06, |
|
"loss": 1.3114, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.48306148055207027, |
|
"grad_norm": 40.6950569152832, |
|
"learning_rate": 5.5064929088747324e-06, |
|
"loss": 1.6083, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.48431618569636137, |
|
"grad_norm": 37.67729949951172, |
|
"learning_rate": 5.486298659619346e-06, |
|
"loss": 1.5827, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.48557089084065247, |
|
"grad_norm": 38.3140754699707, |
|
"learning_rate": 5.46609639874473e-06, |
|
"loss": 1.3942, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.4868255959849435, |
|
"grad_norm": 33.37904739379883, |
|
"learning_rate": 5.445886459076848e-06, |
|
"loss": 1.5518, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.4880803011292346, |
|
"grad_norm": 30.683101654052734, |
|
"learning_rate": 5.425669173568179e-06, |
|
"loss": 1.3667, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.4893350062735257, |
|
"grad_norm": 38.90886306762695, |
|
"learning_rate": 5.405444875292213e-06, |
|
"loss": 1.6388, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4905897114178168, |
|
"grad_norm": 32.49534606933594, |
|
"learning_rate": 5.385213897437975e-06, |
|
"loss": 1.3725, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.4918444165621079, |
|
"grad_norm": 31.765207290649414, |
|
"learning_rate": 5.364976573304538e-06, |
|
"loss": 1.4513, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.493099121706399, |
|
"grad_norm": 34.01384735107422, |
|
"learning_rate": 5.344733236295525e-06, |
|
"loss": 1.3848, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.49435382685069007, |
|
"grad_norm": 36.31550216674805, |
|
"learning_rate": 5.324484219913621e-06, |
|
"loss": 1.3873, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.49560853199498117, |
|
"grad_norm": 30.318265914916992, |
|
"learning_rate": 5.30422985775507e-06, |
|
"loss": 1.5321, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4968632371392723, |
|
"grad_norm": 30.169464111328125, |
|
"learning_rate": 5.283970483504198e-06, |
|
"loss": 1.3799, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.4981179422835634, |
|
"grad_norm": 31.82530975341797, |
|
"learning_rate": 5.263706430927895e-06, |
|
"loss": 1.5295, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.4993726474278545, |
|
"grad_norm": 36.714996337890625, |
|
"learning_rate": 5.243438033870126e-06, |
|
"loss": 1.4037, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.5006273525721455, |
|
"grad_norm": 33.54505157470703, |
|
"learning_rate": 5.223165626246432e-06, |
|
"loss": 1.521, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.5006273525721455, |
|
"eval_loss": 1.436629295349121, |
|
"eval_runtime": 6.0522, |
|
"eval_samples_per_second": 110.869, |
|
"eval_steps_per_second": 6.94, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.5018820577164367, |
|
"grad_norm": 30.569034576416016, |
|
"learning_rate": 5.202889542038428e-06, |
|
"loss": 1.3634, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5031367628607277, |
|
"grad_norm": 28.09290885925293, |
|
"learning_rate": 5.182610115288296e-06, |
|
"loss": 1.4243, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.5043914680050188, |
|
"grad_norm": 31.013883590698242, |
|
"learning_rate": 5.162327680093284e-06, |
|
"loss": 1.5255, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.5056461731493099, |
|
"grad_norm": 28.622833251953125, |
|
"learning_rate": 5.142042570600212e-06, |
|
"loss": 1.143, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.506900878293601, |
|
"grad_norm": 34.083290100097656, |
|
"learning_rate": 5.121755120999949e-06, |
|
"loss": 1.4854, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.5081555834378921, |
|
"grad_norm": 29.883394241333008, |
|
"learning_rate": 5.101465665521919e-06, |
|
"loss": 1.2494, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5094102885821832, |
|
"grad_norm": 36.8629035949707, |
|
"learning_rate": 5.081174538428596e-06, |
|
"loss": 1.5055, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.5106649937264742, |
|
"grad_norm": 39.23841094970703, |
|
"learning_rate": 5.060882074009988e-06, |
|
"loss": 1.41, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.5119196988707654, |
|
"grad_norm": 42.195274353027344, |
|
"learning_rate": 5.04058860657814e-06, |
|
"loss": 1.5589, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.5131744040150564, |
|
"grad_norm": 32.830596923828125, |
|
"learning_rate": 5.020294470461615e-06, |
|
"loss": 1.3412, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.5144291091593476, |
|
"grad_norm": 49.16096496582031, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5255, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5156838143036386, |
|
"grad_norm": 29.00592613220215, |
|
"learning_rate": 4.979705529538385e-06, |
|
"loss": 1.4311, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.5169385194479298, |
|
"grad_norm": 39.06101608276367, |
|
"learning_rate": 4.959411393421863e-06, |
|
"loss": 1.3708, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.5181932245922208, |
|
"grad_norm": 34.09449768066406, |
|
"learning_rate": 4.939117925990013e-06, |
|
"loss": 1.484, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.5194479297365119, |
|
"grad_norm": 35.57181167602539, |
|
"learning_rate": 4.918825461571405e-06, |
|
"loss": 1.3226, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.520702634880803, |
|
"grad_norm": 29.180233001708984, |
|
"learning_rate": 4.8985343344780815e-06, |
|
"loss": 1.6168, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5219573400250941, |
|
"grad_norm": 25.967992782592773, |
|
"learning_rate": 4.8782448790000525e-06, |
|
"loss": 1.4807, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.5232120451693852, |
|
"grad_norm": 31.979293823242188, |
|
"learning_rate": 4.857957429399788e-06, |
|
"loss": 1.4218, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.5244667503136763, |
|
"grad_norm": 30.151277542114258, |
|
"learning_rate": 4.837672319906717e-06, |
|
"loss": 1.4075, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.5257214554579673, |
|
"grad_norm": 40.19000244140625, |
|
"learning_rate": 4.817389884711706e-06, |
|
"loss": 1.6472, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.5269761606022585, |
|
"grad_norm": 28.63579559326172, |
|
"learning_rate": 4.797110457961575e-06, |
|
"loss": 1.1942, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5282308657465495, |
|
"grad_norm": 36.74559020996094, |
|
"learning_rate": 4.7768343737535694e-06, |
|
"loss": 1.5179, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.5294855708908407, |
|
"grad_norm": 30.191770553588867, |
|
"learning_rate": 4.756561966129875e-06, |
|
"loss": 1.2881, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.5307402760351317, |
|
"grad_norm": 31.707502365112305, |
|
"learning_rate": 4.736293569072108e-06, |
|
"loss": 1.3801, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.5319949811794228, |
|
"grad_norm": 25.902997970581055, |
|
"learning_rate": 4.716029516495803e-06, |
|
"loss": 1.3326, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.533249686323714, |
|
"grad_norm": 42.108238220214844, |
|
"learning_rate": 4.695770142244931e-06, |
|
"loss": 1.529, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.534504391468005, |
|
"grad_norm": 31.789140701293945, |
|
"learning_rate": 4.6755157800863826e-06, |
|
"loss": 1.3478, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.5357590966122961, |
|
"grad_norm": 27.96792984008789, |
|
"learning_rate": 4.655266763704476e-06, |
|
"loss": 1.397, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.5370138017565872, |
|
"grad_norm": 31.803890228271484, |
|
"learning_rate": 4.635023426695462e-06, |
|
"loss": 1.4011, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.5382685069008782, |
|
"grad_norm": 35.10597610473633, |
|
"learning_rate": 4.614786102562026e-06, |
|
"loss": 1.4848, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.5395232120451694, |
|
"grad_norm": 31.621994018554688, |
|
"learning_rate": 4.594555124707789e-06, |
|
"loss": 1.3346, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5407779171894604, |
|
"grad_norm": 33.457908630371094, |
|
"learning_rate": 4.574330826431822e-06, |
|
"loss": 1.3045, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.5420326223337516, |
|
"grad_norm": 31.1467342376709, |
|
"learning_rate": 4.554113540923153e-06, |
|
"loss": 1.4343, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.5432873274780426, |
|
"grad_norm": 31.287960052490234, |
|
"learning_rate": 4.533903601255272e-06, |
|
"loss": 1.3903, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.5445420326223338, |
|
"grad_norm": 26.70494842529297, |
|
"learning_rate": 4.513701340380655e-06, |
|
"loss": 1.3482, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.5457967377666249, |
|
"grad_norm": 44.05613327026367, |
|
"learning_rate": 4.493507091125269e-06, |
|
"loss": 1.5986, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5470514429109159, |
|
"grad_norm": 29.704072952270508, |
|
"learning_rate": 4.473321186183091e-06, |
|
"loss": 1.3137, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.548306148055207, |
|
"grad_norm": 29.141984939575195, |
|
"learning_rate": 4.4531439581106295e-06, |
|
"loss": 1.478, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.5495608531994981, |
|
"grad_norm": 34.73693084716797, |
|
"learning_rate": 4.432975739321444e-06, |
|
"loss": 1.5629, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.5508155583437893, |
|
"grad_norm": 33.1425666809082, |
|
"learning_rate": 4.412816862080668e-06, |
|
"loss": 1.3101, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.5520702634880803, |
|
"grad_norm": 31.933034896850586, |
|
"learning_rate": 4.392667658499539e-06, |
|
"loss": 1.3371, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5533249686323714, |
|
"grad_norm": 30.45763397216797, |
|
"learning_rate": 4.37252846052992e-06, |
|
"loss": 1.3671, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.5545796737766625, |
|
"grad_norm": 42.91053009033203, |
|
"learning_rate": 4.352399599958837e-06, |
|
"loss": 1.4992, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.5558343789209536, |
|
"grad_norm": 36.65143585205078, |
|
"learning_rate": 4.332281408403011e-06, |
|
"loss": 1.4589, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.5570890840652447, |
|
"grad_norm": 38.462398529052734, |
|
"learning_rate": 4.312174217303391e-06, |
|
"loss": 1.2266, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.5583437892095358, |
|
"grad_norm": 31.30473518371582, |
|
"learning_rate": 4.292078357919701e-06, |
|
"loss": 1.4476, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5595984943538268, |
|
"grad_norm": 35.10082244873047, |
|
"learning_rate": 4.271994161324977e-06, |
|
"loss": 1.4988, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.560853199498118, |
|
"grad_norm": 32.5116081237793, |
|
"learning_rate": 4.2519219584001106e-06, |
|
"loss": 1.4988, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.562107904642409, |
|
"grad_norm": 29.34661102294922, |
|
"learning_rate": 4.231862079828408e-06, |
|
"loss": 1.4725, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.5633626097867002, |
|
"grad_norm": 36.072879791259766, |
|
"learning_rate": 4.2118148560901325e-06, |
|
"loss": 1.4334, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.5646173149309912, |
|
"grad_norm": 30.869470596313477, |
|
"learning_rate": 4.19178061745706e-06, |
|
"loss": 1.3606, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5658720200752823, |
|
"grad_norm": 29.298429489135742, |
|
"learning_rate": 4.171759693987046e-06, |
|
"loss": 1.2983, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.5671267252195734, |
|
"grad_norm": 24.67900276184082, |
|
"learning_rate": 4.151752415518577e-06, |
|
"loss": 1.2631, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.5683814303638645, |
|
"grad_norm": 33.28513717651367, |
|
"learning_rate": 4.131759111665349e-06, |
|
"loss": 1.3843, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.5696361355081556, |
|
"grad_norm": 34.13528823852539, |
|
"learning_rate": 4.111780111810826e-06, |
|
"loss": 1.4529, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.5708908406524467, |
|
"grad_norm": 28.38991355895996, |
|
"learning_rate": 4.091815745102818e-06, |
|
"loss": 1.5154, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5721455457967378, |
|
"grad_norm": 26.64844512939453, |
|
"learning_rate": 4.071866340448062e-06, |
|
"loss": 1.3302, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.5734002509410289, |
|
"grad_norm": 37.00432205200195, |
|
"learning_rate": 4.051932226506797e-06, |
|
"loss": 1.3327, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.5746549560853199, |
|
"grad_norm": 27.36146354675293, |
|
"learning_rate": 4.032013731687351e-06, |
|
"loss": 1.361, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.5759096612296111, |
|
"grad_norm": 32.78675842285156, |
|
"learning_rate": 4.0121111841407345e-06, |
|
"loss": 1.4741, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.5771643663739021, |
|
"grad_norm": 37.97308349609375, |
|
"learning_rate": 3.992224911755234e-06, |
|
"loss": 1.5363, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5784190715181933, |
|
"grad_norm": 31.34197235107422, |
|
"learning_rate": 3.9723552421509975e-06, |
|
"loss": 1.2434, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.5796737766624843, |
|
"grad_norm": 36.909828186035156, |
|
"learning_rate": 3.95250250267466e-06, |
|
"loss": 1.3956, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.5809284818067754, |
|
"grad_norm": 47.24994659423828, |
|
"learning_rate": 3.932667020393933e-06, |
|
"loss": 1.3312, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.5821831869510665, |
|
"grad_norm": 35.684608459472656, |
|
"learning_rate": 3.912849122092216e-06, |
|
"loss": 1.4447, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.5834378920953576, |
|
"grad_norm": 36.601715087890625, |
|
"learning_rate": 3.8930491342632235e-06, |
|
"loss": 1.4177, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5846925972396487, |
|
"grad_norm": 28.328744888305664, |
|
"learning_rate": 3.873267383105604e-06, |
|
"loss": 1.3929, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.5859473023839398, |
|
"grad_norm": 32.12102127075195, |
|
"learning_rate": 3.853504194517551e-06, |
|
"loss": 1.4941, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.5872020075282308, |
|
"grad_norm": 32.12097930908203, |
|
"learning_rate": 3.833759894091456e-06, |
|
"loss": 1.3292, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.588456712672522, |
|
"grad_norm": 26.0775146484375, |
|
"learning_rate": 3.814034807108529e-06, |
|
"loss": 1.3233, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.589711417816813, |
|
"grad_norm": 26.92903709411621, |
|
"learning_rate": 3.7943292585334464e-06, |
|
"loss": 1.3575, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5909661229611042, |
|
"grad_norm": 35.65913772583008, |
|
"learning_rate": 3.774643573008995e-06, |
|
"loss": 1.3416, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.5922208281053952, |
|
"grad_norm": 44.53237533569336, |
|
"learning_rate": 3.754978074850722e-06, |
|
"loss": 1.6346, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.5934755332496863, |
|
"grad_norm": 33.18136978149414, |
|
"learning_rate": 3.7353330880415963e-06, |
|
"loss": 1.5085, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.5947302383939774, |
|
"grad_norm": 31.07672882080078, |
|
"learning_rate": 3.7157089362266695e-06, |
|
"loss": 1.3839, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.5959849435382685, |
|
"grad_norm": 29.932600021362305, |
|
"learning_rate": 3.6961059427077407e-06, |
|
"loss": 1.4774, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5972396486825596, |
|
"grad_norm": 27.480052947998047, |
|
"learning_rate": 3.6765244304380323e-06, |
|
"loss": 1.2551, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.5984943538268507, |
|
"grad_norm": 39.4902458190918, |
|
"learning_rate": 3.656964722016875e-06, |
|
"loss": 1.3972, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.5997490589711418, |
|
"grad_norm": 36.17951583862305, |
|
"learning_rate": 3.6374271396843797e-06, |
|
"loss": 1.2946, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.6010037641154329, |
|
"grad_norm": 30.92720603942871, |
|
"learning_rate": 3.617912005316142e-06, |
|
"loss": 1.2169, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.6022584692597239, |
|
"grad_norm": 34.092063903808594, |
|
"learning_rate": 3.598419640417938e-06, |
|
"loss": 1.3757, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6035131744040151, |
|
"grad_norm": 27.944690704345703, |
|
"learning_rate": 3.578950366120414e-06, |
|
"loss": 1.2427, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.6047678795483061, |
|
"grad_norm": 36.29844665527344, |
|
"learning_rate": 3.5595045031738123e-06, |
|
"loss": 1.3915, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.6060225846925973, |
|
"grad_norm": 36.75183868408203, |
|
"learning_rate": 3.540082371942682e-06, |
|
"loss": 1.4398, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.6072772898368883, |
|
"grad_norm": 28.854524612426758, |
|
"learning_rate": 3.5206842924005934e-06, |
|
"loss": 1.3392, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.6085319949811794, |
|
"grad_norm": 32.42161560058594, |
|
"learning_rate": 3.5013105841248794e-06, |
|
"loss": 1.5482, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6097867001254705, |
|
"grad_norm": 38.66543960571289, |
|
"learning_rate": 3.481961566291358e-06, |
|
"loss": 1.4572, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.6110414052697616, |
|
"grad_norm": 37.27582550048828, |
|
"learning_rate": 3.462637557669084e-06, |
|
"loss": 1.3017, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.6122961104140527, |
|
"grad_norm": 28.435178756713867, |
|
"learning_rate": 3.443338876615092e-06, |
|
"loss": 1.3203, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.6135508155583438, |
|
"grad_norm": 33.752044677734375, |
|
"learning_rate": 3.424065841069152e-06, |
|
"loss": 1.5739, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.6148055207026348, |
|
"grad_norm": 34.22273635864258, |
|
"learning_rate": 3.4048187685485312e-06, |
|
"loss": 1.4068, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.616060225846926, |
|
"grad_norm": 54.36898422241211, |
|
"learning_rate": 3.3855979761427705e-06, |
|
"loss": 1.3019, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.617314930991217, |
|
"grad_norm": 32.61660385131836, |
|
"learning_rate": 3.3664037805084428e-06, |
|
"loss": 1.2823, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.6185696361355082, |
|
"grad_norm": 34.06522750854492, |
|
"learning_rate": 3.347236497863957e-06, |
|
"loss": 1.3678, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.6198243412797992, |
|
"grad_norm": 29.604419708251953, |
|
"learning_rate": 3.3280964439843377e-06, |
|
"loss": 1.3285, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.6210790464240903, |
|
"grad_norm": 33.45100021362305, |
|
"learning_rate": 3.308983934196018e-06, |
|
"loss": 1.422, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6223337515683814, |
|
"grad_norm": 33.3889274597168, |
|
"learning_rate": 3.289899283371657e-06, |
|
"loss": 1.3114, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.6235884567126725, |
|
"grad_norm": 30.00410270690918, |
|
"learning_rate": 3.2708428059249437e-06, |
|
"loss": 1.3216, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.6248431618569636, |
|
"grad_norm": 41.03053283691406, |
|
"learning_rate": 3.2518148158054186e-06, |
|
"loss": 1.4942, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.6260978670012547, |
|
"grad_norm": 46.363258361816406, |
|
"learning_rate": 3.2328156264933043e-06, |
|
"loss": 1.6328, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.6273525721455459, |
|
"grad_norm": 37.64637756347656, |
|
"learning_rate": 3.2138455509943365e-06, |
|
"loss": 1.3816, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6286072772898369, |
|
"grad_norm": 46.19404602050781, |
|
"learning_rate": 3.194904901834613e-06, |
|
"loss": 1.5756, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.6298619824341279, |
|
"grad_norm": 26.028804779052734, |
|
"learning_rate": 3.17599399105544e-06, |
|
"loss": 1.314, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.6311166875784191, |
|
"grad_norm": 31.624303817749023, |
|
"learning_rate": 3.1571131302081916e-06, |
|
"loss": 1.3178, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.6323713927227101, |
|
"grad_norm": 35.267478942871094, |
|
"learning_rate": 3.138262630349182e-06, |
|
"loss": 1.5758, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.6336260978670013, |
|
"grad_norm": 30.934772491455078, |
|
"learning_rate": 3.1194428020345375e-06, |
|
"loss": 1.4725, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.6348808030112923, |
|
"grad_norm": 28.47898292541504, |
|
"learning_rate": 3.1006539553150727e-06, |
|
"loss": 1.3188, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.6361355081555834, |
|
"grad_norm": 38.18532943725586, |
|
"learning_rate": 3.081896399731202e-06, |
|
"loss": 1.2228, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.6373902132998746, |
|
"grad_norm": 35.62003707885742, |
|
"learning_rate": 3.063170444307821e-06, |
|
"loss": 1.6133, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.6386449184441656, |
|
"grad_norm": 58.091861724853516, |
|
"learning_rate": 3.044476397549221e-06, |
|
"loss": 1.3338, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.6398996235884568, |
|
"grad_norm": 31.276124954223633, |
|
"learning_rate": 3.02581456743401e-06, |
|
"loss": 1.1924, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6411543287327478, |
|
"grad_norm": 36.98395538330078, |
|
"learning_rate": 3.0071852614100427e-06, |
|
"loss": 1.3475, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.6424090338770388, |
|
"grad_norm": 33.80880355834961, |
|
"learning_rate": 2.9885887863893394e-06, |
|
"loss": 1.2211, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.64366373902133, |
|
"grad_norm": 37.08169174194336, |
|
"learning_rate": 2.9700254487430448e-06, |
|
"loss": 1.3388, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.644918444165621, |
|
"grad_norm": 30.51959228515625, |
|
"learning_rate": 2.9514955542963775e-06, |
|
"loss": 1.4277, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.6461731493099122, |
|
"grad_norm": 31.10744285583496, |
|
"learning_rate": 2.9329994083235857e-06, |
|
"loss": 1.2503, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6474278544542033, |
|
"grad_norm": 32.857383728027344, |
|
"learning_rate": 2.9145373155429263e-06, |
|
"loss": 1.4776, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.6486825595984943, |
|
"grad_norm": 36.374961853027344, |
|
"learning_rate": 2.896109580111634e-06, |
|
"loss": 1.2288, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.6499372647427855, |
|
"grad_norm": 26.020505905151367, |
|
"learning_rate": 2.8777165056209256e-06, |
|
"loss": 1.2806, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.6511919698870765, |
|
"grad_norm": 31.82769775390625, |
|
"learning_rate": 2.8593583950909833e-06, |
|
"loss": 1.3725, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.6524466750313677, |
|
"grad_norm": 36.6817741394043, |
|
"learning_rate": 2.8410355509659682e-06, |
|
"loss": 1.2934, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6537013801756587, |
|
"grad_norm": 46.93891525268555, |
|
"learning_rate": 2.8227482751090445e-06, |
|
"loss": 1.4673, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.6549560853199499, |
|
"grad_norm": 41.38336181640625, |
|
"learning_rate": 2.8044968687973956e-06, |
|
"loss": 1.4611, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.6562107904642409, |
|
"grad_norm": 37.399681091308594, |
|
"learning_rate": 2.786281632717264e-06, |
|
"loss": 1.2811, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.657465495608532, |
|
"grad_norm": 44.295719146728516, |
|
"learning_rate": 2.7681028669590038e-06, |
|
"loss": 1.3587, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.6587202007528231, |
|
"grad_norm": 33.356292724609375, |
|
"learning_rate": 2.749960871012129e-06, |
|
"loss": 1.4634, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6599749058971142, |
|
"grad_norm": 38.98143005371094, |
|
"learning_rate": 2.73185594376038e-06, |
|
"loss": 1.4382, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.6612296110414053, |
|
"grad_norm": 30.759475708007812, |
|
"learning_rate": 2.7137883834768076e-06, |
|
"loss": 1.3081, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.6624843161856964, |
|
"grad_norm": 37.871238708496094, |
|
"learning_rate": 2.6957584878188496e-06, |
|
"loss": 1.3886, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.6637390213299874, |
|
"grad_norm": 49.197872161865234, |
|
"learning_rate": 2.6777665538234292e-06, |
|
"loss": 1.5503, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.6649937264742786, |
|
"grad_norm": 37.15614700317383, |
|
"learning_rate": 2.6598128779020693e-06, |
|
"loss": 1.3044, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6662484316185696, |
|
"grad_norm": 31.275415420532227, |
|
"learning_rate": 2.641897755835997e-06, |
|
"loss": 1.397, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.6675031367628608, |
|
"grad_norm": 41.38181686401367, |
|
"learning_rate": 2.6240214827712794e-06, |
|
"loss": 1.4281, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.6687578419071518, |
|
"grad_norm": 39.80350875854492, |
|
"learning_rate": 2.6061843532139563e-06, |
|
"loss": 1.4107, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.6700125470514429, |
|
"grad_norm": 40.21477508544922, |
|
"learning_rate": 2.5883866610251906e-06, |
|
"loss": 1.4339, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.671267252195734, |
|
"grad_norm": 43.72838592529297, |
|
"learning_rate": 2.5706286994164315e-06, |
|
"loss": 1.5603, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6725219573400251, |
|
"grad_norm": 27.070802688598633, |
|
"learning_rate": 2.5529107609445737e-06, |
|
"loss": 1.4321, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.6737766624843162, |
|
"grad_norm": 41.055633544921875, |
|
"learning_rate": 2.5352331375071437e-06, |
|
"loss": 1.4914, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.6750313676286073, |
|
"grad_norm": 39.451602935791016, |
|
"learning_rate": 2.5175961203374954e-06, |
|
"loss": 1.4453, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.6762860727728983, |
|
"grad_norm": 38.11553955078125, |
|
"learning_rate": 2.5000000000000015e-06, |
|
"loss": 1.3918, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.6775407779171895, |
|
"grad_norm": 30.756338119506836, |
|
"learning_rate": 2.4824450663852716e-06, |
|
"loss": 1.1408, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6787954830614805, |
|
"grad_norm": 31.51823616027832, |
|
"learning_rate": 2.464931608705384e-06, |
|
"loss": 1.5483, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.6800501882057717, |
|
"grad_norm": 28.151769638061523, |
|
"learning_rate": 2.447459915489106e-06, |
|
"loss": 1.2619, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.6813048933500627, |
|
"grad_norm": 34.87588119506836, |
|
"learning_rate": 2.430030274577151e-06, |
|
"loss": 1.3653, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.6825595984943539, |
|
"grad_norm": 44.73030090332031, |
|
"learning_rate": 2.4126429731174372e-06, |
|
"loss": 1.4503, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.6838143036386449, |
|
"grad_norm": 35.88227462768555, |
|
"learning_rate": 2.3952982975603494e-06, |
|
"loss": 1.3246, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.685069008782936, |
|
"grad_norm": 27.695951461791992, |
|
"learning_rate": 2.3779965336540237e-06, |
|
"loss": 1.3869, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.6863237139272271, |
|
"grad_norm": 37.88958740234375, |
|
"learning_rate": 2.3607379664396414e-06, |
|
"loss": 1.4772, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.6875784190715182, |
|
"grad_norm": 30.21925926208496, |
|
"learning_rate": 2.343522880246734e-06, |
|
"loss": 1.3563, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.6888331242158093, |
|
"grad_norm": 41.6002197265625, |
|
"learning_rate": 2.3263515586884935e-06, |
|
"loss": 1.3695, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.6900878293601004, |
|
"grad_norm": 29.012378692626953, |
|
"learning_rate": 2.3092242846571034e-06, |
|
"loss": 1.3925, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6913425345043914, |
|
"grad_norm": 28.30169105529785, |
|
"learning_rate": 2.2921413403190774e-06, |
|
"loss": 1.3324, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.6925972396486826, |
|
"grad_norm": 30.30564308166504, |
|
"learning_rate": 2.275103007110616e-06, |
|
"loss": 1.3319, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.6938519447929736, |
|
"grad_norm": 32.01078796386719, |
|
"learning_rate": 2.25810956573296e-06, |
|
"loss": 1.2561, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.6951066499372648, |
|
"grad_norm": 45.61001205444336, |
|
"learning_rate": 2.2411612961477704e-06, |
|
"loss": 1.4322, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.6963613550815558, |
|
"grad_norm": 39.38789749145508, |
|
"learning_rate": 2.224258477572524e-06, |
|
"loss": 1.2698, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6976160602258469, |
|
"grad_norm": 41.91701126098633, |
|
"learning_rate": 2.2074013884758993e-06, |
|
"loss": 1.4422, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.698870765370138, |
|
"grad_norm": 32.67595291137695, |
|
"learning_rate": 2.190590306573198e-06, |
|
"loss": 1.2315, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.7001254705144291, |
|
"grad_norm": 33.57855224609375, |
|
"learning_rate": 2.17382550882177e-06, |
|
"loss": 1.2939, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.7013801756587202, |
|
"grad_norm": 30.53522491455078, |
|
"learning_rate": 2.1571072714164445e-06, |
|
"loss": 1.3556, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.7026348808030113, |
|
"grad_norm": 33.44630432128906, |
|
"learning_rate": 2.140435869784986e-06, |
|
"loss": 1.3701, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.7038895859473023, |
|
"grad_norm": 34.59889221191406, |
|
"learning_rate": 2.1238115785835512e-06, |
|
"loss": 1.5211, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.7051442910915935, |
|
"grad_norm": 42.23357009887695, |
|
"learning_rate": 2.1072346716921733e-06, |
|
"loss": 1.2913, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.7063989962358845, |
|
"grad_norm": 32.22030258178711, |
|
"learning_rate": 2.0907054222102367e-06, |
|
"loss": 1.3462, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.7076537013801757, |
|
"grad_norm": 39.91384506225586, |
|
"learning_rate": 2.0742241024519886e-06, |
|
"loss": 1.3211, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.7089084065244667, |
|
"grad_norm": 41.389461517333984, |
|
"learning_rate": 2.0577909839420468e-06, |
|
"loss": 1.3882, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.7101631116687579, |
|
"grad_norm": 25.932300567626953, |
|
"learning_rate": 2.0414063374109326e-06, |
|
"loss": 1.2911, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.7114178168130489, |
|
"grad_norm": 40.37273025512695, |
|
"learning_rate": 2.0250704327906025e-06, |
|
"loss": 1.3346, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.71267252195734, |
|
"grad_norm": 33.203975677490234, |
|
"learning_rate": 2.0087835392100034e-06, |
|
"loss": 1.3206, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.7139272271016311, |
|
"grad_norm": 25.78790283203125, |
|
"learning_rate": 1.9925459249906488e-06, |
|
"loss": 1.2016, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.7151819322459222, |
|
"grad_norm": 26.151403427124023, |
|
"learning_rate": 1.9763578576421816e-06, |
|
"loss": 1.3088, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7164366373902133, |
|
"grad_norm": 40.70786666870117, |
|
"learning_rate": 1.9602196038579774e-06, |
|
"loss": 1.2366, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.7176913425345044, |
|
"grad_norm": 32.47188949584961, |
|
"learning_rate": 1.944131429510754e-06, |
|
"loss": 1.3264, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.7189460476787954, |
|
"grad_norm": 44.57042694091797, |
|
"learning_rate": 1.9280935996481792e-06, |
|
"loss": 1.3883, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.7202007528230866, |
|
"grad_norm": 37.86323165893555, |
|
"learning_rate": 1.9121063784885135e-06, |
|
"loss": 1.2686, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.7214554579673776, |
|
"grad_norm": 28.20488739013672, |
|
"learning_rate": 1.8961700294162578e-06, |
|
"loss": 1.3424, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7227101631116688, |
|
"grad_norm": 80.7864761352539, |
|
"learning_rate": 1.880284814977807e-06, |
|
"loss": 1.4263, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.7239648682559598, |
|
"grad_norm": 47.082122802734375, |
|
"learning_rate": 1.8644509968771302e-06, |
|
"loss": 1.3611, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.7252195734002509, |
|
"grad_norm": 27.525779724121094, |
|
"learning_rate": 1.8486688359714567e-06, |
|
"loss": 1.1818, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.726474278544542, |
|
"grad_norm": 26.097383499145508, |
|
"learning_rate": 1.832938592266984e-06, |
|
"loss": 1.4285, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.7277289836888331, |
|
"grad_norm": 27.29695701599121, |
|
"learning_rate": 1.8172605249145848e-06, |
|
"loss": 1.2213, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7289836888331243, |
|
"grad_norm": 43.18733215332031, |
|
"learning_rate": 1.8016348922055448e-06, |
|
"loss": 1.3866, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.7302383939774153, |
|
"grad_norm": 30.83635139465332, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 1.2583, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.7314930991217063, |
|
"grad_norm": 38.65605163574219, |
|
"learning_rate": 1.7705419595592193e-06, |
|
"loss": 1.4949, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.7327478042659975, |
|
"grad_norm": 33.9451789855957, |
|
"learning_rate": 1.7550751718683339e-06, |
|
"loss": 1.4502, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.7340025094102886, |
|
"grad_norm": 32.3410530090332, |
|
"learning_rate": 1.7396618433051648e-06, |
|
"loss": 1.3073, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7352572145545797, |
|
"grad_norm": 31.831172943115234, |
|
"learning_rate": 1.7243022277995109e-06, |
|
"loss": 1.1989, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.7365119196988708, |
|
"grad_norm": 36.86290740966797, |
|
"learning_rate": 1.7089965783962608e-06, |
|
"loss": 1.4668, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.7377666248431619, |
|
"grad_norm": 34.344600677490234, |
|
"learning_rate": 1.6937451472512284e-06, |
|
"loss": 1.3803, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.739021329987453, |
|
"grad_norm": 27.322994232177734, |
|
"learning_rate": 1.6785481856270042e-06, |
|
"loss": 1.2354, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.740276035131744, |
|
"grad_norm": 44.57414245605469, |
|
"learning_rate": 1.6634059438888034e-06, |
|
"loss": 1.5863, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7415307402760352, |
|
"grad_norm": 33.31477737426758, |
|
"learning_rate": 1.6483186715003523e-06, |
|
"loss": 1.4086, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.7427854454203262, |
|
"grad_norm": 33.885536193847656, |
|
"learning_rate": 1.633286617019771e-06, |
|
"loss": 1.4022, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.7440401505646174, |
|
"grad_norm": 43.636802673339844, |
|
"learning_rate": 1.618310028095486e-06, |
|
"loss": 1.403, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.7452948557089084, |
|
"grad_norm": 38.1976432800293, |
|
"learning_rate": 1.6033891514621436e-06, |
|
"loss": 1.375, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.7465495608531995, |
|
"grad_norm": 27.386051177978516, |
|
"learning_rate": 1.5885242329365448e-06, |
|
"loss": 1.2411, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7478042659974906, |
|
"grad_norm": 32.94865036010742, |
|
"learning_rate": 1.5737155174136042e-06, |
|
"loss": 1.3973, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.7490589711417817, |
|
"grad_norm": 52.85768127441406, |
|
"learning_rate": 1.5589632488623053e-06, |
|
"loss": 1.3857, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.7503136762860728, |
|
"grad_norm": 30.37677001953125, |
|
"learning_rate": 1.5442676703216851e-06, |
|
"loss": 1.2986, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.7515683814303639, |
|
"grad_norm": 50.629112243652344, |
|
"learning_rate": 1.5296290238968303e-06, |
|
"loss": 1.4606, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.7528230865746549, |
|
"grad_norm": 75.81658172607422, |
|
"learning_rate": 1.5150475507548933e-06, |
|
"loss": 1.4354, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7540777917189461, |
|
"grad_norm": 32.35127639770508, |
|
"learning_rate": 1.500523491121108e-06, |
|
"loss": 1.4572, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.7553324968632371, |
|
"grad_norm": 36.757484436035156, |
|
"learning_rate": 1.4860570842748412e-06, |
|
"loss": 1.3798, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.7565872020075283, |
|
"grad_norm": 39.54582977294922, |
|
"learning_rate": 1.47164856854565e-06, |
|
"loss": 1.4334, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.7578419071518193, |
|
"grad_norm": 30.180776596069336, |
|
"learning_rate": 1.4572981813093507e-06, |
|
"loss": 1.4914, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.7590966122961104, |
|
"grad_norm": 55.5819091796875, |
|
"learning_rate": 1.4430061589841122e-06, |
|
"loss": 1.3051, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7603513174404015, |
|
"grad_norm": 41.72428894042969, |
|
"learning_rate": 1.4287727370265558e-06, |
|
"loss": 1.5724, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.7616060225846926, |
|
"grad_norm": 30.067726135253906, |
|
"learning_rate": 1.4145981499278877e-06, |
|
"loss": 1.2012, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.7628607277289837, |
|
"grad_norm": 35.68577194213867, |
|
"learning_rate": 1.4004826312100218e-06, |
|
"loss": 1.375, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.7641154328732748, |
|
"grad_norm": 34.37779998779297, |
|
"learning_rate": 1.386426413421738e-06, |
|
"loss": 1.4803, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.7653701380175659, |
|
"grad_norm": 28.35356330871582, |
|
"learning_rate": 1.3724297281348591e-06, |
|
"loss": 1.0709, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.766624843161857, |
|
"grad_norm": 63.945228576660156, |
|
"learning_rate": 1.3584928059404207e-06, |
|
"loss": 1.3223, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.767879548306148, |
|
"grad_norm": 37.977333068847656, |
|
"learning_rate": 1.3446158764448842e-06, |
|
"loss": 1.3541, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.7691342534504392, |
|
"grad_norm": 33.97459411621094, |
|
"learning_rate": 1.3307991682663463e-06, |
|
"loss": 1.2762, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.7703889585947302, |
|
"grad_norm": 52.56448745727539, |
|
"learning_rate": 1.3170429090307824e-06, |
|
"loss": 1.4249, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.7716436637390214, |
|
"grad_norm": 29.552059173583984, |
|
"learning_rate": 1.303347325368285e-06, |
|
"loss": 1.3487, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7728983688833124, |
|
"grad_norm": 52.34573745727539, |
|
"learning_rate": 1.2897126429093354e-06, |
|
"loss": 1.29, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.7741530740276035, |
|
"grad_norm": 38.19261932373047, |
|
"learning_rate": 1.2761390862810907e-06, |
|
"loss": 1.4146, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.7754077791718946, |
|
"grad_norm": 36.244651794433594, |
|
"learning_rate": 1.2626268791036766e-06, |
|
"loss": 1.4714, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.7766624843161857, |
|
"grad_norm": 41.59754180908203, |
|
"learning_rate": 1.2491762439865034e-06, |
|
"loss": 1.2052, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.7779171894604768, |
|
"grad_norm": 32.61091232299805, |
|
"learning_rate": 1.235787402524603e-06, |
|
"loss": 1.2954, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7791718946047679, |
|
"grad_norm": 30.722808837890625, |
|
"learning_rate": 1.2224605752949786e-06, |
|
"loss": 1.2545, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.7804265997490589, |
|
"grad_norm": 36.57342529296875, |
|
"learning_rate": 1.2091959818529636e-06, |
|
"loss": 1.2536, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.7816813048933501, |
|
"grad_norm": 45.92577362060547, |
|
"learning_rate": 1.1959938407286099e-06, |
|
"loss": 1.3089, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.7829360100376411, |
|
"grad_norm": 31.191242218017578, |
|
"learning_rate": 1.182854369423091e-06, |
|
"loss": 1.2477, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.7841907151819323, |
|
"grad_norm": 31.34370231628418, |
|
"learning_rate": 1.1697777844051105e-06, |
|
"loss": 1.3789, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7854454203262233, |
|
"grad_norm": 27.42989730834961, |
|
"learning_rate": 1.1567643011073393e-06, |
|
"loss": 1.2446, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.7867001254705144, |
|
"grad_norm": 31.601276397705078, |
|
"learning_rate": 1.143814133922872e-06, |
|
"loss": 1.453, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.7879548306148055, |
|
"grad_norm": 42.06584548950195, |
|
"learning_rate": 1.1309274962016854e-06, |
|
"loss": 1.2825, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.7892095357590966, |
|
"grad_norm": 36.16788864135742, |
|
"learning_rate": 1.1181046002471292e-06, |
|
"loss": 1.3807, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.7904642409033877, |
|
"grad_norm": 35.88719177246094, |
|
"learning_rate": 1.1053456573124272e-06, |
|
"loss": 1.1951, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7917189460476788, |
|
"grad_norm": 43.55876541137695, |
|
"learning_rate": 1.0926508775971995e-06, |
|
"loss": 1.3084, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.7929736511919699, |
|
"grad_norm": 38.98108673095703, |
|
"learning_rate": 1.0800204702439937e-06, |
|
"loss": 1.336, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.794228356336261, |
|
"grad_norm": 34.15788650512695, |
|
"learning_rate": 1.0674546433348453e-06, |
|
"loss": 1.4309, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.795483061480552, |
|
"grad_norm": 42.34593963623047, |
|
"learning_rate": 1.0549536038878432e-06, |
|
"loss": 1.3815, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.7967377666248432, |
|
"grad_norm": 33.58256530761719, |
|
"learning_rate": 1.04251755785373e-06, |
|
"loss": 1.2034, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7979924717691342, |
|
"grad_norm": 41.538753509521484, |
|
"learning_rate": 1.0301467101124956e-06, |
|
"loss": 1.3423, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.7992471769134254, |
|
"grad_norm": 42.10636901855469, |
|
"learning_rate": 1.0178412644700093e-06, |
|
"loss": 1.3916, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.8005018820577164, |
|
"grad_norm": 31.18490219116211, |
|
"learning_rate": 1.0056014236546647e-06, |
|
"loss": 1.1455, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.8017565872020075, |
|
"grad_norm": 32.616031646728516, |
|
"learning_rate": 9.934273893140335e-07, |
|
"loss": 1.3136, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.8030112923462986, |
|
"grad_norm": 41.29079818725586, |
|
"learning_rate": 9.813193620115446e-07, |
|
"loss": 1.2788, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.8042659974905897, |
|
"grad_norm": 39.024993896484375, |
|
"learning_rate": 9.692775412231863e-07, |
|
"loss": 1.3029, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.8055207026348808, |
|
"grad_norm": 40.532737731933594, |
|
"learning_rate": 9.573021253342114e-07, |
|
"loss": 1.3518, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.8067754077791719, |
|
"grad_norm": 42.95549011230469, |
|
"learning_rate": 9.453933116358715e-07, |
|
"loss": 1.4456, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.8080301129234629, |
|
"grad_norm": 30.134597778320312, |
|
"learning_rate": 9.335512963221732e-07, |
|
"loss": 1.2561, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.8092848180677541, |
|
"grad_norm": 42.78569412231445, |
|
"learning_rate": 9.21776274486636e-07, |
|
"loss": 1.3378, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.8105395232120451, |
|
"grad_norm": 54.95227813720703, |
|
"learning_rate": 9.100684401190829e-07, |
|
"loss": 1.3858, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.8117942283563363, |
|
"grad_norm": 42.90878677368164, |
|
"learning_rate": 8.984279861024453e-07, |
|
"loss": 1.2899, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.8130489335006273, |
|
"grad_norm": 53.56229019165039, |
|
"learning_rate": 8.868551042095852e-07, |
|
"loss": 1.468, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.8143036386449184, |
|
"grad_norm": 31.682039260864258, |
|
"learning_rate": 8.753499851001341e-07, |
|
"loss": 1.1707, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.8155583437892095, |
|
"grad_norm": 31.241701126098633, |
|
"learning_rate": 8.639128183173517e-07, |
|
"loss": 1.1829, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8168130489335006, |
|
"grad_norm": 33.625938415527344, |
|
"learning_rate": 8.525437922850033e-07, |
|
"loss": 1.3418, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.8180677540777918, |
|
"grad_norm": 30.763322830200195, |
|
"learning_rate": 8.412430943042616e-07, |
|
"loss": 1.3651, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.8193224592220828, |
|
"grad_norm": 48.34621810913086, |
|
"learning_rate": 8.30010910550611e-07, |
|
"loss": 1.3246, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.820577164366374, |
|
"grad_norm": 35.97224426269531, |
|
"learning_rate": 8.188474260707857e-07, |
|
"loss": 1.422, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.821831869510665, |
|
"grad_norm": 31.350204467773438, |
|
"learning_rate": 8.077528247797234e-07, |
|
"loss": 1.3197, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.823086574654956, |
|
"grad_norm": 39.3220329284668, |
|
"learning_rate": 7.967272894575312e-07, |
|
"loss": 1.3164, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.8243412797992472, |
|
"grad_norm": 34.87789535522461, |
|
"learning_rate": 7.857710017464737e-07, |
|
"loss": 1.3422, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.8255959849435383, |
|
"grad_norm": 39.69428634643555, |
|
"learning_rate": 7.748841421479875e-07, |
|
"loss": 1.2374, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.8268506900878294, |
|
"grad_norm": 40.43376541137695, |
|
"learning_rate": 7.640668900196985e-07, |
|
"loss": 1.3143, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.8281053952321205, |
|
"grad_norm": 28.951221466064453, |
|
"learning_rate": 7.533194235724728e-07, |
|
"loss": 1.315, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8293601003764115, |
|
"grad_norm": 56.01127243041992, |
|
"learning_rate": 7.426419198674773e-07, |
|
"loss": 1.3279, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.8306148055207027, |
|
"grad_norm": 36.56144332885742, |
|
"learning_rate": 7.320345548132679e-07, |
|
"loss": 1.2427, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.8318695106649937, |
|
"grad_norm": 34.64320373535156, |
|
"learning_rate": 7.214975031628856e-07, |
|
"loss": 1.3805, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.8331242158092849, |
|
"grad_norm": 42.90142059326172, |
|
"learning_rate": 7.110309385109804e-07, |
|
"loss": 1.3778, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.8343789209535759, |
|
"grad_norm": 33.45329284667969, |
|
"learning_rate": 7.006350332909495e-07, |
|
"loss": 1.3461, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.835633626097867, |
|
"grad_norm": 39.53373718261719, |
|
"learning_rate": 6.903099587721024e-07, |
|
"loss": 1.372, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.8368883312421581, |
|
"grad_norm": 26.866334915161133, |
|
"learning_rate": 6.800558850568295e-07, |
|
"loss": 1.1701, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.8381430363864492, |
|
"grad_norm": 35.01183319091797, |
|
"learning_rate": 6.698729810778065e-07, |
|
"loss": 1.2913, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.8393977415307403, |
|
"grad_norm": 26.15965461730957, |
|
"learning_rate": 6.597614145952136e-07, |
|
"loss": 1.1659, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.8406524466750314, |
|
"grad_norm": 27.10162925720215, |
|
"learning_rate": 6.497213521939638e-07, |
|
"loss": 1.176, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8419071518193224, |
|
"grad_norm": 39.48128128051758, |
|
"learning_rate": 6.397529592809615e-07, |
|
"loss": 1.4855, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.8431618569636136, |
|
"grad_norm": 45.1597785949707, |
|
"learning_rate": 6.298564000823848e-07, |
|
"loss": 1.2702, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.8444165621079046, |
|
"grad_norm": 59.02643585205078, |
|
"learning_rate": 6.20031837640967e-07, |
|
"loss": 1.3335, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.8456712672521958, |
|
"grad_norm": 33.48893737792969, |
|
"learning_rate": 6.102794338133195e-07, |
|
"loss": 1.1215, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.8469259723964868, |
|
"grad_norm": 33.40549850463867, |
|
"learning_rate": 6.005993492672657e-07, |
|
"loss": 1.3049, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.848180677540778, |
|
"grad_norm": 28.336149215698242, |
|
"learning_rate": 5.909917434791884e-07, |
|
"loss": 1.2866, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.849435382685069, |
|
"grad_norm": 31.5575008392334, |
|
"learning_rate": 5.814567747314049e-07, |
|
"loss": 1.1839, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.8506900878293601, |
|
"grad_norm": 30.665040969848633, |
|
"learning_rate": 5.719946001095617e-07, |
|
"loss": 1.3647, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.8519447929736512, |
|
"grad_norm": 38.09904098510742, |
|
"learning_rate": 5.626053755000421e-07, |
|
"loss": 1.3963, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.8531994981179423, |
|
"grad_norm": 62.874881744384766, |
|
"learning_rate": 5.532892555874059e-07, |
|
"loss": 1.2852, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8544542032622334, |
|
"grad_norm": 31.233694076538086, |
|
"learning_rate": 5.440463938518304e-07, |
|
"loss": 1.487, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.8557089084065245, |
|
"grad_norm": 34.371585845947266, |
|
"learning_rate": 5.348769425665884e-07, |
|
"loss": 1.3499, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.8569636135508155, |
|
"grad_norm": 40.928802490234375, |
|
"learning_rate": 5.25781052795541e-07, |
|
"loss": 1.494, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.8582183186951067, |
|
"grad_norm": 47.68248748779297, |
|
"learning_rate": 5.167588743906432e-07, |
|
"loss": 1.2565, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.8594730238393977, |
|
"grad_norm": 31.525768280029297, |
|
"learning_rate": 5.078105559894791e-07, |
|
"loss": 1.2186, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8607277289836889, |
|
"grad_norm": 41.63323211669922, |
|
"learning_rate": 4.989362450128133e-07, |
|
"loss": 1.3934, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.8619824341279799, |
|
"grad_norm": 29.7374324798584, |
|
"learning_rate": 4.901360876621597e-07, |
|
"loss": 1.2498, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.863237139272271, |
|
"grad_norm": 38.2042350769043, |
|
"learning_rate": 4.814102289173733e-07, |
|
"loss": 1.1372, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.8644918444165621, |
|
"grad_norm": 33.84709930419922, |
|
"learning_rate": 4.727588125342669e-07, |
|
"loss": 1.218, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.8657465495608532, |
|
"grad_norm": 39.36479568481445, |
|
"learning_rate": 4.6418198104223434e-07, |
|
"loss": 1.3434, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8670012547051443, |
|
"grad_norm": 45.70726776123047, |
|
"learning_rate": 4.5567987574190677e-07, |
|
"loss": 1.3344, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.8682559598494354, |
|
"grad_norm": 42.92964172363281, |
|
"learning_rate": 4.4725263670282905e-07, |
|
"loss": 1.3247, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.8695106649937264, |
|
"grad_norm": 33.368629455566406, |
|
"learning_rate": 4.3890040276114044e-07, |
|
"loss": 1.3195, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.8707653701380176, |
|
"grad_norm": 43.9223518371582, |
|
"learning_rate": 4.306233115173009e-07, |
|
"loss": 1.3844, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.8720200752823086, |
|
"grad_norm": 40.18341064453125, |
|
"learning_rate": 4.224214993338149e-07, |
|
"loss": 1.3651, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8732747804265998, |
|
"grad_norm": 38.75429916381836, |
|
"learning_rate": 4.1429510133298714e-07, |
|
"loss": 1.3685, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.8745294855708908, |
|
"grad_norm": 41.714378356933594, |
|
"learning_rate": 4.062442513947007e-07, |
|
"loss": 1.4269, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.875784190715182, |
|
"grad_norm": 29.522842407226562, |
|
"learning_rate": 3.9826908215420344e-07, |
|
"loss": 1.1375, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.877038895859473, |
|
"grad_norm": 28.621906280517578, |
|
"learning_rate": 3.903697249999289e-07, |
|
"loss": 1.3684, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.8782936010037641, |
|
"grad_norm": 78.60023498535156, |
|
"learning_rate": 3.825463100713317e-07, |
|
"loss": 1.3113, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8795483061480552, |
|
"grad_norm": 34.123355865478516, |
|
"learning_rate": 3.747989662567403e-07, |
|
"loss": 1.4122, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.8808030112923463, |
|
"grad_norm": 34.434959411621094, |
|
"learning_rate": 3.671278211912338e-07, |
|
"loss": 1.4044, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.8820577164366374, |
|
"grad_norm": 43.3989372253418, |
|
"learning_rate": 3.595330012545445e-07, |
|
"loss": 1.3849, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.8833124215809285, |
|
"grad_norm": 51.71344757080078, |
|
"learning_rate": 3.520146315689693e-07, |
|
"loss": 1.4736, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.8845671267252195, |
|
"grad_norm": 37.74956130981445, |
|
"learning_rate": 3.445728359973094e-07, |
|
"loss": 1.5021, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.8858218318695107, |
|
"grad_norm": 38.12771224975586, |
|
"learning_rate": 3.372077371408361e-07, |
|
"loss": 1.3782, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.8870765370138017, |
|
"grad_norm": 45.82014465332031, |
|
"learning_rate": 3.299194563372604e-07, |
|
"loss": 1.4072, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.8883312421580929, |
|
"grad_norm": 41.57502746582031, |
|
"learning_rate": 3.22708113658744e-07, |
|
"loss": 1.1852, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.8895859473023839, |
|
"grad_norm": 40.33243179321289, |
|
"learning_rate": 3.1557382790991686e-07, |
|
"loss": 1.2315, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.890840652446675, |
|
"grad_norm": 50.13658142089844, |
|
"learning_rate": 3.085167166259162e-07, |
|
"loss": 1.5278, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8920953575909661, |
|
"grad_norm": 43.55479431152344, |
|
"learning_rate": 3.015368960704584e-07, |
|
"loss": 1.214, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.8933500627352572, |
|
"grad_norm": 40.6564826965332, |
|
"learning_rate": 2.9463448123391634e-07, |
|
"loss": 1.2893, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.8946047678795483, |
|
"grad_norm": 36.575809478759766, |
|
"learning_rate": 2.878095858314278e-07, |
|
"loss": 1.2348, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.8958594730238394, |
|
"grad_norm": 43.1509895324707, |
|
"learning_rate": 2.810623223010245e-07, |
|
"loss": 1.2692, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.8971141781681304, |
|
"grad_norm": 30.058103561401367, |
|
"learning_rate": 2.743928018017744e-07, |
|
"loss": 1.2322, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.8983688833124216, |
|
"grad_norm": 29.974342346191406, |
|
"learning_rate": 2.67801134211953e-07, |
|
"loss": 1.1901, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.8996235884567126, |
|
"grad_norm": 35.170406341552734, |
|
"learning_rate": 2.612874281272371e-07, |
|
"loss": 1.2897, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.9008782936010038, |
|
"grad_norm": 36.794464111328125, |
|
"learning_rate": 2.548517908589077e-07, |
|
"loss": 1.4094, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.9021329987452948, |
|
"grad_norm": 34.6309700012207, |
|
"learning_rate": 2.4849432843208786e-07, |
|
"loss": 1.2453, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.903387703889586, |
|
"grad_norm": 41.007938385009766, |
|
"learning_rate": 2.422151455839955e-07, |
|
"loss": 1.479, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.904642409033877, |
|
"grad_norm": 35.56821823120117, |
|
"learning_rate": 2.3601434576221548e-07, |
|
"loss": 1.2376, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.9058971141781681, |
|
"grad_norm": 40.16046905517578, |
|
"learning_rate": 2.2989203112299685e-07, |
|
"loss": 1.3773, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.9071518193224593, |
|
"grad_norm": 31.831424713134766, |
|
"learning_rate": 2.2384830252957068e-07, |
|
"loss": 1.2387, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.9084065244667503, |
|
"grad_norm": 102.80229187011719, |
|
"learning_rate": 2.178832595504854e-07, |
|
"loss": 1.3606, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.9096612296110415, |
|
"grad_norm": 37.90886688232422, |
|
"learning_rate": 2.1199700045797077e-07, |
|
"loss": 1.4478, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9109159347553325, |
|
"grad_norm": 36.04559326171875, |
|
"learning_rate": 2.0618962222631434e-07, |
|
"loss": 1.4465, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.9121706398996235, |
|
"grad_norm": 45.584922790527344, |
|
"learning_rate": 2.0046122053026697e-07, |
|
"loss": 1.3702, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.9134253450439147, |
|
"grad_norm": 39.56161117553711, |
|
"learning_rate": 1.9481188974346698e-07, |
|
"loss": 1.2966, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.9146800501882058, |
|
"grad_norm": 48.845314025878906, |
|
"learning_rate": 1.8924172293688148e-07, |
|
"loss": 1.3017, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.9159347553324969, |
|
"grad_norm": 35.209503173828125, |
|
"learning_rate": 1.8375081187727683e-07, |
|
"loss": 1.2948, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.917189460476788, |
|
"grad_norm": 35.617698669433594, |
|
"learning_rate": 1.7833924702570725e-07, |
|
"loss": 1.1993, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.918444165621079, |
|
"grad_norm": 29.306623458862305, |
|
"learning_rate": 1.7300711753601985e-07, |
|
"loss": 1.2074, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.9196988707653702, |
|
"grad_norm": 34.39566421508789, |
|
"learning_rate": 1.677545112533896e-07, |
|
"loss": 1.3316, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.9209535759096612, |
|
"grad_norm": 36.989356994628906, |
|
"learning_rate": 1.6258151471287397e-07, |
|
"loss": 1.3134, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.9222082810539524, |
|
"grad_norm": 48.13298034667969, |
|
"learning_rate": 1.5748821313798124e-07, |
|
"loss": 1.3963, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9234629861982434, |
|
"grad_norm": 39.777278900146484, |
|
"learning_rate": 1.5247469043927153e-07, |
|
"loss": 1.3866, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.9247176913425345, |
|
"grad_norm": 31.973005294799805, |
|
"learning_rate": 1.4754102921297363e-07, |
|
"loss": 1.2392, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.9259723964868256, |
|
"grad_norm": 31.995790481567383, |
|
"learning_rate": 1.4268731073962094e-07, |
|
"loss": 1.2198, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.9272271016311167, |
|
"grad_norm": 33.672569274902344, |
|
"learning_rate": 1.3791361498271704e-07, |
|
"loss": 1.3004, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.9284818067754078, |
|
"grad_norm": 31.81163787841797, |
|
"learning_rate": 1.3322002058741678e-07, |
|
"loss": 1.3826, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9297365119196989, |
|
"grad_norm": 32.36835479736328, |
|
"learning_rate": 1.2860660487922616e-07, |
|
"loss": 1.4068, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.93099121706399, |
|
"grad_norm": 43.015193939208984, |
|
"learning_rate": 1.240734438627361e-07, |
|
"loss": 1.381, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.9322459222082811, |
|
"grad_norm": 44.727230072021484, |
|
"learning_rate": 1.196206122203647e-07, |
|
"loss": 1.3348, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.9335006273525721, |
|
"grad_norm": 29.804079055786133, |
|
"learning_rate": 1.1524818331112853e-07, |
|
"loss": 1.2291, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.9347553324968633, |
|
"grad_norm": 46.379451751708984, |
|
"learning_rate": 1.1095622916943494e-07, |
|
"loss": 1.4644, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9360100376411543, |
|
"grad_norm": 31.480005264282227, |
|
"learning_rate": 1.0674482050389457e-07, |
|
"loss": 1.2402, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.9372647427854455, |
|
"grad_norm": 25.78557586669922, |
|
"learning_rate": 1.0261402669615505e-07, |
|
"loss": 1.3798, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.9385194479297365, |
|
"grad_norm": 41.28335189819336, |
|
"learning_rate": 9.856391579976032e-08, |
|
"loss": 1.3066, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.9397741530740276, |
|
"grad_norm": 42.25539779663086, |
|
"learning_rate": 9.459455453902866e-08, |
|
"loss": 1.3258, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.9410288582183187, |
|
"grad_norm": 39.32608413696289, |
|
"learning_rate": 9.070600830795251e-08, |
|
"loss": 1.3086, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9422835633626098, |
|
"grad_norm": 30.92926025390625, |
|
"learning_rate": 8.68983411691221e-08, |
|
"loss": 1.1993, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.9435382685069009, |
|
"grad_norm": 33.10255813598633, |
|
"learning_rate": 8.317161585266964e-08, |
|
"loss": 1.1975, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.944792973651192, |
|
"grad_norm": 42.903900146484375, |
|
"learning_rate": 7.952589375523567e-08, |
|
"loss": 1.1978, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.946047678795483, |
|
"grad_norm": 28.121461868286133, |
|
"learning_rate": 7.59612349389599e-08, |
|
"loss": 1.222, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.9473023839397742, |
|
"grad_norm": 35.83945083618164, |
|
"learning_rate": 7.247769813048644e-08, |
|
"loss": 1.2472, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9485570890840652, |
|
"grad_norm": 37.46073532104492, |
|
"learning_rate": 6.907534072000177e-08, |
|
"loss": 1.5055, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.9498117942283564, |
|
"grad_norm": 27.392000198364258, |
|
"learning_rate": 6.575421876028721e-08, |
|
"loss": 1.1948, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.9510664993726474, |
|
"grad_norm": 51.311744689941406, |
|
"learning_rate": 6.251438696579293e-08, |
|
"loss": 1.3754, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.9523212045169385, |
|
"grad_norm": 36.097373962402344, |
|
"learning_rate": 5.935589871174208e-08, |
|
"loss": 1.1822, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.9535759096612296, |
|
"grad_norm": 32.62606430053711, |
|
"learning_rate": 5.627880603324532e-08, |
|
"loss": 1.271, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9548306148055207, |
|
"grad_norm": 37.016719818115234, |
|
"learning_rate": 5.3283159624448745e-08, |
|
"loss": 1.1878, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.9560853199498118, |
|
"grad_norm": 37.270118713378906, |
|
"learning_rate": 5.0369008837696244e-08, |
|
"loss": 1.2704, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.9573400250941029, |
|
"grad_norm": 37.098854064941406, |
|
"learning_rate": 4.753640168271456e-08, |
|
"loss": 1.2812, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.958594730238394, |
|
"grad_norm": 44.55942916870117, |
|
"learning_rate": 4.478538482582617e-08, |
|
"loss": 1.2366, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.9598494353826851, |
|
"grad_norm": 30.313488006591797, |
|
"learning_rate": 4.211600358917989e-08, |
|
"loss": 1.2731, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9611041405269761, |
|
"grad_norm": 52.65010452270508, |
|
"learning_rate": 3.9528301950000345e-08, |
|
"loss": 1.4277, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.9623588456712673, |
|
"grad_norm": 34.024227142333984, |
|
"learning_rate": 3.702232253986804e-08, |
|
"loss": 1.3047, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.9636135508155583, |
|
"grad_norm": 49.82564163208008, |
|
"learning_rate": 3.4598106644014863e-08, |
|
"loss": 1.2943, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.9648682559598495, |
|
"grad_norm": 42.5301513671875, |
|
"learning_rate": 3.2255694200643003e-08, |
|
"loss": 1.3643, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.9661229611041405, |
|
"grad_norm": 36.81052017211914, |
|
"learning_rate": 2.9995123800270476e-08, |
|
"loss": 1.4252, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9673776662484316, |
|
"grad_norm": 35.52188491821289, |
|
"learning_rate": 2.7816432685091598e-08, |
|
"loss": 1.35, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.9686323713927227, |
|
"grad_norm": 30.83523941040039, |
|
"learning_rate": 2.5719656748364184e-08, |
|
"loss": 1.2627, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.9698870765370138, |
|
"grad_norm": 29.04794692993164, |
|
"learning_rate": 2.370483053382111e-08, |
|
"loss": 1.2903, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.9711417816813049, |
|
"grad_norm": 36.21467208862305, |
|
"learning_rate": 2.177198723509688e-08, |
|
"loss": 1.3589, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.972396486825596, |
|
"grad_norm": 30.13644790649414, |
|
"learning_rate": 1.992115869518474e-08, |
|
"loss": 1.2922, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.973651191969887, |
|
"grad_norm": 50.431663513183594, |
|
"learning_rate": 1.8152375405909305e-08, |
|
"loss": 1.2573, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.9749058971141782, |
|
"grad_norm": 50.13302230834961, |
|
"learning_rate": 1.6465666507425314e-08, |
|
"loss": 1.4401, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.9761606022584692, |
|
"grad_norm": 46.383636474609375, |
|
"learning_rate": 1.4861059787736886e-08, |
|
"loss": 1.424, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.9774153074027604, |
|
"grad_norm": 34.33049011230469, |
|
"learning_rate": 1.333858168224178e-08, |
|
"loss": 1.2715, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.9786700125470514, |
|
"grad_norm": 42.03940963745117, |
|
"learning_rate": 1.1898257273292857e-08, |
|
"loss": 1.2918, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9799247176913425, |
|
"grad_norm": 42.43777847290039, |
|
"learning_rate": 1.0540110289786742e-08, |
|
"loss": 1.5214, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.9811794228356336, |
|
"grad_norm": 31.801700592041016, |
|
"learning_rate": 9.264163106774138e-09, |
|
"loss": 1.2777, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.9824341279799247, |
|
"grad_norm": 49.655391693115234, |
|
"learning_rate": 8.07043674508623e-09, |
|
"loss": 1.2324, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.9836888331242158, |
|
"grad_norm": 37.17424011230469, |
|
"learning_rate": 6.958950870994963e-09, |
|
"loss": 1.2559, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.9849435382685069, |
|
"grad_norm": 33.83037567138672, |
|
"learning_rate": 5.929723795884967e-09, |
|
"loss": 1.2658, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.986198243412798, |
|
"grad_norm": 49.56622314453125, |
|
"learning_rate": 4.982772475951026e-09, |
|
"loss": 1.2301, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.9874529485570891, |
|
"grad_norm": 71.51993560791016, |
|
"learning_rate": 4.1181125119221785e-09, |
|
"loss": 1.4287, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.9887076537013801, |
|
"grad_norm": 37.62562942504883, |
|
"learning_rate": 3.3357581488030476e-09, |
|
"loss": 1.4585, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.9899623588456713, |
|
"grad_norm": 44.091552734375, |
|
"learning_rate": 2.635722275638464e-09, |
|
"loss": 1.5654, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.9912170639899623, |
|
"grad_norm": 67.96106719970703, |
|
"learning_rate": 2.0180164253008614e-09, |
|
"loss": 1.3665, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9924717691342535, |
|
"grad_norm": 38.18610763549805, |
|
"learning_rate": 1.4826507743032071e-09, |
|
"loss": 1.1607, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.9937264742785445, |
|
"grad_norm": 36.40510940551758, |
|
"learning_rate": 1.029634142627467e-09, |
|
"loss": 1.2769, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.9949811794228356, |
|
"grad_norm": 34.85893630981445, |
|
"learning_rate": 6.589739935819461e-10, |
|
"loss": 1.3029, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.9962358845671268, |
|
"grad_norm": 36.350643157958984, |
|
"learning_rate": 3.7067643367749707e-10, |
|
"loss": 1.2861, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.9974905897114178, |
|
"grad_norm": 38.7654914855957, |
|
"learning_rate": 1.6474621252704494e-10, |
|
"loss": 1.1653, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.998745294855709, |
|
"grad_norm": 72.9517822265625, |
|
"learning_rate": 4.118672276620661e-11, |
|
"loss": 1.3579, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 33.991390228271484, |
|
"learning_rate": 0.0, |
|
"loss": 1.346, |
|
"step": 797 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 797, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4921722755088384.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|