{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 4.823320729485639, "learning_rate": 8.333333333333334e-06, "loss": 1.7211, "step": 1 }, { "epoch": 0.125, "grad_norm": 4.751505642660104, "learning_rate": 1.6666666666666667e-05, "loss": 1.7397, "step": 2 }, { "epoch": 0.1875, "grad_norm": 4.487357353479602, "learning_rate": 2.5e-05, "loss": 1.5714, "step": 3 }, { "epoch": 0.25, "grad_norm": 4.036784685130891, "learning_rate": 3.3333333333333335e-05, "loss": 1.4755, "step": 4 }, { "epoch": 0.3125, "grad_norm": 2.7951107515008022, "learning_rate": 4.166666666666667e-05, "loss": 1.0114, "step": 5 }, { "epoch": 0.375, "grad_norm": 3.3636166511290284, "learning_rate": 5e-05, "loss": 0.866, "step": 6 }, { "epoch": 0.4375, "grad_norm": 2.077830674223075, "learning_rate": 5.833333333333334e-05, "loss": 0.5604, "step": 7 }, { "epoch": 0.5, "grad_norm": 1.1666324696634576, "learning_rate": 6.666666666666667e-05, "loss": 0.339, "step": 8 }, { "epoch": 0.5625, "grad_norm": 0.8956349805282244, "learning_rate": 7.500000000000001e-05, "loss": 0.2396, "step": 9 }, { "epoch": 0.625, "grad_norm": 1.6517703519389952, "learning_rate": 8.333333333333334e-05, "loss": 0.2824, "step": 10 }, { "epoch": 0.6875, "grad_norm": 1.4629478660244202, "learning_rate": 9.166666666666667e-05, "loss": 0.247, "step": 11 }, { "epoch": 0.75, "grad_norm": 0.8768061602490982, "learning_rate": 0.0001, "loss": 0.1893, "step": 12 }, { "epoch": 0.8125, "grad_norm": 0.8390983969336053, "learning_rate": 0.00010833333333333333, "loss": 0.2698, "step": 13 }, { "epoch": 0.875, "grad_norm": 0.9480328834691413, "learning_rate": 0.00011666666666666668, "loss": 0.1832, "step": 14 }, { "epoch": 0.9375, "grad_norm": 0.9607185561225253, "learning_rate": 0.000125, "loss": 0.1928, "step": 15 }, { "epoch": 1.0, "grad_norm": 0.4976206492811293, "learning_rate": 0.00013333333333333334, "loss": 0.1599, "step": 16 }, { "epoch": 1.0625, "grad_norm": 0.6230231499462036, "learning_rate": 0.00014166666666666668, "loss": 0.2803, "step": 17 }, { "epoch": 1.125, "grad_norm": 0.4748261671538313, "learning_rate": 0.00015000000000000001, "loss": 0.1435, "step": 18 }, { "epoch": 1.1875, "grad_norm": 0.41444527359590055, "learning_rate": 0.00015833333333333332, "loss": 0.1798, "step": 19 }, { "epoch": 1.25, "grad_norm": 0.458788407239296, "learning_rate": 0.0001666666666666667, "loss": 0.1989, "step": 20 }, { "epoch": 1.3125, "grad_norm": 0.4563665244618309, "learning_rate": 0.000175, "loss": 0.1647, "step": 21 }, { "epoch": 1.375, "grad_norm": 0.4105054285151717, "learning_rate": 0.00018333333333333334, "loss": 0.1164, "step": 22 }, { "epoch": 1.4375, "grad_norm": 0.3562784794883969, "learning_rate": 0.00019166666666666667, "loss": 0.1172, "step": 23 }, { "epoch": 1.5, "grad_norm": 0.38610424041542285, "learning_rate": 0.0002, "loss": 0.1056, "step": 24 }, { "epoch": 1.5625, "grad_norm": 0.35527586407983786, "learning_rate": 0.00019999918050612108, "loss": 0.1217, "step": 25 }, { "epoch": 1.625, "grad_norm": 0.3633401519435876, "learning_rate": 0.00019999672203791565, "loss": 0.1458, "step": 26 }, { "epoch": 1.6875, "grad_norm": 0.3603552448206864, "learning_rate": 0.00019999262463567773, "loss": 0.1261, "step": 27 }, { "epoch": 1.75, "grad_norm": 0.3388971002892635, "learning_rate": 0.00019998688836656323, "loss": 0.1309, "step": 28 }, { "epoch": 1.8125, "grad_norm": 0.4061212900739158, "learning_rate": 0.0001999795133245889, "loss": 0.1091, "step": 29 }, { "epoch": 1.875, "grad_norm": 0.39345006056668225, "learning_rate": 0.0001999704996306308, "loss": 0.1627, "step": 30 }, { "epoch": 1.9375, "grad_norm": 0.38142281255979, "learning_rate": 0.00019995984743242226, "loss": 0.1327, "step": 31 }, { "epoch": 2.0, "grad_norm": 0.5305995721603599, "learning_rate": 0.00019994755690455152, "loss": 0.156, "step": 32 }, { "epoch": 2.0625, "grad_norm": 0.2546721306857309, "learning_rate": 0.00019993362824845875, "loss": 0.0744, "step": 33 }, { "epoch": 2.125, "grad_norm": 0.3997709948362388, "learning_rate": 0.000199918061692433, "loss": 0.1605, "step": 34 }, { "epoch": 2.1875, "grad_norm": 0.39616508988629523, "learning_rate": 0.00019990085749160822, "loss": 0.1306, "step": 35 }, { "epoch": 2.25, "grad_norm": 0.4619577933372822, "learning_rate": 0.0001998820159279591, "loss": 0.1318, "step": 36 }, { "epoch": 2.3125, "grad_norm": 0.2968290985862986, "learning_rate": 0.00019986153731029656, "loss": 0.0788, "step": 37 }, { "epoch": 2.375, "grad_norm": 0.4062269625672033, "learning_rate": 0.0001998394219742627, "loss": 0.1207, "step": 38 }, { "epoch": 2.4375, "grad_norm": 0.36908209897646804, "learning_rate": 0.00019981567028232514, "loss": 0.1006, "step": 39 }, { "epoch": 2.5, "grad_norm": 0.23625961054729414, "learning_rate": 0.00019979028262377118, "loss": 0.1083, "step": 40 }, { "epoch": 2.5625, "grad_norm": 0.3069294649987226, "learning_rate": 0.00019976325941470146, "loss": 0.0962, "step": 41 }, { "epoch": 2.625, "grad_norm": 0.23363462936052565, "learning_rate": 0.00019973460109802305, "loss": 0.096, "step": 42 }, { "epoch": 2.6875, "grad_norm": 0.29106062929023385, "learning_rate": 0.0001997043081434423, "loss": 0.134, "step": 43 }, { "epoch": 2.75, "grad_norm": 0.18087001511209616, "learning_rate": 0.00019967238104745696, "loss": 0.1031, "step": 44 }, { "epoch": 2.8125, "grad_norm": 0.24123960365724528, "learning_rate": 0.00019963882033334826, "loss": 0.1082, "step": 45 }, { "epoch": 2.875, "grad_norm": 0.25054721428161647, "learning_rate": 0.00019960362655117218, "loss": 0.0798, "step": 46 }, { "epoch": 2.9375, "grad_norm": 0.5422539067423491, "learning_rate": 0.00019956680027775051, "loss": 0.1254, "step": 47 }, { "epoch": 3.0, "grad_norm": 0.2565364589386019, "learning_rate": 0.0001995283421166614, "loss": 0.0979, "step": 48 }, { "epoch": 3.0625, "grad_norm": 0.215553765023481, "learning_rate": 0.00019948825269822934, "loss": 0.0656, "step": 49 }, { "epoch": 3.125, "grad_norm": 0.20810732914954436, "learning_rate": 0.00019944653267951504, "loss": 0.0793, "step": 50 }, { "epoch": 3.1875, "grad_norm": 0.28614832461281026, "learning_rate": 0.00019940318274430449, "loss": 0.099, "step": 51 }, { "epoch": 3.25, "grad_norm": 0.32282441297094233, "learning_rate": 0.00019935820360309777, "loss": 0.0958, "step": 52 }, { "epoch": 3.3125, "grad_norm": 0.1946365159456168, "learning_rate": 0.00019931159599309757, "loss": 0.0808, "step": 53 }, { "epoch": 3.375, "grad_norm": 0.327313439667935, "learning_rate": 0.00019926336067819684, "loss": 0.081, "step": 54 }, { "epoch": 3.4375, "grad_norm": 0.25559584990032447, "learning_rate": 0.00019921349844896654, "loss": 0.0855, "step": 55 }, { "epoch": 3.5, "grad_norm": 0.42835063024229314, "learning_rate": 0.00019916201012264254, "loss": 0.0863, "step": 56 }, { "epoch": 3.5625, "grad_norm": 0.4079608215806712, "learning_rate": 0.00019910889654311208, "loss": 0.0749, "step": 57 }, { "epoch": 3.625, "grad_norm": 0.2658074683405381, "learning_rate": 0.00019905415858090036, "loss": 0.0829, "step": 58 }, { "epoch": 3.6875, "grad_norm": 0.26176221704098207, "learning_rate": 0.00019899779713315575, "loss": 0.0711, "step": 59 }, { "epoch": 3.75, "grad_norm": 0.4107999604774971, "learning_rate": 0.00019893981312363562, "loss": 0.0925, "step": 60 }, { "epoch": 3.8125, "grad_norm": 0.28972083293965484, "learning_rate": 0.00019888020750269067, "loss": 0.1033, "step": 61 }, { "epoch": 3.875, "grad_norm": 0.22772110434236376, "learning_rate": 0.00019881898124724981, "loss": 0.0858, "step": 62 }, { "epoch": 3.9375, "grad_norm": 0.27554810738804436, "learning_rate": 0.0001987561353608038, "loss": 0.0988, "step": 63 }, { "epoch": 4.0, "grad_norm": 0.33177229863886515, "learning_rate": 0.00019869167087338907, "loss": 0.0558, "step": 64 }, { "epoch": 4.0625, "grad_norm": 0.307474933124812, "learning_rate": 0.00019862558884157068, "loss": 0.099, "step": 65 }, { "epoch": 4.125, "grad_norm": 0.2608601940862889, "learning_rate": 0.00019855789034842504, "loss": 0.0633, "step": 66 }, { "epoch": 4.1875, "grad_norm": 0.17638905908873861, "learning_rate": 0.00019848857650352214, "loss": 0.0646, "step": 67 }, { "epoch": 4.25, "grad_norm": 0.29040712597623347, "learning_rate": 0.00019841764844290744, "loss": 0.1065, "step": 68 }, { "epoch": 4.3125, "grad_norm": 0.2712830927822363, "learning_rate": 0.00019834510732908315, "loss": 0.0829, "step": 69 }, { "epoch": 4.375, "grad_norm": 0.23863314085657264, "learning_rate": 0.00019827095435098925, "loss": 0.0745, "step": 70 }, { "epoch": 4.4375, "grad_norm": 0.1793388758656021, "learning_rate": 0.000198195190723984, "loss": 0.072, "step": 71 }, { "epoch": 4.5, "grad_norm": 0.23980978867337854, "learning_rate": 0.0001981178176898239, "loss": 0.0838, "step": 72 }, { "epoch": 4.5625, "grad_norm": 0.2043201014604402, "learning_rate": 0.0001980388365166436, "loss": 0.0632, "step": 73 }, { "epoch": 4.625, "grad_norm": 0.15180530742317308, "learning_rate": 0.0001979582484989348, "loss": 0.0424, "step": 74 }, { "epoch": 4.6875, "grad_norm": 0.2547988574530317, "learning_rate": 0.00019787605495752528, "loss": 0.0851, "step": 75 }, { "epoch": 4.75, "grad_norm": 0.1869380541972453, "learning_rate": 0.00019779225723955707, "loss": 0.0565, "step": 76 }, { "epoch": 4.8125, "grad_norm": 0.2716796793411313, "learning_rate": 0.00019770685671846456, "loss": 0.0772, "step": 77 }, { "epoch": 4.875, "grad_norm": 0.17433079754490768, "learning_rate": 0.0001976198547939518, "loss": 0.0482, "step": 78 }, { "epoch": 4.9375, "grad_norm": 0.18011278074792128, "learning_rate": 0.0001975312528919697, "loss": 0.0518, "step": 79 }, { "epoch": 5.0, "grad_norm": 0.31968800498614336, "learning_rate": 0.00019744105246469263, "loss": 0.083, "step": 80 }, { "epoch": 5.0625, "grad_norm": 0.2357655152058512, "learning_rate": 0.00019734925499049447, "loss": 0.069, "step": 81 }, { "epoch": 5.125, "grad_norm": 0.34723380394306, "learning_rate": 0.0001972558619739246, "loss": 0.0514, "step": 82 }, { "epoch": 5.1875, "grad_norm": 0.21060705394218132, "learning_rate": 0.00019716087494568317, "loss": 0.0587, "step": 83 }, { "epoch": 5.25, "grad_norm": 0.24633955772510746, "learning_rate": 0.00019706429546259593, "loss": 0.0534, "step": 84 }, { "epoch": 5.3125, "grad_norm": 0.2306490127122376, "learning_rate": 0.00019696612510758876, "loss": 0.0585, "step": 85 }, { "epoch": 5.375, "grad_norm": 0.21690646083214937, "learning_rate": 0.00019686636548966178, "loss": 0.0545, "step": 86 }, { "epoch": 5.4375, "grad_norm": 0.25251567374450357, "learning_rate": 0.00019676501824386294, "loss": 0.049, "step": 87 }, { "epoch": 5.5, "grad_norm": 0.19640965019118795, "learning_rate": 0.00019666208503126112, "loss": 0.0419, "step": 88 }, { "epoch": 5.5625, "grad_norm": 0.268693556790637, "learning_rate": 0.00019655756753891916, "loss": 0.0619, "step": 89 }, { "epoch": 5.625, "grad_norm": 0.18291834239390523, "learning_rate": 0.0001964514674798659, "loss": 0.0295, "step": 90 }, { "epoch": 5.6875, "grad_norm": 0.24981548084974023, "learning_rate": 0.00019634378659306832, "loss": 0.0699, "step": 91 }, { "epoch": 5.75, "grad_norm": 0.2086985689956262, "learning_rate": 0.00019623452664340306, "loss": 0.0496, "step": 92 }, { "epoch": 5.8125, "grad_norm": 0.2043098976035677, "learning_rate": 0.0001961236894216272, "loss": 0.0407, "step": 93 }, { "epoch": 5.875, "grad_norm": 0.2730615592143012, "learning_rate": 0.00019601127674434928, "loss": 0.0631, "step": 94 }, { "epoch": 5.9375, "grad_norm": 0.3501805095305464, "learning_rate": 0.00019589729045399934, "loss": 0.071, "step": 95 }, { "epoch": 6.0, "grad_norm": 0.20729413014232384, "learning_rate": 0.00019578173241879872, "loss": 0.045, "step": 96 }, { "epoch": 6.0625, "grad_norm": 0.15750376698735663, "learning_rate": 0.00019566460453272945, "loss": 0.0346, "step": 97 }, { "epoch": 6.125, "grad_norm": 0.3137680018534623, "learning_rate": 0.0001955459087155033, "loss": 0.0442, "step": 98 }, { "epoch": 6.1875, "grad_norm": 0.24917821149174516, "learning_rate": 0.0001954256469125301, "loss": 0.0425, "step": 99 }, { "epoch": 6.25, "grad_norm": 0.15986754420947197, "learning_rate": 0.0001953038210948861, "loss": 0.0468, "step": 100 }, { "epoch": 6.3125, "grad_norm": 0.22286136536824913, "learning_rate": 0.00019518043325928157, "loss": 0.0506, "step": 101 }, { "epoch": 6.375, "grad_norm": 0.24914639465088984, "learning_rate": 0.00019505548542802804, "loss": 0.0706, "step": 102 }, { "epoch": 6.4375, "grad_norm": 0.1752134277500243, "learning_rate": 0.00019492897964900512, "loss": 0.0468, "step": 103 }, { "epoch": 6.5, "grad_norm": 0.1927795196511069, "learning_rate": 0.00019480091799562704, "loss": 0.0561, "step": 104 }, { "epoch": 6.5625, "grad_norm": 0.1700835087697781, "learning_rate": 0.00019467130256680868, "loss": 0.0381, "step": 105 }, { "epoch": 6.625, "grad_norm": 0.2497630067036982, "learning_rate": 0.00019454013548693102, "loss": 0.0437, "step": 106 }, { "epoch": 6.6875, "grad_norm": 0.18837543811238555, "learning_rate": 0.00019440741890580643, "loss": 0.0526, "step": 107 }, { "epoch": 6.75, "grad_norm": 0.22518955186910106, "learning_rate": 0.00019427315499864344, "loss": 0.0424, "step": 108 }, { "epoch": 6.8125, "grad_norm": 0.20500175172697058, "learning_rate": 0.00019413734596601104, "loss": 0.052, "step": 109 }, { "epoch": 6.875, "grad_norm": 0.12045773353666347, "learning_rate": 0.00019399999403380266, "loss": 0.0189, "step": 110 }, { "epoch": 6.9375, "grad_norm": 0.2148224667509155, "learning_rate": 0.00019386110145319963, "loss": 0.0461, "step": 111 }, { "epoch": 7.0, "grad_norm": 0.23216240017196932, "learning_rate": 0.00019372067050063438, "loss": 0.0628, "step": 112 }, { "epoch": 7.0625, "grad_norm": 0.27496419377253983, "learning_rate": 0.000193578703477753, "loss": 0.0504, "step": 113 }, { "epoch": 7.125, "grad_norm": 0.18897397234008034, "learning_rate": 0.00019343520271137763, "loss": 0.0418, "step": 114 }, { "epoch": 7.1875, "grad_norm": 0.17029862867910886, "learning_rate": 0.0001932901705534683, "loss": 0.0364, "step": 115 }, { "epoch": 7.25, "grad_norm": 0.1963658278569445, "learning_rate": 0.00019314360938108425, "loss": 0.0495, "step": 116 }, { "epoch": 7.3125, "grad_norm": 0.24460981520111755, "learning_rate": 0.00019299552159634517, "loss": 0.052, "step": 117 }, { "epoch": 7.375, "grad_norm": 0.2841837300999824, "learning_rate": 0.00019284590962639176, "loss": 0.0251, "step": 118 }, { "epoch": 7.4375, "grad_norm": 0.28136757069268153, "learning_rate": 0.0001926947759233459, "loss": 0.0583, "step": 119 }, { "epoch": 7.5, "grad_norm": 0.14123250014906416, "learning_rate": 0.00019254212296427044, "loss": 0.0247, "step": 120 }, { "epoch": 7.5625, "grad_norm": 0.24291209764375074, "learning_rate": 0.0001923879532511287, "loss": 0.041, "step": 121 }, { "epoch": 7.625, "grad_norm": 0.16313867008770622, "learning_rate": 0.0001922322693107434, "loss": 0.0267, "step": 122 }, { "epoch": 7.6875, "grad_norm": 0.20009496777109764, "learning_rate": 0.0001920750736947553, "loss": 0.0451, "step": 123 }, { "epoch": 7.75, "grad_norm": 0.33391229054639826, "learning_rate": 0.00019191636897958122, "loss": 0.0659, "step": 124 }, { "epoch": 7.8125, "grad_norm": 0.20026703724460174, "learning_rate": 0.0001917561577663721, "loss": 0.0309, "step": 125 }, { "epoch": 7.875, "grad_norm": 0.22455588585805783, "learning_rate": 0.00019159444268097012, "loss": 0.0396, "step": 126 }, { "epoch": 7.9375, "grad_norm": 0.26799981180801946, "learning_rate": 0.00019143122637386566, "loss": 0.0541, "step": 127 }, { "epoch": 8.0, "grad_norm": 0.19547469523965652, "learning_rate": 0.00019126651152015403, "loss": 0.0551, "step": 128 }, { "epoch": 8.0625, "grad_norm": 0.1544644919932735, "learning_rate": 0.00019110030081949156, "loss": 0.025, "step": 129 }, { "epoch": 8.125, "grad_norm": 0.19630976778261258, "learning_rate": 0.00019093259699605125, "loss": 0.0301, "step": 130 }, { "epoch": 8.1875, "grad_norm": 0.17570512618519246, "learning_rate": 0.0001907634027984782, "loss": 0.0284, "step": 131 }, { "epoch": 8.25, "grad_norm": 0.15656419342775668, "learning_rate": 0.0001905927209998447, "loss": 0.0316, "step": 132 }, { "epoch": 8.3125, "grad_norm": 0.20086086826122584, "learning_rate": 0.00019042055439760444, "loss": 0.0406, "step": 133 }, { "epoch": 8.375, "grad_norm": 0.17929729489809332, "learning_rate": 0.000190246905813547, "loss": 0.0306, "step": 134 }, { "epoch": 8.4375, "grad_norm": 0.17408633598619777, "learning_rate": 0.0001900717780937514, "loss": 0.0331, "step": 135 }, { "epoch": 8.5, "grad_norm": 0.22245736981289757, "learning_rate": 0.00018989517410853955, "loss": 0.0296, "step": 136 }, { "epoch": 8.5625, "grad_norm": 0.15106374249798415, "learning_rate": 0.0001897170967524291, "loss": 0.0195, "step": 137 }, { "epoch": 8.625, "grad_norm": 0.19583907014296853, "learning_rate": 0.00018953754894408616, "loss": 0.034, "step": 138 }, { "epoch": 8.6875, "grad_norm": 0.2516341266285177, "learning_rate": 0.0001893565336262773, "loss": 0.0397, "step": 139 }, { "epoch": 8.75, "grad_norm": 0.21858233256386844, "learning_rate": 0.00018917405376582145, "loss": 0.0413, "step": 140 }, { "epoch": 8.8125, "grad_norm": 0.2198644417350149, "learning_rate": 0.00018899011235354115, "loss": 0.037, "step": 141 }, { "epoch": 8.875, "grad_norm": 0.14391410038600197, "learning_rate": 0.00018880471240421365, "loss": 0.0243, "step": 142 }, { "epoch": 8.9375, "grad_norm": 0.19142537938933432, "learning_rate": 0.00018861785695652142, "loss": 0.0378, "step": 143 }, { "epoch": 9.0, "grad_norm": 0.273105976253114, "learning_rate": 0.00018842954907300236, "loss": 0.0335, "step": 144 }, { "epoch": 9.0625, "grad_norm": 0.1764202905014519, "learning_rate": 0.00018823979183999964, "loss": 0.0269, "step": 145 }, { "epoch": 9.125, "grad_norm": 0.16768199512274265, "learning_rate": 0.00018804858836761107, "loss": 0.0274, "step": 146 }, { "epoch": 9.1875, "grad_norm": 0.11993797625263519, "learning_rate": 0.0001878559417896382, "loss": 0.0197, "step": 147 }, { "epoch": 9.25, "grad_norm": 0.1113638157849448, "learning_rate": 0.0001876618552635348, "loss": 0.0144, "step": 148 }, { "epoch": 9.3125, "grad_norm": 0.404272192796613, "learning_rate": 0.00018746633197035527, "loss": 0.0623, "step": 149 }, { "epoch": 9.375, "grad_norm": 0.21941870781755787, "learning_rate": 0.00018726937511470246, "loss": 0.0353, "step": 150 }, { "epoch": 9.4375, "grad_norm": 0.15013935972184475, "learning_rate": 0.00018707098792467515, "loss": 0.0212, "step": 151 }, { "epoch": 9.5, "grad_norm": 0.15110805919842035, "learning_rate": 0.00018687117365181512, "loss": 0.0218, "step": 152 }, { "epoch": 9.5625, "grad_norm": 0.2279637543691053, "learning_rate": 0.00018666993557105377, "loss": 0.0384, "step": 153 }, { "epoch": 9.625, "grad_norm": 0.22049558442795594, "learning_rate": 0.00018646727698065865, "loss": 0.0386, "step": 154 }, { "epoch": 9.6875, "grad_norm": 0.19273043393336428, "learning_rate": 0.00018626320120217923, "loss": 0.0261, "step": 155 }, { "epoch": 9.75, "grad_norm": 0.28617400835044193, "learning_rate": 0.00018605771158039253, "loss": 0.0366, "step": 156 }, { "epoch": 9.8125, "grad_norm": 0.18709984153605747, "learning_rate": 0.00018585081148324832, "loss": 0.0291, "step": 157 }, { "epoch": 9.875, "grad_norm": 0.1292782602493134, "learning_rate": 0.00018564250430181387, "loss": 0.0199, "step": 158 }, { "epoch": 9.9375, "grad_norm": 0.13979049475637031, "learning_rate": 0.00018543279345021834, "loss": 0.0157, "step": 159 }, { "epoch": 10.0, "grad_norm": 0.19965128862936724, "learning_rate": 0.00018522168236559695, "loss": 0.0323, "step": 160 }, { "epoch": 10.0625, "grad_norm": 0.15854406411462987, "learning_rate": 0.0001850091745080345, "loss": 0.029, "step": 161 }, { "epoch": 10.125, "grad_norm": 0.21210803120758442, "learning_rate": 0.00018479527336050878, "loss": 0.0275, "step": 162 }, { "epoch": 10.1875, "grad_norm": 0.14570318547786973, "learning_rate": 0.00018457998242883344, "loss": 0.0198, "step": 163 }, { "epoch": 10.25, "grad_norm": 0.1365650856121692, "learning_rate": 0.00018436330524160047, "loss": 0.0187, "step": 164 }, { "epoch": 10.3125, "grad_norm": 0.14366514107884812, "learning_rate": 0.00018414524535012244, "loss": 0.0201, "step": 165 }, { "epoch": 10.375, "grad_norm": 0.11689977724032004, "learning_rate": 0.00018392580632837423, "loss": 0.0127, "step": 166 }, { "epoch": 10.4375, "grad_norm": 0.14547947591736377, "learning_rate": 0.00018370499177293464, "loss": 0.021, "step": 167 }, { "epoch": 10.5, "grad_norm": 0.13962090636283936, "learning_rate": 0.00018348280530292713, "loss": 0.0198, "step": 168 }, { "epoch": 10.5625, "grad_norm": 0.16513279241014592, "learning_rate": 0.00018325925055996076, "loss": 0.0292, "step": 169 }, { "epoch": 10.625, "grad_norm": 0.09476674044103778, "learning_rate": 0.0001830343312080704, "loss": 0.0163, "step": 170 }, { "epoch": 10.6875, "grad_norm": 0.2189319278786043, "learning_rate": 0.00018280805093365672, "loss": 0.0267, "step": 171 }, { "epoch": 10.75, "grad_norm": 0.11950416301901717, "learning_rate": 0.00018258041344542566, "loss": 0.0162, "step": 172 }, { "epoch": 10.8125, "grad_norm": 0.21271468962636456, "learning_rate": 0.00018235142247432782, "loss": 0.0341, "step": 173 }, { "epoch": 10.875, "grad_norm": 0.26291334746224676, "learning_rate": 0.0001821210817734972, "loss": 0.0183, "step": 174 }, { "epoch": 10.9375, "grad_norm": 0.7541232207929679, "learning_rate": 0.00018188939511818965, "loss": 0.0341, "step": 175 }, { "epoch": 11.0, "grad_norm": 0.2668657582109521, "learning_rate": 0.0001816563663057211, "loss": 0.0248, "step": 176 }, { "epoch": 11.0625, "grad_norm": 0.188760844502015, "learning_rate": 0.00018142199915540527, "loss": 0.0167, "step": 177 }, { "epoch": 11.125, "grad_norm": 0.13049188810156132, "learning_rate": 0.00018118629750849105, "loss": 0.0111, "step": 178 }, { "epoch": 11.1875, "grad_norm": 0.12147377234191868, "learning_rate": 0.0001809492652280996, "loss": 0.0132, "step": 179 }, { "epoch": 11.25, "grad_norm": 0.1561909420624479, "learning_rate": 0.00018071090619916093, "loss": 0.017, "step": 180 }, { "epoch": 11.3125, "grad_norm": 0.2761203872951007, "learning_rate": 0.00018047122432835038, "loss": 0.0242, "step": 181 }, { "epoch": 11.375, "grad_norm": 0.13770345739451892, "learning_rate": 0.0001802302235440245, "loss": 0.0168, "step": 182 }, { "epoch": 11.4375, "grad_norm": 0.18722163720923593, "learning_rate": 0.0001799879077961566, "loss": 0.0301, "step": 183 }, { "epoch": 11.5, "grad_norm": 0.16157498504579615, "learning_rate": 0.00017974428105627208, "loss": 0.0188, "step": 184 }, { "epoch": 11.5625, "grad_norm": 0.11914193260548754, "learning_rate": 0.00017949934731738347, "loss": 0.0167, "step": 185 }, { "epoch": 11.625, "grad_norm": 0.21366231459710572, "learning_rate": 0.0001792531105939247, "loss": 0.0307, "step": 186 }, { "epoch": 11.6875, "grad_norm": 0.12591549609993827, "learning_rate": 0.0001790055749216856, "loss": 0.0156, "step": 187 }, { "epoch": 11.75, "grad_norm": 0.17099000320728966, "learning_rate": 0.00017875674435774547, "loss": 0.0213, "step": 188 }, { "epoch": 11.8125, "grad_norm": 0.21223314589021008, "learning_rate": 0.00017850662298040678, "loss": 0.0303, "step": 189 }, { "epoch": 11.875, "grad_norm": 0.18967657804115237, "learning_rate": 0.0001782552148891283, "loss": 0.0147, "step": 190 }, { "epoch": 11.9375, "grad_norm": 0.2411319114734683, "learning_rate": 0.00017800252420445788, "loss": 0.0356, "step": 191 }, { "epoch": 12.0, "grad_norm": 0.19592901070523805, "learning_rate": 0.00017774855506796496, "loss": 0.0361, "step": 192 }, { "epoch": 12.0625, "grad_norm": 0.1922774431645892, "learning_rate": 0.0001774933116421725, "loss": 0.0163, "step": 193 }, { "epoch": 12.125, "grad_norm": 0.1336099627438813, "learning_rate": 0.00017723679811048904, "loss": 0.016, "step": 194 }, { "epoch": 12.1875, "grad_norm": 0.11902506990463049, "learning_rate": 0.00017697901867713995, "loss": 0.0128, "step": 195 }, { "epoch": 12.25, "grad_norm": 0.15265650075862036, "learning_rate": 0.00017671997756709863, "loss": 0.0158, "step": 196 }, { "epoch": 12.3125, "grad_norm": 0.10409493403901945, "learning_rate": 0.0001764596790260171, "loss": 0.01, "step": 197 }, { "epoch": 12.375, "grad_norm": 0.12592944148803245, "learning_rate": 0.00017619812732015664, "loss": 0.0081, "step": 198 }, { "epoch": 12.4375, "grad_norm": 0.15892735182879292, "learning_rate": 0.00017593532673631766, "loss": 0.0226, "step": 199 }, { "epoch": 12.5, "grad_norm": 0.16370083252883327, "learning_rate": 0.00017567128158176953, "loss": 0.0156, "step": 200 }, { "epoch": 12.5625, "grad_norm": 0.1926319403468188, "learning_rate": 0.00017540599618418007, "loss": 0.0198, "step": 201 }, { "epoch": 12.625, "grad_norm": 0.1361218264727559, "learning_rate": 0.00017513947489154443, "loss": 0.014, "step": 202 }, { "epoch": 12.6875, "grad_norm": 0.15346622202020466, "learning_rate": 0.00017487172207211396, "loss": 0.0149, "step": 203 }, { "epoch": 12.75, "grad_norm": 0.1582659958129119, "learning_rate": 0.0001746027421143246, "loss": 0.0209, "step": 204 }, { "epoch": 12.8125, "grad_norm": 0.08422642994909509, "learning_rate": 0.00017433253942672496, "loss": 0.0107, "step": 205 }, { "epoch": 12.875, "grad_norm": 0.14151191719126865, "learning_rate": 0.000174061118437904, "loss": 0.0163, "step": 206 }, { "epoch": 12.9375, "grad_norm": 0.21170441141264745, "learning_rate": 0.00017378848359641847, "loss": 0.0248, "step": 207 }, { "epoch": 13.0, "grad_norm": 0.21854641357858184, "learning_rate": 0.00017351463937072004, "loss": 0.0314, "step": 208 }, { "epoch": 13.0625, "grad_norm": 0.1460880269124164, "learning_rate": 0.00017323959024908209, "loss": 0.01, "step": 209 }, { "epoch": 13.125, "grad_norm": 0.2314775735978235, "learning_rate": 0.00017296334073952605, "loss": 0.0158, "step": 210 }, { "epoch": 13.1875, "grad_norm": 0.25609276829659133, "learning_rate": 0.0001726858953697475, "loss": 0.0228, "step": 211 }, { "epoch": 13.25, "grad_norm": 0.05892147625450794, "learning_rate": 0.00017240725868704218, "loss": 0.0054, "step": 212 }, { "epoch": 13.3125, "grad_norm": 0.09265829411728602, "learning_rate": 0.00017212743525823112, "loss": 0.0121, "step": 213 }, { "epoch": 13.375, "grad_norm": 0.28766545739529126, "learning_rate": 0.0001718464296695861, "loss": 0.0166, "step": 214 }, { "epoch": 13.4375, "grad_norm": 0.22056507132944547, "learning_rate": 0.0001715642465267543, "loss": 0.0189, "step": 215 }, { "epoch": 13.5, "grad_norm": 0.20425585831942672, "learning_rate": 0.00017128089045468294, "loss": 0.0226, "step": 216 }, { "epoch": 13.5625, "grad_norm": 0.08920463828133202, "learning_rate": 0.00017099636609754329, "loss": 0.0111, "step": 217 }, { "epoch": 13.625, "grad_norm": 0.07587770729105793, "learning_rate": 0.00017071067811865476, "loss": 0.0107, "step": 218 }, { "epoch": 13.6875, "grad_norm": 0.2436581509018354, "learning_rate": 0.00017042383120040834, "loss": 0.0122, "step": 219 }, { "epoch": 13.75, "grad_norm": 0.2776351805516073, "learning_rate": 0.00017013583004418993, "loss": 0.0338, "step": 220 }, { "epoch": 13.8125, "grad_norm": 0.1500695552486999, "learning_rate": 0.00016984667937030318, "loss": 0.0128, "step": 221 }, { "epoch": 13.875, "grad_norm": 0.19022226617986523, "learning_rate": 0.00016955638391789228, "loss": 0.0182, "step": 222 }, { "epoch": 13.9375, "grad_norm": 0.07668703300214032, "learning_rate": 0.00016926494844486412, "loss": 0.0081, "step": 223 }, { "epoch": 14.0, "grad_norm": 0.09791844205901491, "learning_rate": 0.00016897237772781044, "loss": 0.0102, "step": 224 }, { "epoch": 14.0625, "grad_norm": 0.13334638868212356, "learning_rate": 0.00016867867656192946, "loss": 0.0154, "step": 225 }, { "epoch": 14.125, "grad_norm": 0.0443847074815828, "learning_rate": 0.00016838384976094738, "loss": 0.0038, "step": 226 }, { "epoch": 14.1875, "grad_norm": 0.22900928937804826, "learning_rate": 0.00016808790215703935, "loss": 0.0146, "step": 227 }, { "epoch": 14.25, "grad_norm": 0.09820050963868261, "learning_rate": 0.00016779083860075033, "loss": 0.0139, "step": 228 }, { "epoch": 14.3125, "grad_norm": 0.12261110863547245, "learning_rate": 0.0001674926639609157, "loss": 0.0081, "step": 229 }, { "epoch": 14.375, "grad_norm": 0.4737769261848427, "learning_rate": 0.00016719338312458124, "loss": 0.0196, "step": 230 }, { "epoch": 14.4375, "grad_norm": 0.0719988039842083, "learning_rate": 0.00016689300099692332, "loss": 0.0075, "step": 231 }, { "epoch": 14.5, "grad_norm": 0.24824987963252254, "learning_rate": 0.00016659152250116812, "loss": 0.0095, "step": 232 }, { "epoch": 14.5625, "grad_norm": 0.1806409419450783, "learning_rate": 0.00016628895257851135, "loss": 0.0177, "step": 233 }, { "epoch": 14.625, "grad_norm": 0.12022842892878163, "learning_rate": 0.000165985296188037, "loss": 0.0108, "step": 234 }, { "epoch": 14.6875, "grad_norm": 0.27573651025583323, "learning_rate": 0.0001656805583066361, "loss": 0.0408, "step": 235 }, { "epoch": 14.75, "grad_norm": 0.13027830763516066, "learning_rate": 0.00016537474392892528, "loss": 0.0164, "step": 236 }, { "epoch": 14.8125, "grad_norm": 0.2190711849455461, "learning_rate": 0.00016506785806716465, "loss": 0.0381, "step": 237 }, { "epoch": 14.875, "grad_norm": 0.18889153886713622, "learning_rate": 0.00016475990575117605, "loss": 0.0137, "step": 238 }, { "epoch": 14.9375, "grad_norm": 0.17427492795979294, "learning_rate": 0.0001644508920282601, "loss": 0.0259, "step": 239 }, { "epoch": 15.0, "grad_norm": 0.13217744356726124, "learning_rate": 0.000164140821963114, "loss": 0.0099, "step": 240 }, { "epoch": 15.0625, "grad_norm": 0.08062959617570911, "learning_rate": 0.0001638297006377481, "loss": 0.0065, "step": 241 }, { "epoch": 15.125, "grad_norm": 0.11874477325134665, "learning_rate": 0.00016351753315140287, "loss": 0.0132, "step": 242 }, { "epoch": 15.1875, "grad_norm": 0.08293002909973335, "learning_rate": 0.00016320432462046516, "loss": 0.0093, "step": 243 }, { "epoch": 15.25, "grad_norm": 0.08700808300439221, "learning_rate": 0.00016289008017838445, "loss": 0.0077, "step": 244 }, { "epoch": 15.3125, "grad_norm": 0.12073296585647869, "learning_rate": 0.00016257480497558873, "loss": 0.0096, "step": 245 }, { "epoch": 15.375, "grad_norm": 0.07654303999452532, "learning_rate": 0.0001622585041793999, "loss": 0.0059, "step": 246 }, { "epoch": 15.4375, "grad_norm": 0.2562520960634689, "learning_rate": 0.00016194118297394936, "loss": 0.0263, "step": 247 }, { "epoch": 15.5, "grad_norm": 0.08068310167444095, "learning_rate": 0.00016162284656009274, "loss": 0.0062, "step": 248 }, { "epoch": 15.5625, "grad_norm": 0.2090301776269612, "learning_rate": 0.00016130350015532496, "loss": 0.0201, "step": 249 }, { "epoch": 15.625, "grad_norm": 0.18851005491544473, "learning_rate": 0.00016098314899369446, "loss": 0.0129, "step": 250 }, { "epoch": 15.6875, "grad_norm": 0.10484133416084232, "learning_rate": 0.0001606617983257176, "loss": 0.0058, "step": 251 }, { "epoch": 15.75, "grad_norm": 0.17883267452407117, "learning_rate": 0.00016033945341829248, "loss": 0.0194, "step": 252 }, { "epoch": 15.8125, "grad_norm": 0.15051127763163427, "learning_rate": 0.00016001611955461265, "loss": 0.011, "step": 253 }, { "epoch": 15.875, "grad_norm": 0.09161429352244004, "learning_rate": 0.0001596918020340805, "loss": 0.0045, "step": 254 }, { "epoch": 15.9375, "grad_norm": 0.11734353010884248, "learning_rate": 0.00015936650617222063, "loss": 0.007, "step": 255 }, { "epoch": 16.0, "grad_norm": 0.1956323556889479, "learning_rate": 0.00015904023730059228, "loss": 0.0165, "step": 256 }, { "epoch": 16.0625, "grad_norm": 0.14747898179025032, "learning_rate": 0.00015871300076670234, "loss": 0.0146, "step": 257 }, { "epoch": 16.125, "grad_norm": 0.18498598991778836, "learning_rate": 0.00015838480193391754, "loss": 0.0102, "step": 258 }, { "epoch": 16.1875, "grad_norm": 0.2419734729440462, "learning_rate": 0.0001580556461813766, "loss": 0.02, "step": 259 }, { "epoch": 16.25, "grad_norm": 0.13549389704091608, "learning_rate": 0.00015772553890390197, "loss": 0.0096, "step": 260 }, { "epoch": 16.3125, "grad_norm": 0.09488023406511628, "learning_rate": 0.0001573944855119115, "loss": 0.0142, "step": 261 }, { "epoch": 16.375, "grad_norm": 0.08589054588641899, "learning_rate": 0.00015706249143132982, "loss": 0.0086, "step": 262 }, { "epoch": 16.4375, "grad_norm": 0.18599116361768675, "learning_rate": 0.00015672956210349923, "loss": 0.0158, "step": 263 }, { "epoch": 16.5, "grad_norm": 0.13790931505797482, "learning_rate": 0.00015639570298509064, "loss": 0.0076, "step": 264 }, { "epoch": 16.5625, "grad_norm": 0.14746745744271494, "learning_rate": 0.0001560609195480142, "loss": 0.0144, "step": 265 }, { "epoch": 16.625, "grad_norm": 0.1747110630319595, "learning_rate": 0.00015572521727932935, "loss": 0.0209, "step": 266 }, { "epoch": 16.6875, "grad_norm": 0.12294368607669189, "learning_rate": 0.00015538860168115527, "loss": 0.0076, "step": 267 }, { "epoch": 16.75, "grad_norm": 0.20591709989710746, "learning_rate": 0.00015505107827058036, "loss": 0.0109, "step": 268 }, { "epoch": 16.8125, "grad_norm": 0.12788905057548938, "learning_rate": 0.00015471265257957202, "loss": 0.0137, "step": 269 }, { "epoch": 16.875, "grad_norm": 0.0907643332122881, "learning_rate": 0.00015437333015488587, "loss": 0.004, "step": 270 }, { "epoch": 16.9375, "grad_norm": 0.11598695941727767, "learning_rate": 0.00015403311655797492, "loss": 0.0173, "step": 271 }, { "epoch": 17.0, "grad_norm": 0.1274596211332466, "learning_rate": 0.0001536920173648984, "loss": 0.0137, "step": 272 }, { "epoch": 17.0625, "grad_norm": 0.11710054449245828, "learning_rate": 0.00015335003816623028, "loss": 0.0114, "step": 273 }, { "epoch": 17.125, "grad_norm": 0.10810173543943251, "learning_rate": 0.00015300718456696778, "loss": 0.0083, "step": 274 }, { "epoch": 17.1875, "grad_norm": 0.11490194551290545, "learning_rate": 0.00015266346218643947, "loss": 0.0104, "step": 275 }, { "epoch": 17.25, "grad_norm": 0.10496790427956344, "learning_rate": 0.000152318876658213, "loss": 0.0117, "step": 276 }, { "epoch": 17.3125, "grad_norm": 0.13303819260520472, "learning_rate": 0.00015197343363000307, "loss": 0.0172, "step": 277 }, { "epoch": 17.375, "grad_norm": 0.07976187691489159, "learning_rate": 0.00015162713876357858, "loss": 0.0118, "step": 278 }, { "epoch": 17.4375, "grad_norm": 0.020200494852139077, "learning_rate": 0.00015127999773467002, "loss": 0.0018, "step": 279 }, { "epoch": 17.5, "grad_norm": 0.08302979036485983, "learning_rate": 0.00015093201623287631, "loss": 0.0074, "step": 280 }, { "epoch": 17.5625, "grad_norm": 0.09709353670094575, "learning_rate": 0.00015058319996157172, "loss": 0.0141, "step": 281 }, { "epoch": 17.625, "grad_norm": 0.10500205069763988, "learning_rate": 0.0001502335546378122, "loss": 0.0082, "step": 282 }, { "epoch": 17.6875, "grad_norm": 0.045653846262314286, "learning_rate": 0.00014988308599224183, "loss": 0.0037, "step": 283 }, { "epoch": 17.75, "grad_norm": 0.07350542815715672, "learning_rate": 0.00014953179976899878, "loss": 0.007, "step": 284 }, { "epoch": 17.8125, "grad_norm": 0.0740842955766792, "learning_rate": 0.0001491797017256212, "loss": 0.0041, "step": 285 }, { "epoch": 17.875, "grad_norm": 0.11610913098575787, "learning_rate": 0.00014882679763295306, "loss": 0.0177, "step": 286 }, { "epoch": 17.9375, "grad_norm": 0.11006673147506568, "learning_rate": 0.0001484730932750491, "loss": 0.0124, "step": 287 }, { "epoch": 18.0, "grad_norm": 0.17310654116013702, "learning_rate": 0.00014811859444908052, "loss": 0.0174, "step": 288 }, { "epoch": 18.0625, "grad_norm": 0.05397642477813828, "learning_rate": 0.00014776330696523963, "loss": 0.0047, "step": 289 }, { "epoch": 18.125, "grad_norm": 0.07510434928834142, "learning_rate": 0.00014740723664664483, "loss": 0.0084, "step": 290 }, { "epoch": 18.1875, "grad_norm": 0.15571308024563857, "learning_rate": 0.00014705038932924503, "loss": 0.0061, "step": 291 }, { "epoch": 18.25, "grad_norm": 0.04276361043100037, "learning_rate": 0.00014669277086172406, "loss": 0.0052, "step": 292 }, { "epoch": 18.3125, "grad_norm": 0.10566723422506075, "learning_rate": 0.00014633438710540489, "loss": 0.0095, "step": 293 }, { "epoch": 18.375, "grad_norm": 0.12332021633992264, "learning_rate": 0.00014597524393415335, "loss": 0.0125, "step": 294 }, { "epoch": 18.4375, "grad_norm": 0.06085873636051086, "learning_rate": 0.00014561534723428205, "loss": 0.0036, "step": 295 }, { "epoch": 18.5, "grad_norm": 0.08889363982250575, "learning_rate": 0.00014525470290445392, "loss": 0.0062, "step": 296 }, { "epoch": 18.5625, "grad_norm": 0.03473335852451447, "learning_rate": 0.00014489331685558525, "loss": 0.0026, "step": 297 }, { "epoch": 18.625, "grad_norm": 0.08852650667058536, "learning_rate": 0.00014453119501074924, "loss": 0.011, "step": 298 }, { "epoch": 18.6875, "grad_norm": 0.03288743535270982, "learning_rate": 0.00014416834330507856, "loss": 0.0031, "step": 299 }, { "epoch": 18.75, "grad_norm": 0.16831306636699117, "learning_rate": 0.00014380476768566824, "loss": 0.0142, "step": 300 }, { "epoch": 18.8125, "grad_norm": 0.060653415545579646, "learning_rate": 0.00014344047411147818, "loss": 0.0084, "step": 301 }, { "epoch": 18.875, "grad_norm": 0.0379853861449074, "learning_rate": 0.00014307546855323549, "loss": 0.0022, "step": 302 }, { "epoch": 18.9375, "grad_norm": 0.09664333466138222, "learning_rate": 0.00014270975699333654, "loss": 0.0057, "step": 303 }, { "epoch": 19.0, "grad_norm": 0.09719048721891094, "learning_rate": 0.00014234334542574906, "loss": 0.0092, "step": 304 }, { "epoch": 19.0625, "grad_norm": 0.04572304214406104, "learning_rate": 0.00014197623985591373, "loss": 0.0018, "step": 305 }, { "epoch": 19.125, "grad_norm": 0.05097257049081179, "learning_rate": 0.00014160844630064595, "loss": 0.0057, "step": 306 }, { "epoch": 19.1875, "grad_norm": 0.030553512605281084, "learning_rate": 0.00014123997078803707, "loss": 0.0026, "step": 307 }, { "epoch": 19.25, "grad_norm": 0.05445407477912121, "learning_rate": 0.00014087081935735564, "loss": 0.0057, "step": 308 }, { "epoch": 19.3125, "grad_norm": 0.06371575221400023, "learning_rate": 0.00014050099805894837, "loss": 0.0084, "step": 309 }, { "epoch": 19.375, "grad_norm": 0.06261401628069665, "learning_rate": 0.00014013051295414108, "loss": 0.0081, "step": 310 }, { "epoch": 19.4375, "grad_norm": 0.035834190161274884, "learning_rate": 0.00013975937011513932, "loss": 0.0045, "step": 311 }, { "epoch": 19.5, "grad_norm": 0.0524760162484543, "learning_rate": 0.00013938757562492873, "loss": 0.0062, "step": 312 }, { "epoch": 19.5625, "grad_norm": 0.10667039377919106, "learning_rate": 0.00013901513557717553, "loss": 0.0041, "step": 313 }, { "epoch": 19.625, "grad_norm": 0.05041143161792446, "learning_rate": 0.00013864205607612648, "loss": 0.0052, "step": 314 }, { "epoch": 19.6875, "grad_norm": 0.0424817284436791, "learning_rate": 0.000138268343236509, "loss": 0.0055, "step": 315 }, { "epoch": 19.75, "grad_norm": 0.05668813950953166, "learning_rate": 0.00013789400318343068, "loss": 0.0071, "step": 316 }, { "epoch": 19.8125, "grad_norm": 0.028939867545089322, "learning_rate": 0.0001375190420522792, "loss": 0.003, "step": 317 }, { "epoch": 19.875, "grad_norm": 0.06412293116062714, "learning_rate": 0.00013714346598862166, "loss": 0.0067, "step": 318 }, { "epoch": 19.9375, "grad_norm": 0.06349552195339572, "learning_rate": 0.00013676728114810367, "loss": 0.0068, "step": 319 }, { "epoch": 20.0, "grad_norm": 0.09285652351669382, "learning_rate": 0.00013639049369634876, "loss": 0.0108, "step": 320 }, { "epoch": 20.0625, "grad_norm": 0.038426113694616765, "learning_rate": 0.00013601310980885714, "loss": 0.0039, "step": 321 }, { "epoch": 20.125, "grad_norm": 0.03317557018896503, "learning_rate": 0.0001356351356709045, "loss": 0.003, "step": 322 }, { "epoch": 20.1875, "grad_norm": 0.059248532620137406, "learning_rate": 0.00013525657747744072, "loss": 0.0059, "step": 323 }, { "epoch": 20.25, "grad_norm": 0.05514984219384242, "learning_rate": 0.00013487744143298822, "loss": 0.004, "step": 324 }, { "epoch": 20.3125, "grad_norm": 0.041791984793344325, "learning_rate": 0.0001344977337515404, "loss": 0.0039, "step": 325 }, { "epoch": 20.375, "grad_norm": 0.06346255904291057, "learning_rate": 0.0001341174606564596, "loss": 0.0078, "step": 326 }, { "epoch": 20.4375, "grad_norm": 0.029692961959192875, "learning_rate": 0.00013373662838037537, "loss": 0.0027, "step": 327 }, { "epoch": 20.5, "grad_norm": 0.0481477761395951, "learning_rate": 0.00013335524316508208, "loss": 0.0072, "step": 328 }, { "epoch": 20.5625, "grad_norm": 0.07552298978231338, "learning_rate": 0.00013297331126143667, "loss": 0.0042, "step": 329 }, { "epoch": 20.625, "grad_norm": 0.03033536526572307, "learning_rate": 0.00013259083892925633, "loss": 0.0018, "step": 330 }, { "epoch": 20.6875, "grad_norm": 0.020436451514801952, "learning_rate": 0.00013220783243721572, "loss": 0.0018, "step": 331 }, { "epoch": 20.75, "grad_norm": 0.05066849555968109, "learning_rate": 0.0001318242980627444, "loss": 0.0068, "step": 332 }, { "epoch": 20.8125, "grad_norm": 0.062213499192457326, "learning_rate": 0.0001314402420919238, "loss": 0.0071, "step": 333 }, { "epoch": 20.875, "grad_norm": 0.0602651350512265, "learning_rate": 0.00013105567081938424, "loss": 0.0057, "step": 334 }, { "epoch": 20.9375, "grad_norm": 0.07142806102643208, "learning_rate": 0.00013067059054820183, "loss": 0.011, "step": 335 }, { "epoch": 21.0, "grad_norm": 0.11306410649612093, "learning_rate": 0.00013028500758979506, "loss": 0.0061, "step": 336 }, { "epoch": 21.0625, "grad_norm": 0.04077734941537789, "learning_rate": 0.00012989892826382145, "loss": 0.0047, "step": 337 }, { "epoch": 21.125, "grad_norm": 0.025438845626494507, "learning_rate": 0.00012951235889807386, "loss": 0.0024, "step": 338 }, { "epoch": 21.1875, "grad_norm": 0.04068069801179701, "learning_rate": 0.00012912530582837682, "loss": 0.0057, "step": 339 }, { "epoch": 21.25, "grad_norm": 0.06229646373645838, "learning_rate": 0.00012873777539848283, "loss": 0.0058, "step": 340 }, { "epoch": 21.3125, "grad_norm": 0.05047183477805069, "learning_rate": 0.00012834977395996818, "loss": 0.0073, "step": 341 }, { "epoch": 21.375, "grad_norm": 0.03503202770379194, "learning_rate": 0.0001279613078721289, "loss": 0.0035, "step": 342 }, { "epoch": 21.4375, "grad_norm": 0.02025118966232832, "learning_rate": 0.0001275723835018767, "loss": 0.0011, "step": 343 }, { "epoch": 21.5, "grad_norm": 0.04264241354679371, "learning_rate": 0.0001271830072236343, "loss": 0.0048, "step": 344 }, { "epoch": 21.5625, "grad_norm": 0.059381097002962034, "learning_rate": 0.0001267931854192313, "loss": 0.0065, "step": 345 }, { "epoch": 21.625, "grad_norm": 0.03970878770263772, "learning_rate": 0.0001264029244777993, "loss": 0.0052, "step": 346 }, { "epoch": 21.6875, "grad_norm": 0.04854204024259956, "learning_rate": 0.00012601223079566743, "loss": 0.0036, "step": 347 }, { "epoch": 21.75, "grad_norm": 0.06683236405906955, "learning_rate": 0.00012562111077625722, "loss": 0.0078, "step": 348 }, { "epoch": 21.8125, "grad_norm": 0.030148690304593224, "learning_rate": 0.000125229570829978, "loss": 0.0044, "step": 349 }, { "epoch": 21.875, "grad_norm": 0.043501391695934324, "learning_rate": 0.0001248376173741215, "loss": 0.0044, "step": 350 }, { "epoch": 21.9375, "grad_norm": 0.03558605264264846, "learning_rate": 0.00012444525683275688, "loss": 0.0045, "step": 351 }, { "epoch": 22.0, "grad_norm": 0.04991180242909837, "learning_rate": 0.00012405249563662537, "loss": 0.0067, "step": 352 }, { "epoch": 22.0625, "grad_norm": 0.037891384595762335, "learning_rate": 0.00012365934022303491, "loss": 0.006, "step": 353 }, { "epoch": 22.125, "grad_norm": 0.029393462316367327, "learning_rate": 0.00012326579703575462, "loss": 0.0059, "step": 354 }, { "epoch": 22.1875, "grad_norm": 0.02302636254784467, "learning_rate": 0.00012287187252490913, "loss": 0.002, "step": 355 }, { "epoch": 22.25, "grad_norm": 0.027267824953848163, "learning_rate": 0.00012247757314687297, "loss": 0.004, "step": 356 }, { "epoch": 22.3125, "grad_norm": 0.018193429790619855, "learning_rate": 0.00012208290536416463, "loss": 0.0016, "step": 357 }, { "epoch": 22.375, "grad_norm": 0.03843454617816289, "learning_rate": 0.00012168787564534078, "loss": 0.0044, "step": 358 }, { "epoch": 22.4375, "grad_norm": 0.033400388055823266, "learning_rate": 0.0001212924904648902, "loss": 0.0047, "step": 359 }, { "epoch": 22.5, "grad_norm": 0.026909120960616175, "learning_rate": 0.00012089675630312754, "loss": 0.0039, "step": 360 }, { "epoch": 22.5625, "grad_norm": 0.023045776166397722, "learning_rate": 0.00012050067964608724, "loss": 0.0022, "step": 361 }, { "epoch": 22.625, "grad_norm": 0.03806155810130501, "learning_rate": 0.00012010426698541728, "loss": 0.0052, "step": 362 }, { "epoch": 22.6875, "grad_norm": 0.018336649301123106, "learning_rate": 0.0001197075248182726, "loss": 0.0015, "step": 363 }, { "epoch": 22.75, "grad_norm": 0.04461178752361949, "learning_rate": 0.00011931045964720881, "loss": 0.0049, "step": 364 }, { "epoch": 22.8125, "grad_norm": 0.03716629926558807, "learning_rate": 0.00011891307798007536, "loss": 0.0051, "step": 365 }, { "epoch": 22.875, "grad_norm": 0.10941698413421153, "learning_rate": 0.00011851538632990921, "loss": 0.0061, "step": 366 }, { "epoch": 22.9375, "grad_norm": 0.031543122025977796, "learning_rate": 0.00011811739121482777, "loss": 0.0032, "step": 367 }, { "epoch": 23.0, "grad_norm": 0.03807850455755241, "learning_rate": 0.0001177190991579223, "loss": 0.0054, "step": 368 }, { "epoch": 23.0625, "grad_norm": 0.0691977966708894, "learning_rate": 0.00011732051668715081, "loss": 0.0077, "step": 369 }, { "epoch": 23.125, "grad_norm": 0.04515689836142195, "learning_rate": 0.00011692165033523117, "loss": 0.0057, "step": 370 }, { "epoch": 23.1875, "grad_norm": 0.03928887922524319, "learning_rate": 0.00011652250663953415, "loss": 0.0055, "step": 371 }, { "epoch": 23.25, "grad_norm": 0.02118235648867606, "learning_rate": 0.00011612309214197599, "loss": 0.0019, "step": 372 }, { "epoch": 23.3125, "grad_norm": 0.014975510606827147, "learning_rate": 0.00011572341338891144, "loss": 0.0013, "step": 373 }, { "epoch": 23.375, "grad_norm": 0.026742366673437934, "learning_rate": 0.00011532347693102632, "loss": 0.002, "step": 374 }, { "epoch": 23.4375, "grad_norm": 0.13222981379526266, "learning_rate": 0.00011492328932323022, "loss": 0.0065, "step": 375 }, { "epoch": 23.5, "grad_norm": 0.030786902568145764, "learning_rate": 0.00011452285712454904, "loss": 0.0032, "step": 376 }, { "epoch": 23.5625, "grad_norm": 0.04300178598048733, "learning_rate": 0.00011412218689801748, "loss": 0.0061, "step": 377 }, { "epoch": 23.625, "grad_norm": 0.031601651854756344, "learning_rate": 0.00011372128521057155, "loss": 0.0037, "step": 378 }, { "epoch": 23.6875, "grad_norm": 0.07785976030991328, "learning_rate": 0.00011332015863294076, "loss": 0.0051, "step": 379 }, { "epoch": 23.75, "grad_norm": 0.02462026239901167, "learning_rate": 0.00011291881373954065, "loss": 0.0024, "step": 380 }, { "epoch": 23.8125, "grad_norm": 0.015040874194919356, "learning_rate": 0.00011251725710836489, "loss": 0.0011, "step": 381 }, { "epoch": 23.875, "grad_norm": 0.013797157919588528, "learning_rate": 0.00011211549532087749, "loss": 0.0012, "step": 382 }, { "epoch": 23.9375, "grad_norm": 0.028167968760020725, "learning_rate": 0.00011171353496190498, "loss": 0.0032, "step": 383 }, { "epoch": 24.0, "grad_norm": 0.05318582105382964, "learning_rate": 0.00011131138261952845, "loss": 0.0093, "step": 384 }, { "epoch": 24.0625, "grad_norm": 0.027391464358218053, "learning_rate": 0.00011090904488497549, "loss": 0.0031, "step": 385 }, { "epoch": 24.125, "grad_norm": 0.03229113172070876, "learning_rate": 0.0001105065283525124, "loss": 0.0037, "step": 386 }, { "epoch": 24.1875, "grad_norm": 0.030127780485989034, "learning_rate": 0.00011010383961933581, "loss": 0.0049, "step": 387 }, { "epoch": 24.25, "grad_norm": 0.013416894665755037, "learning_rate": 0.00010970098528546481, "loss": 0.0002, "step": 388 }, { "epoch": 24.3125, "grad_norm": 0.013218671166503056, "learning_rate": 0.00010929797195363259, "loss": 0.0012, "step": 389 }, { "epoch": 24.375, "grad_norm": 0.03798660609085839, "learning_rate": 0.0001088948062291783, "loss": 0.0066, "step": 390 }, { "epoch": 24.4375, "grad_norm": 0.01875425458258293, "learning_rate": 0.00010849149471993882, "loss": 0.0014, "step": 391 }, { "epoch": 24.5, "grad_norm": 0.03284107465970819, "learning_rate": 0.00010808804403614043, "loss": 0.0051, "step": 392 }, { "epoch": 24.5625, "grad_norm": 0.0318856663182744, "learning_rate": 0.00010768446079029044, "loss": 0.0041, "step": 393 }, { "epoch": 24.625, "grad_norm": 0.03886319257158687, "learning_rate": 0.0001072807515970688, "loss": 0.0055, "step": 394 }, { "epoch": 24.6875, "grad_norm": 0.0240690438810162, "learning_rate": 0.00010687692307321984, "loss": 0.0026, "step": 395 }, { "epoch": 24.75, "grad_norm": 0.03153859131907879, "learning_rate": 0.00010647298183744359, "loss": 0.0037, "step": 396 }, { "epoch": 24.8125, "grad_norm": 0.028754869900780036, "learning_rate": 0.00010606893451028743, "loss": 0.0046, "step": 397 }, { "epoch": 24.875, "grad_norm": 0.036835349609799346, "learning_rate": 0.00010566478771403763, "loss": 0.0059, "step": 398 }, { "epoch": 24.9375, "grad_norm": 0.03742416571548899, "learning_rate": 0.00010526054807261067, "loss": 0.0057, "step": 399 }, { "epoch": 25.0, "grad_norm": 0.019217319407847776, "learning_rate": 0.00010485622221144484, "loss": 0.0019, "step": 400 }, { "epoch": 25.0625, "grad_norm": 0.0181018400795553, "learning_rate": 0.00010445181675739144, "loss": 0.0019, "step": 401 }, { "epoch": 25.125, "grad_norm": 0.015082303122275808, "learning_rate": 0.00010404733833860639, "loss": 0.001, "step": 402 }, { "epoch": 25.1875, "grad_norm": 0.024352538732525977, "learning_rate": 0.00010364279358444144, "loss": 0.0031, "step": 403 }, { "epoch": 25.25, "grad_norm": 0.03063940290250688, "learning_rate": 0.00010323818912533561, "loss": 0.0041, "step": 404 }, { "epoch": 25.3125, "grad_norm": 0.02851015433219716, "learning_rate": 0.00010283353159270643, "loss": 0.0046, "step": 405 }, { "epoch": 25.375, "grad_norm": 0.017078363597566818, "learning_rate": 0.00010242882761884131, "loss": 0.0017, "step": 406 }, { "epoch": 25.4375, "grad_norm": 0.015453958897260991, "learning_rate": 0.00010202408383678888, "loss": 0.0011, "step": 407 }, { "epoch": 25.5, "grad_norm": 0.028055877963511246, "learning_rate": 0.00010161930688025017, "loss": 0.0029, "step": 408 }, { "epoch": 25.5625, "grad_norm": 0.003095926902821122, "learning_rate": 0.0001012145033834699, "loss": 0.0001, "step": 409 }, { "epoch": 25.625, "grad_norm": 0.04032729610564608, "learning_rate": 0.00010080967998112787, "loss": 0.0084, "step": 410 }, { "epoch": 25.6875, "grad_norm": 0.03694941225075249, "learning_rate": 0.00010040484330823006, "loss": 0.0058, "step": 411 }, { "epoch": 25.75, "grad_norm": 0.040456452234492964, "learning_rate": 0.0001, "loss": 0.0067, "step": 412 }, { "epoch": 25.8125, "grad_norm": 0.02482728072796654, "learning_rate": 9.959515669176996e-05, "loss": 0.0027, "step": 413 }, { "epoch": 25.875, "grad_norm": 0.046729978524535094, "learning_rate": 9.919032001887215e-05, "loss": 0.0088, "step": 414 }, { "epoch": 25.9375, "grad_norm": 0.03304233160758997, "learning_rate": 9.878549661653012e-05, "loss": 0.0043, "step": 415 }, { "epoch": 26.0, "grad_norm": 0.03375904734753774, "learning_rate": 9.838069311974986e-05, "loss": 0.0044, "step": 416 }, { "epoch": 26.0625, "grad_norm": 0.01611151201817884, "learning_rate": 9.797591616321114e-05, "loss": 0.0016, "step": 417 }, { "epoch": 26.125, "grad_norm": 0.01947904547375526, "learning_rate": 9.757117238115871e-05, "loss": 0.0017, "step": 418 }, { "epoch": 26.1875, "grad_norm": 0.020613065991069244, "learning_rate": 9.716646840729361e-05, "loss": 0.0022, "step": 419 }, { "epoch": 26.25, "grad_norm": 0.028039730983769738, "learning_rate": 9.676181087466444e-05, "loss": 0.006, "step": 420 }, { "epoch": 26.3125, "grad_norm": 0.03218256334320815, "learning_rate": 9.635720641555858e-05, "loss": 0.0047, "step": 421 }, { "epoch": 26.375, "grad_norm": 0.017143091500272017, "learning_rate": 9.595266166139366e-05, "loss": 0.0016, "step": 422 }, { "epoch": 26.4375, "grad_norm": 0.024547894378123793, "learning_rate": 9.554818324260859e-05, "loss": 0.0024, "step": 423 }, { "epoch": 26.5, "grad_norm": 0.0017051489220323973, "learning_rate": 9.514377778855521e-05, "loss": 0.0001, "step": 424 }, { "epoch": 26.5625, "grad_norm": 0.03211276051099752, "learning_rate": 9.473945192738933e-05, "loss": 0.0043, "step": 425 }, { "epoch": 26.625, "grad_norm": 0.029672554162065486, "learning_rate": 9.433521228596237e-05, "loss": 0.0049, "step": 426 }, { "epoch": 26.6875, "grad_norm": 0.02974958108819204, "learning_rate": 9.393106548971256e-05, "loss": 0.0046, "step": 427 }, { "epoch": 26.75, "grad_norm": 0.02994562158825566, "learning_rate": 9.352701816255643e-05, "loss": 0.0032, "step": 428 }, { "epoch": 26.8125, "grad_norm": 0.021945113737497645, "learning_rate": 9.312307692678017e-05, "loss": 0.0025, "step": 429 }, { "epoch": 26.875, "grad_norm": 0.039440593653657555, "learning_rate": 9.27192484029312e-05, "loss": 0.0064, "step": 430 }, { "epoch": 26.9375, "grad_norm": 0.034294640588989456, "learning_rate": 9.231553920970958e-05, "loss": 0.0067, "step": 431 }, { "epoch": 27.0, "grad_norm": 0.045874360009063225, "learning_rate": 9.19119559638596e-05, "loss": 0.0091, "step": 432 }, { "epoch": 27.0625, "grad_norm": 0.029633783585542237, "learning_rate": 9.150850528006119e-05, "loss": 0.0052, "step": 433 }, { "epoch": 27.125, "grad_norm": 0.01916779629257807, "learning_rate": 9.110519377082172e-05, "loss": 0.0023, "step": 434 }, { "epoch": 27.1875, "grad_norm": 0.029144898881918332, "learning_rate": 9.070202804636745e-05, "loss": 0.0041, "step": 435 }, { "epoch": 27.25, "grad_norm": 0.02042965784384683, "learning_rate": 9.02990147145352e-05, "loss": 0.0036, "step": 436 }, { "epoch": 27.3125, "grad_norm": 0.03052607215886322, "learning_rate": 8.98961603806642e-05, "loss": 0.0041, "step": 437 }, { "epoch": 27.375, "grad_norm": 0.011632711843821695, "learning_rate": 8.949347164748762e-05, "loss": 0.0009, "step": 438 }, { "epoch": 27.4375, "grad_norm": 0.02552057287290442, "learning_rate": 8.909095511502452e-05, "loss": 0.0049, "step": 439 }, { "epoch": 27.5, "grad_norm": 0.022469807056043515, "learning_rate": 8.868861738047158e-05, "loss": 0.0031, "step": 440 }, { "epoch": 27.5625, "grad_norm": 0.023608406072661937, "learning_rate": 8.828646503809504e-05, "loss": 0.0022, "step": 441 }, { "epoch": 27.625, "grad_norm": 0.03344361764249166, "learning_rate": 8.788450467912255e-05, "loss": 0.006, "step": 442 }, { "epoch": 27.6875, "grad_norm": 0.021786698400709355, "learning_rate": 8.748274289163514e-05, "loss": 0.0024, "step": 443 }, { "epoch": 27.75, "grad_norm": 0.030927337267203, "learning_rate": 8.70811862604594e-05, "loss": 0.0044, "step": 444 }, { "epoch": 27.8125, "grad_norm": 0.022165776937843076, "learning_rate": 8.667984136705928e-05, "loss": 0.0046, "step": 445 }, { "epoch": 27.875, "grad_norm": 0.01725502880403943, "learning_rate": 8.627871478942851e-05, "loss": 0.0028, "step": 446 }, { "epoch": 27.9375, "grad_norm": 0.020259168330017313, "learning_rate": 8.587781310198255e-05, "loss": 0.0025, "step": 447 }, { "epoch": 28.0, "grad_norm": 0.022633252879775623, "learning_rate": 8.5477142875451e-05, "loss": 0.0027, "step": 448 }, { "epoch": 28.0625, "grad_norm": 0.019201479355893957, "learning_rate": 8.507671067676979e-05, "loss": 0.0031, "step": 449 }, { "epoch": 28.125, "grad_norm": 0.0009559951000656043, "learning_rate": 8.467652306897369e-05, "loss": 0.0, "step": 450 }, { "epoch": 28.1875, "grad_norm": 0.023486468299252494, "learning_rate": 8.427658661108857e-05, "loss": 0.0031, "step": 451 }, { "epoch": 28.25, "grad_norm": 0.015284012764583016, "learning_rate": 8.387690785802402e-05, "loss": 0.0013, "step": 452 }, { "epoch": 28.3125, "grad_norm": 0.021772765753157987, "learning_rate": 8.347749336046586e-05, "loss": 0.0035, "step": 453 }, { "epoch": 28.375, "grad_norm": 0.011499608138854372, "learning_rate": 8.307834966476884e-05, "loss": 0.0009, "step": 454 }, { "epoch": 28.4375, "grad_norm": 0.014877751008139171, "learning_rate": 8.267948331284923e-05, "loss": 0.0032, "step": 455 }, { "epoch": 28.5, "grad_norm": 0.020317448663989896, "learning_rate": 8.228090084207774e-05, "loss": 0.0051, "step": 456 }, { "epoch": 28.5625, "grad_norm": 0.011464514826041898, "learning_rate": 8.188260878517224e-05, "loss": 0.0027, "step": 457 }, { "epoch": 28.625, "grad_norm": 0.02729549690762567, "learning_rate": 8.14846136700908e-05, "loss": 0.0037, "step": 458 }, { "epoch": 28.6875, "grad_norm": 0.016332439503834673, "learning_rate": 8.108692201992465e-05, "loss": 0.0019, "step": 459 }, { "epoch": 28.75, "grad_norm": 0.020328088646763758, "learning_rate": 8.068954035279121e-05, "loss": 0.0045, "step": 460 }, { "epoch": 28.8125, "grad_norm": 0.04010266322110663, "learning_rate": 8.02924751817274e-05, "loss": 0.0085, "step": 461 }, { "epoch": 28.875, "grad_norm": 0.024097139912002265, "learning_rate": 7.989573301458273e-05, "loss": 0.0042, "step": 462 }, { "epoch": 28.9375, "grad_norm": 0.036490969223195995, "learning_rate": 7.949932035391278e-05, "loss": 0.0053, "step": 463 }, { "epoch": 29.0, "grad_norm": 0.03282570248481929, "learning_rate": 7.91032436968725e-05, "loss": 0.0045, "step": 464 }, { "epoch": 29.0625, "grad_norm": 0.01039285670497238, "learning_rate": 7.870750953510984e-05, "loss": 0.0006, "step": 465 }, { "epoch": 29.125, "grad_norm": 0.01787195379230809, "learning_rate": 7.831212435465924e-05, "loss": 0.0022, "step": 466 }, { "epoch": 29.1875, "grad_norm": 0.02385197046230436, "learning_rate": 7.79170946358354e-05, "loss": 0.0039, "step": 467 }, { "epoch": 29.25, "grad_norm": 0.018022930648381282, "learning_rate": 7.75224268531271e-05, "loss": 0.0017, "step": 468 }, { "epoch": 29.3125, "grad_norm": 0.027503021332335603, "learning_rate": 7.71281274750909e-05, "loss": 0.0036, "step": 469 }, { "epoch": 29.375, "grad_norm": 0.009996199602644211, "learning_rate": 7.673420296424541e-05, "loss": 0.0009, "step": 470 }, { "epoch": 29.4375, "grad_norm": 0.018797312474179668, "learning_rate": 7.634065977696511e-05, "loss": 0.0026, "step": 471 }, { "epoch": 29.5, "grad_norm": 0.028684614773920647, "learning_rate": 7.594750436337467e-05, "loss": 0.004, "step": 472 }, { "epoch": 29.5625, "grad_norm": 0.020479906972316966, "learning_rate": 7.555474316724313e-05, "loss": 0.0013, "step": 473 }, { "epoch": 29.625, "grad_norm": 0.025765673045083433, "learning_rate": 7.516238262587851e-05, "loss": 0.0033, "step": 474 }, { "epoch": 29.6875, "grad_norm": 0.045631738026105505, "learning_rate": 7.4770429170022e-05, "loss": 0.0109, "step": 475 }, { "epoch": 29.75, "grad_norm": 0.02215455016152165, "learning_rate": 7.437888922374276e-05, "loss": 0.003, "step": 476 }, { "epoch": 29.8125, "grad_norm": 0.03072966417955352, "learning_rate": 7.398776920433258e-05, "loss": 0.0055, "step": 477 }, { "epoch": 29.875, "grad_norm": 0.017660307308308684, "learning_rate": 7.35970755222007e-05, "loss": 0.0016, "step": 478 }, { "epoch": 29.9375, "grad_norm": 0.03283219124830058, "learning_rate": 7.320681458076871e-05, "loss": 0.0052, "step": 479 }, { "epoch": 30.0, "grad_norm": 0.03754750336262435, "learning_rate": 7.281699277636572e-05, "loss": 0.0059, "step": 480 }, { "epoch": 30.0625, "grad_norm": 0.0232647542444026, "learning_rate": 7.242761649812335e-05, "loss": 0.0026, "step": 481 }, { "epoch": 30.125, "grad_norm": 0.010280990700798192, "learning_rate": 7.20386921278711e-05, "loss": 0.0008, "step": 482 }, { "epoch": 30.1875, "grad_norm": 0.023701325742362663, "learning_rate": 7.165022604003186e-05, "loss": 0.0045, "step": 483 }, { "epoch": 30.25, "grad_norm": 0.02723103090348779, "learning_rate": 7.126222460151719e-05, "loss": 0.0036, "step": 484 }, { "epoch": 30.3125, "grad_norm": 0.022573066749760014, "learning_rate": 7.08746941716232e-05, "loss": 0.0028, "step": 485 }, { "epoch": 30.375, "grad_norm": 0.015662563973923037, "learning_rate": 7.048764110192618e-05, "loss": 0.0027, "step": 486 }, { "epoch": 30.4375, "grad_norm": 0.03142186630699031, "learning_rate": 7.010107173617857e-05, "loss": 0.0046, "step": 487 }, { "epoch": 30.5, "grad_norm": 0.010153995879187198, "learning_rate": 6.971499241020495e-05, "loss": 0.001, "step": 488 }, { "epoch": 30.5625, "grad_norm": 0.027760410207358414, "learning_rate": 6.932940945179818e-05, "loss": 0.0035, "step": 489 }, { "epoch": 30.625, "grad_norm": 0.02569158532689841, "learning_rate": 6.894432918061579e-05, "loss": 0.0042, "step": 490 }, { "epoch": 30.6875, "grad_norm": 0.03360414388929754, "learning_rate": 6.855975790807623e-05, "loss": 0.0052, "step": 491 }, { "epoch": 30.75, "grad_norm": 0.03809908186634646, "learning_rate": 6.817570193725564e-05, "loss": 0.0046, "step": 492 }, { "epoch": 30.8125, "grad_norm": 0.01683175407924854, "learning_rate": 6.77921675627843e-05, "loss": 0.0018, "step": 493 }, { "epoch": 30.875, "grad_norm": 0.025360976440516984, "learning_rate": 6.740916107074372e-05, "loss": 0.0037, "step": 494 }, { "epoch": 30.9375, "grad_norm": 0.023768876753983793, "learning_rate": 6.702668873856338e-05, "loss": 0.0031, "step": 495 }, { "epoch": 31.0, "grad_norm": 0.03989426250367044, "learning_rate": 6.664475683491796e-05, "loss": 0.008, "step": 496 }, { "epoch": 31.0625, "grad_norm": 0.010352984240734544, "learning_rate": 6.626337161962461e-05, "loss": 0.0008, "step": 497 }, { "epoch": 31.125, "grad_norm": 0.021953465861397056, "learning_rate": 6.588253934354039e-05, "loss": 0.0031, "step": 498 }, { "epoch": 31.1875, "grad_norm": 0.019549539753814318, "learning_rate": 6.550226624845961e-05, "loss": 0.0019, "step": 499 }, { "epoch": 31.25, "grad_norm": 0.02490729225984056, "learning_rate": 6.512255856701177e-05, "loss": 0.0027, "step": 500 }, { "epoch": 31.3125, "grad_norm": 0.019764190511534645, "learning_rate": 6.474342252255927e-05, "loss": 0.0027, "step": 501 }, { "epoch": 31.375, "grad_norm": 0.03215845958993517, "learning_rate": 6.43648643290955e-05, "loss": 0.0069, "step": 502 }, { "epoch": 31.4375, "grad_norm": 0.0244310478279658, "learning_rate": 6.398689019114289e-05, "loss": 0.0033, "step": 503 }, { "epoch": 31.5, "grad_norm": 0.018051312193521227, "learning_rate": 6.360950630365126e-05, "loss": 0.0025, "step": 504 }, { "epoch": 31.5625, "grad_norm": 0.00962728862133209, "learning_rate": 6.323271885189635e-05, "loss": 0.0007, "step": 505 }, { "epoch": 31.625, "grad_norm": 0.013047383716533472, "learning_rate": 6.285653401137837e-05, "loss": 0.0011, "step": 506 }, { "epoch": 31.6875, "grad_norm": 0.026493369355222587, "learning_rate": 6.248095794772079e-05, "loss": 0.0035, "step": 507 }, { "epoch": 31.75, "grad_norm": 0.023599070967015707, "learning_rate": 6.210599681656933e-05, "loss": 0.0039, "step": 508 }, { "epoch": 31.8125, "grad_norm": 0.02343395190062143, "learning_rate": 6.173165676349103e-05, "loss": 0.0037, "step": 509 }, { "epoch": 31.875, "grad_norm": 0.01983520736229701, "learning_rate": 6.135794392387353e-05, "loss": 0.0025, "step": 510 }, { "epoch": 31.9375, "grad_norm": 0.0318092460128266, "learning_rate": 6.0984864422824496e-05, "loss": 0.0045, "step": 511 }, { "epoch": 32.0, "grad_norm": 0.04242840537695658, "learning_rate": 6.061242437507131e-05, "loss": 0.009, "step": 512 }, { "epoch": 32.0625, "grad_norm": 0.02009402381973668, "learning_rate": 6.024062988486072e-05, "loss": 0.0032, "step": 513 }, { "epoch": 32.125, "grad_norm": 0.011445958109287144, "learning_rate": 5.986948704585895e-05, "loss": 0.0024, "step": 514 }, { "epoch": 32.1875, "grad_norm": 0.016981136262338714, "learning_rate": 5.949900194105167e-05, "loss": 0.0019, "step": 515 }, { "epoch": 32.25, "grad_norm": 0.03620971462935574, "learning_rate": 5.9129180642644414e-05, "loss": 0.0061, "step": 516 }, { "epoch": 32.3125, "grad_norm": 0.022511016593835396, "learning_rate": 5.8760029211962954e-05, "loss": 0.0045, "step": 517 }, { "epoch": 32.375, "grad_norm": 0.027254211809580164, "learning_rate": 5.839155369935407e-05, "loss": 0.0039, "step": 518 }, { "epoch": 32.4375, "grad_norm": 0.02011520557513548, "learning_rate": 5.802376014408632e-05, "loss": 0.002, "step": 519 }, { "epoch": 32.5, "grad_norm": 0.029228301710033645, "learning_rate": 5.765665457425102e-05, "loss": 0.0051, "step": 520 }, { "epoch": 32.5625, "grad_norm": 0.01687690442346917, "learning_rate": 5.729024300666349e-05, "loss": 0.0011, "step": 521 }, { "epoch": 32.625, "grad_norm": 0.02623446617433343, "learning_rate": 5.6924531446764504e-05, "loss": 0.0038, "step": 522 }, { "epoch": 32.6875, "grad_norm": 0.018653711010246068, "learning_rate": 5.6559525888521815e-05, "loss": 0.0019, "step": 523 }, { "epoch": 32.75, "grad_norm": 0.01760737201183457, "learning_rate": 5.6195232314331766e-05, "loss": 0.002, "step": 524 }, { "epoch": 32.8125, "grad_norm": 0.0142233875057376, "learning_rate": 5.5831656694921465e-05, "loss": 0.0017, "step": 525 }, { "epoch": 32.875, "grad_norm": 0.02523792596742103, "learning_rate": 5.5468804989250786e-05, "loss": 0.0034, "step": 526 }, { "epoch": 32.9375, "grad_norm": 0.017246202374674464, "learning_rate": 5.510668314441474e-05, "loss": 0.0024, "step": 527 }, { "epoch": 33.0, "grad_norm": 0.029238162207787115, "learning_rate": 5.474529709554612e-05, "loss": 0.0064, "step": 528 }, { "epoch": 33.0625, "grad_norm": 0.019900275674923293, "learning_rate": 5.438465276571796e-05, "loss": 0.0024, "step": 529 }, { "epoch": 33.125, "grad_norm": 0.022436670149349774, "learning_rate": 5.402475606584669e-05, "loss": 0.0024, "step": 530 }, { "epoch": 33.1875, "grad_norm": 0.026570662311707075, "learning_rate": 5.366561289459512e-05, "loss": 0.0038, "step": 531 }, { "epoch": 33.25, "grad_norm": 0.007873326312030244, "learning_rate": 5.3307229138275936e-05, "loss": 0.0007, "step": 532 }, { "epoch": 33.3125, "grad_norm": 0.010633636981874433, "learning_rate": 5.2949610670755e-05, "loss": 0.0007, "step": 533 }, { "epoch": 33.375, "grad_norm": 0.021848223370386288, "learning_rate": 5.259276335335521e-05, "loss": 0.0032, "step": 534 }, { "epoch": 33.4375, "grad_norm": 0.02671646079052259, "learning_rate": 5.223669303476041e-05, "loss": 0.004, "step": 535 }, { "epoch": 33.5, "grad_norm": 0.025191000069873352, "learning_rate": 5.1881405550919493e-05, "loss": 0.0038, "step": 536 }, { "epoch": 33.5625, "grad_norm": 0.02686634304764754, "learning_rate": 5.152690672495091e-05, "loss": 0.0036, "step": 537 }, { "epoch": 33.625, "grad_norm": 0.016683908355091075, "learning_rate": 5.117320236704697e-05, "loss": 0.0013, "step": 538 }, { "epoch": 33.6875, "grad_norm": 0.025955098003678897, "learning_rate": 5.08202982743788e-05, "loss": 0.0044, "step": 539 }, { "epoch": 33.75, "grad_norm": 0.03159147242134654, "learning_rate": 5.0468200231001286e-05, "loss": 0.0053, "step": 540 }, { "epoch": 33.8125, "grad_norm": 0.020310055743322283, "learning_rate": 5.01169140077582e-05, "loss": 0.0047, "step": 541 }, { "epoch": 33.875, "grad_norm": 0.02010895023321247, "learning_rate": 4.976644536218783e-05, "loss": 0.0022, "step": 542 }, { "epoch": 33.9375, "grad_norm": 0.030234549444115102, "learning_rate": 4.9416800038428324e-05, "loss": 0.007, "step": 543 }, { "epoch": 34.0, "grad_norm": 0.028337369877458187, "learning_rate": 4.9067983767123736e-05, "loss": 0.0037, "step": 544 }, { "epoch": 34.0625, "grad_norm": 0.016650841753729186, "learning_rate": 4.8720002265330015e-05, "loss": 0.0019, "step": 545 }, { "epoch": 34.125, "grad_norm": 0.024415205244997427, "learning_rate": 4.837286123642141e-05, "loss": 0.0039, "step": 546 }, { "epoch": 34.1875, "grad_norm": 0.030479700286681143, "learning_rate": 4.8026566369996926e-05, "loss": 0.0058, "step": 547 }, { "epoch": 34.25, "grad_norm": 0.015608780057852139, "learning_rate": 4.768112334178699e-05, "loss": 0.0024, "step": 548 }, { "epoch": 34.3125, "grad_norm": 0.015068945094923566, "learning_rate": 4.733653781356055e-05, "loss": 0.0022, "step": 549 }, { "epoch": 34.375, "grad_norm": 0.026394448783195724, "learning_rate": 4.699281543303222e-05, "loss": 0.0033, "step": 550 }, { "epoch": 34.4375, "grad_norm": 0.01722401756403534, "learning_rate": 4.6649961833769715e-05, "loss": 0.0019, "step": 551 }, { "epoch": 34.5, "grad_norm": 0.008965687726181431, "learning_rate": 4.630798263510162e-05, "loss": 0.0005, "step": 552 }, { "epoch": 34.5625, "grad_norm": 0.01778818504579463, "learning_rate": 4.596688344202509e-05, "loss": 0.0021, "step": 553 }, { "epoch": 34.625, "grad_norm": 0.025545188168124802, "learning_rate": 4.562666984511416e-05, "loss": 0.0029, "step": 554 }, { "epoch": 34.6875, "grad_norm": 0.022767194931364268, "learning_rate": 4.528734742042803e-05, "loss": 0.0026, "step": 555 }, { "epoch": 34.75, "grad_norm": 0.02949871036401744, "learning_rate": 4.494892172941965e-05, "loss": 0.005, "step": 556 }, { "epoch": 34.8125, "grad_norm": 0.027099445112904772, "learning_rate": 4.461139831884474e-05, "loss": 0.004, "step": 557 }, { "epoch": 34.875, "grad_norm": 0.02425594257821406, "learning_rate": 4.427478272067066e-05, "loss": 0.0038, "step": 558 }, { "epoch": 34.9375, "grad_norm": 0.03234728199101579, "learning_rate": 4.393908045198585e-05, "loss": 0.0051, "step": 559 }, { "epoch": 35.0, "grad_norm": 0.021712331665851, "learning_rate": 4.360429701490934e-05, "loss": 0.0031, "step": 560 }, { "epoch": 35.0625, "grad_norm": 0.030962991791000313, "learning_rate": 4.327043789650078e-05, "loss": 0.0061, "step": 561 }, { "epoch": 35.125, "grad_norm": 0.019615835902719432, "learning_rate": 4.2937508568670194e-05, "loss": 0.0025, "step": 562 }, { "epoch": 35.1875, "grad_norm": 0.019445691272185775, "learning_rate": 4.2605514488088515e-05, "loss": 0.0023, "step": 563 }, { "epoch": 35.25, "grad_norm": 0.025561602674203147, "learning_rate": 4.227446109609809e-05, "loss": 0.0043, "step": 564 }, { "epoch": 35.3125, "grad_norm": 0.021631887342721947, "learning_rate": 4.1944353818623424e-05, "loss": 0.0024, "step": 565 }, { "epoch": 35.375, "grad_norm": 0.023929421104664724, "learning_rate": 4.161519806608247e-05, "loss": 0.0053, "step": 566 }, { "epoch": 35.4375, "grad_norm": 0.026767345162646533, "learning_rate": 4.12869992332977e-05, "loss": 0.0033, "step": 567 }, { "epoch": 35.5, "grad_norm": 0.019265712551654487, "learning_rate": 4.0959762699407766e-05, "loss": 0.0019, "step": 568 }, { "epoch": 35.5625, "grad_norm": 0.024435718154567927, "learning_rate": 4.0633493827779425e-05, "loss": 0.0034, "step": 569 }, { "epoch": 35.625, "grad_norm": 0.01536328570946991, "learning_rate": 4.030819796591949e-05, "loss": 0.0024, "step": 570 }, { "epoch": 35.6875, "grad_norm": 0.016210675375350477, "learning_rate": 3.9983880445387366e-05, "loss": 0.0024, "step": 571 }, { "epoch": 35.75, "grad_norm": 0.01324379048342, "learning_rate": 3.966054658170754e-05, "loss": 0.0011, "step": 572 }, { "epoch": 35.8125, "grad_norm": 0.009268348242829393, "learning_rate": 3.9338201674282406e-05, "loss": 0.0008, "step": 573 }, { "epoch": 35.875, "grad_norm": 0.024688307625187852, "learning_rate": 3.9016851006305545e-05, "loss": 0.0029, "step": 574 }, { "epoch": 35.9375, "grad_norm": 0.04355703985093336, "learning_rate": 3.869649984467504e-05, "loss": 0.0094, "step": 575 }, { "epoch": 36.0, "grad_norm": 0.02001971316741883, "learning_rate": 3.8377153439907266e-05, "loss": 0.0021, "step": 576 }, { "epoch": 36.0625, "grad_norm": 0.025891577060147455, "learning_rate": 3.8058817026050677e-05, "loss": 0.0038, "step": 577 }, { "epoch": 36.125, "grad_norm": 0.022905458332606304, "learning_rate": 3.774149582060012e-05, "loss": 0.0029, "step": 578 }, { "epoch": 36.1875, "grad_norm": 0.020383820791147977, "learning_rate": 3.742519502441132e-05, "loss": 0.0042, "step": 579 }, { "epoch": 36.25, "grad_norm": 0.022297278719448848, "learning_rate": 3.710991982161555e-05, "loss": 0.0041, "step": 580 }, { "epoch": 36.3125, "grad_norm": 0.01800159818226978, "learning_rate": 3.679567537953485e-05, "loss": 0.002, "step": 581 }, { "epoch": 36.375, "grad_norm": 0.026472437396163635, "learning_rate": 3.648246684859716e-05, "loss": 0.004, "step": 582 }, { "epoch": 36.4375, "grad_norm": 0.034982086882290896, "learning_rate": 3.617029936225193e-05, "loss": 0.0044, "step": 583 }, { "epoch": 36.5, "grad_norm": 0.025380193547038656, "learning_rate": 3.585917803688603e-05, "loss": 0.004, "step": 584 }, { "epoch": 36.5625, "grad_norm": 0.024933823706227147, "learning_rate": 3.55491079717399e-05, "loss": 0.0034, "step": 585 }, { "epoch": 36.625, "grad_norm": 0.032395369351154765, "learning_rate": 3.5240094248824e-05, "loss": 0.0047, "step": 586 }, { "epoch": 36.6875, "grad_norm": 0.02426742332899305, "learning_rate": 3.493214193283536e-05, "loss": 0.0025, "step": 587 }, { "epoch": 36.75, "grad_norm": 0.016326134651246785, "learning_rate": 3.4625256071074773e-05, "loss": 0.0019, "step": 588 }, { "epoch": 36.8125, "grad_norm": 0.022068578736713734, "learning_rate": 3.4319441693363906e-05, "loss": 0.0026, "step": 589 }, { "epoch": 36.875, "grad_norm": 0.022624745551108428, "learning_rate": 3.4014703811963025e-05, "loss": 0.003, "step": 590 }, { "epoch": 36.9375, "grad_norm": 0.02155433442232093, "learning_rate": 3.3711047421488675e-05, "loss": 0.0031, "step": 591 }, { "epoch": 37.0, "grad_norm": 0.00020949892145012195, "learning_rate": 3.340847749883191e-05, "loss": 0.0, "step": 592 }, { "epoch": 37.0625, "grad_norm": 0.025941183284377157, "learning_rate": 3.3106999003076746e-05, "loss": 0.0042, "step": 593 }, { "epoch": 37.125, "grad_norm": 0.021731884621000654, "learning_rate": 3.280661687541876e-05, "loss": 0.0024, "step": 594 }, { "epoch": 37.1875, "grad_norm": 0.016413130007407132, "learning_rate": 3.2507336039084314e-05, "loss": 0.0033, "step": 595 }, { "epoch": 37.25, "grad_norm": 0.033791751348081316, "learning_rate": 3.2209161399249674e-05, "loss": 0.0055, "step": 596 }, { "epoch": 37.3125, "grad_norm": 0.007739198098749652, "learning_rate": 3.191209784296068e-05, "loss": 0.0006, "step": 597 }, { "epoch": 37.375, "grad_norm": 0.01698515372085753, "learning_rate": 3.161615023905265e-05, "loss": 0.0018, "step": 598 }, { "epoch": 37.4375, "grad_norm": 0.009237792411598502, "learning_rate": 3.132132343807056e-05, "loss": 0.002, "step": 599 }, { "epoch": 37.5, "grad_norm": 0.01638142995950318, "learning_rate": 3.102762227218957e-05, "loss": 0.0014, "step": 600 }, { "epoch": 37.5625, "grad_norm": 0.016279487432564795, "learning_rate": 3.073505155513591e-05, "loss": 0.0015, "step": 601 }, { "epoch": 37.625, "grad_norm": 0.011772194410100805, "learning_rate": 3.044361608210775e-05, "loss": 0.0009, "step": 602 }, { "epoch": 37.6875, "grad_norm": 0.030973278706032212, "learning_rate": 3.0153320629696846e-05, "loss": 0.0051, "step": 603 }, { "epoch": 37.75, "grad_norm": 0.026566291070872453, "learning_rate": 2.9864169955810084e-05, "loss": 0.006, "step": 604 }, { "epoch": 37.8125, "grad_norm": 0.01901261238275813, "learning_rate": 2.9576168799591664e-05, "loss": 0.0022, "step": 605 }, { "epoch": 37.875, "grad_norm": 0.022602205933752235, "learning_rate": 2.9289321881345254e-05, "loss": 0.0032, "step": 606 }, { "epoch": 37.9375, "grad_norm": 0.022472706538328845, "learning_rate": 2.900363390245674e-05, "loss": 0.0028, "step": 607 }, { "epoch": 38.0, "grad_norm": 0.02802199698579331, "learning_rate": 2.8719109545317103e-05, "loss": 0.0061, "step": 608 }, { "epoch": 38.0625, "grad_norm": 0.007286991170262413, "learning_rate": 2.8435753473245698e-05, "loss": 0.0005, "step": 609 }, { "epoch": 38.125, "grad_norm": 0.015507395834223437, "learning_rate": 2.8153570330413925e-05, "loss": 0.0017, "step": 610 }, { "epoch": 38.1875, "grad_norm": 0.020058482119342358, "learning_rate": 2.7872564741768913e-05, "loss": 0.0034, "step": 611 }, { "epoch": 38.25, "grad_norm": 0.01760996920462819, "learning_rate": 2.759274131295787e-05, "loss": 0.0034, "step": 612 }, { "epoch": 38.3125, "grad_norm": 0.029096639894173625, "learning_rate": 2.73141046302525e-05, "loss": 0.005, "step": 613 }, { "epoch": 38.375, "grad_norm": 0.013393983973238514, "learning_rate": 2.7036659260473974e-05, "loss": 0.0009, "step": 614 }, { "epoch": 38.4375, "grad_norm": 0.012945478401193455, "learning_rate": 2.6760409750917927e-05, "loss": 0.0019, "step": 615 }, { "epoch": 38.5, "grad_norm": 0.026715394934565314, "learning_rate": 2.6485360629279987e-05, "loss": 0.0038, "step": 616 }, { "epoch": 38.5625, "grad_norm": 0.017347535771603556, "learning_rate": 2.6211516403581582e-05, "loss": 0.002, "step": 617 }, { "epoch": 38.625, "grad_norm": 0.016971730219051892, "learning_rate": 2.593888156209603e-05, "loss": 0.0017, "step": 618 }, { "epoch": 38.6875, "grad_norm": 0.025742679582296178, "learning_rate": 2.5667460573275028e-05, "loss": 0.0048, "step": 619 }, { "epoch": 38.75, "grad_norm": 0.02333983935112299, "learning_rate": 2.5397257885675397e-05, "loss": 0.0029, "step": 620 }, { "epoch": 38.8125, "grad_norm": 0.010043103557887257, "learning_rate": 2.5128277927886055e-05, "loss": 0.0039, "step": 621 }, { "epoch": 38.875, "grad_norm": 0.030322831096171305, "learning_rate": 2.48605251084556e-05, "loss": 0.0046, "step": 622 }, { "epoch": 38.9375, "grad_norm": 0.027197588947040895, "learning_rate": 2.4594003815819966e-05, "loss": 0.006, "step": 623 }, { "epoch": 39.0, "grad_norm": 0.023325558051272913, "learning_rate": 2.432871841823047e-05, "loss": 0.0026, "step": 624 }, { "epoch": 39.0625, "grad_norm": 0.027132468126116745, "learning_rate": 2.406467326368237e-05, "loss": 0.005, "step": 625 }, { "epoch": 39.125, "grad_norm": 0.023365336051910712, "learning_rate": 2.3801872679843385e-05, "loss": 0.0047, "step": 626 }, { "epoch": 39.1875, "grad_norm": 0.019510221468659553, "learning_rate": 2.3540320973982922e-05, "loss": 0.0022, "step": 627 }, { "epoch": 39.25, "grad_norm": 0.016164965587762533, "learning_rate": 2.3280022432901383e-05, "loss": 0.0025, "step": 628 }, { "epoch": 39.3125, "grad_norm": 0.021451656557391466, "learning_rate": 2.302098132286006e-05, "loss": 0.003, "step": 629 }, { "epoch": 39.375, "grad_norm": 0.031011855175604085, "learning_rate": 2.2763201889510987e-05, "loss": 0.0064, "step": 630 }, { "epoch": 39.4375, "grad_norm": 0.0006149615487194747, "learning_rate": 2.2506688357827544e-05, "loss": 0.0, "step": 631 }, { "epoch": 39.5, "grad_norm": 0.02248522871311791, "learning_rate": 2.2251444932035094e-05, "loss": 0.0027, "step": 632 }, { "epoch": 39.5625, "grad_norm": 0.034693691996730266, "learning_rate": 2.199747579554211e-05, "loss": 0.0057, "step": 633 }, { "epoch": 39.625, "grad_norm": 0.02437060961249019, "learning_rate": 2.174478511087171e-05, "loss": 0.0032, "step": 634 }, { "epoch": 39.6875, "grad_norm": 0.015994793305714, "learning_rate": 2.149337701959325e-05, "loss": 0.0018, "step": 635 }, { "epoch": 39.75, "grad_norm": 0.0002863850040924982, "learning_rate": 2.1243255642254578e-05, "loss": 0.0, "step": 636 }, { "epoch": 39.8125, "grad_norm": 0.020151892270889014, "learning_rate": 2.099442507831444e-05, "loss": 0.002, "step": 637 }, { "epoch": 39.875, "grad_norm": 0.028381722329777846, "learning_rate": 2.074688940607529e-05, "loss": 0.0043, "step": 638 }, { "epoch": 39.9375, "grad_norm": 0.024166442271924462, "learning_rate": 2.050065268261655e-05, "loss": 0.0026, "step": 639 }, { "epoch": 40.0, "grad_norm": 0.02278485322740038, "learning_rate": 2.025571894372794e-05, "loss": 0.0028, "step": 640 }, { "epoch": 40.0625, "grad_norm": 0.013588204702480308, "learning_rate": 2.001209220384346e-05, "loss": 0.0008, "step": 641 }, { "epoch": 40.125, "grad_norm": 0.02222330407423336, "learning_rate": 1.976977645597552e-05, "loss": 0.0032, "step": 642 }, { "epoch": 40.1875, "grad_norm": 0.015083512876565244, "learning_rate": 1.9528775671649592e-05, "loss": 0.001, "step": 643 }, { "epoch": 40.25, "grad_norm": 0.025069080823856827, "learning_rate": 1.9289093800839066e-05, "loss": 0.0031, "step": 644 }, { "epoch": 40.3125, "grad_norm": 0.02025912896301005, "learning_rate": 1.9050734771900413e-05, "loss": 0.0025, "step": 645 }, { "epoch": 40.375, "grad_norm": 0.013091990888201397, "learning_rate": 1.8813702491508955e-05, "loss": 0.0015, "step": 646 }, { "epoch": 40.4375, "grad_norm": 0.021454823181291053, "learning_rate": 1.8578000844594747e-05, "loss": 0.0027, "step": 647 }, { "epoch": 40.5, "grad_norm": 0.02158593959614438, "learning_rate": 1.8343633694278895e-05, "loss": 0.0038, "step": 648 }, { "epoch": 40.5625, "grad_norm": 0.030736687344323808, "learning_rate": 1.8110604881810355e-05, "loss": 0.0064, "step": 649 }, { "epoch": 40.625, "grad_norm": 0.0186110157411597, "learning_rate": 1.7878918226502816e-05, "loss": 0.0029, "step": 650 }, { "epoch": 40.6875, "grad_norm": 0.021941314758172913, "learning_rate": 1.7648577525672194e-05, "loss": 0.0037, "step": 651 }, { "epoch": 40.75, "grad_norm": 0.021140994060252962, "learning_rate": 1.741958655457436e-05, "loss": 0.0042, "step": 652 }, { "epoch": 40.8125, "grad_norm": 0.022090278067811005, "learning_rate": 1.7191949066343305e-05, "loss": 0.0026, "step": 653 }, { "epoch": 40.875, "grad_norm": 0.020131252147946324, "learning_rate": 1.69656687919296e-05, "loss": 0.0034, "step": 654 }, { "epoch": 40.9375, "grad_norm": 0.015348430101637372, "learning_rate": 1.6740749440039262e-05, "loss": 0.0017, "step": 655 }, { "epoch": 41.0, "grad_norm": 0.029281984191582698, "learning_rate": 1.65171946970729e-05, "loss": 0.0049, "step": 656 }, { "epoch": 41.0625, "grad_norm": 0.024216109331731846, "learning_rate": 1.6295008227065367e-05, "loss": 0.0046, "step": 657 }, { "epoch": 41.125, "grad_norm": 0.021824879823787828, "learning_rate": 1.607419367162577e-05, "loss": 0.003, "step": 658 }, { "epoch": 41.1875, "grad_norm": 0.01662964866399249, "learning_rate": 1.5854754649877603e-05, "loss": 0.0017, "step": 659 }, { "epoch": 41.25, "grad_norm": 0.028069673310717295, "learning_rate": 1.563669475839956e-05, "loss": 0.005, "step": 660 }, { "epoch": 41.3125, "grad_norm": 0.02097056263367153, "learning_rate": 1.542001757116658e-05, "loss": 0.0021, "step": 661 }, { "epoch": 41.375, "grad_norm": 0.02751701662543221, "learning_rate": 1.5204726639491218e-05, "loss": 0.0043, "step": 662 }, { "epoch": 41.4375, "grad_norm": 0.02821649307326047, "learning_rate": 1.4990825491965522e-05, "loss": 0.0049, "step": 663 }, { "epoch": 41.5, "grad_norm": 0.0194640022449319, "learning_rate": 1.4778317634403083e-05, "loss": 0.0041, "step": 664 }, { "epoch": 41.5625, "grad_norm": 0.015127933678134073, "learning_rate": 1.4567206549781698e-05, "loss": 0.0009, "step": 665 }, { "epoch": 41.625, "grad_norm": 0.01576306460154544, "learning_rate": 1.4357495698186186e-05, "loss": 0.0016, "step": 666 }, { "epoch": 41.6875, "grad_norm": 0.022161541294332343, "learning_rate": 1.41491885167517e-05, "loss": 0.0032, "step": 667 }, { "epoch": 41.75, "grad_norm": 0.027773972314299573, "learning_rate": 1.3942288419607475e-05, "loss": 0.0048, "step": 668 }, { "epoch": 41.8125, "grad_norm": 0.018134086844876104, "learning_rate": 1.3736798797820782e-05, "loss": 0.0019, "step": 669 }, { "epoch": 41.875, "grad_norm": 0.010646599496077774, "learning_rate": 1.3532723019341375e-05, "loss": 0.0009, "step": 670 }, { "epoch": 41.9375, "grad_norm": 0.022768008186984896, "learning_rate": 1.3330064428946254e-05, "loss": 0.0034, "step": 671 }, { "epoch": 42.0, "grad_norm": 0.022539707407857736, "learning_rate": 1.3128826348184887e-05, "loss": 0.0026, "step": 672 }, { "epoch": 42.0625, "grad_norm": 0.017092620865285464, "learning_rate": 1.2929012075324831e-05, "loss": 0.002, "step": 673 }, { "epoch": 42.125, "grad_norm": 0.027898487565227375, "learning_rate": 1.2730624885297537e-05, "loss": 0.0036, "step": 674 }, { "epoch": 42.1875, "grad_norm": 0.01888830939120487, "learning_rate": 1.2533668029644751e-05, "loss": 0.0023, "step": 675 }, { "epoch": 42.25, "grad_norm": 0.034797071188688175, "learning_rate": 1.233814473646524e-05, "loss": 0.0047, "step": 676 }, { "epoch": 42.3125, "grad_norm": 0.029204011780076224, "learning_rate": 1.214405821036182e-05, "loss": 0.0045, "step": 677 }, { "epoch": 42.375, "grad_norm": 0.018166143974674018, "learning_rate": 1.195141163238892e-05, "loss": 0.0025, "step": 678 }, { "epoch": 42.4375, "grad_norm": 0.02110557824065673, "learning_rate": 1.1760208160000363e-05, "loss": 0.004, "step": 679 }, { "epoch": 42.5, "grad_norm": 0.025933927942001878, "learning_rate": 1.1570450926997655e-05, "loss": 0.0047, "step": 680 }, { "epoch": 42.5625, "grad_norm": 0.01153329092029045, "learning_rate": 1.13821430434786e-05, "loss": 0.0008, "step": 681 }, { "epoch": 42.625, "grad_norm": 0.02871409530487605, "learning_rate": 1.1195287595786352e-05, "loss": 0.0033, "step": 682 }, { "epoch": 42.6875, "grad_norm": 0.020158017546353265, "learning_rate": 1.1009887646458861e-05, "loss": 0.0038, "step": 683 }, { "epoch": 42.75, "grad_norm": 0.020818728203346376, "learning_rate": 1.0825946234178574e-05, "loss": 0.0035, "step": 684 }, { "epoch": 42.8125, "grad_norm": 0.019312470454732285, "learning_rate": 1.0643466373722711e-05, "loss": 0.0021, "step": 685 }, { "epoch": 42.875, "grad_norm": 0.01710427570336108, "learning_rate": 1.0462451055913847e-05, "loss": 0.002, "step": 686 }, { "epoch": 42.9375, "grad_norm": 0.024424739421454587, "learning_rate": 1.0282903247570908e-05, "loss": 0.0034, "step": 687 }, { "epoch": 43.0, "grad_norm": 0.014619911575136098, "learning_rate": 1.010482589146048e-05, "loss": 0.0016, "step": 688 }, { "epoch": 43.0625, "grad_norm": 0.018861495766737645, "learning_rate": 9.928221906248614e-06, "loss": 0.0018, "step": 689 }, { "epoch": 43.125, "grad_norm": 0.018818551498274123, "learning_rate": 9.753094186453026e-06, "loss": 0.0028, "step": 690 }, { "epoch": 43.1875, "grad_norm": 0.00016866140217513113, "learning_rate": 9.579445602395576e-06, "loss": 0.0, "step": 691 }, { "epoch": 43.25, "grad_norm": 0.02583319297082241, "learning_rate": 9.407279000155312e-06, "loss": 0.0036, "step": 692 }, { "epoch": 43.3125, "grad_norm": 0.023265334503398318, "learning_rate": 9.23659720152179e-06, "loss": 0.0031, "step": 693 }, { "epoch": 43.375, "grad_norm": 0.02996502207699758, "learning_rate": 9.067403003948782e-06, "loss": 0.0055, "step": 694 }, { "epoch": 43.4375, "grad_norm": 0.024920753877047955, "learning_rate": 8.89969918050847e-06, "loss": 0.0032, "step": 695 }, { "epoch": 43.5, "grad_norm": 0.020615629249821864, "learning_rate": 8.733488479845997e-06, "loss": 0.0023, "step": 696 }, { "epoch": 43.5625, "grad_norm": 0.02585899233303135, "learning_rate": 8.568773626134364e-06, "loss": 0.0039, "step": 697 }, { "epoch": 43.625, "grad_norm": 0.03234297820570824, "learning_rate": 8.405557319029912e-06, "loss": 0.0055, "step": 698 }, { "epoch": 43.6875, "grad_norm": 0.02361422744737024, "learning_rate": 8.243842233627896e-06, "loss": 0.004, "step": 699 }, { "epoch": 43.75, "grad_norm": 0.015111961904190768, "learning_rate": 8.083631020418791e-06, "loss": 0.0017, "step": 700 }, { "epoch": 43.8125, "grad_norm": 0.019087732617034506, "learning_rate": 7.924926305244728e-06, "loss": 0.0015, "step": 701 }, { "epoch": 43.875, "grad_norm": 0.019039580961815027, "learning_rate": 7.767730689256614e-06, "loss": 0.0032, "step": 702 }, { "epoch": 43.9375, "grad_norm": 0.01619120710381878, "learning_rate": 7.612046748871327e-06, "loss": 0.0022, "step": 703 }, { "epoch": 44.0, "grad_norm": 0.02963246039764041, "learning_rate": 7.457877035729588e-06, "loss": 0.0036, "step": 704 }, { "epoch": 44.0625, "grad_norm": 0.02289200860477135, "learning_rate": 7.305224076654127e-06, "loss": 0.0031, "step": 705 }, { "epoch": 44.125, "grad_norm": 0.022076965629593947, "learning_rate": 7.154090373608235e-06, "loss": 0.0025, "step": 706 }, { "epoch": 44.1875, "grad_norm": 0.017284412700166694, "learning_rate": 7.004478403654835e-06, "loss": 0.0028, "step": 707 }, { "epoch": 44.25, "grad_norm": 0.02994271330026706, "learning_rate": 6.856390618915775e-06, "loss": 0.0036, "step": 708 }, { "epoch": 44.3125, "grad_norm": 0.010544664066729796, "learning_rate": 6.709829446531734e-06, "loss": 0.0025, "step": 709 }, { "epoch": 44.375, "grad_norm": 0.01646307933490768, "learning_rate": 6.564797288622371e-06, "loss": 0.0017, "step": 710 }, { "epoch": 44.4375, "grad_norm": 0.02167552761136678, "learning_rate": 6.4212965222470115e-06, "loss": 0.0027, "step": 711 }, { "epoch": 44.5, "grad_norm": 0.011538751711468322, "learning_rate": 6.2793294993656494e-06, "loss": 0.001, "step": 712 }, { "epoch": 44.5625, "grad_norm": 0.026342695075247172, "learning_rate": 6.138898546800398e-06, "loss": 0.003, "step": 713 }, { "epoch": 44.625, "grad_norm": 0.01981392389008597, "learning_rate": 6.000005966197387e-06, "loss": 0.0023, "step": 714 }, { "epoch": 44.6875, "grad_norm": 0.025550367873884112, "learning_rate": 5.86265403398899e-06, "loss": 0.0046, "step": 715 }, { "epoch": 44.75, "grad_norm": 0.026765500781078307, "learning_rate": 5.726845001356573e-06, "loss": 0.0047, "step": 716 }, { "epoch": 44.8125, "grad_norm": 0.027059728322687618, "learning_rate": 5.592581094193583e-06, "loss": 0.0045, "step": 717 }, { "epoch": 44.875, "grad_norm": 0.02091280497177447, "learning_rate": 5.45986451306899e-06, "loss": 0.0031, "step": 718 }, { "epoch": 44.9375, "grad_norm": 0.023097246575829473, "learning_rate": 5.328697433191321e-06, "loss": 0.0035, "step": 719 }, { "epoch": 45.0, "grad_norm": 0.018080675383588967, "learning_rate": 5.199082004372957e-06, "loss": 0.0019, "step": 720 }, { "epoch": 45.0625, "grad_norm": 0.022599144419125274, "learning_rate": 5.0710203509948924e-06, "loss": 0.0041, "step": 721 }, { "epoch": 45.125, "grad_norm": 0.019069660618183874, "learning_rate": 4.944514571971981e-06, "loss": 0.0023, "step": 722 }, { "epoch": 45.1875, "grad_norm": 0.01673711361691797, "learning_rate": 4.819566740718439e-06, "loss": 0.0017, "step": 723 }, { "epoch": 45.25, "grad_norm": 0.017407725192059877, "learning_rate": 4.6961789051139124e-06, "loss": 0.0025, "step": 724 }, { "epoch": 45.3125, "grad_norm": 0.007214982705630053, "learning_rate": 4.574353087469929e-06, "loss": 0.0005, "step": 725 }, { "epoch": 45.375, "grad_norm": 0.02158166297132307, "learning_rate": 4.454091284496731e-06, "loss": 0.003, "step": 726 }, { "epoch": 45.4375, "grad_norm": 0.031535142634254686, "learning_rate": 4.335395467270553e-06, "loss": 0.0047, "step": 727 }, { "epoch": 45.5, "grad_norm": 0.030571739192627623, "learning_rate": 4.2182675812012965e-06, "loss": 0.0048, "step": 728 }, { "epoch": 45.5625, "grad_norm": 0.02876678712689496, "learning_rate": 4.102709546000671e-06, "loss": 0.0052, "step": 729 }, { "epoch": 45.625, "grad_norm": 0.0211381008090843, "learning_rate": 3.988723255650728e-06, "loss": 0.0036, "step": 730 }, { "epoch": 45.6875, "grad_norm": 0.02281744512369711, "learning_rate": 3.876310578372832e-06, "loss": 0.0027, "step": 731 }, { "epoch": 45.75, "grad_norm": 0.021897414013302245, "learning_rate": 3.7654733565969826e-06, "loss": 0.002, "step": 732 }, { "epoch": 45.8125, "grad_norm": 0.01919140132177964, "learning_rate": 3.6562134069316854e-06, "loss": 0.0017, "step": 733 }, { "epoch": 45.875, "grad_norm": 0.017425342650746963, "learning_rate": 3.548532520134129e-06, "loss": 0.0017, "step": 734 }, { "epoch": 45.9375, "grad_norm": 0.018230395206139328, "learning_rate": 3.442432461080858e-06, "loss": 0.0023, "step": 735 }, { "epoch": 46.0, "grad_norm": 0.024588454053071196, "learning_rate": 3.3379149687388867e-06, "loss": 0.0036, "step": 736 }, { "epoch": 46.0625, "grad_norm": 0.019822471128658536, "learning_rate": 3.23498175613709e-06, "loss": 0.0028, "step": 737 }, { "epoch": 46.125, "grad_norm": 0.02110858026864958, "learning_rate": 3.1336345103382346e-06, "loss": 0.0032, "step": 738 }, { "epoch": 46.1875, "grad_norm": 0.037579636108812396, "learning_rate": 3.0338748924112483e-06, "loss": 0.0046, "step": 739 }, { "epoch": 46.25, "grad_norm": 0.023073536580485558, "learning_rate": 2.9357045374040825e-06, "loss": 0.0024, "step": 740 }, { "epoch": 46.3125, "grad_norm": 0.019009441815773284, "learning_rate": 2.839125054316838e-06, "loss": 0.0018, "step": 741 }, { "epoch": 46.375, "grad_norm": 0.021241256500812482, "learning_rate": 2.7441380260754048e-06, "loss": 0.0032, "step": 742 }, { "epoch": 46.4375, "grad_norm": 0.02621426587903328, "learning_rate": 2.6507450095055618e-06, "loss": 0.0034, "step": 743 }, { "epoch": 46.5, "grad_norm": 0.03271609828252333, "learning_rate": 2.5589475353073988e-06, "loss": 0.0057, "step": 744 }, { "epoch": 46.5625, "grad_norm": 0.016395104429952994, "learning_rate": 2.4687471080302894e-06, "loss": 0.0014, "step": 745 }, { "epoch": 46.625, "grad_norm": 0.010491752550833084, "learning_rate": 2.380145206048201e-06, "loss": 0.0008, "step": 746 }, { "epoch": 46.6875, "grad_norm": 0.01997489278725962, "learning_rate": 2.2931432815354594e-06, "loss": 0.0028, "step": 747 }, { "epoch": 46.75, "grad_norm": 0.025314386929928892, "learning_rate": 2.2077427604429433e-06, "loss": 0.004, "step": 748 }, { "epoch": 46.8125, "grad_norm": 0.017939092098162242, "learning_rate": 2.1239450424747508e-06, "loss": 0.0032, "step": 749 }, { "epoch": 46.875, "grad_norm": 0.02330397906421762, "learning_rate": 2.041751501065203e-06, "loss": 0.0031, "step": 750 }, { "epoch": 46.9375, "grad_norm": 0.017815651039882807, "learning_rate": 1.9611634833564096e-06, "loss": 0.0023, "step": 751 }, { "epoch": 47.0, "grad_norm": 0.022593869860642213, "learning_rate": 1.882182310176095e-06, "loss": 0.0022, "step": 752 }, { "epoch": 47.0625, "grad_norm": 0.016063543491380557, "learning_rate": 1.8048092760160285e-06, "loss": 0.0021, "step": 753 }, { "epoch": 47.125, "grad_norm": 0.021873269229617312, "learning_rate": 1.729045649010752e-06, "loss": 0.0045, "step": 754 }, { "epoch": 47.1875, "grad_norm": 0.01569665609733916, "learning_rate": 1.6548926709168633e-06, "loss": 0.0009, "step": 755 }, { "epoch": 47.25, "grad_norm": 0.011413815570498292, "learning_rate": 1.5823515570925763e-06, "loss": 0.0009, "step": 756 }, { "epoch": 47.3125, "grad_norm": 0.029109810904750552, "learning_rate": 1.5114234964778707e-06, "loss": 0.0049, "step": 757 }, { "epoch": 47.375, "grad_norm": 0.024048997338655526, "learning_rate": 1.4421096515749855e-06, "loss": 0.0034, "step": 758 }, { "epoch": 47.4375, "grad_norm": 0.027172675685767346, "learning_rate": 1.3744111584293228e-06, "loss": 0.0037, "step": 759 }, { "epoch": 47.5, "grad_norm": 0.0035756082981969447, "learning_rate": 1.30832912661093e-06, "loss": 0.0017, "step": 760 }, { "epoch": 47.5625, "grad_norm": 0.02566413955432668, "learning_rate": 1.2438646391962129e-06, "loss": 0.0035, "step": 761 }, { "epoch": 47.625, "grad_norm": 0.032308326355990395, "learning_rate": 1.1810187527502182e-06, "loss": 0.0048, "step": 762 }, { "epoch": 47.6875, "grad_norm": 0.026149262681996552, "learning_rate": 1.1197924973093464e-06, "loss": 0.0034, "step": 763 }, { "epoch": 47.75, "grad_norm": 0.015097613235697617, "learning_rate": 1.0601868763643996e-06, "loss": 0.0015, "step": 764 }, { "epoch": 47.8125, "grad_norm": 0.019142453822043624, "learning_rate": 1.0022028668442375e-06, "loss": 0.002, "step": 765 }, { "epoch": 47.875, "grad_norm": 0.014333115422685562, "learning_rate": 9.458414190996689e-07, "loss": 0.0009, "step": 766 }, { "epoch": 47.9375, "grad_norm": 0.031884445051135024, "learning_rate": 8.911034568879207e-07, "loss": 0.0069, "step": 767 }, { "epoch": 48.0, "grad_norm": 0.017813313479240984, "learning_rate": 8.379898773574924e-07, "loss": 0.0017, "step": 768 }, { "epoch": 48.0625, "grad_norm": 0.021676976013209007, "learning_rate": 7.865015510334472e-07, "loss": 0.0052, "step": 769 }, { "epoch": 48.125, "grad_norm": 0.023872711159584236, "learning_rate": 7.366393218031564e-07, "loss": 0.0037, "step": 770 }, { "epoch": 48.1875, "grad_norm": 0.013560684928306489, "learning_rate": 6.884040069024434e-07, "loss": 0.0009, "step": 771 }, { "epoch": 48.25, "grad_norm": 0.027547746861488913, "learning_rate": 6.41796396902239e-07, "loss": 0.0041, "step": 772 }, { "epoch": 48.3125, "grad_norm": 0.02379869982898958, "learning_rate": 5.968172556955365e-07, "loss": 0.0025, "step": 773 }, { "epoch": 48.375, "grad_norm": 0.01470048963673093, "learning_rate": 5.534673204849572e-07, "loss": 0.0009, "step": 774 }, { "epoch": 48.4375, "grad_norm": 0.023911550191113507, "learning_rate": 5.117473017706486e-07, "loss": 0.0053, "step": 775 }, { "epoch": 48.5, "grad_norm": 0.022409728345083195, "learning_rate": 4.7165788333860536e-07, "loss": 0.0026, "step": 776 }, { "epoch": 48.5625, "grad_norm": 0.020732847008319438, "learning_rate": 4.331997222494777e-07, "loss": 0.0062, "step": 777 }, { "epoch": 48.625, "grad_norm": 0.017704460576832216, "learning_rate": 3.963734488278248e-07, "loss": 0.0016, "step": 778 }, { "epoch": 48.6875, "grad_norm": 0.019405071538624658, "learning_rate": 3.611796666517564e-07, "loss": 0.0029, "step": 779 }, { "epoch": 48.75, "grad_norm": 0.02358804433732502, "learning_rate": 3.2761895254306287e-07, "loss": 0.0033, "step": 780 }, { "epoch": 48.8125, "grad_norm": 0.00883061746234552, "learning_rate": 2.956918565577338e-07, "loss": 0.0005, "step": 781 }, { "epoch": 48.875, "grad_norm": 0.017159129222209444, "learning_rate": 2.6539890197695427e-07, "loss": 0.0016, "step": 782 }, { "epoch": 48.9375, "grad_norm": 0.027402061759511424, "learning_rate": 2.3674058529855602e-07, "loss": 0.0028, "step": 783 }, { "epoch": 49.0, "grad_norm": 0.02128952613240459, "learning_rate": 2.0971737622883515e-07, "loss": 0.0025, "step": 784 }, { "epoch": 49.0625, "grad_norm": 0.026668820402710704, "learning_rate": 1.843297176748804e-07, "loss": 0.0046, "step": 785 }, { "epoch": 49.125, "grad_norm": 0.022353610802999663, "learning_rate": 1.605780257373124e-07, "loss": 0.0039, "step": 786 }, { "epoch": 49.1875, "grad_norm": 0.010951321068786213, "learning_rate": 1.3846268970344466e-07, "loss": 0.0009, "step": 787 }, { "epoch": 49.25, "grad_norm": 0.014538946273216842, "learning_rate": 1.179840720409331e-07, "loss": 0.0014, "step": 788 }, { "epoch": 49.3125, "grad_norm": 0.01556854992979764, "learning_rate": 9.914250839180294e-08, "loss": 0.0015, "step": 789 }, { "epoch": 49.375, "grad_norm": 0.020104302093467665, "learning_rate": 8.193830756699772e-08, "loss": 0.0017, "step": 790 }, { "epoch": 49.4375, "grad_norm": 0.021695384943554476, "learning_rate": 6.637175154124986e-08, "loss": 0.0044, "step": 791 }, { "epoch": 49.5, "grad_norm": 0.018630437617792112, "learning_rate": 5.2443095448506674e-08, "loss": 0.0041, "step": 792 }, { "epoch": 49.5625, "grad_norm": 0.02382024881730757, "learning_rate": 4.015256757774477e-08, "loss": 0.0023, "step": 793 }, { "epoch": 49.625, "grad_norm": 0.03427139945682584, "learning_rate": 2.9500369369195312e-08, "loss": 0.0046, "step": 794 }, { "epoch": 49.6875, "grad_norm": 0.012461120044038388, "learning_rate": 2.0486675411102163e-08, "loss": 0.0011, "step": 795 }, { "epoch": 49.75, "grad_norm": 0.020179437876762667, "learning_rate": 1.3111633436779791e-08, "loss": 0.0017, "step": 796 }, { "epoch": 49.8125, "grad_norm": 0.02050009583056352, "learning_rate": 7.375364322292911e-09, "loss": 0.0021, "step": 797 }, { "epoch": 49.875, "grad_norm": 0.02353884448233026, "learning_rate": 3.2779620843692572e-09, "loss": 0.0039, "step": 798 }, { "epoch": 49.9375, "grad_norm": 0.02422128610193115, "learning_rate": 8.194938789451989e-10, "loss": 0.0033, "step": 799 }, { "epoch": 50.0, "grad_norm": 0.028631822991189302, "learning_rate": 0.0, "loss": 0.0048, "step": 800 } ], "logging_steps": 1.0, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9578559744000.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }