LocalC-12B-e2.0 / trainer_state.json
Undi95's picture
Upload folder using huggingface_hub
68ac809 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9853300733496333,
"eval_steps": 500,
"global_step": 1024,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019559902200488996,
"grad_norm": 16.282888412475586,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.0979,
"step": 1
},
{
"epoch": 0.003911980440097799,
"grad_norm": 12.96766471862793,
"learning_rate": 4.000000000000001e-06,
"loss": 1.075,
"step": 2
},
{
"epoch": 0.0058679706601467,
"grad_norm": 8.62896728515625,
"learning_rate": 6e-06,
"loss": 1.084,
"step": 3
},
{
"epoch": 0.007823960880195598,
"grad_norm": 28.233434677124023,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9951,
"step": 4
},
{
"epoch": 0.009779951100244499,
"grad_norm": 20.809677124023438,
"learning_rate": 1e-05,
"loss": 1.0048,
"step": 5
},
{
"epoch": 0.0117359413202934,
"grad_norm": 18.27992820739746,
"learning_rate": 9.99998943199786e-06,
"loss": 0.9456,
"step": 6
},
{
"epoch": 0.013691931540342298,
"grad_norm": 7.64124870300293,
"learning_rate": 9.999957728036109e-06,
"loss": 0.984,
"step": 7
},
{
"epoch": 0.015647921760391197,
"grad_norm": 6.329475402832031,
"learning_rate": 9.99990488824877e-06,
"loss": 0.9117,
"step": 8
},
{
"epoch": 0.017603911980440097,
"grad_norm": 3.9235880374908447,
"learning_rate": 9.999830912859204e-06,
"loss": 0.9326,
"step": 9
},
{
"epoch": 0.019559902200488997,
"grad_norm": 3.593456268310547,
"learning_rate": 9.999735802180121e-06,
"loss": 0.9041,
"step": 10
},
{
"epoch": 0.021515892420537898,
"grad_norm": 2.6196131706237793,
"learning_rate": 9.999619556613573e-06,
"loss": 0.8991,
"step": 11
},
{
"epoch": 0.0234718826405868,
"grad_norm": 3.5464529991149902,
"learning_rate": 9.999482176650956e-06,
"loss": 0.8731,
"step": 12
},
{
"epoch": 0.025427872860635695,
"grad_norm": 2.584135055541992,
"learning_rate": 9.999323662872998e-06,
"loss": 0.8496,
"step": 13
},
{
"epoch": 0.027383863080684596,
"grad_norm": 2.4647350311279297,
"learning_rate": 9.99914401594977e-06,
"loss": 0.8545,
"step": 14
},
{
"epoch": 0.029339853300733496,
"grad_norm": 2.52474308013916,
"learning_rate": 9.998943236640678e-06,
"loss": 0.8293,
"step": 15
},
{
"epoch": 0.03129584352078239,
"grad_norm": 2.1536269187927246,
"learning_rate": 9.998721325794454e-06,
"loss": 0.8203,
"step": 16
},
{
"epoch": 0.033251833740831294,
"grad_norm": 2.681109666824341,
"learning_rate": 9.998478284349163e-06,
"loss": 0.8426,
"step": 17
},
{
"epoch": 0.035207823960880194,
"grad_norm": 2.7063651084899902,
"learning_rate": 9.998214113332184e-06,
"loss": 0.8517,
"step": 18
},
{
"epoch": 0.037163814180929094,
"grad_norm": 2.0724613666534424,
"learning_rate": 9.997928813860228e-06,
"loss": 0.8007,
"step": 19
},
{
"epoch": 0.039119804400977995,
"grad_norm": 2.826772928237915,
"learning_rate": 9.997622387139306e-06,
"loss": 0.7952,
"step": 20
},
{
"epoch": 0.041075794621026895,
"grad_norm": 2.115604877471924,
"learning_rate": 9.99729483446475e-06,
"loss": 0.8614,
"step": 21
},
{
"epoch": 0.043031784841075796,
"grad_norm": 2.8321192264556885,
"learning_rate": 9.996946157221192e-06,
"loss": 0.8484,
"step": 22
},
{
"epoch": 0.044987775061124696,
"grad_norm": 2.0314548015594482,
"learning_rate": 9.996576356882558e-06,
"loss": 0.8133,
"step": 23
},
{
"epoch": 0.0469437652811736,
"grad_norm": 2.4443204402923584,
"learning_rate": 9.99618543501207e-06,
"loss": 0.7858,
"step": 24
},
{
"epoch": 0.0488997555012225,
"grad_norm": 2.3830316066741943,
"learning_rate": 9.99577339326223e-06,
"loss": 0.8126,
"step": 25
},
{
"epoch": 0.05085574572127139,
"grad_norm": 2.355252504348755,
"learning_rate": 9.995340233374824e-06,
"loss": 0.7963,
"step": 26
},
{
"epoch": 0.05281173594132029,
"grad_norm": 2.411180019378662,
"learning_rate": 9.994885957180905e-06,
"loss": 0.8081,
"step": 27
},
{
"epoch": 0.05476772616136919,
"grad_norm": 2.8028552532196045,
"learning_rate": 9.994410566600792e-06,
"loss": 0.8201,
"step": 28
},
{
"epoch": 0.05672371638141809,
"grad_norm": 2.396615505218506,
"learning_rate": 9.993914063644053e-06,
"loss": 0.7734,
"step": 29
},
{
"epoch": 0.05867970660146699,
"grad_norm": 1.850614309310913,
"learning_rate": 9.993396450409508e-06,
"loss": 0.7778,
"step": 30
},
{
"epoch": 0.06063569682151589,
"grad_norm": 2.12536883354187,
"learning_rate": 9.99285772908521e-06,
"loss": 0.7897,
"step": 31
},
{
"epoch": 0.06259168704156479,
"grad_norm": 2.718092203140259,
"learning_rate": 9.992297901948446e-06,
"loss": 0.7499,
"step": 32
},
{
"epoch": 0.06454767726161369,
"grad_norm": 2.013169288635254,
"learning_rate": 9.991716971365713e-06,
"loss": 0.7705,
"step": 33
},
{
"epoch": 0.06650366748166259,
"grad_norm": 2.0360498428344727,
"learning_rate": 9.991114939792725e-06,
"loss": 0.7935,
"step": 34
},
{
"epoch": 0.06845965770171149,
"grad_norm": 1.9136534929275513,
"learning_rate": 9.99049180977439e-06,
"loss": 0.7495,
"step": 35
},
{
"epoch": 0.07041564792176039,
"grad_norm": 1.7010070085525513,
"learning_rate": 9.989847583944801e-06,
"loss": 0.7608,
"step": 36
},
{
"epoch": 0.07237163814180929,
"grad_norm": 1.7865066528320312,
"learning_rate": 9.989182265027232e-06,
"loss": 0.7994,
"step": 37
},
{
"epoch": 0.07432762836185819,
"grad_norm": 1.774584412574768,
"learning_rate": 9.98849585583412e-06,
"loss": 0.7754,
"step": 38
},
{
"epoch": 0.07628361858190709,
"grad_norm": 1.7736550569534302,
"learning_rate": 9.987788359267053e-06,
"loss": 0.7688,
"step": 39
},
{
"epoch": 0.07823960880195599,
"grad_norm": 2.0538413524627686,
"learning_rate": 9.987059778316763e-06,
"loss": 0.7722,
"step": 40
},
{
"epoch": 0.08019559902200489,
"grad_norm": 1.742871880531311,
"learning_rate": 9.986310116063108e-06,
"loss": 0.782,
"step": 41
},
{
"epoch": 0.08215158924205379,
"grad_norm": 1.8255136013031006,
"learning_rate": 9.985539375675058e-06,
"loss": 0.7879,
"step": 42
},
{
"epoch": 0.08410757946210269,
"grad_norm": 1.8212188482284546,
"learning_rate": 9.98474756041069e-06,
"loss": 0.7656,
"step": 43
},
{
"epoch": 0.08606356968215159,
"grad_norm": 1.741162896156311,
"learning_rate": 9.983934673617165e-06,
"loss": 0.7877,
"step": 44
},
{
"epoch": 0.08801955990220049,
"grad_norm": 1.810950517654419,
"learning_rate": 9.98310071873072e-06,
"loss": 0.7854,
"step": 45
},
{
"epoch": 0.08997555012224939,
"grad_norm": 1.8254064321517944,
"learning_rate": 9.982245699276651e-06,
"loss": 0.784,
"step": 46
},
{
"epoch": 0.09193154034229829,
"grad_norm": 1.6075822114944458,
"learning_rate": 9.981369618869293e-06,
"loss": 0.7326,
"step": 47
},
{
"epoch": 0.0938875305623472,
"grad_norm": 1.767863392829895,
"learning_rate": 9.980472481212015e-06,
"loss": 0.7719,
"step": 48
},
{
"epoch": 0.0958435207823961,
"grad_norm": 1.6684534549713135,
"learning_rate": 9.979554290097201e-06,
"loss": 0.7841,
"step": 49
},
{
"epoch": 0.097799511002445,
"grad_norm": 1.8684815168380737,
"learning_rate": 9.978615049406228e-06,
"loss": 0.7519,
"step": 50
},
{
"epoch": 0.09975550122249388,
"grad_norm": 1.5016576051712036,
"learning_rate": 9.977654763109452e-06,
"loss": 0.7075,
"step": 51
},
{
"epoch": 0.10171149144254278,
"grad_norm": 1.8114155530929565,
"learning_rate": 9.9766734352662e-06,
"loss": 0.7666,
"step": 52
},
{
"epoch": 0.10366748166259168,
"grad_norm": 1.9040465354919434,
"learning_rate": 9.975671070024741e-06,
"loss": 0.7444,
"step": 53
},
{
"epoch": 0.10562347188264058,
"grad_norm": 1.6450563669204712,
"learning_rate": 9.974647671622271e-06,
"loss": 0.7803,
"step": 54
},
{
"epoch": 0.10757946210268948,
"grad_norm": 1.5921177864074707,
"learning_rate": 9.973603244384906e-06,
"loss": 0.7692,
"step": 55
},
{
"epoch": 0.10953545232273838,
"grad_norm": 1.5766173601150513,
"learning_rate": 9.972537792727645e-06,
"loss": 0.7744,
"step": 56
},
{
"epoch": 0.11149144254278728,
"grad_norm": 1.4984322786331177,
"learning_rate": 9.971451321154368e-06,
"loss": 0.7854,
"step": 57
},
{
"epoch": 0.11344743276283618,
"grad_norm": 1.6171250343322754,
"learning_rate": 9.97034383425781e-06,
"loss": 0.761,
"step": 58
},
{
"epoch": 0.11540342298288508,
"grad_norm": 1.7444490194320679,
"learning_rate": 9.969215336719537e-06,
"loss": 0.7397,
"step": 59
},
{
"epoch": 0.11735941320293398,
"grad_norm": 1.6317099332809448,
"learning_rate": 9.968065833309939e-06,
"loss": 0.7624,
"step": 60
},
{
"epoch": 0.11931540342298289,
"grad_norm": 1.6640548706054688,
"learning_rate": 9.966895328888195e-06,
"loss": 0.7247,
"step": 61
},
{
"epoch": 0.12127139364303179,
"grad_norm": 1.9074345827102661,
"learning_rate": 9.965703828402263e-06,
"loss": 0.7631,
"step": 62
},
{
"epoch": 0.12322738386308069,
"grad_norm": 1.5792888402938843,
"learning_rate": 9.964491336888853e-06,
"loss": 0.773,
"step": 63
},
{
"epoch": 0.12518337408312957,
"grad_norm": 1.8201286792755127,
"learning_rate": 9.963257859473414e-06,
"loss": 0.7659,
"step": 64
},
{
"epoch": 0.1271393643031785,
"grad_norm": 1.6881448030471802,
"learning_rate": 9.962003401370101e-06,
"loss": 0.7607,
"step": 65
},
{
"epoch": 0.12909535452322737,
"grad_norm": 1.5919877290725708,
"learning_rate": 9.960727967881758e-06,
"loss": 0.7369,
"step": 66
},
{
"epoch": 0.1310513447432763,
"grad_norm": 1.5349136590957642,
"learning_rate": 9.959431564399902e-06,
"loss": 0.7594,
"step": 67
},
{
"epoch": 0.13300733496332517,
"grad_norm": 1.640322208404541,
"learning_rate": 9.958114196404691e-06,
"loss": 0.7888,
"step": 68
},
{
"epoch": 0.1349633251833741,
"grad_norm": 1.7044633626937866,
"learning_rate": 9.956775869464901e-06,
"loss": 0.7486,
"step": 69
},
{
"epoch": 0.13691931540342298,
"grad_norm": 1.6809159517288208,
"learning_rate": 9.955416589237912e-06,
"loss": 0.7695,
"step": 70
},
{
"epoch": 0.1388753056234719,
"grad_norm": 1.6983811855316162,
"learning_rate": 9.954036361469672e-06,
"loss": 0.7524,
"step": 71
},
{
"epoch": 0.14083129584352078,
"grad_norm": 1.7974449396133423,
"learning_rate": 9.952635191994682e-06,
"loss": 0.7789,
"step": 72
},
{
"epoch": 0.1427872860635697,
"grad_norm": 1.8526934385299683,
"learning_rate": 9.951213086735967e-06,
"loss": 0.7605,
"step": 73
},
{
"epoch": 0.14474327628361858,
"grad_norm": 1.6601364612579346,
"learning_rate": 9.949770051705051e-06,
"loss": 0.7568,
"step": 74
},
{
"epoch": 0.1466992665036675,
"grad_norm": 1.9272420406341553,
"learning_rate": 9.948306093001933e-06,
"loss": 0.7335,
"step": 75
},
{
"epoch": 0.14865525672371638,
"grad_norm": 1.4759540557861328,
"learning_rate": 9.946821216815064e-06,
"loss": 0.7732,
"step": 76
},
{
"epoch": 0.1506112469437653,
"grad_norm": 1.5570530891418457,
"learning_rate": 9.945315429421307e-06,
"loss": 0.7545,
"step": 77
},
{
"epoch": 0.15256723716381418,
"grad_norm": 1.8461523056030273,
"learning_rate": 9.943788737185934e-06,
"loss": 0.7874,
"step": 78
},
{
"epoch": 0.1545232273838631,
"grad_norm": 1.7286914587020874,
"learning_rate": 9.942241146562575e-06,
"loss": 0.7571,
"step": 79
},
{
"epoch": 0.15647921760391198,
"grad_norm": 1.7849379777908325,
"learning_rate": 9.940672664093209e-06,
"loss": 0.7565,
"step": 80
},
{
"epoch": 0.15843520782396087,
"grad_norm": 1.6959819793701172,
"learning_rate": 9.939083296408127e-06,
"loss": 0.7777,
"step": 81
},
{
"epoch": 0.16039119804400978,
"grad_norm": 1.646620512008667,
"learning_rate": 9.937473050225905e-06,
"loss": 0.7579,
"step": 82
},
{
"epoch": 0.16234718826405867,
"grad_norm": 1.5833187103271484,
"learning_rate": 9.935841932353376e-06,
"loss": 0.7409,
"step": 83
},
{
"epoch": 0.16430317848410758,
"grad_norm": 1.6932284832000732,
"learning_rate": 9.934189949685602e-06,
"loss": 0.7684,
"step": 84
},
{
"epoch": 0.16625916870415647,
"grad_norm": 1.7056066989898682,
"learning_rate": 9.932517109205849e-06,
"loss": 0.7224,
"step": 85
},
{
"epoch": 0.16821515892420538,
"grad_norm": 1.7733569145202637,
"learning_rate": 9.930823417985546e-06,
"loss": 0.7517,
"step": 86
},
{
"epoch": 0.17017114914425427,
"grad_norm": 1.5914546251296997,
"learning_rate": 9.929108883184269e-06,
"loss": 0.7477,
"step": 87
},
{
"epoch": 0.17212713936430318,
"grad_norm": 1.7451390027999878,
"learning_rate": 9.9273735120497e-06,
"loss": 0.7354,
"step": 88
},
{
"epoch": 0.17408312958435207,
"grad_norm": 1.8151520490646362,
"learning_rate": 9.9256173119176e-06,
"loss": 0.769,
"step": 89
},
{
"epoch": 0.17603911980440098,
"grad_norm": 1.8201167583465576,
"learning_rate": 9.923840290211781e-06,
"loss": 0.7496,
"step": 90
},
{
"epoch": 0.17799511002444987,
"grad_norm": 1.7559887170791626,
"learning_rate": 9.92204245444407e-06,
"loss": 0.7174,
"step": 91
},
{
"epoch": 0.17995110024449879,
"grad_norm": 1.4874770641326904,
"learning_rate": 9.92022381221428e-06,
"loss": 0.7332,
"step": 92
},
{
"epoch": 0.18190709046454767,
"grad_norm": 1.624958872795105,
"learning_rate": 9.918384371210178e-06,
"loss": 0.7484,
"step": 93
},
{
"epoch": 0.18386308068459659,
"grad_norm": 1.7472434043884277,
"learning_rate": 9.916524139207449e-06,
"loss": 0.73,
"step": 94
},
{
"epoch": 0.18581907090464547,
"grad_norm": 1.5148112773895264,
"learning_rate": 9.914643124069667e-06,
"loss": 0.736,
"step": 95
},
{
"epoch": 0.1877750611246944,
"grad_norm": 1.5377767086029053,
"learning_rate": 9.912741333748264e-06,
"loss": 0.7635,
"step": 96
},
{
"epoch": 0.18973105134474327,
"grad_norm": 1.5089975595474243,
"learning_rate": 9.910818776282487e-06,
"loss": 0.7669,
"step": 97
},
{
"epoch": 0.1916870415647922,
"grad_norm": 1.7104604244232178,
"learning_rate": 9.908875459799373e-06,
"loss": 0.7411,
"step": 98
},
{
"epoch": 0.19364303178484107,
"grad_norm": 1.5175243616104126,
"learning_rate": 9.906911392513711e-06,
"loss": 0.7507,
"step": 99
},
{
"epoch": 0.19559902200489,
"grad_norm": 1.4044206142425537,
"learning_rate": 9.904926582728009e-06,
"loss": 0.7526,
"step": 100
},
{
"epoch": 0.19755501222493888,
"grad_norm": 1.5137357711791992,
"learning_rate": 9.902921038832456e-06,
"loss": 0.732,
"step": 101
},
{
"epoch": 0.19951100244498776,
"grad_norm": 1.5008641481399536,
"learning_rate": 9.900894769304888e-06,
"loss": 0.7577,
"step": 102
},
{
"epoch": 0.20146699266503668,
"grad_norm": 1.4948675632476807,
"learning_rate": 9.898847782710754e-06,
"loss": 0.7408,
"step": 103
},
{
"epoch": 0.20342298288508556,
"grad_norm": 1.6185336112976074,
"learning_rate": 9.896780087703077e-06,
"loss": 0.7127,
"step": 104
},
{
"epoch": 0.20537897310513448,
"grad_norm": 1.3796682357788086,
"learning_rate": 9.89469169302242e-06,
"loss": 0.7412,
"step": 105
},
{
"epoch": 0.20733496332518336,
"grad_norm": 1.80031156539917,
"learning_rate": 9.892582607496848e-06,
"loss": 0.7283,
"step": 106
},
{
"epoch": 0.20929095354523228,
"grad_norm": 1.3735039234161377,
"learning_rate": 9.890452840041885e-06,
"loss": 0.7252,
"step": 107
},
{
"epoch": 0.21124694376528116,
"grad_norm": 1.6003905534744263,
"learning_rate": 9.88830239966049e-06,
"loss": 0.7808,
"step": 108
},
{
"epoch": 0.21320293398533008,
"grad_norm": 1.4597164392471313,
"learning_rate": 9.886131295443003e-06,
"loss": 0.7477,
"step": 109
},
{
"epoch": 0.21515892420537897,
"grad_norm": 1.5035536289215088,
"learning_rate": 9.88393953656712e-06,
"loss": 0.7423,
"step": 110
},
{
"epoch": 0.21711491442542788,
"grad_norm": 1.7422672510147095,
"learning_rate": 9.881727132297847e-06,
"loss": 0.7254,
"step": 111
},
{
"epoch": 0.21907090464547677,
"grad_norm": 2.035033941268921,
"learning_rate": 9.879494091987459e-06,
"loss": 0.7576,
"step": 112
},
{
"epoch": 0.22102689486552568,
"grad_norm": 1.4937026500701904,
"learning_rate": 9.877240425075465e-06,
"loss": 0.7404,
"step": 113
},
{
"epoch": 0.22298288508557457,
"grad_norm": 1.6916754245758057,
"learning_rate": 9.874966141088569e-06,
"loss": 0.7392,
"step": 114
},
{
"epoch": 0.22493887530562348,
"grad_norm": 1.435537576675415,
"learning_rate": 9.872671249640627e-06,
"loss": 0.7465,
"step": 115
},
{
"epoch": 0.22689486552567237,
"grad_norm": 1.833910346031189,
"learning_rate": 9.870355760432607e-06,
"loss": 0.7427,
"step": 116
},
{
"epoch": 0.22885085574572128,
"grad_norm": 1.6560982465744019,
"learning_rate": 9.868019683252543e-06,
"loss": 0.7057,
"step": 117
},
{
"epoch": 0.23080684596577017,
"grad_norm": 1.4378702640533447,
"learning_rate": 9.865663027975504e-06,
"loss": 0.7727,
"step": 118
},
{
"epoch": 0.23276283618581908,
"grad_norm": 1.6583086252212524,
"learning_rate": 9.863285804563549e-06,
"loss": 0.7531,
"step": 119
},
{
"epoch": 0.23471882640586797,
"grad_norm": 1.5125632286071777,
"learning_rate": 9.860888023065676e-06,
"loss": 0.719,
"step": 120
},
{
"epoch": 0.23667481662591688,
"grad_norm": 1.8075984716415405,
"learning_rate": 9.858469693617787e-06,
"loss": 0.7474,
"step": 121
},
{
"epoch": 0.23863080684596577,
"grad_norm": 1.4673876762390137,
"learning_rate": 9.85603082644265e-06,
"loss": 0.737,
"step": 122
},
{
"epoch": 0.24058679706601466,
"grad_norm": 1.622917890548706,
"learning_rate": 9.853571431849844e-06,
"loss": 0.7213,
"step": 123
},
{
"epoch": 0.24254278728606357,
"grad_norm": 1.4568711519241333,
"learning_rate": 9.851091520235724e-06,
"loss": 0.7316,
"step": 124
},
{
"epoch": 0.24449877750611246,
"grad_norm": 1.7095931768417358,
"learning_rate": 9.848591102083375e-06,
"loss": 0.7003,
"step": 125
},
{
"epoch": 0.24645476772616137,
"grad_norm": 1.5975903272628784,
"learning_rate": 9.846070187962569e-06,
"loss": 0.724,
"step": 126
},
{
"epoch": 0.24841075794621026,
"grad_norm": 1.4630956649780273,
"learning_rate": 9.843528788529711e-06,
"loss": 0.726,
"step": 127
},
{
"epoch": 0.25036674816625915,
"grad_norm": 1.6113767623901367,
"learning_rate": 9.840966914527812e-06,
"loss": 0.7451,
"step": 128
},
{
"epoch": 0.2523227383863081,
"grad_norm": 1.5766589641571045,
"learning_rate": 9.838384576786427e-06,
"loss": 0.7455,
"step": 129
},
{
"epoch": 0.254278728606357,
"grad_norm": 1.3475068807601929,
"learning_rate": 9.835781786221612e-06,
"loss": 0.7102,
"step": 130
},
{
"epoch": 0.25623471882640586,
"grad_norm": 1.8732693195343018,
"learning_rate": 9.83315855383589e-06,
"loss": 0.74,
"step": 131
},
{
"epoch": 0.25819070904645475,
"grad_norm": 1.7925317287445068,
"learning_rate": 9.830514890718192e-06,
"loss": 0.7395,
"step": 132
},
{
"epoch": 0.2601466992665037,
"grad_norm": 1.5492051839828491,
"learning_rate": 9.82785080804381e-06,
"loss": 0.7333,
"step": 133
},
{
"epoch": 0.2621026894865526,
"grad_norm": 1.4603351354599,
"learning_rate": 9.825166317074357e-06,
"loss": 0.7409,
"step": 134
},
{
"epoch": 0.26405867970660146,
"grad_norm": 1.4673221111297607,
"learning_rate": 9.822461429157716e-06,
"loss": 0.7433,
"step": 135
},
{
"epoch": 0.26601466992665035,
"grad_norm": 1.5018287897109985,
"learning_rate": 9.819736155727992e-06,
"loss": 0.7505,
"step": 136
},
{
"epoch": 0.2679706601466993,
"grad_norm": 1.4042693376541138,
"learning_rate": 9.816990508305463e-06,
"loss": 0.6874,
"step": 137
},
{
"epoch": 0.2699266503667482,
"grad_norm": 1.86225426197052,
"learning_rate": 9.814224498496532e-06,
"loss": 0.7126,
"step": 138
},
{
"epoch": 0.27188264058679706,
"grad_norm": 1.697844386100769,
"learning_rate": 9.811438137993678e-06,
"loss": 0.7172,
"step": 139
},
{
"epoch": 0.27383863080684595,
"grad_norm": 1.8233097791671753,
"learning_rate": 9.808631438575404e-06,
"loss": 0.7504,
"step": 140
},
{
"epoch": 0.2757946210268949,
"grad_norm": 1.6390475034713745,
"learning_rate": 9.805804412106197e-06,
"loss": 0.7417,
"step": 141
},
{
"epoch": 0.2777506112469438,
"grad_norm": 1.8348608016967773,
"learning_rate": 9.802957070536464e-06,
"loss": 0.7371,
"step": 142
},
{
"epoch": 0.27970660146699267,
"grad_norm": 1.6390647888183594,
"learning_rate": 9.800089425902489e-06,
"loss": 0.7249,
"step": 143
},
{
"epoch": 0.28166259168704155,
"grad_norm": 1.775242805480957,
"learning_rate": 9.79720149032638e-06,
"loss": 0.7015,
"step": 144
},
{
"epoch": 0.28361858190709044,
"grad_norm": 1.653192162513733,
"learning_rate": 9.794293276016024e-06,
"loss": 0.7542,
"step": 145
},
{
"epoch": 0.2855745721271394,
"grad_norm": 1.8253936767578125,
"learning_rate": 9.791364795265027e-06,
"loss": 0.7097,
"step": 146
},
{
"epoch": 0.28753056234718827,
"grad_norm": 1.5707979202270508,
"learning_rate": 9.788416060452662e-06,
"loss": 0.7317,
"step": 147
},
{
"epoch": 0.28948655256723715,
"grad_norm": 2.0345988273620605,
"learning_rate": 9.785447084043825e-06,
"loss": 0.7368,
"step": 148
},
{
"epoch": 0.29144254278728604,
"grad_norm": 1.4038718938827515,
"learning_rate": 9.782457878588977e-06,
"loss": 0.7092,
"step": 149
},
{
"epoch": 0.293398533007335,
"grad_norm": 1.8474197387695312,
"learning_rate": 9.779448456724088e-06,
"loss": 0.7147,
"step": 150
},
{
"epoch": 0.29535452322738387,
"grad_norm": 1.4272691011428833,
"learning_rate": 9.776418831170591e-06,
"loss": 0.742,
"step": 151
},
{
"epoch": 0.29731051344743276,
"grad_norm": 1.7123950719833374,
"learning_rate": 9.77336901473532e-06,
"loss": 0.7593,
"step": 152
},
{
"epoch": 0.29926650366748164,
"grad_norm": 1.4355639219284058,
"learning_rate": 9.77029902031046e-06,
"loss": 0.7345,
"step": 153
},
{
"epoch": 0.3012224938875306,
"grad_norm": 2.2433369159698486,
"learning_rate": 9.767208860873498e-06,
"loss": 0.7319,
"step": 154
},
{
"epoch": 0.30317848410757947,
"grad_norm": 1.5576993227005005,
"learning_rate": 9.764098549487156e-06,
"loss": 0.7128,
"step": 155
},
{
"epoch": 0.30513447432762836,
"grad_norm": 1.508055567741394,
"learning_rate": 9.760968099299345e-06,
"loss": 0.7616,
"step": 156
},
{
"epoch": 0.30709046454767724,
"grad_norm": 1.7950801849365234,
"learning_rate": 9.75781752354311e-06,
"loss": 0.7136,
"step": 157
},
{
"epoch": 0.3090464547677262,
"grad_norm": 1.509214997291565,
"learning_rate": 9.754646835536561e-06,
"loss": 0.7316,
"step": 158
},
{
"epoch": 0.3110024449877751,
"grad_norm": 1.6463536024093628,
"learning_rate": 9.751456048682839e-06,
"loss": 0.7599,
"step": 159
},
{
"epoch": 0.31295843520782396,
"grad_norm": 1.566895842552185,
"learning_rate": 9.748245176470037e-06,
"loss": 0.7353,
"step": 160
},
{
"epoch": 0.31491442542787285,
"grad_norm": 1.568455696105957,
"learning_rate": 9.745014232471161e-06,
"loss": 0.7286,
"step": 161
},
{
"epoch": 0.31687041564792173,
"grad_norm": 1.6639809608459473,
"learning_rate": 9.741763230344055e-06,
"loss": 0.7405,
"step": 162
},
{
"epoch": 0.3188264058679707,
"grad_norm": 1.5624911785125732,
"learning_rate": 9.738492183831362e-06,
"loss": 0.7247,
"step": 163
},
{
"epoch": 0.32078239608801956,
"grad_norm": 1.589717149734497,
"learning_rate": 9.735201106760452e-06,
"loss": 0.7204,
"step": 164
},
{
"epoch": 0.32273838630806845,
"grad_norm": 1.5873618125915527,
"learning_rate": 9.731890013043367e-06,
"loss": 0.7329,
"step": 165
},
{
"epoch": 0.32469437652811733,
"grad_norm": 1.546643853187561,
"learning_rate": 9.728558916676769e-06,
"loss": 0.7195,
"step": 166
},
{
"epoch": 0.3266503667481663,
"grad_norm": 1.6865900754928589,
"learning_rate": 9.725207831741867e-06,
"loss": 0.7196,
"step": 167
},
{
"epoch": 0.32860635696821516,
"grad_norm": 1.6543104648590088,
"learning_rate": 9.721836772404372e-06,
"loss": 0.7536,
"step": 168
},
{
"epoch": 0.33056234718826405,
"grad_norm": 1.8691520690917969,
"learning_rate": 9.718445752914427e-06,
"loss": 0.7192,
"step": 169
},
{
"epoch": 0.33251833740831294,
"grad_norm": 1.7233076095581055,
"learning_rate": 9.715034787606556e-06,
"loss": 0.7166,
"step": 170
},
{
"epoch": 0.3344743276283619,
"grad_norm": 1.70209801197052,
"learning_rate": 9.711603890899593e-06,
"loss": 0.6976,
"step": 171
},
{
"epoch": 0.33643031784841076,
"grad_norm": 1.8248488903045654,
"learning_rate": 9.708153077296626e-06,
"loss": 0.743,
"step": 172
},
{
"epoch": 0.33838630806845965,
"grad_norm": 1.5105550289154053,
"learning_rate": 9.704682361384941e-06,
"loss": 0.6894,
"step": 173
},
{
"epoch": 0.34034229828850854,
"grad_norm": 1.6972191333770752,
"learning_rate": 9.701191757835948e-06,
"loss": 0.7204,
"step": 174
},
{
"epoch": 0.3422982885085575,
"grad_norm": 1.4798142910003662,
"learning_rate": 9.69768128140513e-06,
"loss": 0.7027,
"step": 175
},
{
"epoch": 0.34425427872860637,
"grad_norm": 1.7019851207733154,
"learning_rate": 9.694150946931973e-06,
"loss": 0.7172,
"step": 176
},
{
"epoch": 0.34621026894865525,
"grad_norm": 1.3944963216781616,
"learning_rate": 9.690600769339916e-06,
"loss": 0.7152,
"step": 177
},
{
"epoch": 0.34816625916870414,
"grad_norm": 1.5284974575042725,
"learning_rate": 9.68703076363627e-06,
"loss": 0.7266,
"step": 178
},
{
"epoch": 0.3501222493887531,
"grad_norm": 1.4864403009414673,
"learning_rate": 9.683440944912165e-06,
"loss": 0.6945,
"step": 179
},
{
"epoch": 0.35207823960880197,
"grad_norm": 1.5325855016708374,
"learning_rate": 9.679831328342486e-06,
"loss": 0.7228,
"step": 180
},
{
"epoch": 0.35403422982885085,
"grad_norm": 1.3994523286819458,
"learning_rate": 9.676201929185809e-06,
"loss": 0.7267,
"step": 181
},
{
"epoch": 0.35599022004889974,
"grad_norm": 1.4828211069107056,
"learning_rate": 9.672552762784331e-06,
"loss": 0.717,
"step": 182
},
{
"epoch": 0.35794621026894863,
"grad_norm": 1.4578241109848022,
"learning_rate": 9.668883844563814e-06,
"loss": 0.7107,
"step": 183
},
{
"epoch": 0.35990220048899757,
"grad_norm": 1.4730778932571411,
"learning_rate": 9.66519519003351e-06,
"loss": 0.6941,
"step": 184
},
{
"epoch": 0.36185819070904646,
"grad_norm": 1.375962257385254,
"learning_rate": 9.661486814786104e-06,
"loss": 0.7205,
"step": 185
},
{
"epoch": 0.36381418092909534,
"grad_norm": 1.5559496879577637,
"learning_rate": 9.657758734497642e-06,
"loss": 0.7394,
"step": 186
},
{
"epoch": 0.36577017114914423,
"grad_norm": 1.2486817836761475,
"learning_rate": 9.654010964927467e-06,
"loss": 0.7529,
"step": 187
},
{
"epoch": 0.36772616136919317,
"grad_norm": 1.6362264156341553,
"learning_rate": 9.650243521918157e-06,
"loss": 0.7205,
"step": 188
},
{
"epoch": 0.36968215158924206,
"grad_norm": 1.4043997526168823,
"learning_rate": 9.646456421395447e-06,
"loss": 0.7304,
"step": 189
},
{
"epoch": 0.37163814180929094,
"grad_norm": 1.4461724758148193,
"learning_rate": 9.642649679368175e-06,
"loss": 0.7138,
"step": 190
},
{
"epoch": 0.37359413202933983,
"grad_norm": 1.543683409690857,
"learning_rate": 9.6388233119282e-06,
"loss": 0.6915,
"step": 191
},
{
"epoch": 0.3755501222493888,
"grad_norm": 1.4317554235458374,
"learning_rate": 9.63497733525035e-06,
"loss": 0.7329,
"step": 192
},
{
"epoch": 0.37750611246943766,
"grad_norm": 1.5005362033843994,
"learning_rate": 9.631111765592339e-06,
"loss": 0.712,
"step": 193
},
{
"epoch": 0.37946210268948655,
"grad_norm": 1.4482682943344116,
"learning_rate": 9.627226619294706e-06,
"loss": 0.7119,
"step": 194
},
{
"epoch": 0.38141809290953543,
"grad_norm": 1.5683493614196777,
"learning_rate": 9.623321912780745e-06,
"loss": 0.7545,
"step": 195
},
{
"epoch": 0.3833740831295844,
"grad_norm": 1.335252046585083,
"learning_rate": 9.619397662556434e-06,
"loss": 0.7137,
"step": 196
},
{
"epoch": 0.38533007334963326,
"grad_norm": 1.6119199991226196,
"learning_rate": 9.615453885210368e-06,
"loss": 0.709,
"step": 197
},
{
"epoch": 0.38728606356968215,
"grad_norm": 1.2701472043991089,
"learning_rate": 9.611490597413687e-06,
"loss": 0.724,
"step": 198
},
{
"epoch": 0.38924205378973104,
"grad_norm": 1.5385199785232544,
"learning_rate": 9.607507815920002e-06,
"loss": 0.7067,
"step": 199
},
{
"epoch": 0.39119804400978,
"grad_norm": 1.324340581893921,
"learning_rate": 9.603505557565332e-06,
"loss": 0.6998,
"step": 200
},
{
"epoch": 0.39315403422982886,
"grad_norm": 1.5457934141159058,
"learning_rate": 9.599483839268027e-06,
"loss": 0.7429,
"step": 201
},
{
"epoch": 0.39511002444987775,
"grad_norm": 1.4941277503967285,
"learning_rate": 9.595442678028696e-06,
"loss": 0.727,
"step": 202
},
{
"epoch": 0.39706601466992664,
"grad_norm": 1.4549589157104492,
"learning_rate": 9.59138209093014e-06,
"loss": 0.7041,
"step": 203
},
{
"epoch": 0.3990220048899755,
"grad_norm": 1.423126459121704,
"learning_rate": 9.587302095137281e-06,
"loss": 0.7094,
"step": 204
},
{
"epoch": 0.40097799511002447,
"grad_norm": 1.347517967224121,
"learning_rate": 9.583202707897075e-06,
"loss": 0.7119,
"step": 205
},
{
"epoch": 0.40293398533007335,
"grad_norm": 1.3380125761032104,
"learning_rate": 9.579083946538457e-06,
"loss": 0.7291,
"step": 206
},
{
"epoch": 0.40488997555012224,
"grad_norm": 1.460777759552002,
"learning_rate": 9.574945828472257e-06,
"loss": 0.7008,
"step": 207
},
{
"epoch": 0.4068459657701711,
"grad_norm": 1.5004523992538452,
"learning_rate": 9.570788371191134e-06,
"loss": 0.6705,
"step": 208
},
{
"epoch": 0.40880195599022007,
"grad_norm": 1.3935741186141968,
"learning_rate": 9.566611592269495e-06,
"loss": 0.6983,
"step": 209
},
{
"epoch": 0.41075794621026895,
"grad_norm": 1.6443208456039429,
"learning_rate": 9.562415509363422e-06,
"loss": 0.7226,
"step": 210
},
{
"epoch": 0.41271393643031784,
"grad_norm": 1.3152751922607422,
"learning_rate": 9.558200140210598e-06,
"loss": 0.7476,
"step": 211
},
{
"epoch": 0.4146699266503667,
"grad_norm": 1.6871126890182495,
"learning_rate": 9.55396550263024e-06,
"loss": 0.7614,
"step": 212
},
{
"epoch": 0.41662591687041567,
"grad_norm": 1.5741229057312012,
"learning_rate": 9.549711614523007e-06,
"loss": 0.7494,
"step": 213
},
{
"epoch": 0.41858190709046456,
"grad_norm": 1.5038225650787354,
"learning_rate": 9.545438493870943e-06,
"loss": 0.6948,
"step": 214
},
{
"epoch": 0.42053789731051344,
"grad_norm": 1.4876679182052612,
"learning_rate": 9.541146158737383e-06,
"loss": 0.7081,
"step": 215
},
{
"epoch": 0.42249388753056233,
"grad_norm": 1.32923424243927,
"learning_rate": 9.536834627266893e-06,
"loss": 0.7197,
"step": 216
},
{
"epoch": 0.42444987775061127,
"grad_norm": 1.468302845954895,
"learning_rate": 9.532503917685179e-06,
"loss": 0.7222,
"step": 217
},
{
"epoch": 0.42640586797066016,
"grad_norm": 1.3885116577148438,
"learning_rate": 9.528154048299025e-06,
"loss": 0.7217,
"step": 218
},
{
"epoch": 0.42836185819070904,
"grad_norm": 1.516148328781128,
"learning_rate": 9.5237850374962e-06,
"loss": 0.7117,
"step": 219
},
{
"epoch": 0.43031784841075793,
"grad_norm": 1.317048192024231,
"learning_rate": 9.519396903745387e-06,
"loss": 0.7269,
"step": 220
},
{
"epoch": 0.4322738386308069,
"grad_norm": 1.3124562501907349,
"learning_rate": 9.514989665596114e-06,
"loss": 0.7323,
"step": 221
},
{
"epoch": 0.43422982885085576,
"grad_norm": 1.4711933135986328,
"learning_rate": 9.510563341678663e-06,
"loss": 0.7135,
"step": 222
},
{
"epoch": 0.43618581907090465,
"grad_norm": 1.4237111806869507,
"learning_rate": 9.506117950703988e-06,
"loss": 0.7042,
"step": 223
},
{
"epoch": 0.43814180929095353,
"grad_norm": 1.3235597610473633,
"learning_rate": 9.501653511463653e-06,
"loss": 0.7348,
"step": 224
},
{
"epoch": 0.4400977995110024,
"grad_norm": 1.3544509410858154,
"learning_rate": 9.497170042829737e-06,
"loss": 0.7138,
"step": 225
},
{
"epoch": 0.44205378973105136,
"grad_norm": 1.2997663021087646,
"learning_rate": 9.492667563754766e-06,
"loss": 0.728,
"step": 226
},
{
"epoch": 0.44400977995110025,
"grad_norm": 1.409828782081604,
"learning_rate": 9.488146093271625e-06,
"loss": 0.6897,
"step": 227
},
{
"epoch": 0.44596577017114913,
"grad_norm": 1.3433566093444824,
"learning_rate": 9.48360565049347e-06,
"loss": 0.7191,
"step": 228
},
{
"epoch": 0.447921760391198,
"grad_norm": 1.37416410446167,
"learning_rate": 9.479046254613673e-06,
"loss": 0.7203,
"step": 229
},
{
"epoch": 0.44987775061124696,
"grad_norm": 1.3755804300308228,
"learning_rate": 9.474467924905711e-06,
"loss": 0.7165,
"step": 230
},
{
"epoch": 0.45183374083129585,
"grad_norm": 1.2708872556686401,
"learning_rate": 9.469870680723104e-06,
"loss": 0.7014,
"step": 231
},
{
"epoch": 0.45378973105134474,
"grad_norm": 1.3506673574447632,
"learning_rate": 9.465254541499328e-06,
"loss": 0.7031,
"step": 232
},
{
"epoch": 0.4557457212713936,
"grad_norm": 1.3668538331985474,
"learning_rate": 9.460619526747732e-06,
"loss": 0.6983,
"step": 233
},
{
"epoch": 0.45770171149144256,
"grad_norm": 1.4770394563674927,
"learning_rate": 9.45596565606145e-06,
"loss": 0.6995,
"step": 234
},
{
"epoch": 0.45965770171149145,
"grad_norm": 1.4407926797866821,
"learning_rate": 9.451292949113332e-06,
"loss": 0.7108,
"step": 235
},
{
"epoch": 0.46161369193154034,
"grad_norm": 1.6618622541427612,
"learning_rate": 9.446601425655846e-06,
"loss": 0.7059,
"step": 236
},
{
"epoch": 0.4635696821515892,
"grad_norm": 1.3920097351074219,
"learning_rate": 9.441891105521005e-06,
"loss": 0.7289,
"step": 237
},
{
"epoch": 0.46552567237163817,
"grad_norm": 1.3779282569885254,
"learning_rate": 9.437162008620279e-06,
"loss": 0.6999,
"step": 238
},
{
"epoch": 0.46748166259168705,
"grad_norm": 1.389256238937378,
"learning_rate": 9.432414154944511e-06,
"loss": 0.6981,
"step": 239
},
{
"epoch": 0.46943765281173594,
"grad_norm": 1.5415723323822021,
"learning_rate": 9.42764756456383e-06,
"loss": 0.7239,
"step": 240
},
{
"epoch": 0.4713936430317848,
"grad_norm": 1.3362054824829102,
"learning_rate": 9.422862257627573e-06,
"loss": 0.719,
"step": 241
},
{
"epoch": 0.47334963325183377,
"grad_norm": 1.4279307126998901,
"learning_rate": 9.418058254364195e-06,
"loss": 0.7069,
"step": 242
},
{
"epoch": 0.47530562347188265,
"grad_norm": 1.4915199279785156,
"learning_rate": 9.413235575081177e-06,
"loss": 0.7379,
"step": 243
},
{
"epoch": 0.47726161369193154,
"grad_norm": 1.3085819482803345,
"learning_rate": 9.408394240164957e-06,
"loss": 0.7389,
"step": 244
},
{
"epoch": 0.4792176039119804,
"grad_norm": 1.5228544473648071,
"learning_rate": 9.40353427008083e-06,
"loss": 0.7209,
"step": 245
},
{
"epoch": 0.4811735941320293,
"grad_norm": 1.2931418418884277,
"learning_rate": 9.398655685372866e-06,
"loss": 0.715,
"step": 246
},
{
"epoch": 0.48312958435207826,
"grad_norm": 1.3800779581069946,
"learning_rate": 9.393758506663821e-06,
"loss": 0.7037,
"step": 247
},
{
"epoch": 0.48508557457212714,
"grad_norm": 1.42329740524292,
"learning_rate": 9.388842754655053e-06,
"loss": 0.7293,
"step": 248
},
{
"epoch": 0.48704156479217603,
"grad_norm": 1.424433708190918,
"learning_rate": 9.383908450126436e-06,
"loss": 0.7183,
"step": 249
},
{
"epoch": 0.4889975550122249,
"grad_norm": 1.3259403705596924,
"learning_rate": 9.378955613936261e-06,
"loss": 0.6899,
"step": 250
},
{
"epoch": 0.49095354523227386,
"grad_norm": 1.391248106956482,
"learning_rate": 9.373984267021167e-06,
"loss": 0.7169,
"step": 251
},
{
"epoch": 0.49290953545232274,
"grad_norm": 1.3917707204818726,
"learning_rate": 9.368994430396033e-06,
"loss": 0.7185,
"step": 252
},
{
"epoch": 0.49486552567237163,
"grad_norm": 1.3821173906326294,
"learning_rate": 9.3639861251539e-06,
"loss": 0.7217,
"step": 253
},
{
"epoch": 0.4968215158924205,
"grad_norm": 1.5511088371276855,
"learning_rate": 9.358959372465883e-06,
"loss": 0.7239,
"step": 254
},
{
"epoch": 0.49877750611246946,
"grad_norm": 1.2932369709014893,
"learning_rate": 9.353914193581073e-06,
"loss": 0.751,
"step": 255
},
{
"epoch": 0.5007334963325183,
"grad_norm": 1.3697410821914673,
"learning_rate": 9.348850609826454e-06,
"loss": 0.7072,
"step": 256
},
{
"epoch": 0.5026894865525673,
"grad_norm": 1.2831662893295288,
"learning_rate": 9.343768642606813e-06,
"loss": 0.6909,
"step": 257
},
{
"epoch": 0.5046454767726162,
"grad_norm": 1.2629653215408325,
"learning_rate": 9.338668313404647e-06,
"loss": 0.7182,
"step": 258
},
{
"epoch": 0.5066014669926651,
"grad_norm": 1.352612853050232,
"learning_rate": 9.33354964378007e-06,
"loss": 0.7159,
"step": 259
},
{
"epoch": 0.508557457212714,
"grad_norm": 1.2814078330993652,
"learning_rate": 9.32841265537073e-06,
"loss": 0.6925,
"step": 260
},
{
"epoch": 0.5105134474327628,
"grad_norm": 1.2791004180908203,
"learning_rate": 9.323257369891702e-06,
"loss": 0.7319,
"step": 261
},
{
"epoch": 0.5124694376528117,
"grad_norm": 1.4273337125778198,
"learning_rate": 9.318083809135421e-06,
"loss": 0.6868,
"step": 262
},
{
"epoch": 0.5144254278728606,
"grad_norm": 1.3843871355056763,
"learning_rate": 9.312891994971562e-06,
"loss": 0.7103,
"step": 263
},
{
"epoch": 0.5163814180929095,
"grad_norm": 1.3229299783706665,
"learning_rate": 9.307681949346969e-06,
"loss": 0.6953,
"step": 264
},
{
"epoch": 0.5183374083129584,
"grad_norm": 1.2690023183822632,
"learning_rate": 9.302453694285549e-06,
"loss": 0.7201,
"step": 265
},
{
"epoch": 0.5202933985330074,
"grad_norm": 1.544927954673767,
"learning_rate": 9.29720725188819e-06,
"loss": 0.7206,
"step": 266
},
{
"epoch": 0.5222493887530563,
"grad_norm": 1.2711515426635742,
"learning_rate": 9.291942644332654e-06,
"loss": 0.6915,
"step": 267
},
{
"epoch": 0.5242053789731052,
"grad_norm": 1.4151374101638794,
"learning_rate": 9.286659893873498e-06,
"loss": 0.7194,
"step": 268
},
{
"epoch": 0.526161369193154,
"grad_norm": 1.3871374130249023,
"learning_rate": 9.281359022841966e-06,
"loss": 0.7086,
"step": 269
},
{
"epoch": 0.5281173594132029,
"grad_norm": 1.3970173597335815,
"learning_rate": 9.276040053645907e-06,
"loss": 0.7111,
"step": 270
},
{
"epoch": 0.5300733496332518,
"grad_norm": 1.287168264389038,
"learning_rate": 9.27070300876967e-06,
"loss": 0.7032,
"step": 271
},
{
"epoch": 0.5320293398533007,
"grad_norm": 1.3781298398971558,
"learning_rate": 9.265347910774016e-06,
"loss": 0.7055,
"step": 272
},
{
"epoch": 0.5339853300733496,
"grad_norm": 1.4398114681243896,
"learning_rate": 9.259974782296023e-06,
"loss": 0.6926,
"step": 273
},
{
"epoch": 0.5359413202933986,
"grad_norm": 1.506633996963501,
"learning_rate": 9.254583646048981e-06,
"loss": 0.7201,
"step": 274
},
{
"epoch": 0.5378973105134475,
"grad_norm": 1.508634328842163,
"learning_rate": 9.249174524822307e-06,
"loss": 0.699,
"step": 275
},
{
"epoch": 0.5398533007334964,
"grad_norm": 1.469885230064392,
"learning_rate": 9.24374744148144e-06,
"loss": 0.728,
"step": 276
},
{
"epoch": 0.5418092909535452,
"grad_norm": 1.5133376121520996,
"learning_rate": 9.238302418967757e-06,
"loss": 0.698,
"step": 277
},
{
"epoch": 0.5437652811735941,
"grad_norm": 1.3522744178771973,
"learning_rate": 9.23283948029846e-06,
"loss": 0.7287,
"step": 278
},
{
"epoch": 0.545721271393643,
"grad_norm": 1.4715549945831299,
"learning_rate": 9.227358648566483e-06,
"loss": 0.7171,
"step": 279
},
{
"epoch": 0.5476772616136919,
"grad_norm": 1.37788987159729,
"learning_rate": 9.221859946940407e-06,
"loss": 0.7438,
"step": 280
},
{
"epoch": 0.5496332518337408,
"grad_norm": 1.3700509071350098,
"learning_rate": 9.216343398664349e-06,
"loss": 0.6981,
"step": 281
},
{
"epoch": 0.5515892420537898,
"grad_norm": 1.318377137184143,
"learning_rate": 9.210809027057866e-06,
"loss": 0.7176,
"step": 282
},
{
"epoch": 0.5535452322738387,
"grad_norm": 1.5471018552780151,
"learning_rate": 9.205256855515856e-06,
"loss": 0.6932,
"step": 283
},
{
"epoch": 0.5555012224938876,
"grad_norm": 1.4040894508361816,
"learning_rate": 9.199686907508465e-06,
"loss": 0.6905,
"step": 284
},
{
"epoch": 0.5574572127139364,
"grad_norm": 1.4304442405700684,
"learning_rate": 9.194099206580981e-06,
"loss": 0.7147,
"step": 285
},
{
"epoch": 0.5594132029339853,
"grad_norm": 1.401181697845459,
"learning_rate": 9.188493776353743e-06,
"loss": 0.6774,
"step": 286
},
{
"epoch": 0.5613691931540342,
"grad_norm": 1.4256865978240967,
"learning_rate": 9.182870640522023e-06,
"loss": 0.6909,
"step": 287
},
{
"epoch": 0.5633251833740831,
"grad_norm": 1.3554214239120483,
"learning_rate": 9.177229822855949e-06,
"loss": 0.7164,
"step": 288
},
{
"epoch": 0.565281173594132,
"grad_norm": 1.3488880395889282,
"learning_rate": 9.171571347200392e-06,
"loss": 0.6852,
"step": 289
},
{
"epoch": 0.5672371638141809,
"grad_norm": 1.3619459867477417,
"learning_rate": 9.165895237474863e-06,
"loss": 0.7171,
"step": 290
},
{
"epoch": 0.5691931540342299,
"grad_norm": 1.3099799156188965,
"learning_rate": 9.160201517673417e-06,
"loss": 0.6947,
"step": 291
},
{
"epoch": 0.5711491442542788,
"grad_norm": 1.4156692028045654,
"learning_rate": 9.154490211864554e-06,
"loss": 0.7026,
"step": 292
},
{
"epoch": 0.5731051344743276,
"grad_norm": 1.3235222101211548,
"learning_rate": 9.14876134419111e-06,
"loss": 0.6966,
"step": 293
},
{
"epoch": 0.5750611246943765,
"grad_norm": 1.3630427122116089,
"learning_rate": 9.143014938870157e-06,
"loss": 0.7155,
"step": 294
},
{
"epoch": 0.5770171149144254,
"grad_norm": 1.2701759338378906,
"learning_rate": 9.137251020192907e-06,
"loss": 0.7074,
"step": 295
},
{
"epoch": 0.5789731051344743,
"grad_norm": 1.2779312133789062,
"learning_rate": 9.131469612524602e-06,
"loss": 0.7118,
"step": 296
},
{
"epoch": 0.5809290953545232,
"grad_norm": 1.37440824508667,
"learning_rate": 9.125670740304412e-06,
"loss": 0.6779,
"step": 297
},
{
"epoch": 0.5828850855745721,
"grad_norm": 1.3591920137405396,
"learning_rate": 9.119854428045335e-06,
"loss": 0.722,
"step": 298
},
{
"epoch": 0.5848410757946211,
"grad_norm": 1.311881422996521,
"learning_rate": 9.114020700334092e-06,
"loss": 0.6914,
"step": 299
},
{
"epoch": 0.58679706601467,
"grad_norm": 1.415059208869934,
"learning_rate": 9.108169581831021e-06,
"loss": 0.6986,
"step": 300
},
{
"epoch": 0.5887530562347189,
"grad_norm": 1.5263363122940063,
"learning_rate": 9.102301097269974e-06,
"loss": 0.7066,
"step": 301
},
{
"epoch": 0.5907090464547677,
"grad_norm": 1.3773906230926514,
"learning_rate": 9.096415271458218e-06,
"loss": 0.6651,
"step": 302
},
{
"epoch": 0.5926650366748166,
"grad_norm": 1.4495782852172852,
"learning_rate": 9.090512129276316e-06,
"loss": 0.7033,
"step": 303
},
{
"epoch": 0.5946210268948655,
"grad_norm": 1.2831271886825562,
"learning_rate": 9.08459169567804e-06,
"loss": 0.7199,
"step": 304
},
{
"epoch": 0.5965770171149144,
"grad_norm": 1.3329228162765503,
"learning_rate": 9.078653995690248e-06,
"loss": 0.7101,
"step": 305
},
{
"epoch": 0.5985330073349633,
"grad_norm": 1.4745112657546997,
"learning_rate": 9.072699054412793e-06,
"loss": 0.6837,
"step": 306
},
{
"epoch": 0.6004889975550122,
"grad_norm": 1.4286351203918457,
"learning_rate": 9.066726897018408e-06,
"loss": 0.7067,
"step": 307
},
{
"epoch": 0.6024449877750612,
"grad_norm": 1.4778072834014893,
"learning_rate": 9.060737548752601e-06,
"loss": 0.6937,
"step": 308
},
{
"epoch": 0.6044009779951101,
"grad_norm": 1.3873180150985718,
"learning_rate": 9.05473103493355e-06,
"loss": 0.6972,
"step": 309
},
{
"epoch": 0.6063569682151589,
"grad_norm": 1.4970178604125977,
"learning_rate": 9.048707380951993e-06,
"loss": 0.7295,
"step": 310
},
{
"epoch": 0.6083129584352078,
"grad_norm": 1.4127295017242432,
"learning_rate": 9.042666612271131e-06,
"loss": 0.6977,
"step": 311
},
{
"epoch": 0.6102689486552567,
"grad_norm": 1.5276705026626587,
"learning_rate": 9.036608754426504e-06,
"loss": 0.6867,
"step": 312
},
{
"epoch": 0.6122249388753056,
"grad_norm": 1.4647181034088135,
"learning_rate": 9.03053383302589e-06,
"loss": 0.7076,
"step": 313
},
{
"epoch": 0.6141809290953545,
"grad_norm": 1.553812861442566,
"learning_rate": 9.024441873749208e-06,
"loss": 0.7213,
"step": 314
},
{
"epoch": 0.6161369193154034,
"grad_norm": 1.3352826833724976,
"learning_rate": 9.018332902348389e-06,
"loss": 0.6929,
"step": 315
},
{
"epoch": 0.6180929095354524,
"grad_norm": 1.5422905683517456,
"learning_rate": 9.012206944647284e-06,
"loss": 0.7556,
"step": 316
},
{
"epoch": 0.6200488997555013,
"grad_norm": 1.3888792991638184,
"learning_rate": 9.006064026541549e-06,
"loss": 0.7233,
"step": 317
},
{
"epoch": 0.6220048899755501,
"grad_norm": 1.4934391975402832,
"learning_rate": 8.999904173998525e-06,
"loss": 0.6935,
"step": 318
},
{
"epoch": 0.623960880195599,
"grad_norm": 1.393879771232605,
"learning_rate": 8.993727413057155e-06,
"loss": 0.6695,
"step": 319
},
{
"epoch": 0.6259168704156479,
"grad_norm": 1.422959566116333,
"learning_rate": 8.987533769827842e-06,
"loss": 0.7058,
"step": 320
},
{
"epoch": 0.6278728606356968,
"grad_norm": 1.3607128858566284,
"learning_rate": 8.981323270492367e-06,
"loss": 0.6757,
"step": 321
},
{
"epoch": 0.6298288508557457,
"grad_norm": 1.2379283905029297,
"learning_rate": 8.975095941303748e-06,
"loss": 0.698,
"step": 322
},
{
"epoch": 0.6317848410757946,
"grad_norm": 1.3851107358932495,
"learning_rate": 8.968851808586163e-06,
"loss": 0.7143,
"step": 323
},
{
"epoch": 0.6337408312958435,
"grad_norm": 1.3429324626922607,
"learning_rate": 8.962590898734814e-06,
"loss": 0.7217,
"step": 324
},
{
"epoch": 0.6356968215158925,
"grad_norm": 1.202528476715088,
"learning_rate": 8.956313238215824e-06,
"loss": 0.7085,
"step": 325
},
{
"epoch": 0.6376528117359413,
"grad_norm": 1.4195929765701294,
"learning_rate": 8.950018853566128e-06,
"loss": 0.7069,
"step": 326
},
{
"epoch": 0.6396088019559902,
"grad_norm": 1.2432851791381836,
"learning_rate": 8.943707771393347e-06,
"loss": 0.7148,
"step": 327
},
{
"epoch": 0.6415647921760391,
"grad_norm": 1.3695796728134155,
"learning_rate": 8.9373800183757e-06,
"loss": 0.7414,
"step": 328
},
{
"epoch": 0.643520782396088,
"grad_norm": 1.32993745803833,
"learning_rate": 8.931035621261865e-06,
"loss": 0.6864,
"step": 329
},
{
"epoch": 0.6454767726161369,
"grad_norm": 1.3239011764526367,
"learning_rate": 8.924674606870887e-06,
"loss": 0.6929,
"step": 330
},
{
"epoch": 0.6474327628361858,
"grad_norm": 1.267788290977478,
"learning_rate": 8.918297002092048e-06,
"loss": 0.6877,
"step": 331
},
{
"epoch": 0.6493887530562347,
"grad_norm": 1.2890385389328003,
"learning_rate": 8.911902833884769e-06,
"loss": 0.7101,
"step": 332
},
{
"epoch": 0.6513447432762837,
"grad_norm": 1.2896732091903687,
"learning_rate": 8.905492129278478e-06,
"loss": 0.6929,
"step": 333
},
{
"epoch": 0.6533007334963326,
"grad_norm": 1.2869383096694946,
"learning_rate": 8.899064915372513e-06,
"loss": 0.7076,
"step": 334
},
{
"epoch": 0.6552567237163814,
"grad_norm": 1.2590776681900024,
"learning_rate": 8.892621219336001e-06,
"loss": 0.7104,
"step": 335
},
{
"epoch": 0.6572127139364303,
"grad_norm": 1.4171830415725708,
"learning_rate": 8.886161068407734e-06,
"loss": 0.6914,
"step": 336
},
{
"epoch": 0.6591687041564792,
"grad_norm": 1.2935525178909302,
"learning_rate": 8.879684489896073e-06,
"loss": 0.6931,
"step": 337
},
{
"epoch": 0.6611246943765281,
"grad_norm": 1.3660129308700562,
"learning_rate": 8.873191511178812e-06,
"loss": 0.7125,
"step": 338
},
{
"epoch": 0.663080684596577,
"grad_norm": 1.2660232782363892,
"learning_rate": 8.86668215970308e-06,
"loss": 0.698,
"step": 339
},
{
"epoch": 0.6650366748166259,
"grad_norm": 1.313506841659546,
"learning_rate": 8.860156462985207e-06,
"loss": 0.6944,
"step": 340
},
{
"epoch": 0.6669926650366749,
"grad_norm": 1.371457815170288,
"learning_rate": 8.85361444861063e-06,
"loss": 0.6942,
"step": 341
},
{
"epoch": 0.6689486552567238,
"grad_norm": 1.3678666353225708,
"learning_rate": 8.847056144233756e-06,
"loss": 0.6804,
"step": 342
},
{
"epoch": 0.6709046454767726,
"grad_norm": 1.537421464920044,
"learning_rate": 8.840481577577856e-06,
"loss": 0.7163,
"step": 343
},
{
"epoch": 0.6728606356968215,
"grad_norm": 1.311813473701477,
"learning_rate": 8.83389077643494e-06,
"loss": 0.6952,
"step": 344
},
{
"epoch": 0.6748166259168704,
"grad_norm": 1.392874836921692,
"learning_rate": 8.82728376866565e-06,
"loss": 0.7025,
"step": 345
},
{
"epoch": 0.6767726161369193,
"grad_norm": 1.3374087810516357,
"learning_rate": 8.820660582199137e-06,
"loss": 0.718,
"step": 346
},
{
"epoch": 0.6787286063569682,
"grad_norm": 1.3735182285308838,
"learning_rate": 8.814021245032938e-06,
"loss": 0.6913,
"step": 347
},
{
"epoch": 0.6806845965770171,
"grad_norm": 1.4343420267105103,
"learning_rate": 8.807365785232865e-06,
"loss": 0.6984,
"step": 348
},
{
"epoch": 0.682640586797066,
"grad_norm": 1.3286428451538086,
"learning_rate": 8.800694230932885e-06,
"loss": 0.6441,
"step": 349
},
{
"epoch": 0.684596577017115,
"grad_norm": 1.3408572673797607,
"learning_rate": 8.794006610334995e-06,
"loss": 0.7161,
"step": 350
},
{
"epoch": 0.6865525672371638,
"grad_norm": 1.422874927520752,
"learning_rate": 8.787302951709113e-06,
"loss": 0.6852,
"step": 351
},
{
"epoch": 0.6885085574572127,
"grad_norm": 1.374058723449707,
"learning_rate": 8.780583283392952e-06,
"loss": 0.6915,
"step": 352
},
{
"epoch": 0.6904645476772616,
"grad_norm": 1.323232889175415,
"learning_rate": 8.773847633791897e-06,
"loss": 0.7256,
"step": 353
},
{
"epoch": 0.6924205378973105,
"grad_norm": 1.3007184267044067,
"learning_rate": 8.767096031378891e-06,
"loss": 0.7001,
"step": 354
},
{
"epoch": 0.6943765281173594,
"grad_norm": 1.3835595846176147,
"learning_rate": 8.760328504694317e-06,
"loss": 0.7094,
"step": 355
},
{
"epoch": 0.6963325183374083,
"grad_norm": 1.2669827938079834,
"learning_rate": 8.753545082345866e-06,
"loss": 0.6707,
"step": 356
},
{
"epoch": 0.6982885085574572,
"grad_norm": 1.3541840314865112,
"learning_rate": 8.74674579300843e-06,
"loss": 0.6948,
"step": 357
},
{
"epoch": 0.7002444987775062,
"grad_norm": 1.350557565689087,
"learning_rate": 8.739930665423968e-06,
"loss": 0.703,
"step": 358
},
{
"epoch": 0.702200488997555,
"grad_norm": 1.3906623125076294,
"learning_rate": 8.733099728401392e-06,
"loss": 0.6959,
"step": 359
},
{
"epoch": 0.7041564792176039,
"grad_norm": 1.3461555242538452,
"learning_rate": 8.72625301081645e-06,
"loss": 0.6823,
"step": 360
},
{
"epoch": 0.7061124694376528,
"grad_norm": 1.3507471084594727,
"learning_rate": 8.71939054161159e-06,
"loss": 0.707,
"step": 361
},
{
"epoch": 0.7080684596577017,
"grad_norm": 1.3323566913604736,
"learning_rate": 8.712512349795845e-06,
"loss": 0.7087,
"step": 362
},
{
"epoch": 0.7100244498777506,
"grad_norm": 1.3335025310516357,
"learning_rate": 8.705618464444715e-06,
"loss": 0.7094,
"step": 363
},
{
"epoch": 0.7119804400977995,
"grad_norm": 1.300357699394226,
"learning_rate": 8.69870891470004e-06,
"loss": 0.6792,
"step": 364
},
{
"epoch": 0.7139364303178484,
"grad_norm": 1.409981369972229,
"learning_rate": 8.691783729769874e-06,
"loss": 0.6814,
"step": 365
},
{
"epoch": 0.7158924205378973,
"grad_norm": 1.2382073402404785,
"learning_rate": 8.684842938928362e-06,
"loss": 0.7074,
"step": 366
},
{
"epoch": 0.7178484107579463,
"grad_norm": 1.3354192972183228,
"learning_rate": 8.677886571515624e-06,
"loss": 0.7045,
"step": 367
},
{
"epoch": 0.7198044009779951,
"grad_norm": 1.3605114221572876,
"learning_rate": 8.670914656937619e-06,
"loss": 0.6841,
"step": 368
},
{
"epoch": 0.721760391198044,
"grad_norm": 1.3090858459472656,
"learning_rate": 8.663927224666034e-06,
"loss": 0.7118,
"step": 369
},
{
"epoch": 0.7237163814180929,
"grad_norm": 1.4557610750198364,
"learning_rate": 8.656924304238149e-06,
"loss": 0.6813,
"step": 370
},
{
"epoch": 0.7256723716381418,
"grad_norm": 1.2979029417037964,
"learning_rate": 8.64990592525671e-06,
"loss": 0.6909,
"step": 371
},
{
"epoch": 0.7276283618581907,
"grad_norm": 1.3519365787506104,
"learning_rate": 8.642872117389818e-06,
"loss": 0.7083,
"step": 372
},
{
"epoch": 0.7295843520782396,
"grad_norm": 1.3600205183029175,
"learning_rate": 8.635822910370793e-06,
"loss": 0.7012,
"step": 373
},
{
"epoch": 0.7315403422982885,
"grad_norm": 1.3810431957244873,
"learning_rate": 8.628758333998047e-06,
"loss": 0.6888,
"step": 374
},
{
"epoch": 0.7334963325183375,
"grad_norm": 1.4093194007873535,
"learning_rate": 8.621678418134964e-06,
"loss": 0.6932,
"step": 375
},
{
"epoch": 0.7354523227383863,
"grad_norm": 1.3567487001419067,
"learning_rate": 8.61458319270977e-06,
"loss": 0.6693,
"step": 376
},
{
"epoch": 0.7374083129584352,
"grad_norm": 1.2569823265075684,
"learning_rate": 8.607472687715408e-06,
"loss": 0.701,
"step": 377
},
{
"epoch": 0.7393643031784841,
"grad_norm": 1.379348874092102,
"learning_rate": 8.60034693320941e-06,
"loss": 0.6779,
"step": 378
},
{
"epoch": 0.741320293398533,
"grad_norm": 1.288761854171753,
"learning_rate": 8.593205959313774e-06,
"loss": 0.7077,
"step": 379
},
{
"epoch": 0.7432762836185819,
"grad_norm": 1.3518379926681519,
"learning_rate": 8.58604979621483e-06,
"loss": 0.6698,
"step": 380
},
{
"epoch": 0.7452322738386308,
"grad_norm": 1.3594914674758911,
"learning_rate": 8.578878474163115e-06,
"loss": 0.699,
"step": 381
},
{
"epoch": 0.7471882640586797,
"grad_norm": 1.3028552532196045,
"learning_rate": 8.57169202347325e-06,
"loss": 0.6914,
"step": 382
},
{
"epoch": 0.7491442542787287,
"grad_norm": 1.4065419435501099,
"learning_rate": 8.564490474523803e-06,
"loss": 0.6663,
"step": 383
},
{
"epoch": 0.7511002444987775,
"grad_norm": 1.31602942943573,
"learning_rate": 8.557273857757172e-06,
"loss": 0.6854,
"step": 384
},
{
"epoch": 0.7530562347188264,
"grad_norm": 1.6082147359848022,
"learning_rate": 8.550042203679441e-06,
"loss": 0.6861,
"step": 385
},
{
"epoch": 0.7550122249388753,
"grad_norm": 1.2542592287063599,
"learning_rate": 8.542795542860265e-06,
"loss": 0.6934,
"step": 386
},
{
"epoch": 0.7569682151589242,
"grad_norm": 1.5243535041809082,
"learning_rate": 8.535533905932739e-06,
"loss": 0.686,
"step": 387
},
{
"epoch": 0.7589242053789731,
"grad_norm": 1.364704966545105,
"learning_rate": 8.528257323593257e-06,
"loss": 0.6709,
"step": 388
},
{
"epoch": 0.760880195599022,
"grad_norm": 1.4337162971496582,
"learning_rate": 8.520965826601394e-06,
"loss": 0.7024,
"step": 389
},
{
"epoch": 0.7628361858190709,
"grad_norm": 1.4389928579330444,
"learning_rate": 8.513659445779773e-06,
"loss": 0.7034,
"step": 390
},
{
"epoch": 0.7647921760391198,
"grad_norm": 1.3389860391616821,
"learning_rate": 8.506338212013937e-06,
"loss": 0.6864,
"step": 391
},
{
"epoch": 0.7667481662591688,
"grad_norm": 1.2770016193389893,
"learning_rate": 8.499002156252205e-06,
"loss": 0.6864,
"step": 392
},
{
"epoch": 0.7687041564792176,
"grad_norm": 1.3530707359313965,
"learning_rate": 8.491651309505562e-06,
"loss": 0.6982,
"step": 393
},
{
"epoch": 0.7706601466992665,
"grad_norm": 1.3368823528289795,
"learning_rate": 8.484285702847513e-06,
"loss": 0.6942,
"step": 394
},
{
"epoch": 0.7726161369193154,
"grad_norm": 1.242735743522644,
"learning_rate": 8.476905367413958e-06,
"loss": 0.6499,
"step": 395
},
{
"epoch": 0.7745721271393643,
"grad_norm": 1.307515025138855,
"learning_rate": 8.469510334403054e-06,
"loss": 0.688,
"step": 396
},
{
"epoch": 0.7765281173594132,
"grad_norm": 1.2474210262298584,
"learning_rate": 8.462100635075097e-06,
"loss": 0.7273,
"step": 397
},
{
"epoch": 0.7784841075794621,
"grad_norm": 1.3087382316589355,
"learning_rate": 8.454676300752367e-06,
"loss": 0.6874,
"step": 398
},
{
"epoch": 0.780440097799511,
"grad_norm": 1.3236266374588013,
"learning_rate": 8.447237362819022e-06,
"loss": 0.701,
"step": 399
},
{
"epoch": 0.78239608801956,
"grad_norm": 1.3533902168273926,
"learning_rate": 8.439783852720941e-06,
"loss": 0.6639,
"step": 400
},
{
"epoch": 0.7843520782396088,
"grad_norm": 1.3087717294692993,
"learning_rate": 8.432315801965616e-06,
"loss": 0.6982,
"step": 401
},
{
"epoch": 0.7863080684596577,
"grad_norm": 1.4343737363815308,
"learning_rate": 8.424833242121993e-06,
"loss": 0.718,
"step": 402
},
{
"epoch": 0.7882640586797066,
"grad_norm": 1.3762623071670532,
"learning_rate": 8.417336204820353e-06,
"loss": 0.6884,
"step": 403
},
{
"epoch": 0.7902200488997555,
"grad_norm": 1.2853654623031616,
"learning_rate": 8.409824721752183e-06,
"loss": 0.67,
"step": 404
},
{
"epoch": 0.7921760391198044,
"grad_norm": 1.2903891801834106,
"learning_rate": 8.40229882467003e-06,
"loss": 0.6781,
"step": 405
},
{
"epoch": 0.7941320293398533,
"grad_norm": 1.3627846240997314,
"learning_rate": 8.394758545387369e-06,
"loss": 0.6932,
"step": 406
},
{
"epoch": 0.7960880195599022,
"grad_norm": 1.298510193824768,
"learning_rate": 8.38720391577848e-06,
"loss": 0.6856,
"step": 407
},
{
"epoch": 0.798044009779951,
"grad_norm": 1.180311918258667,
"learning_rate": 8.379634967778297e-06,
"loss": 0.684,
"step": 408
},
{
"epoch": 0.8,
"grad_norm": 1.1478567123413086,
"learning_rate": 8.372051733382283e-06,
"loss": 0.7289,
"step": 409
},
{
"epoch": 0.8019559902200489,
"grad_norm": 1.3660131692886353,
"learning_rate": 8.364454244646294e-06,
"loss": 0.6792,
"step": 410
},
{
"epoch": 0.8039119804400978,
"grad_norm": 1.2295174598693848,
"learning_rate": 8.35684253368644e-06,
"loss": 0.6725,
"step": 411
},
{
"epoch": 0.8058679706601467,
"grad_norm": 1.2348603010177612,
"learning_rate": 8.349216632678954e-06,
"loss": 0.6629,
"step": 412
},
{
"epoch": 0.8078239608801956,
"grad_norm": 1.32919180393219,
"learning_rate": 8.341576573860049e-06,
"loss": 0.6994,
"step": 413
},
{
"epoch": 0.8097799511002445,
"grad_norm": 1.2249308824539185,
"learning_rate": 8.333922389525789e-06,
"loss": 0.6935,
"step": 414
},
{
"epoch": 0.8117359413202934,
"grad_norm": 1.288668155670166,
"learning_rate": 8.32625411203195e-06,
"loss": 0.6852,
"step": 415
},
{
"epoch": 0.8136919315403423,
"grad_norm": 1.252305507659912,
"learning_rate": 8.318571773793879e-06,
"loss": 0.6985,
"step": 416
},
{
"epoch": 0.8156479217603912,
"grad_norm": 1.2717095613479614,
"learning_rate": 8.310875407286364e-06,
"loss": 0.6902,
"step": 417
},
{
"epoch": 0.8176039119804401,
"grad_norm": 1.2788197994232178,
"learning_rate": 8.30316504504349e-06,
"loss": 0.6828,
"step": 418
},
{
"epoch": 0.819559902200489,
"grad_norm": 1.186546802520752,
"learning_rate": 8.295440719658512e-06,
"loss": 0.6842,
"step": 419
},
{
"epoch": 0.8215158924205379,
"grad_norm": 1.2258790731430054,
"learning_rate": 8.2877024637837e-06,
"loss": 0.7062,
"step": 420
},
{
"epoch": 0.8234718826405868,
"grad_norm": 1.313541054725647,
"learning_rate": 8.279950310130218e-06,
"loss": 0.6903,
"step": 421
},
{
"epoch": 0.8254278728606357,
"grad_norm": 1.2598004341125488,
"learning_rate": 8.272184291467976e-06,
"loss": 0.7117,
"step": 422
},
{
"epoch": 0.8273838630806846,
"grad_norm": 1.2314584255218506,
"learning_rate": 8.264404440625493e-06,
"loss": 0.7166,
"step": 423
},
{
"epoch": 0.8293398533007335,
"grad_norm": 1.3035787343978882,
"learning_rate": 8.256610790489765e-06,
"loss": 0.6667,
"step": 424
},
{
"epoch": 0.8312958435207825,
"grad_norm": 1.3895798921585083,
"learning_rate": 8.248803374006113e-06,
"loss": 0.6708,
"step": 425
},
{
"epoch": 0.8332518337408313,
"grad_norm": 1.2241226434707642,
"learning_rate": 8.240982224178058e-06,
"loss": 0.6889,
"step": 426
},
{
"epoch": 0.8352078239608802,
"grad_norm": 1.1916440725326538,
"learning_rate": 8.233147374067166e-06,
"loss": 0.6858,
"step": 427
},
{
"epoch": 0.8371638141809291,
"grad_norm": 1.2678576707839966,
"learning_rate": 8.225298856792929e-06,
"loss": 0.695,
"step": 428
},
{
"epoch": 0.839119804400978,
"grad_norm": 1.2337770462036133,
"learning_rate": 8.2174367055326e-06,
"loss": 0.702,
"step": 429
},
{
"epoch": 0.8410757946210269,
"grad_norm": 1.171006202697754,
"learning_rate": 8.209560953521075e-06,
"loss": 0.7058,
"step": 430
},
{
"epoch": 0.8430317848410758,
"grad_norm": 1.2427887916564941,
"learning_rate": 8.201671634050738e-06,
"loss": 0.695,
"step": 431
},
{
"epoch": 0.8449877750611247,
"grad_norm": 1.2962589263916016,
"learning_rate": 8.193768780471326e-06,
"loss": 0.7152,
"step": 432
},
{
"epoch": 0.8469437652811735,
"grad_norm": 1.3337562084197998,
"learning_rate": 8.185852426189794e-06,
"loss": 0.7168,
"step": 433
},
{
"epoch": 0.8488997555012225,
"grad_norm": 1.352417230606079,
"learning_rate": 8.177922604670155e-06,
"loss": 0.6958,
"step": 434
},
{
"epoch": 0.8508557457212714,
"grad_norm": 1.186039924621582,
"learning_rate": 8.169979349433358e-06,
"loss": 0.6631,
"step": 435
},
{
"epoch": 0.8528117359413203,
"grad_norm": 1.1488475799560547,
"learning_rate": 8.162022694057143e-06,
"loss": 0.6991,
"step": 436
},
{
"epoch": 0.8547677261613692,
"grad_norm": 1.2668110132217407,
"learning_rate": 8.154052672175888e-06,
"loss": 0.6866,
"step": 437
},
{
"epoch": 0.8567237163814181,
"grad_norm": 1.266473412513733,
"learning_rate": 8.146069317480475e-06,
"loss": 0.6932,
"step": 438
},
{
"epoch": 0.858679706601467,
"grad_norm": 1.241896152496338,
"learning_rate": 8.138072663718149e-06,
"loss": 0.6788,
"step": 439
},
{
"epoch": 0.8606356968215159,
"grad_norm": 1.3747978210449219,
"learning_rate": 8.130062744692371e-06,
"loss": 0.6899,
"step": 440
},
{
"epoch": 0.8625916870415647,
"grad_norm": 1.393964171409607,
"learning_rate": 8.122039594262679e-06,
"loss": 0.6886,
"step": 441
},
{
"epoch": 0.8645476772616137,
"grad_norm": 1.1397488117218018,
"learning_rate": 8.114003246344539e-06,
"loss": 0.6797,
"step": 442
},
{
"epoch": 0.8665036674816626,
"grad_norm": 1.2788703441619873,
"learning_rate": 8.10595373490921e-06,
"loss": 0.6767,
"step": 443
},
{
"epoch": 0.8684596577017115,
"grad_norm": 1.2286616563796997,
"learning_rate": 8.097891093983592e-06,
"loss": 0.6616,
"step": 444
},
{
"epoch": 0.8704156479217604,
"grad_norm": 1.294381022453308,
"learning_rate": 8.08981535765009e-06,
"loss": 0.7023,
"step": 445
},
{
"epoch": 0.8723716381418093,
"grad_norm": 1.421319842338562,
"learning_rate": 8.08172656004646e-06,
"loss": 0.7068,
"step": 446
},
{
"epoch": 0.8743276283618582,
"grad_norm": 1.3619354963302612,
"learning_rate": 8.073624735365677e-06,
"loss": 0.6629,
"step": 447
},
{
"epoch": 0.8762836185819071,
"grad_norm": 1.2635352611541748,
"learning_rate": 8.06550991785578e-06,
"loss": 0.6986,
"step": 448
},
{
"epoch": 0.878239608801956,
"grad_norm": 1.3464255332946777,
"learning_rate": 8.057382141819734e-06,
"loss": 0.7125,
"step": 449
},
{
"epoch": 0.8801955990220048,
"grad_norm": 1.241179347038269,
"learning_rate": 8.049241441615279e-06,
"loss": 0.6943,
"step": 450
},
{
"epoch": 0.8821515892420538,
"grad_norm": 1.343956708908081,
"learning_rate": 8.04108785165479e-06,
"loss": 0.74,
"step": 451
},
{
"epoch": 0.8841075794621027,
"grad_norm": 1.3890380859375,
"learning_rate": 8.032921406405132e-06,
"loss": 0.6938,
"step": 452
},
{
"epoch": 0.8860635696821516,
"grad_norm": 1.2220715284347534,
"learning_rate": 8.024742140387506e-06,
"loss": 0.6873,
"step": 453
},
{
"epoch": 0.8880195599022005,
"grad_norm": 1.322405219078064,
"learning_rate": 8.016550088177313e-06,
"loss": 0.6721,
"step": 454
},
{
"epoch": 0.8899755501222494,
"grad_norm": 1.2938501834869385,
"learning_rate": 8.008345284404005e-06,
"loss": 0.722,
"step": 455
},
{
"epoch": 0.8919315403422983,
"grad_norm": 1.2954498529434204,
"learning_rate": 8.000127763750934e-06,
"loss": 0.6788,
"step": 456
},
{
"epoch": 0.8938875305623472,
"grad_norm": 1.3062814474105835,
"learning_rate": 7.99189756095521e-06,
"loss": 0.6423,
"step": 457
},
{
"epoch": 0.895843520782396,
"grad_norm": 1.2911349534988403,
"learning_rate": 7.983654710807556e-06,
"loss": 0.7107,
"step": 458
},
{
"epoch": 0.897799511002445,
"grad_norm": 1.4652104377746582,
"learning_rate": 7.975399248152151e-06,
"loss": 0.6706,
"step": 459
},
{
"epoch": 0.8997555012224939,
"grad_norm": 1.2224496603012085,
"learning_rate": 7.967131207886497e-06,
"loss": 0.687,
"step": 460
},
{
"epoch": 0.9017114914425428,
"grad_norm": 1.3361140489578247,
"learning_rate": 7.95885062496126e-06,
"loss": 0.6558,
"step": 461
},
{
"epoch": 0.9036674816625917,
"grad_norm": 1.3264832496643066,
"learning_rate": 7.950557534380126e-06,
"loss": 0.6966,
"step": 462
},
{
"epoch": 0.9056234718826406,
"grad_norm": 1.3627907037734985,
"learning_rate": 7.942251971199657e-06,
"loss": 0.694,
"step": 463
},
{
"epoch": 0.9075794621026895,
"grad_norm": 1.2352824211120605,
"learning_rate": 7.933933970529135e-06,
"loss": 0.7287,
"step": 464
},
{
"epoch": 0.9095354523227384,
"grad_norm": 1.3050729036331177,
"learning_rate": 7.92560356753042e-06,
"loss": 0.6898,
"step": 465
},
{
"epoch": 0.9114914425427872,
"grad_norm": 1.3547155857086182,
"learning_rate": 7.917260797417801e-06,
"loss": 0.6589,
"step": 466
},
{
"epoch": 0.9134474327628361,
"grad_norm": 1.2764095067977905,
"learning_rate": 7.908905695457838e-06,
"loss": 0.6777,
"step": 467
},
{
"epoch": 0.9154034229828851,
"grad_norm": 1.2692286968231201,
"learning_rate": 7.900538296969228e-06,
"loss": 0.675,
"step": 468
},
{
"epoch": 0.917359413202934,
"grad_norm": 1.2980645895004272,
"learning_rate": 7.892158637322647e-06,
"loss": 0.7241,
"step": 469
},
{
"epoch": 0.9193154034229829,
"grad_norm": 1.285950779914856,
"learning_rate": 7.883766751940595e-06,
"loss": 0.6724,
"step": 470
},
{
"epoch": 0.9212713936430318,
"grad_norm": 1.2841320037841797,
"learning_rate": 7.87536267629726e-06,
"loss": 0.6553,
"step": 471
},
{
"epoch": 0.9232273838630807,
"grad_norm": 1.2985844612121582,
"learning_rate": 7.866946445918359e-06,
"loss": 0.6767,
"step": 472
},
{
"epoch": 0.9251833740831296,
"grad_norm": 1.2050354480743408,
"learning_rate": 7.858518096380984e-06,
"loss": 0.6954,
"step": 473
},
{
"epoch": 0.9271393643031784,
"grad_norm": 1.1968133449554443,
"learning_rate": 7.850077663313466e-06,
"loss": 0.6921,
"step": 474
},
{
"epoch": 0.9290953545232273,
"grad_norm": 1.2390793561935425,
"learning_rate": 7.841625182395207e-06,
"loss": 0.7176,
"step": 475
},
{
"epoch": 0.9310513447432763,
"grad_norm": 1.2606759071350098,
"learning_rate": 7.833160689356545e-06,
"loss": 0.7002,
"step": 476
},
{
"epoch": 0.9330073349633252,
"grad_norm": 1.265933632850647,
"learning_rate": 7.824684219978591e-06,
"loss": 0.6572,
"step": 477
},
{
"epoch": 0.9349633251833741,
"grad_norm": 1.3236520290374756,
"learning_rate": 7.816195810093081e-06,
"loss": 0.7176,
"step": 478
},
{
"epoch": 0.936919315403423,
"grad_norm": 1.1737425327301025,
"learning_rate": 7.807695495582233e-06,
"loss": 0.6785,
"step": 479
},
{
"epoch": 0.9388753056234719,
"grad_norm": 1.1625640392303467,
"learning_rate": 7.79918331237858e-06,
"loss": 0.6656,
"step": 480
},
{
"epoch": 0.9408312958435208,
"grad_norm": 1.2079232931137085,
"learning_rate": 7.790659296464833e-06,
"loss": 0.6991,
"step": 481
},
{
"epoch": 0.9427872860635697,
"grad_norm": 1.1734462976455688,
"learning_rate": 7.782123483873716e-06,
"loss": 0.6793,
"step": 482
},
{
"epoch": 0.9447432762836185,
"grad_norm": 1.4000263214111328,
"learning_rate": 7.773575910687827e-06,
"loss": 0.6928,
"step": 483
},
{
"epoch": 0.9466992665036675,
"grad_norm": 1.368007779121399,
"learning_rate": 7.76501661303947e-06,
"loss": 0.676,
"step": 484
},
{
"epoch": 0.9486552567237164,
"grad_norm": 1.211277961730957,
"learning_rate": 7.756445627110523e-06,
"loss": 0.666,
"step": 485
},
{
"epoch": 0.9506112469437653,
"grad_norm": 1.3525391817092896,
"learning_rate": 7.747862989132257e-06,
"loss": 0.6932,
"step": 486
},
{
"epoch": 0.9525672371638142,
"grad_norm": 1.30064058303833,
"learning_rate": 7.739268735385212e-06,
"loss": 0.6976,
"step": 487
},
{
"epoch": 0.9545232273838631,
"grad_norm": 1.3304917812347412,
"learning_rate": 7.730662902199022e-06,
"loss": 0.6721,
"step": 488
},
{
"epoch": 0.956479217603912,
"grad_norm": 1.5623983144760132,
"learning_rate": 7.722045525952272e-06,
"loss": 0.7155,
"step": 489
},
{
"epoch": 0.9584352078239609,
"grad_norm": 1.258493423461914,
"learning_rate": 7.713416643072345e-06,
"loss": 0.6694,
"step": 490
},
{
"epoch": 0.9603911980440097,
"grad_norm": 1.3739190101623535,
"learning_rate": 7.70477629003526e-06,
"loss": 0.7087,
"step": 491
},
{
"epoch": 0.9623471882640586,
"grad_norm": 1.3824055194854736,
"learning_rate": 7.696124503365526e-06,
"loss": 0.6822,
"step": 492
},
{
"epoch": 0.9643031784841076,
"grad_norm": 1.2122122049331665,
"learning_rate": 7.68746131963598e-06,
"loss": 0.6785,
"step": 493
},
{
"epoch": 0.9662591687041565,
"grad_norm": 1.250258445739746,
"learning_rate": 7.678786775467645e-06,
"loss": 0.705,
"step": 494
},
{
"epoch": 0.9682151589242054,
"grad_norm": 1.287483811378479,
"learning_rate": 7.670100907529558e-06,
"loss": 0.6785,
"step": 495
},
{
"epoch": 0.9701711491442543,
"grad_norm": 1.213124394416809,
"learning_rate": 7.661403752538628e-06,
"loss": 0.7136,
"step": 496
},
{
"epoch": 0.9721271393643032,
"grad_norm": 1.2306246757507324,
"learning_rate": 7.652695347259476e-06,
"loss": 0.6748,
"step": 497
},
{
"epoch": 0.9740831295843521,
"grad_norm": 1.2954891920089722,
"learning_rate": 7.64397572850428e-06,
"loss": 0.6732,
"step": 498
},
{
"epoch": 0.976039119804401,
"grad_norm": 1.207684874534607,
"learning_rate": 7.635244933132618e-06,
"loss": 0.671,
"step": 499
},
{
"epoch": 0.9779951100244498,
"grad_norm": 1.3937175273895264,
"learning_rate": 7.626502998051321e-06,
"loss": 0.6884,
"step": 500
},
{
"epoch": 0.9799511002444988,
"grad_norm": 1.3250900506973267,
"learning_rate": 7.6177499602143e-06,
"loss": 0.6701,
"step": 501
},
{
"epoch": 0.9819070904645477,
"grad_norm": 1.2760993242263794,
"learning_rate": 7.608985856622405e-06,
"loss": 0.6703,
"step": 502
},
{
"epoch": 0.9838630806845966,
"grad_norm": 1.2552558183670044,
"learning_rate": 7.6002107243232625e-06,
"loss": 0.7043,
"step": 503
},
{
"epoch": 0.9858190709046455,
"grad_norm": 1.3188880681991577,
"learning_rate": 7.5914246004111195e-06,
"loss": 0.6893,
"step": 504
},
{
"epoch": 0.9877750611246944,
"grad_norm": 1.3230962753295898,
"learning_rate": 7.582627522026686e-06,
"loss": 0.6458,
"step": 505
},
{
"epoch": 0.9897310513447433,
"grad_norm": 1.212945818901062,
"learning_rate": 7.573819526356979e-06,
"loss": 0.684,
"step": 506
},
{
"epoch": 0.9916870415647921,
"grad_norm": 1.265489935874939,
"learning_rate": 7.565000650635167e-06,
"loss": 0.6697,
"step": 507
},
{
"epoch": 0.993643031784841,
"grad_norm": 1.2421671152114868,
"learning_rate": 7.556170932140407e-06,
"loss": 0.6875,
"step": 508
},
{
"epoch": 0.9955990220048899,
"grad_norm": 1.2173808813095093,
"learning_rate": 7.547330408197695e-06,
"loss": 0.6872,
"step": 509
},
{
"epoch": 0.9975550122249389,
"grad_norm": 1.2676969766616821,
"learning_rate": 7.538479116177699e-06,
"loss": 0.6543,
"step": 510
},
{
"epoch": 0.9995110024449878,
"grad_norm": 1.2565542459487915,
"learning_rate": 7.529617093496609e-06,
"loss": 0.7153,
"step": 511
},
{
"epoch": 1.0014669926650366,
"grad_norm": 1.333875298500061,
"learning_rate": 7.520744377615975e-06,
"loss": 0.6843,
"step": 512
},
{
"epoch": 1.0034229828850856,
"grad_norm": 1.2860937118530273,
"learning_rate": 7.511861006042549e-06,
"loss": 0.684,
"step": 513
},
{
"epoch": 1.0053789731051346,
"grad_norm": 1.1821390390396118,
"learning_rate": 7.502967016328128e-06,
"loss": 0.6788,
"step": 514
},
{
"epoch": 1.0073349633251834,
"grad_norm": 1.285091757774353,
"learning_rate": 7.494062446069391e-06,
"loss": 0.6679,
"step": 515
},
{
"epoch": 1.0092909535452323,
"grad_norm": 1.298203468322754,
"learning_rate": 7.485147332907745e-06,
"loss": 0.6622,
"step": 516
},
{
"epoch": 1.0112469437652811,
"grad_norm": 1.230342149734497,
"learning_rate": 7.476221714529167e-06,
"loss": 0.6829,
"step": 517
},
{
"epoch": 1.0132029339853301,
"grad_norm": 1.2091991901397705,
"learning_rate": 7.467285628664036e-06,
"loss": 0.6977,
"step": 518
},
{
"epoch": 1.015158924205379,
"grad_norm": 1.2483344078063965,
"learning_rate": 7.458339113086983e-06,
"loss": 0.6874,
"step": 519
},
{
"epoch": 1.017114914425428,
"grad_norm": 1.2287633419036865,
"learning_rate": 7.4493822056167255e-06,
"loss": 0.6984,
"step": 520
},
{
"epoch": 1.0014669926650366,
"grad_norm": 1.9640307426452637,
"learning_rate": 7.440414944115909e-06,
"loss": 0.5247,
"step": 521
},
{
"epoch": 1.0034229828850856,
"grad_norm": 1.8829586505889893,
"learning_rate": 7.431437366490952e-06,
"loss": 0.4741,
"step": 522
},
{
"epoch": 1.0053789731051346,
"grad_norm": 1.5505880117416382,
"learning_rate": 7.422449510691878e-06,
"loss": 0.4706,
"step": 523
},
{
"epoch": 1.0073349633251834,
"grad_norm": 1.6880980730056763,
"learning_rate": 7.413451414712156e-06,
"loss": 0.4542,
"step": 524
},
{
"epoch": 1.0092909535452323,
"grad_norm": 1.934731125831604,
"learning_rate": 7.404443116588548e-06,
"loss": 0.4819,
"step": 525
},
{
"epoch": 1.0112469437652811,
"grad_norm": 1.9934979677200317,
"learning_rate": 7.395424654400938e-06,
"loss": 0.4653,
"step": 526
},
{
"epoch": 1.0132029339853301,
"grad_norm": 1.781922698020935,
"learning_rate": 7.386396066272177e-06,
"loss": 0.4566,
"step": 527
},
{
"epoch": 1.015158924205379,
"grad_norm": 1.3940856456756592,
"learning_rate": 7.377357390367922e-06,
"loss": 0.4616,
"step": 528
},
{
"epoch": 1.017114914425428,
"grad_norm": 1.4322494268417358,
"learning_rate": 7.368308664896471e-06,
"loss": 0.4535,
"step": 529
},
{
"epoch": 1.0190709046454767,
"grad_norm": 1.6584018468856812,
"learning_rate": 7.3592499281086e-06,
"loss": 0.466,
"step": 530
},
{
"epoch": 1.0210268948655257,
"grad_norm": 1.39903724193573,
"learning_rate": 7.350181218297417e-06,
"loss": 0.4639,
"step": 531
},
{
"epoch": 1.0229828850855747,
"grad_norm": 1.3164935111999512,
"learning_rate": 7.341102573798171e-06,
"loss": 0.4354,
"step": 532
},
{
"epoch": 1.0249388753056234,
"grad_norm": 1.417481541633606,
"learning_rate": 7.332014032988123e-06,
"loss": 0.4512,
"step": 533
},
{
"epoch": 1.0268948655256724,
"grad_norm": 1.36899995803833,
"learning_rate": 7.322915634286356e-06,
"loss": 0.4386,
"step": 534
},
{
"epoch": 1.0288508557457212,
"grad_norm": 1.4433590173721313,
"learning_rate": 7.3138074161536306e-06,
"loss": 0.4639,
"step": 535
},
{
"epoch": 1.0308068459657702,
"grad_norm": 1.6212917566299438,
"learning_rate": 7.304689417092215e-06,
"loss": 0.43,
"step": 536
},
{
"epoch": 1.032762836185819,
"grad_norm": 1.329058051109314,
"learning_rate": 7.29556167564572e-06,
"loss": 0.4361,
"step": 537
},
{
"epoch": 1.034718826405868,
"grad_norm": 1.3723002672195435,
"learning_rate": 7.286424230398946e-06,
"loss": 0.4312,
"step": 538
},
{
"epoch": 1.0366748166259168,
"grad_norm": 1.408575177192688,
"learning_rate": 7.277277119977706e-06,
"loss": 0.4326,
"step": 539
},
{
"epoch": 1.0386308068459658,
"grad_norm": 1.2816498279571533,
"learning_rate": 7.268120383048674e-06,
"loss": 0.452,
"step": 540
},
{
"epoch": 1.0405867970660148,
"grad_norm": 1.340277910232544,
"learning_rate": 7.2589540583192165e-06,
"loss": 0.4617,
"step": 541
},
{
"epoch": 1.0425427872860635,
"grad_norm": 1.253343105316162,
"learning_rate": 7.249778184537228e-06,
"loss": 0.4422,
"step": 542
},
{
"epoch": 1.0444987775061125,
"grad_norm": 1.3268951177597046,
"learning_rate": 7.240592800490972e-06,
"loss": 0.4406,
"step": 543
},
{
"epoch": 1.0464547677261613,
"grad_norm": 1.3422157764434814,
"learning_rate": 7.231397945008912e-06,
"loss": 0.4434,
"step": 544
},
{
"epoch": 1.0484107579462103,
"grad_norm": 1.4334876537322998,
"learning_rate": 7.222193656959546e-06,
"loss": 0.4359,
"step": 545
},
{
"epoch": 1.050366748166259,
"grad_norm": 1.4176690578460693,
"learning_rate": 7.212979975251252e-06,
"loss": 0.4595,
"step": 546
},
{
"epoch": 1.052322738386308,
"grad_norm": 1.4059594869613647,
"learning_rate": 7.20375693883211e-06,
"loss": 0.4699,
"step": 547
},
{
"epoch": 1.054278728606357,
"grad_norm": 1.381970763206482,
"learning_rate": 7.194524586689749e-06,
"loss": 0.4439,
"step": 548
},
{
"epoch": 1.0562347188264058,
"grad_norm": 1.5071953535079956,
"learning_rate": 7.185282957851175e-06,
"loss": 0.4403,
"step": 549
},
{
"epoch": 1.0581907090464548,
"grad_norm": 1.374406337738037,
"learning_rate": 7.176032091382611e-06,
"loss": 0.4309,
"step": 550
},
{
"epoch": 1.0601466992665036,
"grad_norm": 1.5083845853805542,
"learning_rate": 7.166772026389327e-06,
"loss": 0.4333,
"step": 551
},
{
"epoch": 1.0621026894865526,
"grad_norm": 1.3600280284881592,
"learning_rate": 7.157502802015477e-06,
"loss": 0.4173,
"step": 552
},
{
"epoch": 1.0640586797066014,
"grad_norm": 1.3899999856948853,
"learning_rate": 7.148224457443933e-06,
"loss": 0.4414,
"step": 553
},
{
"epoch": 1.0660146699266504,
"grad_norm": 1.32930326461792,
"learning_rate": 7.138937031896125e-06,
"loss": 0.4489,
"step": 554
},
{
"epoch": 1.0679706601466992,
"grad_norm": 1.2142601013183594,
"learning_rate": 7.129640564631863e-06,
"loss": 0.4372,
"step": 555
},
{
"epoch": 1.0699266503667482,
"grad_norm": 1.2110955715179443,
"learning_rate": 7.1203350949491824e-06,
"loss": 0.4396,
"step": 556
},
{
"epoch": 1.0718826405867972,
"grad_norm": 1.2478694915771484,
"learning_rate": 7.111020662184174e-06,
"loss": 0.4467,
"step": 557
},
{
"epoch": 1.073838630806846,
"grad_norm": 1.2674793004989624,
"learning_rate": 7.101697305710812e-06,
"loss": 0.4595,
"step": 558
},
{
"epoch": 1.075794621026895,
"grad_norm": 1.280213475227356,
"learning_rate": 7.092365064940801e-06,
"loss": 0.4367,
"step": 559
},
{
"epoch": 1.0777506112469437,
"grad_norm": 1.2007982730865479,
"learning_rate": 7.083023979323396e-06,
"loss": 0.4328,
"step": 560
},
{
"epoch": 1.0797066014669927,
"grad_norm": 1.2380990982055664,
"learning_rate": 7.073674088345239e-06,
"loss": 0.4263,
"step": 561
},
{
"epoch": 1.0816625916870415,
"grad_norm": 1.1389713287353516,
"learning_rate": 7.064315431530202e-06,
"loss": 0.453,
"step": 562
},
{
"epoch": 1.0836185819070905,
"grad_norm": 1.2735354900360107,
"learning_rate": 7.054948048439204e-06,
"loss": 0.4342,
"step": 563
},
{
"epoch": 1.0855745721271393,
"grad_norm": 1.2365597486495972,
"learning_rate": 7.045571978670057e-06,
"loss": 0.4549,
"step": 564
},
{
"epoch": 1.0875305623471883,
"grad_norm": 1.2870897054672241,
"learning_rate": 7.036187261857289e-06,
"loss": 0.4347,
"step": 565
},
{
"epoch": 1.0894865525672373,
"grad_norm": 1.2662715911865234,
"learning_rate": 7.026793937671984e-06,
"loss": 0.4244,
"step": 566
},
{
"epoch": 1.091442542787286,
"grad_norm": 1.3069432973861694,
"learning_rate": 7.017392045821609e-06,
"loss": 0.4442,
"step": 567
},
{
"epoch": 1.093398533007335,
"grad_norm": 1.2944248914718628,
"learning_rate": 7.007981626049851e-06,
"loss": 0.4261,
"step": 568
},
{
"epoch": 1.0953545232273838,
"grad_norm": 1.271885633468628,
"learning_rate": 6.998562718136445e-06,
"loss": 0.4423,
"step": 569
},
{
"epoch": 1.0973105134474328,
"grad_norm": 1.2481279373168945,
"learning_rate": 6.989135361897002e-06,
"loss": 0.4146,
"step": 570
},
{
"epoch": 1.0992665036674816,
"grad_norm": 1.229149341583252,
"learning_rate": 6.979699597182856e-06,
"loss": 0.4413,
"step": 571
},
{
"epoch": 1.1012224938875306,
"grad_norm": 1.3400832414627075,
"learning_rate": 6.970255463880879e-06,
"loss": 0.4481,
"step": 572
},
{
"epoch": 1.1031784841075796,
"grad_norm": 1.1787781715393066,
"learning_rate": 6.960803001913315e-06,
"loss": 0.4434,
"step": 573
},
{
"epoch": 1.1051344743276283,
"grad_norm": 1.3244268894195557,
"learning_rate": 6.9513422512376214e-06,
"loss": 0.4567,
"step": 574
},
{
"epoch": 1.1070904645476773,
"grad_norm": 1.2756062746047974,
"learning_rate": 6.9418732518462935e-06,
"loss": 0.4472,
"step": 575
},
{
"epoch": 1.1090464547677261,
"grad_norm": 1.2949943542480469,
"learning_rate": 6.932396043766694e-06,
"loss": 0.4545,
"step": 576
},
{
"epoch": 1.1110024449877751,
"grad_norm": 1.2295341491699219,
"learning_rate": 6.922910667060881e-06,
"loss": 0.4519,
"step": 577
},
{
"epoch": 1.112958435207824,
"grad_norm": 1.3025312423706055,
"learning_rate": 6.913417161825449e-06,
"loss": 0.4649,
"step": 578
},
{
"epoch": 1.114914425427873,
"grad_norm": 1.303686261177063,
"learning_rate": 6.903915568191353e-06,
"loss": 0.4484,
"step": 579
},
{
"epoch": 1.1168704156479217,
"grad_norm": 1.2657026052474976,
"learning_rate": 6.894405926323737e-06,
"loss": 0.4664,
"step": 580
},
{
"epoch": 1.1188264058679707,
"grad_norm": 1.9421648979187012,
"learning_rate": 6.884888276421766e-06,
"loss": 0.4503,
"step": 581
},
{
"epoch": 1.1207823960880197,
"grad_norm": 1.2472649812698364,
"learning_rate": 6.875362658718459e-06,
"loss": 0.4636,
"step": 582
},
{
"epoch": 1.1227383863080684,
"grad_norm": 1.337275505065918,
"learning_rate": 6.8658291134805155e-06,
"loss": 0.4575,
"step": 583
},
{
"epoch": 1.1246943765281174,
"grad_norm": 1.438635230064392,
"learning_rate": 6.856287681008145e-06,
"loss": 0.45,
"step": 584
},
{
"epoch": 1.1266503667481662,
"grad_norm": 1.2741761207580566,
"learning_rate": 6.846738401634899e-06,
"loss": 0.4443,
"step": 585
},
{
"epoch": 1.1286063569682152,
"grad_norm": 1.4363293647766113,
"learning_rate": 6.837181315727501e-06,
"loss": 0.432,
"step": 586
},
{
"epoch": 1.130562347188264,
"grad_norm": 1.398519515991211,
"learning_rate": 6.827616463685671e-06,
"loss": 0.443,
"step": 587
},
{
"epoch": 1.132518337408313,
"grad_norm": 1.3236083984375,
"learning_rate": 6.818043885941962e-06,
"loss": 0.465,
"step": 588
},
{
"epoch": 1.1344743276283618,
"grad_norm": 1.2787197828292847,
"learning_rate": 6.8084636229615786e-06,
"loss": 0.4429,
"step": 589
},
{
"epoch": 1.1364303178484108,
"grad_norm": 1.1890900135040283,
"learning_rate": 6.798875715242221e-06,
"loss": 0.4437,
"step": 590
},
{
"epoch": 1.1383863080684598,
"grad_norm": 1.3473844528198242,
"learning_rate": 6.789280203313899e-06,
"loss": 0.4475,
"step": 591
},
{
"epoch": 1.1403422982885085,
"grad_norm": 1.2148065567016602,
"learning_rate": 6.7796771277387705e-06,
"loss": 0.4421,
"step": 592
},
{
"epoch": 1.1422982885085575,
"grad_norm": 1.3973183631896973,
"learning_rate": 6.770066529110964e-06,
"loss": 0.4344,
"step": 593
},
{
"epoch": 1.1442542787286063,
"grad_norm": 1.1743921041488647,
"learning_rate": 6.760448448056407e-06,
"loss": 0.4249,
"step": 594
},
{
"epoch": 1.1462102689486553,
"grad_norm": 1.3278478384017944,
"learning_rate": 6.750822925232664e-06,
"loss": 0.4542,
"step": 595
},
{
"epoch": 1.148166259168704,
"grad_norm": 1.2853714227676392,
"learning_rate": 6.741190001328751e-06,
"loss": 0.4457,
"step": 596
},
{
"epoch": 1.150122249388753,
"grad_norm": 1.217772126197815,
"learning_rate": 6.731549717064975e-06,
"loss": 0.4587,
"step": 597
},
{
"epoch": 1.152078239608802,
"grad_norm": 1.3278999328613281,
"learning_rate": 6.721902113192752e-06,
"loss": 0.4607,
"step": 598
},
{
"epoch": 1.1540342298288508,
"grad_norm": 1.2875677347183228,
"learning_rate": 6.71224723049444e-06,
"loss": 0.4445,
"step": 599
},
{
"epoch": 1.1559902200488998,
"grad_norm": 1.2930001020431519,
"learning_rate": 6.702585109783169e-06,
"loss": 0.4308,
"step": 600
},
{
"epoch": 1.1579462102689486,
"grad_norm": 1.2925857305526733,
"learning_rate": 6.6929157919026645e-06,
"loss": 0.4179,
"step": 601
},
{
"epoch": 1.1599022004889976,
"grad_norm": 1.2512215375900269,
"learning_rate": 6.683239317727075e-06,
"loss": 0.428,
"step": 602
},
{
"epoch": 1.1618581907090464,
"grad_norm": 1.1987147331237793,
"learning_rate": 6.6735557281608e-06,
"loss": 0.4581,
"step": 603
},
{
"epoch": 1.1638141809290954,
"grad_norm": 1.2744587659835815,
"learning_rate": 6.663865064138316e-06,
"loss": 0.4489,
"step": 604
},
{
"epoch": 1.1657701711491442,
"grad_norm": 1.248136043548584,
"learning_rate": 6.654167366624009e-06,
"loss": 0.4415,
"step": 605
},
{
"epoch": 1.1677261613691932,
"grad_norm": 1.239186406135559,
"learning_rate": 6.644462676611993e-06,
"loss": 0.4552,
"step": 606
},
{
"epoch": 1.169682151589242,
"grad_norm": 1.2165157794952393,
"learning_rate": 6.634751035125943e-06,
"loss": 0.4383,
"step": 607
},
{
"epoch": 1.171638141809291,
"grad_norm": 1.3180150985717773,
"learning_rate": 6.625032483218917e-06,
"loss": 0.4427,
"step": 608
},
{
"epoch": 1.17359413202934,
"grad_norm": 1.3215945959091187,
"learning_rate": 6.615307061973185e-06,
"loss": 0.4434,
"step": 609
},
{
"epoch": 1.1755501222493887,
"grad_norm": 1.2843338251113892,
"learning_rate": 6.605574812500057e-06,
"loss": 0.4563,
"step": 610
},
{
"epoch": 1.1775061124694377,
"grad_norm": 1.2404955625534058,
"learning_rate": 6.595835775939709e-06,
"loss": 0.4446,
"step": 611
},
{
"epoch": 1.1794621026894865,
"grad_norm": 1.203095555305481,
"learning_rate": 6.586089993461e-06,
"loss": 0.4288,
"step": 612
},
{
"epoch": 1.1814180929095355,
"grad_norm": 1.2812986373901367,
"learning_rate": 6.576337506261314e-06,
"loss": 0.4431,
"step": 613
},
{
"epoch": 1.1833740831295843,
"grad_norm": 1.2452881336212158,
"learning_rate": 6.566578355566371e-06,
"loss": 0.4602,
"step": 614
},
{
"epoch": 1.1853300733496333,
"grad_norm": 1.2354931831359863,
"learning_rate": 6.55681258263006e-06,
"loss": 0.4319,
"step": 615
},
{
"epoch": 1.1872860635696822,
"grad_norm": 1.230417251586914,
"learning_rate": 6.547040228734268e-06,
"loss": 0.4493,
"step": 616
},
{
"epoch": 1.189242053789731,
"grad_norm": 1.218149185180664,
"learning_rate": 6.537261335188696e-06,
"loss": 0.4361,
"step": 617
},
{
"epoch": 1.19119804400978,
"grad_norm": 1.2415145635604858,
"learning_rate": 6.527475943330691e-06,
"loss": 0.4396,
"step": 618
},
{
"epoch": 1.1931540342298288,
"grad_norm": 1.282718300819397,
"learning_rate": 6.517684094525071e-06,
"loss": 0.4397,
"step": 619
},
{
"epoch": 1.1951100244498778,
"grad_norm": 1.1451632976531982,
"learning_rate": 6.507885830163946e-06,
"loss": 0.4564,
"step": 620
},
{
"epoch": 1.1970660146699266,
"grad_norm": 1.249971628189087,
"learning_rate": 6.498081191666549e-06,
"loss": 0.4565,
"step": 621
},
{
"epoch": 1.1990220048899756,
"grad_norm": 1.2496131658554077,
"learning_rate": 6.488270220479055e-06,
"loss": 0.4351,
"step": 622
},
{
"epoch": 1.2009779951100246,
"grad_norm": 1.2109962701797485,
"learning_rate": 6.478452958074411e-06,
"loss": 0.4459,
"step": 623
},
{
"epoch": 1.2029339853300733,
"grad_norm": 1.2337491512298584,
"learning_rate": 6.468629445952156e-06,
"loss": 0.4373,
"step": 624
},
{
"epoch": 1.2048899755501223,
"grad_norm": 1.3091925382614136,
"learning_rate": 6.458799725638249e-06,
"loss": 0.4352,
"step": 625
},
{
"epoch": 1.2068459657701711,
"grad_norm": 1.2573388814926147,
"learning_rate": 6.448963838684893e-06,
"loss": 0.4419,
"step": 626
},
{
"epoch": 1.2088019559902201,
"grad_norm": 1.2169891595840454,
"learning_rate": 6.439121826670357e-06,
"loss": 0.4326,
"step": 627
},
{
"epoch": 1.2107579462102689,
"grad_norm": 1.1968295574188232,
"learning_rate": 6.429273731198803e-06,
"loss": 0.4814,
"step": 628
},
{
"epoch": 1.2127139364303179,
"grad_norm": 1.1939140558242798,
"learning_rate": 6.419419593900109e-06,
"loss": 0.4449,
"step": 629
},
{
"epoch": 1.2146699266503667,
"grad_norm": 1.280173420906067,
"learning_rate": 6.40955945642969e-06,
"loss": 0.4637,
"step": 630
},
{
"epoch": 1.2166259168704157,
"grad_norm": 1.2060580253601074,
"learning_rate": 6.399693360468332e-06,
"loss": 0.4399,
"step": 631
},
{
"epoch": 1.2185819070904644,
"grad_norm": 1.2972716093063354,
"learning_rate": 6.3898213477220005e-06,
"loss": 0.4381,
"step": 632
},
{
"epoch": 1.2205378973105134,
"grad_norm": 1.238663911819458,
"learning_rate": 6.379943459921677e-06,
"loss": 0.4497,
"step": 633
},
{
"epoch": 1.2224938875305624,
"grad_norm": 1.3543317317962646,
"learning_rate": 6.37005973882318e-06,
"loss": 0.4278,
"step": 634
},
{
"epoch": 1.2244498777506112,
"grad_norm": 1.2317306995391846,
"learning_rate": 6.360170226206981e-06,
"loss": 0.4512,
"step": 635
},
{
"epoch": 1.2264058679706602,
"grad_norm": 1.3280153274536133,
"learning_rate": 6.350274963878035e-06,
"loss": 0.4476,
"step": 636
},
{
"epoch": 1.228361858190709,
"grad_norm": 1.1947249174118042,
"learning_rate": 6.340373993665607e-06,
"loss": 0.4531,
"step": 637
},
{
"epoch": 1.230317848410758,
"grad_norm": 1.2678272724151611,
"learning_rate": 6.330467357423084e-06,
"loss": 0.4426,
"step": 638
},
{
"epoch": 1.2322738386308068,
"grad_norm": 1.2685966491699219,
"learning_rate": 6.32055509702781e-06,
"loss": 0.4241,
"step": 639
},
{
"epoch": 1.2342298288508557,
"grad_norm": 1.3568520545959473,
"learning_rate": 6.310637254380898e-06,
"loss": 0.4347,
"step": 640
},
{
"epoch": 1.2361858190709047,
"grad_norm": 1.3837932348251343,
"learning_rate": 6.300713871407062e-06,
"loss": 0.4571,
"step": 641
},
{
"epoch": 1.2381418092909535,
"grad_norm": 1.2901196479797363,
"learning_rate": 6.2907849900544345e-06,
"loss": 0.4476,
"step": 642
},
{
"epoch": 1.2400977995110025,
"grad_norm": 1.3630965948104858,
"learning_rate": 6.280850652294391e-06,
"loss": 0.4545,
"step": 643
},
{
"epoch": 1.2420537897310513,
"grad_norm": 1.279327392578125,
"learning_rate": 6.2709109001213744e-06,
"loss": 0.4747,
"step": 644
},
{
"epoch": 1.2440097799511003,
"grad_norm": 1.284744381904602,
"learning_rate": 6.2609657755527135e-06,
"loss": 0.4349,
"step": 645
},
{
"epoch": 1.245965770171149,
"grad_norm": 1.3701122999191284,
"learning_rate": 6.251015320628443e-06,
"loss": 0.4491,
"step": 646
},
{
"epoch": 1.247921760391198,
"grad_norm": 1.430738091468811,
"learning_rate": 6.24105957741114e-06,
"loss": 0.4472,
"step": 647
},
{
"epoch": 1.249877750611247,
"grad_norm": 1.243706226348877,
"learning_rate": 6.231098587985727e-06,
"loss": 0.4539,
"step": 648
},
{
"epoch": 1.2518337408312958,
"grad_norm": 1.4228640794754028,
"learning_rate": 6.22113239445931e-06,
"loss": 0.4669,
"step": 649
},
{
"epoch": 1.2537897310513446,
"grad_norm": 1.194962501525879,
"learning_rate": 6.211161038960989e-06,
"loss": 0.4454,
"step": 650
},
{
"epoch": 1.2557457212713936,
"grad_norm": 1.2327982187271118,
"learning_rate": 6.201184563641687e-06,
"loss": 0.4355,
"step": 651
},
{
"epoch": 1.2577017114914426,
"grad_norm": 1.328300952911377,
"learning_rate": 6.191203010673969e-06,
"loss": 0.4593,
"step": 652
},
{
"epoch": 1.2596577017114914,
"grad_norm": 1.146906852722168,
"learning_rate": 6.1812164222518626e-06,
"loss": 0.4655,
"step": 653
},
{
"epoch": 1.2616136919315404,
"grad_norm": 1.2471123933792114,
"learning_rate": 6.171224840590684e-06,
"loss": 0.4396,
"step": 654
},
{
"epoch": 1.2635696821515894,
"grad_norm": 1.2813622951507568,
"learning_rate": 6.161228307926859e-06,
"loss": 0.4462,
"step": 655
},
{
"epoch": 1.2655256723716382,
"grad_norm": 1.3747153282165527,
"learning_rate": 6.151226866517734e-06,
"loss": 0.4441,
"step": 656
},
{
"epoch": 1.267481662591687,
"grad_norm": 1.335888385772705,
"learning_rate": 6.141220558641416e-06,
"loss": 0.4454,
"step": 657
},
{
"epoch": 1.269437652811736,
"grad_norm": 1.3105989694595337,
"learning_rate": 6.131209426596571e-06,
"loss": 0.4362,
"step": 658
},
{
"epoch": 1.271393643031785,
"grad_norm": 1.3146666288375854,
"learning_rate": 6.12119351270227e-06,
"loss": 0.4589,
"step": 659
},
{
"epoch": 1.2733496332518337,
"grad_norm": 1.1910253763198853,
"learning_rate": 6.111172859297794e-06,
"loss": 0.4445,
"step": 660
},
{
"epoch": 1.2753056234718827,
"grad_norm": 1.3139874935150146,
"learning_rate": 6.101147508742456e-06,
"loss": 0.4617,
"step": 661
},
{
"epoch": 1.2772616136919315,
"grad_norm": 1.2148922681808472,
"learning_rate": 6.0911175034154236e-06,
"loss": 0.4349,
"step": 662
},
{
"epoch": 1.2792176039119805,
"grad_norm": 1.3743414878845215,
"learning_rate": 6.081082885715547e-06,
"loss": 0.4399,
"step": 663
},
{
"epoch": 1.2811735941320292,
"grad_norm": 1.2060234546661377,
"learning_rate": 6.07104369806117e-06,
"loss": 0.422,
"step": 664
},
{
"epoch": 1.2831295843520782,
"grad_norm": 1.2871215343475342,
"learning_rate": 6.060999982889955e-06,
"loss": 0.4484,
"step": 665
},
{
"epoch": 1.2850855745721272,
"grad_norm": 1.739700436592102,
"learning_rate": 6.050951782658705e-06,
"loss": 0.4488,
"step": 666
},
{
"epoch": 1.287041564792176,
"grad_norm": 1.2554281949996948,
"learning_rate": 6.040899139843177e-06,
"loss": 0.4337,
"step": 667
},
{
"epoch": 1.288997555012225,
"grad_norm": 1.2270786762237549,
"learning_rate": 6.030842096937916e-06,
"loss": 0.4533,
"step": 668
},
{
"epoch": 1.2909535452322738,
"grad_norm": 1.1629031896591187,
"learning_rate": 6.020780696456059e-06,
"loss": 0.461,
"step": 669
},
{
"epoch": 1.2929095354523228,
"grad_norm": 1.3253370523452759,
"learning_rate": 6.010714980929168e-06,
"loss": 0.4264,
"step": 670
},
{
"epoch": 1.2948655256723716,
"grad_norm": 1.1790844202041626,
"learning_rate": 6.000644992907044e-06,
"loss": 0.4556,
"step": 671
},
{
"epoch": 1.2968215158924206,
"grad_norm": 1.265358328819275,
"learning_rate": 5.990570774957548e-06,
"loss": 0.4357,
"step": 672
},
{
"epoch": 1.2987775061124696,
"grad_norm": 1.7498961687088013,
"learning_rate": 5.9804923696664255e-06,
"loss": 0.4429,
"step": 673
},
{
"epoch": 1.3007334963325183,
"grad_norm": 1.264091968536377,
"learning_rate": 5.970409819637116e-06,
"loss": 0.448,
"step": 674
},
{
"epoch": 1.302689486552567,
"grad_norm": 1.2229095697402954,
"learning_rate": 5.960323167490588e-06,
"loss": 0.4601,
"step": 675
},
{
"epoch": 1.304645476772616,
"grad_norm": 1.2520877122879028,
"learning_rate": 5.950232455865142e-06,
"loss": 0.4421,
"step": 676
},
{
"epoch": 1.306601466992665,
"grad_norm": 1.170745849609375,
"learning_rate": 5.940137727416247e-06,
"loss": 0.4475,
"step": 677
},
{
"epoch": 1.3085574572127139,
"grad_norm": 1.2222089767456055,
"learning_rate": 5.930039024816344e-06,
"loss": 0.4657,
"step": 678
},
{
"epoch": 1.3105134474327629,
"grad_norm": 1.185941457748413,
"learning_rate": 5.919936390754679e-06,
"loss": 0.4552,
"step": 679
},
{
"epoch": 1.3124694376528117,
"grad_norm": 1.2470475435256958,
"learning_rate": 5.9098298679371155e-06,
"loss": 0.4536,
"step": 680
},
{
"epoch": 1.3144254278728607,
"grad_norm": 1.2581121921539307,
"learning_rate": 5.8997194990859545e-06,
"loss": 0.4358,
"step": 681
},
{
"epoch": 1.3163814180929094,
"grad_norm": 1.1885335445404053,
"learning_rate": 5.889605326939757e-06,
"loss": 0.4403,
"step": 682
},
{
"epoch": 1.3183374083129584,
"grad_norm": 1.1669498682022095,
"learning_rate": 5.87948739425316e-06,
"loss": 0.4673,
"step": 683
},
{
"epoch": 1.3202933985330074,
"grad_norm": 1.2828259468078613,
"learning_rate": 5.8693657437966955e-06,
"loss": 0.4404,
"step": 684
},
{
"epoch": 1.3222493887530562,
"grad_norm": 1.2900339365005493,
"learning_rate": 5.859240418356614e-06,
"loss": 0.4643,
"step": 685
},
{
"epoch": 1.3242053789731052,
"grad_norm": 1.2165307998657227,
"learning_rate": 5.849111460734702e-06,
"loss": 0.4378,
"step": 686
},
{
"epoch": 1.326161369193154,
"grad_norm": 1.3515350818634033,
"learning_rate": 5.838978913748096e-06,
"loss": 0.4428,
"step": 687
},
{
"epoch": 1.328117359413203,
"grad_norm": 1.233166217803955,
"learning_rate": 5.828842820229106e-06,
"loss": 0.4541,
"step": 688
},
{
"epoch": 1.3300733496332517,
"grad_norm": 1.1359091997146606,
"learning_rate": 5.818703223025036e-06,
"loss": 0.4565,
"step": 689
},
{
"epoch": 1.3320293398533007,
"grad_norm": 1.3372867107391357,
"learning_rate": 5.808560164998002e-06,
"loss": 0.4324,
"step": 690
},
{
"epoch": 1.3339853300733497,
"grad_norm": 1.1790531873703003,
"learning_rate": 5.7984136890247455e-06,
"loss": 0.4347,
"step": 691
},
{
"epoch": 1.3359413202933985,
"grad_norm": 1.2366808652877808,
"learning_rate": 5.78826383799646e-06,
"loss": 0.4502,
"step": 692
},
{
"epoch": 1.3378973105134475,
"grad_norm": 1.2719463109970093,
"learning_rate": 5.778110654818602e-06,
"loss": 0.4473,
"step": 693
},
{
"epoch": 1.3398533007334963,
"grad_norm": 1.146131157875061,
"learning_rate": 5.767954182410717e-06,
"loss": 0.4467,
"step": 694
},
{
"epoch": 1.3418092909535453,
"grad_norm": 1.1366889476776123,
"learning_rate": 5.7577944637062545e-06,
"loss": 0.4312,
"step": 695
},
{
"epoch": 1.343765281173594,
"grad_norm": 1.2757666110992432,
"learning_rate": 5.747631541652388e-06,
"loss": 0.4372,
"step": 696
},
{
"epoch": 1.345721271393643,
"grad_norm": 1.239727258682251,
"learning_rate": 5.737465459209825e-06,
"loss": 0.4536,
"step": 697
},
{
"epoch": 1.347677261613692,
"grad_norm": 1.815891146659851,
"learning_rate": 5.727296259352645e-06,
"loss": 0.4632,
"step": 698
},
{
"epoch": 1.3496332518337408,
"grad_norm": 1.2289308309555054,
"learning_rate": 5.717123985068094e-06,
"loss": 0.457,
"step": 699
},
{
"epoch": 1.3515892420537896,
"grad_norm": 1.3102158308029175,
"learning_rate": 5.706948679356417e-06,
"loss": 0.4296,
"step": 700
},
{
"epoch": 1.3535452322738386,
"grad_norm": 1.2064359188079834,
"learning_rate": 5.696770385230679e-06,
"loss": 0.4511,
"step": 701
},
{
"epoch": 1.3555012224938876,
"grad_norm": 1.2940292358398438,
"learning_rate": 5.68658914571657e-06,
"loss": 0.4729,
"step": 702
},
{
"epoch": 1.3574572127139364,
"grad_norm": 1.1980189085006714,
"learning_rate": 5.676405003852238e-06,
"loss": 0.429,
"step": 703
},
{
"epoch": 1.3594132029339854,
"grad_norm": 1.2084240913391113,
"learning_rate": 5.666218002688094e-06,
"loss": 0.4438,
"step": 704
},
{
"epoch": 1.3613691931540342,
"grad_norm": 1.3325614929199219,
"learning_rate": 5.656028185286638e-06,
"loss": 0.4286,
"step": 705
},
{
"epoch": 1.3633251833740831,
"grad_norm": 1.2942312955856323,
"learning_rate": 5.645835594722276e-06,
"loss": 0.4467,
"step": 706
},
{
"epoch": 1.365281173594132,
"grad_norm": 1.263159990310669,
"learning_rate": 5.635640274081135e-06,
"loss": 0.4596,
"step": 707
},
{
"epoch": 1.367237163814181,
"grad_norm": 1.2454265356063843,
"learning_rate": 5.625442266460882e-06,
"loss": 0.4175,
"step": 708
},
{
"epoch": 1.36919315403423,
"grad_norm": 1.3009308576583862,
"learning_rate": 5.615241614970546e-06,
"loss": 0.4525,
"step": 709
},
{
"epoch": 1.3711491442542787,
"grad_norm": 1.222086787223816,
"learning_rate": 5.605038362730326e-06,
"loss": 0.45,
"step": 710
},
{
"epoch": 1.3731051344743277,
"grad_norm": 1.1456713676452637,
"learning_rate": 5.594832552871423e-06,
"loss": 0.4653,
"step": 711
},
{
"epoch": 1.3750611246943765,
"grad_norm": 1.2631255388259888,
"learning_rate": 5.5846242285358424e-06,
"loss": 0.4392,
"step": 712
},
{
"epoch": 1.3770171149144255,
"grad_norm": 1.1547008752822876,
"learning_rate": 5.5744134328762225e-06,
"loss": 0.4483,
"step": 713
},
{
"epoch": 1.3789731051344742,
"grad_norm": 1.2225359678268433,
"learning_rate": 5.564200209055647e-06,
"loss": 0.4491,
"step": 714
},
{
"epoch": 1.3809290953545232,
"grad_norm": 1.2074551582336426,
"learning_rate": 5.553984600247464e-06,
"loss": 0.4555,
"step": 715
},
{
"epoch": 1.3828850855745722,
"grad_norm": 1.287567138671875,
"learning_rate": 5.543766649635104e-06,
"loss": 0.4539,
"step": 716
},
{
"epoch": 1.384841075794621,
"grad_norm": 1.195177674293518,
"learning_rate": 5.533546400411899e-06,
"loss": 0.4323,
"step": 717
},
{
"epoch": 1.3867970660146698,
"grad_norm": 1.2475037574768066,
"learning_rate": 5.523323895780891e-06,
"loss": 0.4428,
"step": 718
},
{
"epoch": 1.3887530562347188,
"grad_norm": 1.2410409450531006,
"learning_rate": 5.513099178954664e-06,
"loss": 0.431,
"step": 719
},
{
"epoch": 1.3907090464547678,
"grad_norm": 1.2587379217147827,
"learning_rate": 5.502872293155148e-06,
"loss": 0.4299,
"step": 720
},
{
"epoch": 1.3926650366748166,
"grad_norm": 1.306764006614685,
"learning_rate": 5.492643281613444e-06,
"loss": 0.4626,
"step": 721
},
{
"epoch": 1.3946210268948656,
"grad_norm": 1.2155743837356567,
"learning_rate": 5.482412187569638e-06,
"loss": 0.4547,
"step": 722
},
{
"epoch": 1.3965770171149146,
"grad_norm": 1.238143801689148,
"learning_rate": 5.472179054272618e-06,
"loss": 0.4338,
"step": 723
},
{
"epoch": 1.3985330073349633,
"grad_norm": 1.276924729347229,
"learning_rate": 5.4619439249798975e-06,
"loss": 0.4446,
"step": 724
},
{
"epoch": 1.400488997555012,
"grad_norm": 1.1909351348876953,
"learning_rate": 5.4517068429574215e-06,
"loss": 0.461,
"step": 725
},
{
"epoch": 1.402444987775061,
"grad_norm": 1.248591661453247,
"learning_rate": 5.441467851479391e-06,
"loss": 0.4404,
"step": 726
},
{
"epoch": 1.40440097799511,
"grad_norm": 1.197350025177002,
"learning_rate": 5.431226993828081e-06,
"loss": 0.4338,
"step": 727
},
{
"epoch": 1.4063569682151589,
"grad_norm": 1.2777246236801147,
"learning_rate": 5.420984313293653e-06,
"loss": 0.4535,
"step": 728
},
{
"epoch": 1.4083129584352079,
"grad_norm": 1.214824914932251,
"learning_rate": 5.4107398531739765e-06,
"loss": 0.4535,
"step": 729
},
{
"epoch": 1.4102689486552566,
"grad_norm": 1.1792421340942383,
"learning_rate": 5.400493656774441e-06,
"loss": 0.4493,
"step": 730
},
{
"epoch": 1.4122249388753056,
"grad_norm": 1.18449068069458,
"learning_rate": 5.3902457674077746e-06,
"loss": 0.4536,
"step": 731
},
{
"epoch": 1.4141809290953544,
"grad_norm": 1.2407759428024292,
"learning_rate": 5.379996228393868e-06,
"loss": 0.454,
"step": 732
},
{
"epoch": 1.4161369193154034,
"grad_norm": 1.1889723539352417,
"learning_rate": 5.3697450830595775e-06,
"loss": 0.4459,
"step": 733
},
{
"epoch": 1.4180929095354524,
"grad_norm": 1.1876051425933838,
"learning_rate": 5.359492374738557e-06,
"loss": 0.4635,
"step": 734
},
{
"epoch": 1.4200488997555012,
"grad_norm": 1.1337804794311523,
"learning_rate": 5.349238146771062e-06,
"loss": 0.4414,
"step": 735
},
{
"epoch": 1.4220048899755502,
"grad_norm": 1.1479378938674927,
"learning_rate": 5.3389824425037725e-06,
"loss": 0.4517,
"step": 736
},
{
"epoch": 1.423960880195599,
"grad_norm": 1.1429476737976074,
"learning_rate": 5.3287253052896125e-06,
"loss": 0.453,
"step": 737
},
{
"epoch": 1.425916870415648,
"grad_norm": 1.1652535200119019,
"learning_rate": 5.31846677848756e-06,
"loss": 0.4603,
"step": 738
},
{
"epoch": 1.4278728606356967,
"grad_norm": 1.2222468852996826,
"learning_rate": 5.308206905462468e-06,
"loss": 0.4297,
"step": 739
},
{
"epoch": 1.4298288508557457,
"grad_norm": 1.1538606882095337,
"learning_rate": 5.297945729584884e-06,
"loss": 0.4306,
"step": 740
},
{
"epoch": 1.4317848410757947,
"grad_norm": 1.150429606437683,
"learning_rate": 5.287683294230855e-06,
"loss": 0.4585,
"step": 741
},
{
"epoch": 1.4337408312958435,
"grad_norm": 1.1533079147338867,
"learning_rate": 5.277419642781759e-06,
"loss": 0.4561,
"step": 742
},
{
"epoch": 1.4356968215158923,
"grad_norm": 1.2049838304519653,
"learning_rate": 5.26715481862411e-06,
"loss": 0.4361,
"step": 743
},
{
"epoch": 1.4376528117359413,
"grad_norm": 1.1533024311065674,
"learning_rate": 5.256888865149383e-06,
"loss": 0.438,
"step": 744
},
{
"epoch": 1.4396088019559903,
"grad_norm": 1.23749577999115,
"learning_rate": 5.246621825753827e-06,
"loss": 0.4632,
"step": 745
},
{
"epoch": 1.441564792176039,
"grad_norm": 1.206140160560608,
"learning_rate": 5.236353743838277e-06,
"loss": 0.4426,
"step": 746
},
{
"epoch": 1.443520782396088,
"grad_norm": 1.162009835243225,
"learning_rate": 5.226084662807978e-06,
"loss": 0.4433,
"step": 747
},
{
"epoch": 1.445476772616137,
"grad_norm": 1.247118353843689,
"learning_rate": 5.2158146260724006e-06,
"loss": 0.4787,
"step": 748
},
{
"epoch": 1.4474327628361858,
"grad_norm": 1.2741855382919312,
"learning_rate": 5.20554367704505e-06,
"loss": 0.4552,
"step": 749
},
{
"epoch": 1.4493887530562346,
"grad_norm": 1.235175609588623,
"learning_rate": 5.1952718591432914e-06,
"loss": 0.4606,
"step": 750
},
{
"epoch": 1.4513447432762836,
"grad_norm": 1.1611201763153076,
"learning_rate": 5.184999215788164e-06,
"loss": 0.4549,
"step": 751
},
{
"epoch": 1.4533007334963326,
"grad_norm": 1.2934192419052124,
"learning_rate": 5.17472579040419e-06,
"loss": 0.4549,
"step": 752
},
{
"epoch": 1.4552567237163814,
"grad_norm": 1.1462373733520508,
"learning_rate": 5.1644516264192075e-06,
"loss": 0.4494,
"step": 753
},
{
"epoch": 1.4572127139364304,
"grad_norm": 1.1914169788360596,
"learning_rate": 5.154176767264168e-06,
"loss": 0.4581,
"step": 754
},
{
"epoch": 1.4591687041564791,
"grad_norm": 1.1715911626815796,
"learning_rate": 5.1439012563729676e-06,
"loss": 0.4401,
"step": 755
},
{
"epoch": 1.4611246943765281,
"grad_norm": 1.205413579940796,
"learning_rate": 5.133625137182252e-06,
"loss": 0.4524,
"step": 756
},
{
"epoch": 1.463080684596577,
"grad_norm": 1.3116756677627563,
"learning_rate": 5.123348453131242e-06,
"loss": 0.4542,
"step": 757
},
{
"epoch": 1.465036674816626,
"grad_norm": 1.2234801054000854,
"learning_rate": 5.113071247661545e-06,
"loss": 0.4622,
"step": 758
},
{
"epoch": 1.466992665036675,
"grad_norm": 1.1696038246154785,
"learning_rate": 5.102793564216972e-06,
"loss": 0.4622,
"step": 759
},
{
"epoch": 1.4689486552567237,
"grad_norm": 1.1506364345550537,
"learning_rate": 5.092515446243359e-06,
"loss": 0.4483,
"step": 760
},
{
"epoch": 1.4709046454767727,
"grad_norm": 1.2517993450164795,
"learning_rate": 5.08223693718837e-06,
"loss": 0.4265,
"step": 761
},
{
"epoch": 1.4728606356968215,
"grad_norm": 1.1876658201217651,
"learning_rate": 5.071958080501331e-06,
"loss": 0.4341,
"step": 762
},
{
"epoch": 1.4748166259168705,
"grad_norm": 1.1794910430908203,
"learning_rate": 5.061678919633033e-06,
"loss": 0.4442,
"step": 763
},
{
"epoch": 1.4767726161369192,
"grad_norm": 1.1299569606781006,
"learning_rate": 5.051399498035552e-06,
"loss": 0.4679,
"step": 764
},
{
"epoch": 1.4787286063569682,
"grad_norm": 1.1907482147216797,
"learning_rate": 5.041119859162068e-06,
"loss": 0.4547,
"step": 765
},
{
"epoch": 1.4806845965770172,
"grad_norm": 1.182114839553833,
"learning_rate": 5.030840046466682e-06,
"loss": 0.4497,
"step": 766
},
{
"epoch": 1.482640586797066,
"grad_norm": 1.119644045829773,
"learning_rate": 5.020560103404225e-06,
"loss": 0.4554,
"step": 767
},
{
"epoch": 1.4845965770171148,
"grad_norm": 1.1454432010650635,
"learning_rate": 5.01028007343008e-06,
"loss": 0.4172,
"step": 768
},
{
"epoch": 1.4865525672371638,
"grad_norm": 1.2797613143920898,
"learning_rate": 5e-06,
"loss": 0.449,
"step": 769
},
{
"epoch": 1.4885085574572128,
"grad_norm": 1.1603306531906128,
"learning_rate": 4.9897199265699205e-06,
"loss": 0.4649,
"step": 770
},
{
"epoch": 1.4904645476772616,
"grad_norm": 1.2365909814834595,
"learning_rate": 4.979439896595777e-06,
"loss": 0.4654,
"step": 771
},
{
"epoch": 1.4924205378973106,
"grad_norm": 1.1844136714935303,
"learning_rate": 4.9691599535333196e-06,
"loss": 0.4438,
"step": 772
},
{
"epoch": 1.4943765281173595,
"grad_norm": 1.1799577474594116,
"learning_rate": 4.958880140837934e-06,
"loss": 0.4445,
"step": 773
},
{
"epoch": 1.4963325183374083,
"grad_norm": 1.2819160223007202,
"learning_rate": 4.94860050196445e-06,
"loss": 0.4355,
"step": 774
},
{
"epoch": 1.498288508557457,
"grad_norm": 1.1956461668014526,
"learning_rate": 4.938321080366969e-06,
"loss": 0.4289,
"step": 775
},
{
"epoch": 1.500244498777506,
"grad_norm": 1.2148059606552124,
"learning_rate": 4.928041919498669e-06,
"loss": 0.4284,
"step": 776
},
{
"epoch": 1.502200488997555,
"grad_norm": 1.1130567789077759,
"learning_rate": 4.917763062811631e-06,
"loss": 0.445,
"step": 777
},
{
"epoch": 1.5041564792176039,
"grad_norm": 1.1467176675796509,
"learning_rate": 4.907484553756644e-06,
"loss": 0.4366,
"step": 778
},
{
"epoch": 1.5061124694376526,
"grad_norm": 1.1647167205810547,
"learning_rate": 4.897206435783029e-06,
"loss": 0.4455,
"step": 779
},
{
"epoch": 1.5080684596577019,
"grad_norm": 1.1800556182861328,
"learning_rate": 4.8869287523384564e-06,
"loss": 0.4619,
"step": 780
},
{
"epoch": 1.5100244498777506,
"grad_norm": 1.2697025537490845,
"learning_rate": 4.876651546868759e-06,
"loss": 0.4578,
"step": 781
},
{
"epoch": 1.5119804400977994,
"grad_norm": 1.1891041994094849,
"learning_rate": 4.86637486281775e-06,
"loss": 0.4418,
"step": 782
},
{
"epoch": 1.5139364303178484,
"grad_norm": 1.183430552482605,
"learning_rate": 4.856098743627035e-06,
"loss": 0.4305,
"step": 783
},
{
"epoch": 1.5158924205378974,
"grad_norm": 1.1615206003189087,
"learning_rate": 4.845823232735833e-06,
"loss": 0.4285,
"step": 784
},
{
"epoch": 1.5178484107579462,
"grad_norm": 1.1649718284606934,
"learning_rate": 4.835548373580793e-06,
"loss": 0.4363,
"step": 785
},
{
"epoch": 1.519804400977995,
"grad_norm": 1.166445255279541,
"learning_rate": 4.82527420959581e-06,
"loss": 0.4441,
"step": 786
},
{
"epoch": 1.521760391198044,
"grad_norm": 1.2622777223587036,
"learning_rate": 4.815000784211839e-06,
"loss": 0.4278,
"step": 787
},
{
"epoch": 1.523716381418093,
"grad_norm": 1.223970890045166,
"learning_rate": 4.80472814085671e-06,
"loss": 0.4214,
"step": 788
},
{
"epoch": 1.5256723716381417,
"grad_norm": 1.2236289978027344,
"learning_rate": 4.794456322954953e-06,
"loss": 0.4461,
"step": 789
},
{
"epoch": 1.5276283618581907,
"grad_norm": 1.2569670677185059,
"learning_rate": 4.784185373927601e-06,
"loss": 0.4604,
"step": 790
},
{
"epoch": 1.5295843520782397,
"grad_norm": 1.1970974206924438,
"learning_rate": 4.773915337192022e-06,
"loss": 0.4328,
"step": 791
},
{
"epoch": 1.5315403422982885,
"grad_norm": 1.2718005180358887,
"learning_rate": 4.763646256161724e-06,
"loss": 0.4356,
"step": 792
},
{
"epoch": 1.5334963325183373,
"grad_norm": 1.260063648223877,
"learning_rate": 4.753378174246174e-06,
"loss": 0.4635,
"step": 793
},
{
"epoch": 1.5354523227383863,
"grad_norm": 1.3000822067260742,
"learning_rate": 4.743111134850618e-06,
"loss": 0.4376,
"step": 794
},
{
"epoch": 1.5374083129584353,
"grad_norm": 1.159167766571045,
"learning_rate": 4.7328451813758905e-06,
"loss": 0.4489,
"step": 795
},
{
"epoch": 1.539364303178484,
"grad_norm": 1.2092034816741943,
"learning_rate": 4.722580357218242e-06,
"loss": 0.4416,
"step": 796
},
{
"epoch": 1.541320293398533,
"grad_norm": 1.2961796522140503,
"learning_rate": 4.7123167057691446e-06,
"loss": 0.4428,
"step": 797
},
{
"epoch": 1.543276283618582,
"grad_norm": 1.1921921968460083,
"learning_rate": 4.702054270415118e-06,
"loss": 0.4429,
"step": 798
},
{
"epoch": 1.5452322738386308,
"grad_norm": 1.2297002077102661,
"learning_rate": 4.6917930945375325e-06,
"loss": 0.4415,
"step": 799
},
{
"epoch": 1.5471882640586796,
"grad_norm": 1.1777838468551636,
"learning_rate": 4.681533221512441e-06,
"loss": 0.457,
"step": 800
},
{
"epoch": 1.5491442542787286,
"grad_norm": 1.2187058925628662,
"learning_rate": 4.671274694710388e-06,
"loss": 0.4543,
"step": 801
},
{
"epoch": 1.5511002444987776,
"grad_norm": 1.138017177581787,
"learning_rate": 4.661017557496228e-06,
"loss": 0.4923,
"step": 802
},
{
"epoch": 1.5530562347188264,
"grad_norm": 1.2310172319412231,
"learning_rate": 4.65076185322894e-06,
"loss": 0.4381,
"step": 803
},
{
"epoch": 1.5550122249388751,
"grad_norm": 1.2169889211654663,
"learning_rate": 4.640507625261446e-06,
"loss": 0.4541,
"step": 804
},
{
"epoch": 1.5569682151589244,
"grad_norm": 1.1405876874923706,
"learning_rate": 4.630254916940424e-06,
"loss": 0.4409,
"step": 805
},
{
"epoch": 1.5589242053789731,
"grad_norm": 1.1320923566818237,
"learning_rate": 4.6200037716061334e-06,
"loss": 0.4141,
"step": 806
},
{
"epoch": 1.560880195599022,
"grad_norm": 1.2325752973556519,
"learning_rate": 4.609754232592225e-06,
"loss": 0.4283,
"step": 807
},
{
"epoch": 1.562836185819071,
"grad_norm": 1.3088810443878174,
"learning_rate": 4.599506343225562e-06,
"loss": 0.4247,
"step": 808
},
{
"epoch": 1.56479217603912,
"grad_norm": 1.162494421005249,
"learning_rate": 4.589260146826025e-06,
"loss": 0.4396,
"step": 809
},
{
"epoch": 1.5667481662591687,
"grad_norm": 1.2321451902389526,
"learning_rate": 4.579015686706348e-06,
"loss": 0.4313,
"step": 810
},
{
"epoch": 1.5687041564792175,
"grad_norm": 1.2772425413131714,
"learning_rate": 4.56877300617192e-06,
"loss": 0.4392,
"step": 811
},
{
"epoch": 1.5706601466992665,
"grad_norm": 1.1612458229064941,
"learning_rate": 4.55853214852061e-06,
"loss": 0.4758,
"step": 812
},
{
"epoch": 1.5726161369193155,
"grad_norm": 1.1374866962432861,
"learning_rate": 4.548293157042581e-06,
"loss": 0.4502,
"step": 813
},
{
"epoch": 1.5745721271393642,
"grad_norm": 1.2353960275650024,
"learning_rate": 4.538056075020104e-06,
"loss": 0.4193,
"step": 814
},
{
"epoch": 1.5765281173594132,
"grad_norm": 1.155655026435852,
"learning_rate": 4.527820945727383e-06,
"loss": 0.4514,
"step": 815
},
{
"epoch": 1.5784841075794622,
"grad_norm": 1.2046425342559814,
"learning_rate": 4.517587812430364e-06,
"loss": 0.4365,
"step": 816
},
{
"epoch": 1.580440097799511,
"grad_norm": 1.2539176940917969,
"learning_rate": 4.507356718386557e-06,
"loss": 0.4466,
"step": 817
},
{
"epoch": 1.5823960880195598,
"grad_norm": 1.1303858757019043,
"learning_rate": 4.497127706844852e-06,
"loss": 0.4547,
"step": 818
},
{
"epoch": 1.5843520782396088,
"grad_norm": 1.285823106765747,
"learning_rate": 4.486900821045337e-06,
"loss": 0.4502,
"step": 819
},
{
"epoch": 1.5863080684596578,
"grad_norm": 1.2383862733840942,
"learning_rate": 4.47667610421911e-06,
"loss": 0.4685,
"step": 820
},
{
"epoch": 1.5882640586797065,
"grad_norm": 1.2859257459640503,
"learning_rate": 4.466453599588103e-06,
"loss": 0.4407,
"step": 821
},
{
"epoch": 1.5902200488997555,
"grad_norm": 1.242066502571106,
"learning_rate": 4.4562333503648966e-06,
"loss": 0.4465,
"step": 822
},
{
"epoch": 1.5921760391198045,
"grad_norm": 1.121509313583374,
"learning_rate": 4.446015399752536e-06,
"loss": 0.4516,
"step": 823
},
{
"epoch": 1.5941320293398533,
"grad_norm": 1.2806360721588135,
"learning_rate": 4.435799790944356e-06,
"loss": 0.455,
"step": 824
},
{
"epoch": 1.596088019559902,
"grad_norm": 1.2540717124938965,
"learning_rate": 4.42558656712378e-06,
"loss": 0.447,
"step": 825
},
{
"epoch": 1.598044009779951,
"grad_norm": 1.1869229078292847,
"learning_rate": 4.41537577146416e-06,
"loss": 0.4575,
"step": 826
},
{
"epoch": 1.6,
"grad_norm": 1.2167538404464722,
"learning_rate": 4.405167447128578e-06,
"loss": 0.4326,
"step": 827
},
{
"epoch": 1.6019559902200489,
"grad_norm": 1.1809440851211548,
"learning_rate": 4.394961637269674e-06,
"loss": 0.4367,
"step": 828
},
{
"epoch": 1.6039119804400976,
"grad_norm": 1.223663568496704,
"learning_rate": 4.384758385029457e-06,
"loss": 0.454,
"step": 829
},
{
"epoch": 1.6058679706601469,
"grad_norm": 1.1459578275680542,
"learning_rate": 4.374557733539119e-06,
"loss": 0.4498,
"step": 830
},
{
"epoch": 1.6078239608801956,
"grad_norm": 1.174612283706665,
"learning_rate": 4.364359725918868e-06,
"loss": 0.449,
"step": 831
},
{
"epoch": 1.6097799511002444,
"grad_norm": 1.1853433847427368,
"learning_rate": 4.354164405277725e-06,
"loss": 0.4354,
"step": 832
},
{
"epoch": 1.6117359413202934,
"grad_norm": 1.134567141532898,
"learning_rate": 4.3439718147133625e-06,
"loss": 0.4581,
"step": 833
},
{
"epoch": 1.6136919315403424,
"grad_norm": 1.2028684616088867,
"learning_rate": 4.333781997311909e-06,
"loss": 0.4814,
"step": 834
},
{
"epoch": 1.6156479217603912,
"grad_norm": 1.2520511150360107,
"learning_rate": 4.323594996147763e-06,
"loss": 0.4519,
"step": 835
},
{
"epoch": 1.61760391198044,
"grad_norm": 1.163762092590332,
"learning_rate": 4.3134108542834315e-06,
"loss": 0.4534,
"step": 836
},
{
"epoch": 1.619559902200489,
"grad_norm": 1.194406270980835,
"learning_rate": 4.3032296147693225e-06,
"loss": 0.4575,
"step": 837
},
{
"epoch": 1.621515892420538,
"grad_norm": 1.2662689685821533,
"learning_rate": 4.293051320643583e-06,
"loss": 0.439,
"step": 838
},
{
"epoch": 1.6234718826405867,
"grad_norm": 1.2627875804901123,
"learning_rate": 4.28287601493191e-06,
"loss": 0.4275,
"step": 839
},
{
"epoch": 1.6254278728606357,
"grad_norm": 1.2528843879699707,
"learning_rate": 4.272703740647356e-06,
"loss": 0.4659,
"step": 840
},
{
"epoch": 1.6273838630806847,
"grad_norm": 1.188919186592102,
"learning_rate": 4.262534540790176e-06,
"loss": 0.4387,
"step": 841
},
{
"epoch": 1.6293398533007335,
"grad_norm": 1.3087656497955322,
"learning_rate": 4.252368458347614e-06,
"loss": 0.4281,
"step": 842
},
{
"epoch": 1.6312958435207823,
"grad_norm": 1.19478440284729,
"learning_rate": 4.2422055362937455e-06,
"loss": 0.4403,
"step": 843
},
{
"epoch": 1.6332518337408313,
"grad_norm": 1.2115154266357422,
"learning_rate": 4.232045817589285e-06,
"loss": 0.4712,
"step": 844
},
{
"epoch": 1.6352078239608803,
"grad_norm": 1.2290632724761963,
"learning_rate": 4.2218893451814005e-06,
"loss": 0.4436,
"step": 845
},
{
"epoch": 1.637163814180929,
"grad_norm": 1.1975177526474,
"learning_rate": 4.211736162003543e-06,
"loss": 0.4334,
"step": 846
},
{
"epoch": 1.639119804400978,
"grad_norm": 1.1870671510696411,
"learning_rate": 4.201586310975256e-06,
"loss": 0.4428,
"step": 847
},
{
"epoch": 1.641075794621027,
"grad_norm": 1.3159699440002441,
"learning_rate": 4.191439835001999e-06,
"loss": 0.4347,
"step": 848
},
{
"epoch": 1.6430317848410758,
"grad_norm": 1.3310630321502686,
"learning_rate": 4.181296776974963e-06,
"loss": 0.4486,
"step": 849
},
{
"epoch": 1.6449877750611246,
"grad_norm": 1.192111849784851,
"learning_rate": 4.171157179770896e-06,
"loss": 0.4456,
"step": 850
},
{
"epoch": 1.6469437652811736,
"grad_norm": 1.2716262340545654,
"learning_rate": 4.161021086251906e-06,
"loss": 0.4547,
"step": 851
},
{
"epoch": 1.6488997555012226,
"grad_norm": 1.2495301961898804,
"learning_rate": 4.1508885392653e-06,
"loss": 0.4401,
"step": 852
},
{
"epoch": 1.6508557457212714,
"grad_norm": 1.1762745380401611,
"learning_rate": 4.140759581643386e-06,
"loss": 0.4634,
"step": 853
},
{
"epoch": 1.6528117359413201,
"grad_norm": 1.2814241647720337,
"learning_rate": 4.1306342562033045e-06,
"loss": 0.4472,
"step": 854
},
{
"epoch": 1.6547677261613694,
"grad_norm": 1.3218616247177124,
"learning_rate": 4.120512605746842e-06,
"loss": 0.4415,
"step": 855
},
{
"epoch": 1.6567237163814181,
"grad_norm": 1.1885900497436523,
"learning_rate": 4.110394673060244e-06,
"loss": 0.4274,
"step": 856
},
{
"epoch": 1.658679706601467,
"grad_norm": 1.1911470890045166,
"learning_rate": 4.100280500914046e-06,
"loss": 0.4321,
"step": 857
},
{
"epoch": 1.660635696821516,
"grad_norm": 2.320434808731079,
"learning_rate": 4.090170132062885e-06,
"loss": 0.4374,
"step": 858
},
{
"epoch": 1.662591687041565,
"grad_norm": 1.2307498455047607,
"learning_rate": 4.080063609245322e-06,
"loss": 0.4508,
"step": 859
},
{
"epoch": 1.6645476772616137,
"grad_norm": 1.2727370262145996,
"learning_rate": 4.0699609751836575e-06,
"loss": 0.4483,
"step": 860
},
{
"epoch": 1.6665036674816625,
"grad_norm": 1.1779570579528809,
"learning_rate": 4.059862272583755e-06,
"loss": 0.4548,
"step": 861
},
{
"epoch": 1.6684596577017115,
"grad_norm": 1.2141393423080444,
"learning_rate": 4.049767544134859e-06,
"loss": 0.4404,
"step": 862
},
{
"epoch": 1.6704156479217604,
"grad_norm": 1.2709059715270996,
"learning_rate": 4.0396768325094135e-06,
"loss": 0.4378,
"step": 863
},
{
"epoch": 1.6723716381418092,
"grad_norm": 1.163887619972229,
"learning_rate": 4.029590180362884e-06,
"loss": 0.4399,
"step": 864
},
{
"epoch": 1.6743276283618582,
"grad_norm": 1.1120645999908447,
"learning_rate": 4.019507630333577e-06,
"loss": 0.4359,
"step": 865
},
{
"epoch": 1.6762836185819072,
"grad_norm": 1.220585584640503,
"learning_rate": 4.009429225042453e-06,
"loss": 0.4357,
"step": 866
},
{
"epoch": 1.678239608801956,
"grad_norm": 1.2459168434143066,
"learning_rate": 3.999355007092958e-06,
"loss": 0.4301,
"step": 867
},
{
"epoch": 1.6801955990220048,
"grad_norm": 1.132286548614502,
"learning_rate": 3.989285019070834e-06,
"loss": 0.4194,
"step": 868
},
{
"epoch": 1.6821515892420538,
"grad_norm": 1.3133598566055298,
"learning_rate": 3.979219303543942e-06,
"loss": 0.4526,
"step": 869
},
{
"epoch": 1.6841075794621028,
"grad_norm": 1.2260628938674927,
"learning_rate": 3.969157903062086e-06,
"loss": 0.4506,
"step": 870
},
{
"epoch": 1.6860635696821515,
"grad_norm": 1.1542868614196777,
"learning_rate": 3.959100860156824e-06,
"loss": 0.4433,
"step": 871
},
{
"epoch": 1.6880195599022005,
"grad_norm": 1.1528064012527466,
"learning_rate": 3.949048217341297e-06,
"loss": 0.4328,
"step": 872
},
{
"epoch": 1.6899755501222495,
"grad_norm": 1.2643218040466309,
"learning_rate": 3.939000017110046e-06,
"loss": 0.4186,
"step": 873
},
{
"epoch": 1.6919315403422983,
"grad_norm": 1.1736092567443848,
"learning_rate": 3.928956301938831e-06,
"loss": 0.4105,
"step": 874
},
{
"epoch": 1.693887530562347,
"grad_norm": 1.2194820642471313,
"learning_rate": 3.918917114284456e-06,
"loss": 0.4274,
"step": 875
},
{
"epoch": 1.695843520782396,
"grad_norm": 1.1990725994110107,
"learning_rate": 3.908882496584578e-06,
"loss": 0.4539,
"step": 876
},
{
"epoch": 1.697799511002445,
"grad_norm": 1.1804009675979614,
"learning_rate": 3.898852491257547e-06,
"loss": 0.4599,
"step": 877
},
{
"epoch": 1.6997555012224939,
"grad_norm": 1.1332404613494873,
"learning_rate": 3.888827140702207e-06,
"loss": 0.4666,
"step": 878
},
{
"epoch": 1.7017114914425426,
"grad_norm": 1.2241950035095215,
"learning_rate": 3.87880648729773e-06,
"loss": 0.44,
"step": 879
},
{
"epoch": 1.7036674816625916,
"grad_norm": 1.224329948425293,
"learning_rate": 3.868790573403431e-06,
"loss": 0.4587,
"step": 880
},
{
"epoch": 1.7056234718826406,
"grad_norm": 1.2016890048980713,
"learning_rate": 3.858779441358588e-06,
"loss": 0.4683,
"step": 881
},
{
"epoch": 1.7075794621026894,
"grad_norm": 1.190882682800293,
"learning_rate": 3.848773133482267e-06,
"loss": 0.4436,
"step": 882
},
{
"epoch": 1.7095354523227384,
"grad_norm": 1.2475522756576538,
"learning_rate": 3.838771692073144e-06,
"loss": 0.4367,
"step": 883
},
{
"epoch": 1.7114914425427874,
"grad_norm": 1.1946569681167603,
"learning_rate": 3.828775159409316e-06,
"loss": 0.4514,
"step": 884
},
{
"epoch": 1.7134474327628362,
"grad_norm": 1.1988422870635986,
"learning_rate": 3.818783577748138e-06,
"loss": 0.4411,
"step": 885
},
{
"epoch": 1.715403422982885,
"grad_norm": 1.2769694328308105,
"learning_rate": 3.808796989326034e-06,
"loss": 0.4413,
"step": 886
},
{
"epoch": 1.717359413202934,
"grad_norm": 1.108680009841919,
"learning_rate": 3.7988154363583153e-06,
"loss": 0.4479,
"step": 887
},
{
"epoch": 1.719315403422983,
"grad_norm": 1.19581937789917,
"learning_rate": 3.788838961039012e-06,
"loss": 0.446,
"step": 888
},
{
"epoch": 1.7212713936430317,
"grad_norm": 1.181363821029663,
"learning_rate": 3.7788676055406913e-06,
"loss": 0.4359,
"step": 889
},
{
"epoch": 1.7232273838630807,
"grad_norm": 1.1199826002120972,
"learning_rate": 3.768901412014273e-06,
"loss": 0.4236,
"step": 890
},
{
"epoch": 1.7251833740831297,
"grad_norm": 1.1632542610168457,
"learning_rate": 3.7589404225888624e-06,
"loss": 0.4402,
"step": 891
},
{
"epoch": 1.7271393643031785,
"grad_norm": 1.205407738685608,
"learning_rate": 3.748984679371558e-06,
"loss": 0.4678,
"step": 892
},
{
"epoch": 1.7290953545232273,
"grad_norm": 1.1697795391082764,
"learning_rate": 3.7390342244472886e-06,
"loss": 0.4114,
"step": 893
},
{
"epoch": 1.7310513447432763,
"grad_norm": 1.1678640842437744,
"learning_rate": 3.729089099878627e-06,
"loss": 0.4443,
"step": 894
},
{
"epoch": 1.7330073349633253,
"grad_norm": 1.1855502128601074,
"learning_rate": 3.719149347705609e-06,
"loss": 0.4461,
"step": 895
},
{
"epoch": 1.734963325183374,
"grad_norm": 1.1547439098358154,
"learning_rate": 3.7092150099455676e-06,
"loss": 0.4517,
"step": 896
},
{
"epoch": 1.736919315403423,
"grad_norm": 1.163944959640503,
"learning_rate": 3.6992861285929395e-06,
"loss": 0.4456,
"step": 897
},
{
"epoch": 1.738875305623472,
"grad_norm": 1.1487215757369995,
"learning_rate": 3.689362745619103e-06,
"loss": 0.4514,
"step": 898
},
{
"epoch": 1.7408312958435208,
"grad_norm": 1.2750896215438843,
"learning_rate": 3.6794449029721913e-06,
"loss": 0.4227,
"step": 899
},
{
"epoch": 1.7427872860635696,
"grad_norm": 1.1833292245864868,
"learning_rate": 3.6695326425769156e-06,
"loss": 0.473,
"step": 900
},
{
"epoch": 1.7447432762836186,
"grad_norm": 1.2018202543258667,
"learning_rate": 3.659626006334395e-06,
"loss": 0.4332,
"step": 901
},
{
"epoch": 1.7466992665036676,
"grad_norm": 1.1667495965957642,
"learning_rate": 3.649725036121966e-06,
"loss": 0.4529,
"step": 902
},
{
"epoch": 1.7486552567237164,
"grad_norm": 1.1951237916946411,
"learning_rate": 3.6398297737930212e-06,
"loss": 0.4355,
"step": 903
},
{
"epoch": 1.7506112469437651,
"grad_norm": 1.1440187692642212,
"learning_rate": 3.629940261176822e-06,
"loss": 0.423,
"step": 904
},
{
"epoch": 1.7525672371638141,
"grad_norm": 1.2192459106445312,
"learning_rate": 3.620056540078323e-06,
"loss": 0.4473,
"step": 905
},
{
"epoch": 1.7545232273838631,
"grad_norm": 1.1615161895751953,
"learning_rate": 3.6101786522780024e-06,
"loss": 0.4584,
"step": 906
},
{
"epoch": 1.756479217603912,
"grad_norm": 1.1762903928756714,
"learning_rate": 3.6003066395316704e-06,
"loss": 0.4569,
"step": 907
},
{
"epoch": 1.758435207823961,
"grad_norm": 1.2474490404129028,
"learning_rate": 3.590440543570311e-06,
"loss": 0.4491,
"step": 908
},
{
"epoch": 1.76039119804401,
"grad_norm": 1.166661024093628,
"learning_rate": 3.5805804060998926e-06,
"loss": 0.4195,
"step": 909
},
{
"epoch": 1.7623471882640587,
"grad_norm": 1.1908670663833618,
"learning_rate": 3.5707262688011983e-06,
"loss": 0.4402,
"step": 910
},
{
"epoch": 1.7643031784841074,
"grad_norm": 1.1561764478683472,
"learning_rate": 3.560878173329646e-06,
"loss": 0.4321,
"step": 911
},
{
"epoch": 1.7662591687041564,
"grad_norm": 1.2901417016983032,
"learning_rate": 3.551036161315109e-06,
"loss": 0.4435,
"step": 912
},
{
"epoch": 1.7682151589242054,
"grad_norm": 1.0962755680084229,
"learning_rate": 3.5412002743617525e-06,
"loss": 0.4285,
"step": 913
},
{
"epoch": 1.7701711491442542,
"grad_norm": 1.1892595291137695,
"learning_rate": 3.531370554047845e-06,
"loss": 0.4598,
"step": 914
},
{
"epoch": 1.7721271393643032,
"grad_norm": 1.151934266090393,
"learning_rate": 3.5215470419255905e-06,
"loss": 0.4365,
"step": 915
},
{
"epoch": 1.7740831295843522,
"grad_norm": 1.1396538019180298,
"learning_rate": 3.511729779520946e-06,
"loss": 0.4396,
"step": 916
},
{
"epoch": 1.776039119804401,
"grad_norm": 1.131398320198059,
"learning_rate": 3.501918808333453e-06,
"loss": 0.4312,
"step": 917
},
{
"epoch": 1.7779951100244498,
"grad_norm": 1.106647253036499,
"learning_rate": 3.4921141698360554e-06,
"loss": 0.4439,
"step": 918
},
{
"epoch": 1.7799511002444988,
"grad_norm": 1.2006912231445312,
"learning_rate": 3.48231590547493e-06,
"loss": 0.44,
"step": 919
},
{
"epoch": 1.7819070904645478,
"grad_norm": 1.1762027740478516,
"learning_rate": 3.4725240566693104e-06,
"loss": 0.4496,
"step": 920
},
{
"epoch": 1.7838630806845965,
"grad_norm": 1.1079798936843872,
"learning_rate": 3.4627386648113046e-06,
"loss": 0.4402,
"step": 921
},
{
"epoch": 1.7858190709046453,
"grad_norm": 1.1334391832351685,
"learning_rate": 3.4529597712657342e-06,
"loss": 0.4793,
"step": 922
},
{
"epoch": 1.7877750611246945,
"grad_norm": 1.139093279838562,
"learning_rate": 3.4431874173699415e-06,
"loss": 0.429,
"step": 923
},
{
"epoch": 1.7897310513447433,
"grad_norm": 1.0803550481796265,
"learning_rate": 3.433421644433631e-06,
"loss": 0.4133,
"step": 924
},
{
"epoch": 1.791687041564792,
"grad_norm": 1.0991169214248657,
"learning_rate": 3.4236624937386874e-06,
"loss": 0.419,
"step": 925
},
{
"epoch": 1.793643031784841,
"grad_norm": 1.1221754550933838,
"learning_rate": 3.4139100065390007e-06,
"loss": 0.4391,
"step": 926
},
{
"epoch": 1.79559902200489,
"grad_norm": 1.1930972337722778,
"learning_rate": 3.404164224060294e-06,
"loss": 0.423,
"step": 927
},
{
"epoch": 1.7975550122249389,
"grad_norm": 1.2077200412750244,
"learning_rate": 3.394425187499944e-06,
"loss": 0.4282,
"step": 928
},
{
"epoch": 1.7995110024449876,
"grad_norm": 1.159081220626831,
"learning_rate": 3.384692938026816e-06,
"loss": 0.4221,
"step": 929
},
{
"epoch": 1.8014669926650366,
"grad_norm": 1.1197729110717773,
"learning_rate": 3.374967516781085e-06,
"loss": 0.4441,
"step": 930
},
{
"epoch": 1.8034229828850856,
"grad_norm": 1.1913528442382812,
"learning_rate": 3.365248964874058e-06,
"loss": 0.4472,
"step": 931
},
{
"epoch": 1.8053789731051344,
"grad_norm": 1.1669801473617554,
"learning_rate": 3.3555373233880096e-06,
"loss": 0.4434,
"step": 932
},
{
"epoch": 1.8073349633251834,
"grad_norm": 1.1273599863052368,
"learning_rate": 3.3458326333759927e-06,
"loss": 0.4467,
"step": 933
},
{
"epoch": 1.8092909535452324,
"grad_norm": 1.144005298614502,
"learning_rate": 3.3361349358616853e-06,
"loss": 0.4387,
"step": 934
},
{
"epoch": 1.8112469437652812,
"grad_norm": 1.1881077289581299,
"learning_rate": 3.326444271839202e-06,
"loss": 0.4177,
"step": 935
},
{
"epoch": 1.81320293398533,
"grad_norm": 1.1392303705215454,
"learning_rate": 3.316760682272927e-06,
"loss": 0.4339,
"step": 936
},
{
"epoch": 1.815158924205379,
"grad_norm": 1.19828200340271,
"learning_rate": 3.307084208097337e-06,
"loss": 0.4247,
"step": 937
},
{
"epoch": 1.817114914425428,
"grad_norm": 1.1880228519439697,
"learning_rate": 3.297414890216833e-06,
"loss": 0.4519,
"step": 938
},
{
"epoch": 1.8190709046454767,
"grad_norm": 1.124159812927246,
"learning_rate": 3.2877527695055615e-06,
"loss": 0.4305,
"step": 939
},
{
"epoch": 1.8210268948655257,
"grad_norm": 1.1801859140396118,
"learning_rate": 3.27809788680725e-06,
"loss": 0.4734,
"step": 940
},
{
"epoch": 1.8229828850855747,
"grad_norm": 1.1997686624526978,
"learning_rate": 3.268450282935026e-06,
"loss": 0.4653,
"step": 941
},
{
"epoch": 1.8249388753056235,
"grad_norm": 1.1670591831207275,
"learning_rate": 3.2588099986712496e-06,
"loss": 0.4208,
"step": 942
},
{
"epoch": 1.8268948655256723,
"grad_norm": 1.1300679445266724,
"learning_rate": 3.2491770747673384e-06,
"loss": 0.4505,
"step": 943
},
{
"epoch": 1.8288508557457213,
"grad_norm": 1.1710411310195923,
"learning_rate": 3.239551551943595e-06,
"loss": 0.4439,
"step": 944
},
{
"epoch": 1.8308068459657703,
"grad_norm": 1.1152379512786865,
"learning_rate": 3.2299334708890384e-06,
"loss": 0.445,
"step": 945
},
{
"epoch": 1.832762836185819,
"grad_norm": 1.1259523630142212,
"learning_rate": 3.22032287226123e-06,
"loss": 0.4336,
"step": 946
},
{
"epoch": 1.8347188264058678,
"grad_norm": 1.1431976556777954,
"learning_rate": 3.2107197966861003e-06,
"loss": 0.4196,
"step": 947
},
{
"epoch": 1.836674816625917,
"grad_norm": 1.2426306009292603,
"learning_rate": 3.2011242847577804e-06,
"loss": 0.435,
"step": 948
},
{
"epoch": 1.8386308068459658,
"grad_norm": 1.17112135887146,
"learning_rate": 3.1915363770384223e-06,
"loss": 0.4609,
"step": 949
},
{
"epoch": 1.8405867970660146,
"grad_norm": 1.1593024730682373,
"learning_rate": 3.18195611405804e-06,
"loss": 0.4384,
"step": 950
},
{
"epoch": 1.8425427872860636,
"grad_norm": 1.166476845741272,
"learning_rate": 3.1723835363143296e-06,
"loss": 0.4238,
"step": 951
},
{
"epoch": 1.8444987775061126,
"grad_norm": 1.0797832012176514,
"learning_rate": 3.1628186842725e-06,
"loss": 0.4433,
"step": 952
},
{
"epoch": 1.8464547677261614,
"grad_norm": 1.215981364250183,
"learning_rate": 3.1532615983651027e-06,
"loss": 0.4326,
"step": 953
},
{
"epoch": 1.8484107579462101,
"grad_norm": 1.095432162284851,
"learning_rate": 3.1437123189918574e-06,
"loss": 0.4485,
"step": 954
},
{
"epoch": 1.8503667481662591,
"grad_norm": 1.2197959423065186,
"learning_rate": 3.1341708865194866e-06,
"loss": 0.4572,
"step": 955
},
{
"epoch": 1.8523227383863081,
"grad_norm": 1.2195861339569092,
"learning_rate": 3.124637341281541e-06,
"loss": 0.4407,
"step": 956
},
{
"epoch": 1.854278728606357,
"grad_norm": 1.1470884084701538,
"learning_rate": 3.1151117235782346e-06,
"loss": 0.4136,
"step": 957
},
{
"epoch": 1.856234718826406,
"grad_norm": 1.132288932800293,
"learning_rate": 3.1055940736762647e-06,
"loss": 0.4199,
"step": 958
},
{
"epoch": 1.858190709046455,
"grad_norm": 1.128574013710022,
"learning_rate": 3.0960844318086482e-06,
"loss": 0.4661,
"step": 959
},
{
"epoch": 1.8601466992665037,
"grad_norm": 1.1992732286453247,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.4415,
"step": 960
},
{
"epoch": 1.8621026894865524,
"grad_norm": 1.1346315145492554,
"learning_rate": 3.0770893329391207e-06,
"loss": 0.421,
"step": 961
},
{
"epoch": 1.8640586797066014,
"grad_norm": 1.160053014755249,
"learning_rate": 3.067603956233308e-06,
"loss": 0.4625,
"step": 962
},
{
"epoch": 1.8660146699266504,
"grad_norm": 1.204193115234375,
"learning_rate": 3.0581267481537073e-06,
"loss": 0.4367,
"step": 963
},
{
"epoch": 1.8679706601466992,
"grad_norm": 1.2072075605392456,
"learning_rate": 3.0486577487623802e-06,
"loss": 0.4574,
"step": 964
},
{
"epoch": 1.8699266503667482,
"grad_norm": 1.0959627628326416,
"learning_rate": 3.0391969980866874e-06,
"loss": 0.4288,
"step": 965
},
{
"epoch": 1.8718826405867972,
"grad_norm": 1.1430832147598267,
"learning_rate": 3.0297445361191235e-06,
"loss": 0.445,
"step": 966
},
{
"epoch": 1.873838630806846,
"grad_norm": 1.256406307220459,
"learning_rate": 3.0203004028171434e-06,
"loss": 0.4315,
"step": 967
},
{
"epoch": 1.8757946210268948,
"grad_norm": 1.163475751876831,
"learning_rate": 3.010864638102998e-06,
"loss": 0.4357,
"step": 968
},
{
"epoch": 1.8777506112469438,
"grad_norm": 1.1195828914642334,
"learning_rate": 3.001437281863558e-06,
"loss": 0.4656,
"step": 969
},
{
"epoch": 1.8797066014669928,
"grad_norm": 1.169587254524231,
"learning_rate": 2.9920183739501503e-06,
"loss": 0.4269,
"step": 970
},
{
"epoch": 1.8816625916870415,
"grad_norm": 1.112953782081604,
"learning_rate": 2.9826079541783914e-06,
"loss": 0.4408,
"step": 971
},
{
"epoch": 1.8836185819070903,
"grad_norm": 1.107088327407837,
"learning_rate": 2.973206062328017e-06,
"loss": 0.4311,
"step": 972
},
{
"epoch": 1.8855745721271395,
"grad_norm": 1.1068124771118164,
"learning_rate": 2.963812738142713e-06,
"loss": 0.4286,
"step": 973
},
{
"epoch": 1.8875305623471883,
"grad_norm": 1.0876468420028687,
"learning_rate": 2.954428021329946e-06,
"loss": 0.429,
"step": 974
},
{
"epoch": 1.889486552567237,
"grad_norm": 1.160590410232544,
"learning_rate": 2.9450519515607966e-06,
"loss": 0.4167,
"step": 975
},
{
"epoch": 1.891442542787286,
"grad_norm": 1.0903960466384888,
"learning_rate": 2.935684568469799e-06,
"loss": 0.4385,
"step": 976
},
{
"epoch": 1.893398533007335,
"grad_norm": 1.1685900688171387,
"learning_rate": 2.9263259116547606e-06,
"loss": 0.4521,
"step": 977
},
{
"epoch": 1.8953545232273838,
"grad_norm": 1.1978154182434082,
"learning_rate": 2.916976020676606e-06,
"loss": 0.4342,
"step": 978
},
{
"epoch": 1.8973105134474326,
"grad_norm": 1.1818468570709229,
"learning_rate": 2.9076349350592014e-06,
"loss": 0.439,
"step": 979
},
{
"epoch": 1.8992665036674816,
"grad_norm": 1.194822072982788,
"learning_rate": 2.898302694289189e-06,
"loss": 0.444,
"step": 980
},
{
"epoch": 1.9012224938875306,
"grad_norm": 1.2437156438827515,
"learning_rate": 2.8889793378158284e-06,
"loss": 0.4409,
"step": 981
},
{
"epoch": 1.9031784841075794,
"grad_norm": 1.2146315574645996,
"learning_rate": 2.8796649050508175e-06,
"loss": 0.4173,
"step": 982
},
{
"epoch": 1.9051344743276284,
"grad_norm": 1.228637456893921,
"learning_rate": 2.870359435368136e-06,
"loss": 0.447,
"step": 983
},
{
"epoch": 1.9070904645476774,
"grad_norm": 1.2419463396072388,
"learning_rate": 2.861062968103876e-06,
"loss": 0.4208,
"step": 984
},
{
"epoch": 1.9090464547677262,
"grad_norm": 1.2601884603500366,
"learning_rate": 2.8517755425560665e-06,
"loss": 0.4361,
"step": 985
},
{
"epoch": 1.911002444987775,
"grad_norm": 1.2036101818084717,
"learning_rate": 2.842497197984524e-06,
"loss": 0.435,
"step": 986
},
{
"epoch": 1.912958435207824,
"grad_norm": 1.1465144157409668,
"learning_rate": 2.8332279736106747e-06,
"loss": 0.4325,
"step": 987
},
{
"epoch": 1.914914425427873,
"grad_norm": 1.1453237533569336,
"learning_rate": 2.82396790861739e-06,
"loss": 0.4444,
"step": 988
},
{
"epoch": 1.9168704156479217,
"grad_norm": 1.1577181816101074,
"learning_rate": 2.814717042148827e-06,
"loss": 0.4625,
"step": 989
},
{
"epoch": 1.9188264058679707,
"grad_norm": 1.2406984567642212,
"learning_rate": 2.8054754133102535e-06,
"loss": 0.4326,
"step": 990
},
{
"epoch": 1.9207823960880197,
"grad_norm": 1.2444988489151,
"learning_rate": 2.796243061167892e-06,
"loss": 0.4323,
"step": 991
},
{
"epoch": 1.9227383863080685,
"grad_norm": 1.1628625392913818,
"learning_rate": 2.78702002474875e-06,
"loss": 0.4349,
"step": 992
},
{
"epoch": 1.9246943765281173,
"grad_norm": 1.1444227695465088,
"learning_rate": 2.7778063430404544e-06,
"loss": 0.4552,
"step": 993
},
{
"epoch": 1.9266503667481663,
"grad_norm": 1.1746978759765625,
"learning_rate": 2.76860205499109e-06,
"loss": 0.4119,
"step": 994
},
{
"epoch": 1.9286063569682153,
"grad_norm": 1.1781418323516846,
"learning_rate": 2.759407199509029e-06,
"loss": 0.4105,
"step": 995
},
{
"epoch": 1.930562347188264,
"grad_norm": 1.1440287828445435,
"learning_rate": 2.7502218154627718e-06,
"loss": 0.4351,
"step": 996
},
{
"epoch": 1.9325183374083128,
"grad_norm": 1.1332827806472778,
"learning_rate": 2.7410459416807856e-06,
"loss": 0.4525,
"step": 997
},
{
"epoch": 1.934474327628362,
"grad_norm": 1.6635149717330933,
"learning_rate": 2.7318796169513275e-06,
"loss": 0.4511,
"step": 998
},
{
"epoch": 1.9364303178484108,
"grad_norm": 1.176467776298523,
"learning_rate": 2.722722880022297e-06,
"loss": 0.4331,
"step": 999
},
{
"epoch": 1.9383863080684596,
"grad_norm": 1.1627734899520874,
"learning_rate": 2.7135757696010565e-06,
"loss": 0.4403,
"step": 1000
},
{
"epoch": 1.9403422982885086,
"grad_norm": 1.1340796947479248,
"learning_rate": 2.7044383243542804e-06,
"loss": 0.4188,
"step": 1001
},
{
"epoch": 1.9422982885085576,
"grad_norm": 1.1349642276763916,
"learning_rate": 2.6953105829077863e-06,
"loss": 0.4385,
"step": 1002
},
{
"epoch": 1.9442542787286063,
"grad_norm": 1.1615030765533447,
"learning_rate": 2.6861925838463694e-06,
"loss": 0.4272,
"step": 1003
},
{
"epoch": 1.9462102689486551,
"grad_norm": 1.1460556983947754,
"learning_rate": 2.6770843657136457e-06,
"loss": 0.4321,
"step": 1004
},
{
"epoch": 1.9481662591687041,
"grad_norm": 1.1438319683074951,
"learning_rate": 2.6679859670118785e-06,
"loss": 0.461,
"step": 1005
},
{
"epoch": 1.9501222493887531,
"grad_norm": 1.1441649198532104,
"learning_rate": 2.658897426201829e-06,
"loss": 0.4531,
"step": 1006
},
{
"epoch": 1.952078239608802,
"grad_norm": 1.1989426612854004,
"learning_rate": 2.6498187817025845e-06,
"loss": 0.4326,
"step": 1007
},
{
"epoch": 1.9540342298288509,
"grad_norm": 1.1358593702316284,
"learning_rate": 2.6407500718914e-06,
"loss": 0.4549,
"step": 1008
},
{
"epoch": 1.9559902200488999,
"grad_norm": 1.2670133113861084,
"learning_rate": 2.6316913351035313e-06,
"loss": 0.4377,
"step": 1009
},
{
"epoch": 1.9579462102689487,
"grad_norm": 1.186421513557434,
"learning_rate": 2.62264260963208e-06,
"loss": 0.4507,
"step": 1010
},
{
"epoch": 1.9599022004889974,
"grad_norm": 1.122685432434082,
"learning_rate": 2.613603933727824e-06,
"loss": 0.4506,
"step": 1011
},
{
"epoch": 1.9618581907090464,
"grad_norm": 1.1497164964675903,
"learning_rate": 2.604575345599063e-06,
"loss": 0.4369,
"step": 1012
},
{
"epoch": 1.9638141809290954,
"grad_norm": 1.1628155708312988,
"learning_rate": 2.5955568834114523e-06,
"loss": 0.4508,
"step": 1013
},
{
"epoch": 1.9657701711491442,
"grad_norm": 1.1360890865325928,
"learning_rate": 2.5865485852878435e-06,
"loss": 0.4286,
"step": 1014
},
{
"epoch": 1.9677261613691932,
"grad_norm": 1.121633529663086,
"learning_rate": 2.577550489308124e-06,
"loss": 0.4265,
"step": 1015
},
{
"epoch": 1.9696821515892422,
"grad_norm": 1.1679707765579224,
"learning_rate": 2.5685626335090487e-06,
"loss": 0.4341,
"step": 1016
},
{
"epoch": 1.971638141809291,
"grad_norm": 1.1223803758621216,
"learning_rate": 2.5595850558840908e-06,
"loss": 0.4364,
"step": 1017
},
{
"epoch": 1.9735941320293398,
"grad_norm": 1.2146944999694824,
"learning_rate": 2.550617794383278e-06,
"loss": 0.4591,
"step": 1018
},
{
"epoch": 1.9755501222493888,
"grad_norm": 1.1352821588516235,
"learning_rate": 2.541660886913019e-06,
"loss": 0.4316,
"step": 1019
},
{
"epoch": 1.9775061124694377,
"grad_norm": 1.1358625888824463,
"learning_rate": 2.5327143713359668e-06,
"loss": 0.438,
"step": 1020
},
{
"epoch": 1.9794621026894865,
"grad_norm": 1.1737252473831177,
"learning_rate": 2.523778285470835e-06,
"loss": 0.4313,
"step": 1021
},
{
"epoch": 1.9814180929095353,
"grad_norm": 1.1773995161056519,
"learning_rate": 2.5148526670922556e-06,
"loss": 0.4359,
"step": 1022
},
{
"epoch": 1.9833740831295843,
"grad_norm": 1.132659912109375,
"learning_rate": 2.5059375539306103e-06,
"loss": 0.4517,
"step": 1023
},
{
"epoch": 1.9853300733496333,
"grad_norm": 1.1384680271148682,
"learning_rate": 2.497032983671873e-06,
"loss": 0.4323,
"step": 1024
}
],
"logging_steps": 1,
"max_steps": 1533,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 256,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.8180416233323626e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}