LongWriter-V-7B-DPO / trainer_state.json
wyuc's picture
Upload folder using huggingface_hub
a0c3985 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.943820224719101,
"eval_steps": 500,
"global_step": 132,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02247191011235955,
"grad_norm": 489.5653076171875,
"learning_rate": 2.1428571428571428e-07,
"logits/chosen": 1.4551408290863037,
"logits/rejected": 1.478129267692566,
"logps/chosen": -2968.771240234375,
"logps/rejected": -3035.35302734375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0449438202247191,
"grad_norm": 419.54876708984375,
"learning_rate": 4.2857142857142857e-07,
"logits/chosen": 1.5314003229141235,
"logits/rejected": 1.4525893926620483,
"logps/chosen": -3010.43994140625,
"logps/rejected": -2926.948974609375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.06741573033707865,
"grad_norm": 789.9224243164062,
"learning_rate": 6.428571428571428e-07,
"logits/chosen": 1.482939600944519,
"logits/rejected": 1.5616533756256104,
"logps/chosen": -2998.501708984375,
"logps/rejected": -3179.81982421875,
"loss": 0.9204,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.08596238493919373,
"rewards/margins": -0.19251862168312073,
"rewards/rejected": 0.10655620694160461,
"step": 3
},
{
"epoch": 0.0898876404494382,
"grad_norm": 378.14190673828125,
"learning_rate": 8.571428571428571e-07,
"logits/chosen": 1.6036081314086914,
"logits/rejected": 1.7028334140777588,
"logps/chosen": -2979.7060546875,
"logps/rejected": -2913.69091796875,
"loss": 0.6588,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.28274667263031006,
"rewards/margins": 0.22482016682624817,
"rewards/rejected": -0.5075668692588806,
"step": 4
},
{
"epoch": 0.11235955056179775,
"grad_norm": 307.0648193359375,
"learning_rate": 1.0714285714285716e-06,
"logits/chosen": 1.3923085927963257,
"logits/rejected": 1.4200749397277832,
"logps/chosen": -3015.828125,
"logps/rejected": -3068.435302734375,
"loss": 0.5615,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03155745938420296,
"rewards/margins": 0.5133614540100098,
"rewards/rejected": -0.5449188947677612,
"step": 5
},
{
"epoch": 0.1348314606741573,
"grad_norm": 282.67034912109375,
"learning_rate": 1.2857142857142856e-06,
"logits/chosen": 1.5581945180892944,
"logits/rejected": 1.405899167060852,
"logps/chosen": -3204.767333984375,
"logps/rejected": -3163.357177734375,
"loss": 0.5469,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.25397348403930664,
"rewards/margins": 0.6482839584350586,
"rewards/rejected": -0.9022574424743652,
"step": 6
},
{
"epoch": 0.15730337078651685,
"grad_norm": 218.5866241455078,
"learning_rate": 1.5e-06,
"logits/chosen": 1.496790885925293,
"logits/rejected": 1.4303985834121704,
"logps/chosen": -3185.8203125,
"logps/rejected": -3225.123046875,
"loss": 0.4709,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.04301854223012924,
"rewards/margins": 1.6269282102584839,
"rewards/rejected": -1.6699466705322266,
"step": 7
},
{
"epoch": 0.1797752808988764,
"grad_norm": 181.489501953125,
"learning_rate": 1.7142857142857143e-06,
"logits/chosen": 1.6130130290985107,
"logits/rejected": 1.5007115602493286,
"logps/chosen": -3087.791748046875,
"logps/rejected": -2948.8115234375,
"loss": 0.396,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.08753497898578644,
"rewards/margins": 2.817833185195923,
"rewards/rejected": -2.9053683280944824,
"step": 8
},
{
"epoch": 0.20224719101123595,
"grad_norm": 188.34768676757812,
"learning_rate": 1.928571428571429e-06,
"logits/chosen": 1.5587732791900635,
"logits/rejected": 1.6744489669799805,
"logps/chosen": -2374.6494140625,
"logps/rejected": -2492.75537109375,
"loss": 0.448,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.14219728112220764,
"rewards/margins": 2.7199909687042236,
"rewards/rejected": -2.8621885776519775,
"step": 9
},
{
"epoch": 0.2247191011235955,
"grad_norm": 167.6234588623047,
"learning_rate": 2.142857142857143e-06,
"logits/chosen": 1.581652283668518,
"logits/rejected": 1.5243756771087646,
"logps/chosen": -2837.341552734375,
"logps/rejected": -2842.2666015625,
"loss": 0.3618,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.06367100775241852,
"rewards/margins": 6.429449081420898,
"rewards/rejected": -6.493120193481445,
"step": 10
},
{
"epoch": 0.24719101123595505,
"grad_norm": 195.05810546875,
"learning_rate": 2.357142857142857e-06,
"logits/chosen": 1.531968355178833,
"logits/rejected": 1.5490195751190186,
"logps/chosen": -2785.763427734375,
"logps/rejected": -2938.71533203125,
"loss": 0.3962,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.2717077136039734,
"rewards/margins": 8.072213172912598,
"rewards/rejected": -8.343921661376953,
"step": 11
},
{
"epoch": 0.2696629213483146,
"grad_norm": 204.53872680664062,
"learning_rate": 2.571428571428571e-06,
"logits/chosen": 1.5632414817810059,
"logits/rejected": 1.5352647304534912,
"logps/chosen": -2883.001220703125,
"logps/rejected": -3065.4296875,
"loss": 0.4155,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.09219703823328018,
"rewards/margins": 11.51332950592041,
"rewards/rejected": -11.421133041381836,
"step": 12
},
{
"epoch": 0.29213483146067415,
"grad_norm": 181.2421112060547,
"learning_rate": 2.785714285714286e-06,
"logits/chosen": 1.5124785900115967,
"logits/rejected": 1.4263392686843872,
"logps/chosen": -3015.5341796875,
"logps/rejected": -3136.56982421875,
"loss": 0.3343,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1826700121164322,
"rewards/margins": 16.418424606323242,
"rewards/rejected": -16.601093292236328,
"step": 13
},
{
"epoch": 0.3146067415730337,
"grad_norm": 178.02650451660156,
"learning_rate": 3e-06,
"logits/chosen": 1.4881091117858887,
"logits/rejected": 1.4641259908676147,
"logps/chosen": -2906.181396484375,
"logps/rejected": -3083.74755859375,
"loss": 0.3189,
"rewards/accuracies": 0.890625,
"rewards/chosen": -0.07007797807455063,
"rewards/margins": 18.051210403442383,
"rewards/rejected": -18.121288299560547,
"step": 14
},
{
"epoch": 0.33707865168539325,
"grad_norm": 188.4379425048828,
"learning_rate": 2.999468416685179e-06,
"logits/chosen": 1.4958661794662476,
"logits/rejected": 1.5740702152252197,
"logps/chosen": -2589.415771484375,
"logps/rejected": -2884.312744140625,
"loss": 0.3903,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.1765696406364441,
"rewards/margins": 17.232072830200195,
"rewards/rejected": -17.408641815185547,
"step": 15
},
{
"epoch": 0.3595505617977528,
"grad_norm": 161.3037872314453,
"learning_rate": 2.9978740435151427e-06,
"logits/chosen": 1.5349267721176147,
"logits/rejected": 1.491062045097351,
"logps/chosen": -2951.84619140625,
"logps/rejected": -3206.8662109375,
"loss": 0.3059,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.7078287601470947,
"rewards/margins": 23.868520736694336,
"rewards/rejected": -25.57634925842285,
"step": 16
},
{
"epoch": 0.38202247191011235,
"grad_norm": 186.13180541992188,
"learning_rate": 2.995218010546125e-06,
"logits/chosen": 1.4998528957366943,
"logits/rejected": 1.4576878547668457,
"logps/chosen": -3011.727783203125,
"logps/rejected": -3261.4501953125,
"loss": 0.3808,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.25169306993484497,
"rewards/margins": 35.25308609008789,
"rewards/rejected": -35.50477600097656,
"step": 17
},
{
"epoch": 0.4044943820224719,
"grad_norm": 185.6712188720703,
"learning_rate": 2.9915022003152055e-06,
"logits/chosen": 1.6139241456985474,
"logits/rejected": 1.5550901889801025,
"logps/chosen": -2965.4423828125,
"logps/rejected": -3224.514404296875,
"loss": 0.3542,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.8823347091674805,
"rewards/margins": 39.0025634765625,
"rewards/rejected": -37.12023162841797,
"step": 18
},
{
"epoch": 0.42696629213483145,
"grad_norm": 182.43603515625,
"learning_rate": 2.986729246506011e-06,
"logits/chosen": 1.244603157043457,
"logits/rejected": 1.2053301334381104,
"logps/chosen": -2764.19189453125,
"logps/rejected": -3084.441650390625,
"loss": 0.367,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.6243125200271606,
"rewards/margins": 43.56684112548828,
"rewards/rejected": -45.1911506652832,
"step": 19
},
{
"epoch": 0.449438202247191,
"grad_norm": 198.76722717285156,
"learning_rate": 2.980902532082017e-06,
"logits/chosen": 1.4910385608673096,
"logits/rejected": 1.4667646884918213,
"logps/chosen": -2632.417724609375,
"logps/rejected": -2912.476806640625,
"loss": 0.4946,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.317056179046631,
"rewards/margins": 34.359012603759766,
"rewards/rejected": -36.676063537597656,
"step": 20
},
{
"epoch": 0.47191011235955055,
"grad_norm": 203.78700256347656,
"learning_rate": 2.9740261868887817e-06,
"logits/chosen": 1.4394636154174805,
"logits/rejected": 1.3155745267868042,
"logps/chosen": -2808.47509765625,
"logps/rejected": -3043.707763671875,
"loss": 0.4802,
"rewards/accuracies": 0.71875,
"rewards/chosen": 1.6056139469146729,
"rewards/margins": 43.16130065917969,
"rewards/rejected": -41.555686950683594,
"step": 21
},
{
"epoch": 0.4943820224719101,
"grad_norm": 199.40330505371094,
"learning_rate": 2.9661050847268e-06,
"logits/chosen": 1.3054568767547607,
"logits/rejected": 1.2870110273361206,
"logps/chosen": -2704.07568359375,
"logps/rejected": -3091.42626953125,
"loss": 0.4924,
"rewards/accuracies": 0.828125,
"rewards/chosen": -4.835676670074463,
"rewards/margins": 40.92457580566406,
"rewards/rejected": -45.76025390625,
"step": 22
},
{
"epoch": 0.5168539325842697,
"grad_norm": 184.34901428222656,
"learning_rate": 2.957144839897065e-06,
"logits/chosen": 1.5794934034347534,
"logits/rejected": 1.374954104423523,
"logps/chosen": -2828.36083984375,
"logps/rejected": -3111.46875,
"loss": 0.4932,
"rewards/accuracies": 0.734375,
"rewards/chosen": 3.432398796081543,
"rewards/margins": 62.3823356628418,
"rewards/rejected": -58.9499397277832,
"step": 23
},
{
"epoch": 0.5393258426966292,
"grad_norm": 198.54269409179688,
"learning_rate": 2.947151803221774e-06,
"logits/chosen": 1.6772565841674805,
"logits/rejected": 1.6362934112548828,
"logps/chosen": -2880.4677734375,
"logps/rejected": -3303.3857421875,
"loss": 0.3869,
"rewards/accuracies": 0.796875,
"rewards/chosen": 0.12497274577617645,
"rewards/margins": 53.7283821105957,
"rewards/rejected": -53.60340881347656,
"step": 24
},
{
"epoch": 0.5617977528089888,
"grad_norm": 173.3833465576172,
"learning_rate": 2.936133057543008e-06,
"logits/chosen": 1.4493129253387451,
"logits/rejected": 1.3350006341934204,
"logps/chosen": -2721.460693359375,
"logps/rejected": -3138.864990234375,
"loss": 0.3981,
"rewards/accuracies": 0.78125,
"rewards/chosen": 2.794492244720459,
"rewards/margins": 69.71061706542969,
"rewards/rejected": -66.91613006591797,
"step": 25
},
{
"epoch": 0.5842696629213483,
"grad_norm": 232.13525390625,
"learning_rate": 2.924096412702572e-06,
"logits/chosen": 1.7099878787994385,
"logits/rejected": 1.5226480960845947,
"logps/chosen": -2983.288330078125,
"logps/rejected": -3093.673095703125,
"loss": 0.613,
"rewards/accuracies": 0.703125,
"rewards/chosen": 2.1761527061462402,
"rewards/margins": 59.57087326049805,
"rewards/rejected": -57.394718170166016,
"step": 26
},
{
"epoch": 0.6067415730337079,
"grad_norm": 162.77978515625,
"learning_rate": 2.91105040000655e-06,
"logits/chosen": 1.4071202278137207,
"logits/rejected": 1.4425785541534424,
"logps/chosen": -2522.546630859375,
"logps/rejected": -3321.0537109375,
"loss": 0.4005,
"rewards/accuracies": 0.859375,
"rewards/chosen": 1.8253318071365356,
"rewards/margins": 63.75608825683594,
"rewards/rejected": -61.930755615234375,
"step": 27
},
{
"epoch": 0.6292134831460674,
"grad_norm": 207.4031219482422,
"learning_rate": 2.897004266178508e-06,
"logits/chosen": 1.5841655731201172,
"logits/rejected": 1.4097201824188232,
"logps/chosen": -3239.787841796875,
"logps/rejected": -3663.88232421875,
"loss": 0.522,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.2217176854610443,
"rewards/margins": 58.664180755615234,
"rewards/rejected": -58.88589859008789,
"step": 28
},
{
"epoch": 0.651685393258427,
"grad_norm": 172.96218872070312,
"learning_rate": 2.8819679668056195e-06,
"logits/chosen": 1.6320128440856934,
"logits/rejected": 1.5467625856399536,
"logps/chosen": -2654.78271484375,
"logps/rejected": -3225.193359375,
"loss": 0.3816,
"rewards/accuracies": 0.765625,
"rewards/chosen": 2.769482374191284,
"rewards/margins": 65.22299194335938,
"rewards/rejected": -62.453514099121094,
"step": 29
},
{
"epoch": 0.6741573033707865,
"grad_norm": 200.36915588378906,
"learning_rate": 2.8659521592823702e-06,
"logits/chosen": 1.6264617443084717,
"logits/rejected": 1.421095848083496,
"logps/chosen": -2914.17529296875,
"logps/rejected": -3396.08544921875,
"loss": 0.4913,
"rewards/accuracies": 0.765625,
"rewards/chosen": 7.334710121154785,
"rewards/margins": 89.93038177490234,
"rewards/rejected": -82.59567260742188,
"step": 30
},
{
"epoch": 0.6966292134831461,
"grad_norm": 250.5316162109375,
"learning_rate": 2.848968195256829e-06,
"logits/chosen": 1.6201553344726562,
"logits/rejected": 1.4870961904525757,
"logps/chosen": -3036.192138671875,
"logps/rejected": -3605.6904296875,
"loss": 0.708,
"rewards/accuracies": 0.65625,
"rewards/chosen": 4.598369121551514,
"rewards/margins": 79.35784149169922,
"rewards/rejected": -74.75946807861328,
"step": 31
},
{
"epoch": 0.7191011235955056,
"grad_norm": 228.1786346435547,
"learning_rate": 2.831028112584857e-06,
"logits/chosen": 1.3086817264556885,
"logits/rejected": 1.2920796871185303,
"logps/chosen": -2828.72900390625,
"logps/rejected": -3492.97802734375,
"loss": 0.5514,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.8046822547912598,
"rewards/margins": 77.88575744628906,
"rewards/rejected": -77.08108520507812,
"step": 32
},
{
"epoch": 0.7415730337078652,
"grad_norm": 156.25662231445312,
"learning_rate": 2.812144626797942e-06,
"logits/chosen": 1.3912537097930908,
"logits/rejected": 1.1646690368652344,
"logps/chosen": -3173.48388671875,
"logps/rejected": -3708.0390625,
"loss": 0.4043,
"rewards/accuracies": 0.875,
"rewards/chosen": 2.820896863937378,
"rewards/margins": 82.55420684814453,
"rewards/rejected": -79.73331451416016,
"step": 33
},
{
"epoch": 0.7640449438202247,
"grad_norm": 189.89682006835938,
"learning_rate": 2.792331122090709e-06,
"logits/chosen": 1.525010108947754,
"logits/rejected": 1.4141947031021118,
"logps/chosen": -2818.591064453125,
"logps/rejected": -3415.1484375,
"loss": 0.4825,
"rewards/accuracies": 0.796875,
"rewards/chosen": 1.3273561000823975,
"rewards/margins": 81.49795532226562,
"rewards/rejected": -80.17059326171875,
"step": 34
},
{
"epoch": 0.7865168539325843,
"grad_norm": 198.3324432373047,
"learning_rate": 2.7716016418345064e-06,
"logits/chosen": 1.5669187307357788,
"logits/rejected": 1.3444348573684692,
"logps/chosen": -2831.2744140625,
"logps/rejected": -3359.554931640625,
"loss": 0.4821,
"rewards/accuracies": 0.84375,
"rewards/chosen": 4.969450950622559,
"rewards/margins": 95.5076675415039,
"rewards/rejected": -90.53821563720703,
"step": 35
},
{
"epoch": 0.8089887640449438,
"grad_norm": 202.50929260253906,
"learning_rate": 2.7499708786237724e-06,
"logits/chosen": 1.6073535680770874,
"logits/rejected": 1.5690536499023438,
"logps/chosen": -2898.311279296875,
"logps/rejected": -3199.489013671875,
"loss": 0.5359,
"rewards/accuracies": 0.796875,
"rewards/chosen": -3.0962305068969727,
"rewards/margins": 49.8695182800293,
"rewards/rejected": -52.96574783325195,
"step": 36
},
{
"epoch": 0.8314606741573034,
"grad_norm": 172.3883056640625,
"learning_rate": 2.7274541638622533e-06,
"logits/chosen": 1.5025634765625,
"logits/rejected": 1.2939093112945557,
"logps/chosen": -2682.772705078125,
"logps/rejected": -3070.16259765625,
"loss": 0.5118,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.5182172060012817,
"rewards/margins": 86.14014434814453,
"rewards/rejected": -86.65835571289062,
"step": 37
},
{
"epoch": 0.8539325842696629,
"grad_norm": 200.7554473876953,
"learning_rate": 2.7040674568964452e-06,
"logits/chosen": 1.4808025360107422,
"logits/rejected": 1.3251252174377441,
"logps/chosen": -2854.599365234375,
"logps/rejected": -3208.1640625,
"loss": 0.5253,
"rewards/accuracies": 0.8125,
"rewards/chosen": 1.5150139331817627,
"rewards/margins": 78.78499603271484,
"rewards/rejected": -77.26997375488281,
"step": 38
},
{
"epoch": 0.8764044943820225,
"grad_norm": 217.05526733398438,
"learning_rate": 2.679827333703964e-06,
"logits/chosen": 1.5550140142440796,
"logits/rejected": 1.5405230522155762,
"logps/chosen": -2775.199951171875,
"logps/rejected": -3292.66650390625,
"loss": 0.5094,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5831690430641174,
"rewards/margins": 75.25239562988281,
"rewards/rejected": -75.8355712890625,
"step": 39
},
{
"epoch": 0.898876404494382,
"grad_norm": 260.61224365234375,
"learning_rate": 2.6547509751448593e-06,
"logits/chosen": 1.5327131748199463,
"logits/rejected": 1.404789924621582,
"logps/chosen": -2995.2666015625,
"logps/rejected": -3701.7333984375,
"loss": 0.7054,
"rewards/accuracies": 0.703125,
"rewards/chosen": 4.574828147888184,
"rewards/margins": 96.09221649169922,
"rewards/rejected": -91.51737976074219,
"step": 40
},
{
"epoch": 0.9213483146067416,
"grad_norm": 210.46607971191406,
"learning_rate": 2.6288561547842076e-06,
"logits/chosen": 1.5143060684204102,
"logits/rejected": 1.2557826042175293,
"logps/chosen": -2932.751953125,
"logps/rejected": -3389.65185546875,
"loss": 0.6426,
"rewards/accuracies": 0.78125,
"rewards/chosen": 3.5902769565582275,
"rewards/margins": 102.1531982421875,
"rewards/rejected": -98.56291198730469,
"step": 41
},
{
"epoch": 0.9438202247191011,
"grad_norm": 203.90863037109375,
"learning_rate": 2.602161226294601e-06,
"logits/chosen": 1.4669859409332275,
"logits/rejected": 1.254248023033142,
"logps/chosen": -3275.650146484375,
"logps/rejected": -3885.744873046875,
"loss": 0.5032,
"rewards/accuracies": 0.796875,
"rewards/chosen": -7.145351886749268,
"rewards/margins": 94.66647338867188,
"rewards/rejected": -101.81182861328125,
"step": 42
},
{
"epoch": 0.9662921348314607,
"grad_norm": 190.71495056152344,
"learning_rate": 2.5746851104474728e-06,
"logits/chosen": 1.4877179861068726,
"logits/rejected": 1.3816105127334595,
"logps/chosen": -2700.980224609375,
"logps/rejected": -3283.328125,
"loss": 0.4432,
"rewards/accuracies": 0.828125,
"rewards/chosen": 1.710632085800171,
"rewards/margins": 75.0985107421875,
"rewards/rejected": -73.38786315917969,
"step": 43
},
{
"epoch": 0.9887640449438202,
"grad_norm": 192.31964111328125,
"learning_rate": 2.5464472817024772e-06,
"logits/chosen": 1.3617230653762817,
"logits/rejected": 1.2478257417678833,
"logps/chosen": -2841.803466796875,
"logps/rejected": -3503.9794921875,
"loss": 0.5194,
"rewards/accuracies": 0.78125,
"rewards/chosen": 4.092733383178711,
"rewards/margins": 110.31430053710938,
"rewards/rejected": -106.22156524658203,
"step": 44
},
{
"epoch": 1.0,
"grad_norm": 192.31964111328125,
"learning_rate": 2.517467754404424e-06,
"logits/chosen": 1.3865031003952026,
"logits/rejected": 1.2281872034072876,
"logps/chosen": -2563.0751953125,
"logps/rejected": -2940.1357421875,
"loss": 0.2103,
"rewards/accuracies": 0.84375,
"rewards/chosen": 4.377815246582031,
"rewards/margins": 81.93372344970703,
"rewards/rejected": -77.555908203125,
"step": 45
},
{
"epoch": 1.0224719101123596,
"grad_norm": 135.86026000976562,
"learning_rate": 2.487767068597558e-06,
"logits/chosen": 1.5341211557388306,
"logits/rejected": 1.4015753269195557,
"logps/chosen": -3250.149658203125,
"logps/rejected": -3893.629150390625,
"loss": 0.0037,
"rewards/accuracies": 1.0,
"rewards/chosen": 19.023714065551758,
"rewards/margins": 134.42942810058594,
"rewards/rejected": -115.40570068359375,
"step": 46
},
{
"epoch": 1.0449438202247192,
"grad_norm": 1.9560177326202393,
"learning_rate": 2.4573662754672303e-06,
"logits/chosen": 1.4638060331344604,
"logits/rejected": 1.396654486656189,
"logps/chosen": -2667.339599609375,
"logps/rejected": -3516.595703125,
"loss": 0.0114,
"rewards/accuracies": 0.984375,
"rewards/chosen": 8.45435905456543,
"rewards/margins": 107.95783996582031,
"rewards/rejected": -99.50347900390625,
"step": 47
},
{
"epoch": 1.0674157303370786,
"grad_norm": 14.909017562866211,
"learning_rate": 2.426286922419288e-06,
"logits/chosen": 1.6447203159332275,
"logits/rejected": 1.6282371282577515,
"logps/chosen": -2377.240478515625,
"logps/rejected": -2950.48583984375,
"loss": 0.0154,
"rewards/accuracies": 0.984375,
"rewards/chosen": 7.06836462020874,
"rewards/margins": 84.36599731445312,
"rewards/rejected": -77.29763793945312,
"step": 48
},
{
"epoch": 1.0898876404494382,
"grad_norm": 4.328535556793213,
"learning_rate": 2.3945510378077523e-06,
"logits/chosen": 1.3356518745422363,
"logits/rejected": 1.2965461015701294,
"logps/chosen": -2788.0400390625,
"logps/rejected": -3457.5185546875,
"loss": 0.0024,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.870361328125,
"rewards/margins": 103.6649169921875,
"rewards/rejected": -91.79456329345703,
"step": 49
},
{
"epoch": 1.1123595505617978,
"grad_norm": 6.1306352615356445,
"learning_rate": 2.3621811153216106e-06,
"logits/chosen": 1.3586758375167847,
"logits/rejected": 1.2172551155090332,
"logps/chosen": -3142.0791015625,
"logps/rejected": -3848.3056640625,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.018255710601807,
"rewards/margins": 121.07866668701172,
"rewards/rejected": -116.06040954589844,
"step": 50
},
{
"epoch": 1.1348314606741572,
"grad_norm": 2.2042205333709717,
"learning_rate": 2.32920009804179e-06,
"logits/chosen": 1.676792860031128,
"logits/rejected": 1.4110440015792847,
"logps/chosen": -2846.33056640625,
"logps/rejected": -3573.93359375,
"loss": 0.0116,
"rewards/accuracies": 0.984375,
"rewards/chosen": 16.190317153930664,
"rewards/margins": 119.14263153076172,
"rewards/rejected": -102.95230102539062,
"step": 51
},
{
"epoch": 1.1573033707865168,
"grad_norm": 13.62660026550293,
"learning_rate": 2.2956313621796135e-06,
"logits/chosen": 1.5751538276672363,
"logits/rejected": 1.4073097705841064,
"logps/chosen": -2536.8515625,
"logps/rejected": -3102.68896484375,
"loss": 0.0147,
"rewards/accuracies": 0.984375,
"rewards/chosen": 7.306772232055664,
"rewards/margins": 98.24702453613281,
"rewards/rejected": -90.94024658203125,
"step": 52
},
{
"epoch": 1.1797752808988764,
"grad_norm": 1.355103850364685,
"learning_rate": 2.26149870050826e-06,
"logits/chosen": 1.363991618156433,
"logits/rejected": 1.1863415241241455,
"logps/chosen": -3056.833740234375,
"logps/rejected": -3680.160888671875,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.664068222045898,
"rewards/margins": 112.41234588623047,
"rewards/rejected": -102.74827575683594,
"step": 53
},
{
"epoch": 1.202247191011236,
"grad_norm": 2.3306772708892822,
"learning_rate": 2.2268263054989753e-06,
"logits/chosen": 1.54270339012146,
"logits/rejected": 1.475841760635376,
"logps/chosen": -2780.744384765625,
"logps/rejected": -3487.5322265625,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.756105422973633,
"rewards/margins": 107.98931884765625,
"rewards/rejected": -96.23321533203125,
"step": 54
},
{
"epoch": 1.2247191011235956,
"grad_norm": 1.47923743724823,
"learning_rate": 2.191638752173989e-06,
"logits/chosen": 1.6175808906555176,
"logits/rejected": 1.5379141569137573,
"logps/chosen": -2748.61328125,
"logps/rejected": -3274.468017578125,
"loss": 0.0117,
"rewards/accuracies": 0.984375,
"rewards/chosen": 8.739614486694336,
"rewards/margins": 110.58942413330078,
"rewards/rejected": -101.84980010986328,
"step": 55
},
{
"epoch": 1.247191011235955,
"grad_norm": 3.0752482414245605,
"learning_rate": 2.1559609806882834e-06,
"logits/chosen": 1.4324688911437988,
"logits/rejected": 1.2107815742492676,
"logps/chosen": -2790.97509765625,
"logps/rejected": -3406.87744140625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.457365989685059,
"rewards/margins": 89.03166198730469,
"rewards/rejected": -83.57430267333984,
"step": 56
},
{
"epoch": 1.2696629213483146,
"grad_norm": 0.07106953859329224,
"learning_rate": 2.1198182786525674e-06,
"logits/chosen": 1.409006118774414,
"logits/rejected": 1.2638301849365234,
"logps/chosen": -2571.373046875,
"logps/rejected": -3436.89892578125,
"loss": 0.0112,
"rewards/accuracies": 0.984375,
"rewards/chosen": 12.910816192626953,
"rewards/margins": 133.70639038085938,
"rewards/rejected": -120.79557800292969,
"step": 57
},
{
"epoch": 1.2921348314606742,
"grad_norm": 1.3202946186065674,
"learning_rate": 2.0832362632099813e-06,
"logits/chosen": 1.4980010986328125,
"logits/rejected": 1.1623045206069946,
"logps/chosen": -3144.611083984375,
"logps/rejected": -3731.18212890625,
"loss": 0.0051,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.096885681152344,
"rewards/margins": 142.87937927246094,
"rewards/rejected": -133.78250122070312,
"step": 58
},
{
"epoch": 1.3146067415730336,
"grad_norm": 2.9557082653045654,
"learning_rate": 2.0462408628792335e-06,
"logits/chosen": 1.6109601259231567,
"logits/rejected": 1.4365208148956299,
"logps/chosen": -2812.40625,
"logps/rejected": -3437.3193359375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.375179290771484,
"rewards/margins": 111.16755676269531,
"rewards/rejected": -102.79237365722656,
"step": 59
},
{
"epoch": 1.3370786516853932,
"grad_norm": 0.2892356514930725,
"learning_rate": 2.008858299177045e-06,
"logits/chosen": 1.4753564596176147,
"logits/rejected": 1.2640880346298218,
"logps/chosen": -2899.793212890625,
"logps/rejected": -3406.771240234375,
"loss": 0.0157,
"rewards/accuracies": 0.984375,
"rewards/chosen": 7.380945682525635,
"rewards/margins": 106.26220703125,
"rewards/rejected": -98.88125610351562,
"step": 60
},
{
"epoch": 1.3595505617977528,
"grad_norm": 50.00154495239258,
"learning_rate": 1.9711150680329234e-06,
"logits/chosen": 1.6642662286758423,
"logits/rejected": 1.473952054977417,
"logps/chosen": -2834.24072265625,
"logps/rejected": -3363.942138671875,
"loss": 0.0175,
"rewards/accuracies": 0.984375,
"rewards/chosen": 8.414569854736328,
"rewards/margins": 110.77262115478516,
"rewards/rejected": -102.35804748535156,
"step": 61
},
{
"epoch": 1.3820224719101124,
"grad_norm": 0.07520447671413422,
"learning_rate": 1.9330379210094315e-06,
"logits/chosen": 1.5798277854919434,
"logits/rejected": 1.4446996450424194,
"logps/chosen": -2692.41162109375,
"logps/rejected": -3175.50830078125,
"loss": 0.0118,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.677203178405762,
"rewards/margins": 96.32395935058594,
"rewards/rejected": -90.64675903320312,
"step": 62
},
{
"epoch": 1.404494382022472,
"grad_norm": 3.16860032081604,
"learning_rate": 1.8946538463412818e-06,
"logits/chosen": 1.606536865234375,
"logits/rejected": 1.5855745077133179,
"logps/chosen": -2659.635986328125,
"logps/rejected": -3431.36572265625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.329705238342285,
"rewards/margins": 98.20384216308594,
"rewards/rejected": -87.87415313720703,
"step": 63
},
{
"epoch": 1.4269662921348314,
"grad_norm": 0.042245469987392426,
"learning_rate": 1.8559900498066726e-06,
"logits/chosen": 1.605839490890503,
"logits/rejected": 1.3888914585113525,
"logps/chosen": -2774.67529296875,
"logps/rejected": -3620.492431640625,
"loss": 0.0092,
"rewards/accuracies": 1.0,
"rewards/chosen": 14.000102996826172,
"rewards/margins": 140.67535400390625,
"rewards/rejected": -126.67523956298828,
"step": 64
},
{
"epoch": 1.449438202247191,
"grad_norm": 28.373090744018555,
"learning_rate": 1.8170739354444366e-06,
"logits/chosen": 1.5468522310256958,
"logits/rejected": 1.316043734550476,
"logps/chosen": -2898.541015625,
"logps/rejected": -3607.741943359375,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.336808204650879,
"rewards/margins": 125.04135131835938,
"rewards/rejected": -115.70454406738281,
"step": 65
},
{
"epoch": 1.4719101123595506,
"grad_norm": 3.688307046890259,
"learning_rate": 1.7779330861306717e-06,
"logits/chosen": 1.4648973941802979,
"logits/rejected": 1.3168296813964844,
"logps/chosen": -3060.658935546875,
"logps/rejected": -4020.65185546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 3.3615617752075195,
"rewards/margins": 130.01849365234375,
"rewards/rejected": -126.65692138671875,
"step": 66
},
{
"epoch": 1.49438202247191,
"grad_norm": 21.308137893676758,
"learning_rate": 1.738595244028608e-06,
"logits/chosen": 1.4748642444610596,
"logits/rejected": 1.3131040334701538,
"logps/chosen": -2794.14599609375,
"logps/rejected": -3351.5478515625,
"loss": 0.0081,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.8835487365722656,
"rewards/margins": 98.07205963134766,
"rewards/rejected": -95.18850708007812,
"step": 67
},
{
"epoch": 1.5168539325842696,
"grad_norm": 1.3383527994155884,
"learning_rate": 1.699088290925583e-06,
"logits/chosen": 1.372517704963684,
"logits/rejected": 1.302228569984436,
"logps/chosen": -2794.654052734375,
"logps/rejected": -3820.33837890625,
"loss": 0.0112,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.68542766571045,
"rewards/margins": 141.4244842529297,
"rewards/rejected": -131.73907470703125,
"step": 68
},
{
"epoch": 1.5393258426966292,
"grad_norm": 1.4769072532653809,
"learning_rate": 1.6594402284710481e-06,
"logits/chosen": 1.5602664947509766,
"logits/rejected": 1.4328043460845947,
"logps/chosen": -2850.06640625,
"logps/rejected": -3549.932861328125,
"loss": 0.026,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.793665409088135,
"rewards/margins": 124.38016510009766,
"rewards/rejected": -118.58650970458984,
"step": 69
},
{
"epoch": 1.5617977528089888,
"grad_norm": 5.262300968170166,
"learning_rate": 1.6196791583296247e-06,
"logits/chosen": 1.4012134075164795,
"logits/rejected": 1.2154825925827026,
"logps/chosen": -2862.569580078125,
"logps/rejected": -3687.36328125,
"loss": 0.003,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.932228088378906,
"rewards/margins": 135.03558349609375,
"rewards/rejected": -122.10337829589844,
"step": 70
},
{
"epoch": 1.5842696629213484,
"grad_norm": 2.9438984394073486,
"learning_rate": 1.579833262263268e-06,
"logits/chosen": 1.4590383768081665,
"logits/rejected": 1.1356399059295654,
"logps/chosen": -2651.068603515625,
"logps/rejected": -3142.91455078125,
"loss": 0.0118,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.391037940979004,
"rewards/margins": 119.59295654296875,
"rewards/rejected": -110.2019271850586,
"step": 71
},
{
"epoch": 1.606741573033708,
"grad_norm": 0.6242117881774902,
"learning_rate": 1.5399307821566623e-06,
"logits/chosen": 1.5220391750335693,
"logits/rejected": 1.2139172554016113,
"logps/chosen": -2834.0634765625,
"logps/rejected": -3674.3623046875,
"loss": 0.0218,
"rewards/accuracies": 0.984375,
"rewards/chosen": 14.53393268585205,
"rewards/margins": 154.6046142578125,
"rewards/rejected": -140.0706787109375,
"step": 72
},
{
"epoch": 1.6292134831460674,
"grad_norm": 0.17758429050445557,
"learning_rate": 1.5e-06,
"logits/chosen": 1.531368374824524,
"logits/rejected": 1.3681552410125732,
"logps/chosen": -2943.841064453125,
"logps/rejected": -3831.00927734375,
"loss": 0.0117,
"rewards/accuracies": 0.984375,
"rewards/chosen": 11.650660514831543,
"rewards/margins": 151.18350219726562,
"rewards/rejected": -139.5328369140625,
"step": 73
},
{
"epoch": 1.651685393258427,
"grad_norm": 12.694519996643066,
"learning_rate": 1.460069217843338e-06,
"logits/chosen": 1.416333794593811,
"logits/rejected": 1.1884994506835938,
"logps/chosen": -3090.49658203125,
"logps/rejected": -3794.48095703125,
"loss": 0.004,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.209739685058594,
"rewards/margins": 145.9217529296875,
"rewards/rejected": -133.71200561523438,
"step": 74
},
{
"epoch": 1.6741573033707864,
"grad_norm": 5.181153774261475,
"learning_rate": 1.4201667377367324e-06,
"logits/chosen": 1.5291459560394287,
"logits/rejected": 1.390205979347229,
"logps/chosen": -2819.557861328125,
"logps/rejected": -3400.41748046875,
"loss": 0.0112,
"rewards/accuracies": 0.984375,
"rewards/chosen": 6.913262367248535,
"rewards/margins": 108.99024200439453,
"rewards/rejected": -102.07699584960938,
"step": 75
},
{
"epoch": 1.696629213483146,
"grad_norm": 5.866981506347656,
"learning_rate": 1.3803208416703752e-06,
"logits/chosen": 1.509679913520813,
"logits/rejected": 1.3863307237625122,
"logps/chosen": -2517.104736328125,
"logps/rejected": -3187.1181640625,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015058517456055,
"rewards/margins": 110.0936508178711,
"rewards/rejected": -104.07859802246094,
"step": 76
},
{
"epoch": 1.7191011235955056,
"grad_norm": 3.792738199234009,
"learning_rate": 1.3405597715289522e-06,
"logits/chosen": 1.4075974225997925,
"logits/rejected": 1.297675609588623,
"logps/chosen": -3116.082275390625,
"logps/rejected": -3820.78271484375,
"loss": 0.0017,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.922908782958984,
"rewards/margins": 124.51133728027344,
"rewards/rejected": -117.58842468261719,
"step": 77
},
{
"epoch": 1.7415730337078652,
"grad_norm": 8.345385551452637,
"learning_rate": 1.3009117090744173e-06,
"logits/chosen": 1.5826494693756104,
"logits/rejected": 1.2875326871871948,
"logps/chosen": -2909.03515625,
"logps/rejected": -3438.2587890625,
"loss": 0.0111,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.310379981994629,
"rewards/margins": 140.91641235351562,
"rewards/rejected": -132.6060333251953,
"step": 78
},
{
"epoch": 1.7640449438202248,
"grad_norm": 0.4116104245185852,
"learning_rate": 1.2614047559713923e-06,
"logits/chosen": 1.4220818281173706,
"logits/rejected": 1.2691839933395386,
"logps/chosen": -3212.60693359375,
"logps/rejected": -3793.721435546875,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.4821667671203613,
"rewards/margins": 128.71267700195312,
"rewards/rejected": -126.23049926757812,
"step": 79
},
{
"epoch": 1.7865168539325844,
"grad_norm": 0.8209803700447083,
"learning_rate": 1.2220669138693288e-06,
"logits/chosen": 1.3909624814987183,
"logits/rejected": 1.1474812030792236,
"logps/chosen": -2994.385009765625,
"logps/rejected": -3750.771728515625,
"loss": 0.0112,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.527303695678711,
"rewards/margins": 137.7163543701172,
"rewards/rejected": -128.18905639648438,
"step": 80
},
{
"epoch": 1.8089887640449438,
"grad_norm": 1.4425156116485596,
"learning_rate": 1.1829260645555634e-06,
"logits/chosen": 1.3281006813049316,
"logits/rejected": 1.039908766746521,
"logps/chosen": -3059.208251953125,
"logps/rejected": -3867.33349609375,
"loss": 0.0108,
"rewards/accuracies": 1.0,
"rewards/chosen": 12.086620330810547,
"rewards/margins": 160.84959411621094,
"rewards/rejected": -148.76295471191406,
"step": 81
},
{
"epoch": 1.8314606741573034,
"grad_norm": 0.7217972278594971,
"learning_rate": 1.1440099501933277e-06,
"logits/chosen": 1.3363004922866821,
"logits/rejected": 1.2744730710983276,
"logps/chosen": -3156.716796875,
"logps/rejected": -4011.334716796875,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.8549346923828125,
"rewards/margins": 134.17984008789062,
"rewards/rejected": -129.3249053955078,
"step": 82
},
{
"epoch": 1.8539325842696628,
"grad_norm": 1.5164899826049805,
"learning_rate": 1.1053461536587183e-06,
"logits/chosen": 1.4580892324447632,
"logits/rejected": 1.2366647720336914,
"logps/chosen": -2984.4619140625,
"logps/rejected": -3910.234375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.195051193237305,
"rewards/margins": 148.3942413330078,
"rewards/rejected": -139.1991729736328,
"step": 83
},
{
"epoch": 1.8764044943820224,
"grad_norm": 3.071080446243286,
"learning_rate": 1.0669620789905688e-06,
"logits/chosen": 1.5336228609085083,
"logits/rejected": 1.3450926542282104,
"logps/chosen": -2671.64892578125,
"logps/rejected": -3312.888427734375,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.30421257019043,
"rewards/margins": 96.9708023071289,
"rewards/rejected": -91.66659545898438,
"step": 84
},
{
"epoch": 1.898876404494382,
"grad_norm": 0.2966591715812683,
"learning_rate": 1.0288849319670773e-06,
"logits/chosen": 1.5615055561065674,
"logits/rejected": 1.4262051582336426,
"logps/chosen": -2924.010498046875,
"logps/rejected": -3439.7509765625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.811070919036865,
"rewards/margins": 107.32271575927734,
"rewards/rejected": -102.51164245605469,
"step": 85
},
{
"epoch": 1.9213483146067416,
"grad_norm": 0.05935266241431236,
"learning_rate": 9.911417008229545e-07,
"logits/chosen": 1.4063825607299805,
"logits/rejected": 1.1860499382019043,
"logps/chosen": -2746.5126953125,
"logps/rejected": -3493.92578125,
"loss": 0.0325,
"rewards/accuracies": 0.953125,
"rewards/chosen": 11.67589282989502,
"rewards/margins": 137.2821502685547,
"rewards/rejected": -125.60626220703125,
"step": 86
},
{
"epoch": 1.9438202247191012,
"grad_norm": 0.21089386940002441,
"learning_rate": 9.537591371207668e-07,
"logits/chosen": 1.5266857147216797,
"logits/rejected": 1.4005635976791382,
"logps/chosen": -2387.665771484375,
"logps/rejected": -3293.546630859375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.131157875061035,
"rewards/margins": 137.9029083251953,
"rewards/rejected": -132.77175903320312,
"step": 87
},
{
"epoch": 1.9662921348314608,
"grad_norm": 0.4727032780647278,
"learning_rate": 9.167637367900192e-07,
"logits/chosen": 1.5321190357208252,
"logits/rejected": 1.3832690715789795,
"logps/chosen": -2469.994384765625,
"logps/rejected": -3097.712890625,
"loss": 0.0117,
"rewards/accuracies": 0.984375,
"rewards/chosen": 13.177355766296387,
"rewards/margins": 116.04686737060547,
"rewards/rejected": -102.8695068359375,
"step": 88
},
{
"epoch": 1.9887640449438202,
"grad_norm": 0.39027953147888184,
"learning_rate": 8.801817213474331e-07,
"logits/chosen": 1.5794587135314941,
"logits/rejected": 1.3486638069152832,
"logps/chosen": -2815.1982421875,
"logps/rejected": -3435.67919921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.544872283935547,
"rewards/margins": 112.28601837158203,
"rewards/rejected": -103.74114227294922,
"step": 89
},
{
"epoch": 2.0,
"grad_norm": 0.14720159769058228,
"learning_rate": 8.44039019311717e-07,
"logits/chosen": 1.492700457572937,
"logits/rejected": 1.3120732307434082,
"logps/chosen": -3285.24267578125,
"logps/rejected": -3985.763916015625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 11.00776481628418,
"rewards/margins": 157.06927490234375,
"rewards/rejected": -146.06150817871094,
"step": 90
},
{
"epoch": 2.0224719101123596,
"grad_norm": 0.019609661772847176,
"learning_rate": 8.08361247826011e-07,
"logits/chosen": 1.3633915185928345,
"logits/rejected": 1.1915699243545532,
"logps/chosen": -3307.618408203125,
"logps/rejected": -4103.1875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.502930760383606,
"rewards/margins": 150.0188446044922,
"rewards/rejected": -151.52178955078125,
"step": 91
},
{
"epoch": 2.044943820224719,
"grad_norm": 0.026041870936751366,
"learning_rate": 7.731736945010249e-07,
"logits/chosen": 1.4235529899597168,
"logits/rejected": 1.0836195945739746,
"logps/chosen": -3224.001708984375,
"logps/rejected": -3803.459228515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.049484252929688,
"rewards/margins": 149.46070861816406,
"rewards/rejected": -140.41123962402344,
"step": 92
},
{
"epoch": 2.067415730337079,
"grad_norm": 0.36662229895591736,
"learning_rate": 7.385012994917405e-07,
"logits/chosen": 1.461303949356079,
"logits/rejected": 1.401003360748291,
"logps/chosen": -2710.856689453125,
"logps/rejected": -3409.259765625,
"loss": 0.0109,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.063204765319824,
"rewards/margins": 96.820068359375,
"rewards/rejected": -91.75686645507812,
"step": 93
},
{
"epoch": 2.0898876404494384,
"grad_norm": 0.22327114641666412,
"learning_rate": 7.043686378203864e-07,
"logits/chosen": 1.5914536714553833,
"logits/rejected": 1.3907164335250854,
"logps/chosen": -2657.873291015625,
"logps/rejected": -3420.0283203125,
"loss": 0.0109,
"rewards/accuracies": 0.984375,
"rewards/chosen": 12.433341979980469,
"rewards/margins": 118.74362182617188,
"rewards/rejected": -106.31027221679688,
"step": 94
},
{
"epoch": 2.1123595505617976,
"grad_norm": 0.006661942228674889,
"learning_rate": 6.707999019582104e-07,
"logits/chosen": 1.4297124147415161,
"logits/rejected": 1.2694649696350098,
"logps/chosen": -2567.587890625,
"logps/rejected": -3557.106201171875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.91953182220459,
"rewards/margins": 146.32005310058594,
"rewards/rejected": -138.4005126953125,
"step": 95
},
{
"epoch": 2.134831460674157,
"grad_norm": 0.010272935964167118,
"learning_rate": 6.378188846783898e-07,
"logits/chosen": 1.584874153137207,
"logits/rejected": 1.3883558511734009,
"logps/chosen": -2836.077880859375,
"logps/rejected": -3408.93115234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.626905918121338,
"rewards/margins": 121.95980834960938,
"rewards/rejected": -115.33291625976562,
"step": 96
},
{
"epoch": 2.157303370786517,
"grad_norm": 0.006059441715478897,
"learning_rate": 6.054489621922477e-07,
"logits/chosen": 1.6233469247817993,
"logits/rejected": 1.4364811182022095,
"logps/chosen": -2997.014404296875,
"logps/rejected": -3488.54150390625,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 12.179953575134277,
"rewards/margins": 123.74882507324219,
"rewards/rejected": -111.56886291503906,
"step": 97
},
{
"epoch": 2.1797752808988764,
"grad_norm": 0.23592473566532135,
"learning_rate": 5.737130775807122e-07,
"logits/chosen": 1.4150291681289673,
"logits/rejected": 1.3036937713623047,
"logps/chosen": -2623.100830078125,
"logps/rejected": -3417.743408203125,
"loss": 0.011,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.777491569519043,
"rewards/margins": 126.9278335571289,
"rewards/rejected": -117.15032196044922,
"step": 98
},
{
"epoch": 2.202247191011236,
"grad_norm": 0.0040085772052407265,
"learning_rate": 5.426337245327703e-07,
"logits/chosen": 1.3026162385940552,
"logits/rejected": 1.194283127784729,
"logps/chosen": -2882.58154296875,
"logps/rejected": -3794.05078125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.322346687316895,
"rewards/margins": 140.7698211669922,
"rewards/rejected": -130.44747924804688,
"step": 99
},
{
"epoch": 2.2247191011235956,
"grad_norm": 0.005036317277699709,
"learning_rate": 5.122329314024422e-07,
"logits/chosen": 1.4347069263458252,
"logits/rejected": 1.2561771869659424,
"logps/chosen": -2425.357177734375,
"logps/rejected": -3138.833740234375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.752297401428223,
"rewards/margins": 120.6755599975586,
"rewards/rejected": -106.92326354980469,
"step": 100
},
{
"epoch": 2.247191011235955,
"grad_norm": 0.267286479473114,
"learning_rate": 4.825322455955759e-07,
"logits/chosen": 1.376643419265747,
"logits/rejected": 1.2739124298095703,
"logps/chosen": -2709.716796875,
"logps/rejected": -3520.384765625,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 10.822145462036133,
"rewards/margins": 141.28472900390625,
"rewards/rejected": -130.4625701904297,
"step": 101
},
{
"epoch": 2.2696629213483144,
"grad_norm": 0.37806662917137146,
"learning_rate": 4.5355271829752307e-07,
"logits/chosen": 1.4881722927093506,
"logits/rejected": 1.346581220626831,
"logps/chosen": -2821.6923828125,
"logps/rejected": -3442.4619140625,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.021244049072266,
"rewards/margins": 126.26439666748047,
"rewards/rejected": -117.2431640625,
"step": 102
},
{
"epoch": 2.292134831460674,
"grad_norm": 0.0023486721329391003,
"learning_rate": 4.2531488955252726e-07,
"logits/chosen": 1.4559850692749023,
"logits/rejected": 1.1960179805755615,
"logps/chosen": -2982.266357421875,
"logps/rejected": -3776.720458984375,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 13.267072677612305,
"rewards/margins": 156.5282440185547,
"rewards/rejected": -143.26113891601562,
"step": 103
},
{
"epoch": 2.3146067415730336,
"grad_norm": 0.006942716892808676,
"learning_rate": 3.978387737053994e-07,
"logits/chosen": 1.5748894214630127,
"logits/rejected": 1.4408270120620728,
"logps/chosen": -2752.75634765625,
"logps/rejected": -3425.216064453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.169326782226562,
"rewards/margins": 107.41685485839844,
"rewards/rejected": -92.24752807617188,
"step": 104
},
{
"epoch": 2.337078651685393,
"grad_norm": 0.1621246337890625,
"learning_rate": 3.7114384521579234e-07,
"logits/chosen": 1.6052483320236206,
"logits/rejected": 1.446576714515686,
"logps/chosen": -2733.099609375,
"logps/rejected": -3558.54931640625,
"loss": 0.0108,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.2836151123046875,
"rewards/margins": 120.5184326171875,
"rewards/rejected": -114.23482513427734,
"step": 105
},
{
"epoch": 2.359550561797753,
"grad_norm": 0.0010318144923076034,
"learning_rate": 3.4524902485514043e-07,
"logits/chosen": 1.5261331796646118,
"logits/rejected": 1.2617827653884888,
"logps/chosen": -2832.090576171875,
"logps/rejected": -3448.433837890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.34963607788086,
"rewards/margins": 127.82434844970703,
"rewards/rejected": -119.47471618652344,
"step": 106
},
{
"epoch": 2.3820224719101124,
"grad_norm": 0.001886666170321405,
"learning_rate": 3.201726662960363e-07,
"logits/chosen": 1.4487926959991455,
"logits/rejected": 1.2953495979309082,
"logps/chosen": -2931.4873046875,
"logps/rejected": -3765.528564453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.4385576248168945,
"rewards/margins": 141.048583984375,
"rewards/rejected": -135.6100311279297,
"step": 107
},
{
"epoch": 2.404494382022472,
"grad_norm": 0.0003725312708411366,
"learning_rate": 2.9593254310355485e-07,
"logits/chosen": 1.5249533653259277,
"logits/rejected": 1.36188805103302,
"logps/chosen": -2958.6279296875,
"logps/rejected": -3625.80859375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.046311378479004,
"rewards/margins": 136.48867797851562,
"rewards/rejected": -128.44235229492188,
"step": 108
},
{
"epoch": 2.4269662921348316,
"grad_norm": 0.0058527453802526,
"learning_rate": 2.725458361377465e-07,
"logits/chosen": 1.449507236480713,
"logits/rejected": 1.195552110671997,
"logps/chosen": -3101.913330078125,
"logps/rejected": -3919.42626953125,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.668648719787598,
"rewards/margins": 170.04879760742188,
"rewards/rejected": -160.38015747070312,
"step": 109
},
{
"epoch": 2.449438202247191,
"grad_norm": 0.004259227309376001,
"learning_rate": 2.5002912137622743e-07,
"logits/chosen": 1.3936243057250977,
"logits/rejected": 1.1740200519561768,
"logps/chosen": -2701.333740234375,
"logps/rejected": -3472.6923828125,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 11.122644424438477,
"rewards/margins": 145.8236083984375,
"rewards/rejected": -134.70095825195312,
"step": 110
},
{
"epoch": 2.4719101123595504,
"grad_norm": 0.010651292279362679,
"learning_rate": 2.2839835816549365e-07,
"logits/chosen": 1.711632490158081,
"logits/rejected": 1.4845446348190308,
"logps/chosen": -3014.84912109375,
"logps/rejected": -3401.6298828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 8.185779571533203,
"rewards/margins": 117.65122985839844,
"rewards/rejected": -109.4654541015625,
"step": 111
},
{
"epoch": 2.49438202247191,
"grad_norm": 0.21365472674369812,
"learning_rate": 2.0766887790929072e-07,
"logits/chosen": 1.5201102495193481,
"logits/rejected": 1.3360121250152588,
"logps/chosen": -2596.279296875,
"logps/rejected": -3536.295166015625,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 9.575386047363281,
"rewards/margins": 136.92886352539062,
"rewards/rejected": -127.35346221923828,
"step": 112
},
{
"epoch": 2.5168539325842696,
"grad_norm": 0.06359975039958954,
"learning_rate": 1.8785537320205808e-07,
"logits/chosen": 1.4054570198059082,
"logits/rejected": 1.304233431816101,
"logps/chosen": -2882.770263671875,
"logps/rejected": -3637.910888671875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.500956535339355,
"rewards/margins": 114.78219604492188,
"rewards/rejected": -105.28123474121094,
"step": 113
},
{
"epoch": 2.539325842696629,
"grad_norm": 0.039696987718343735,
"learning_rate": 1.6897188741514286e-07,
"logits/chosen": 1.3486000299453735,
"logits/rejected": 1.2321511507034302,
"logps/chosen": -2972.344970703125,
"logps/rejected": -3984.229248046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.131504535675049,
"rewards/margins": 162.7792205810547,
"rewards/rejected": -157.64772033691406,
"step": 114
},
{
"epoch": 2.561797752808989,
"grad_norm": 0.002948309760540724,
"learning_rate": 1.510318047431713e-07,
"logits/chosen": 1.4727129936218262,
"logits/rejected": 1.3785285949707031,
"logps/chosen": -2675.683837890625,
"logps/rejected": -3297.158447265625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.861666679382324,
"rewards/margins": 110.47186279296875,
"rewards/rejected": -102.61019134521484,
"step": 115
},
{
"epoch": 2.5842696629213484,
"grad_norm": 0.07731137424707413,
"learning_rate": 1.3404784071763015e-07,
"logits/chosen": 1.4941082000732422,
"logits/rejected": 1.4053186178207397,
"logps/chosen": -2728.80615234375,
"logps/rejected": -3415.1708984375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.857705116271973,
"rewards/margins": 109.21708679199219,
"rewards/rejected": -98.35939025878906,
"step": 116
},
{
"epoch": 2.606741573033708,
"grad_norm": 0.01123058795928955,
"learning_rate": 1.1803203319438056e-07,
"logits/chosen": 1.4337643384933472,
"logits/rejected": 1.2645751237869263,
"logps/chosen": -2684.67041015625,
"logps/rejected": -3446.0908203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 13.534300804138184,
"rewards/margins": 135.90628051757812,
"rewards/rejected": -122.37198638916016,
"step": 117
},
{
"epoch": 2.629213483146067,
"grad_norm": 0.7818881869316101,
"learning_rate": 1.0299573382149235e-07,
"logits/chosen": 1.4340091943740845,
"logits/rejected": 1.2151674032211304,
"logps/chosen": -3169.663330078125,
"logps/rejected": -4115.5751953125,
"loss": 0.0219,
"rewards/accuracies": 0.984375,
"rewards/chosen": 11.765824317932129,
"rewards/margins": 178.14181518554688,
"rewards/rejected": -166.37596130371094,
"step": 118
},
{
"epoch": 2.6516853932584272,
"grad_norm": 0.11178380995988846,
"learning_rate": 8.894959999345015e-08,
"logits/chosen": 1.4085586071014404,
"logits/rejected": 1.317073941230774,
"logps/chosen": -2706.8623046875,
"logps/rejected": -3629.9091796875,
"loss": 0.0109,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.750637531280518,
"rewards/margins": 140.9330291748047,
"rewards/rejected": -134.18240356445312,
"step": 119
},
{
"epoch": 2.6741573033707864,
"grad_norm": 0.009486271999776363,
"learning_rate": 7.590358729742808e-08,
"logits/chosen": 1.5044245719909668,
"logits/rejected": 1.3787866830825806,
"logps/chosen": -2867.752197265625,
"logps/rejected": -3833.509765625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.230460166931152,
"rewards/margins": 134.28904724121094,
"rewards/rejected": -128.05857849121094,
"step": 120
},
{
"epoch": 2.696629213483146,
"grad_norm": 0.009250489063560963,
"learning_rate": 6.386694245699181e-08,
"logits/chosen": 1.5157657861709595,
"logits/rejected": 1.2433254718780518,
"logps/chosen": -3022.373046875,
"logps/rejected": -3732.22900390625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.7557570934295654,
"rewards/margins": 130.84677124023438,
"rewards/rejected": -128.0910186767578,
"step": 121
},
{
"epoch": 2.7191011235955056,
"grad_norm": 0.1917319893836975,
"learning_rate": 5.284819677822611e-08,
"logits/chosen": 1.6072005033493042,
"logits/rejected": 1.528849720954895,
"logps/chosen": -2894.672119140625,
"logps/rejected": -3495.853515625,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 3.3133740425109863,
"rewards/margins": 105.75206756591797,
"rewards/rejected": -102.43870544433594,
"step": 122
},
{
"epoch": 2.741573033707865,
"grad_norm": 0.03384300321340561,
"learning_rate": 4.285516010293522e-08,
"logits/chosen": 1.4517195224761963,
"logits/rejected": 1.3014264106750488,
"logps/chosen": -2851.070556640625,
"logps/rejected": -3593.665771484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.4544267654418945,
"rewards/margins": 122.42274475097656,
"rewards/rejected": -114.96832275390625,
"step": 123
},
{
"epoch": 2.764044943820225,
"grad_norm": 0.24889694154262543,
"learning_rate": 3.389491527319999e-08,
"logits/chosen": 1.4583051204681396,
"logits/rejected": 1.2614139318466187,
"logps/chosen": -2827.8134765625,
"logps/rejected": -3561.30810546875,
"loss": 0.0217,
"rewards/accuracies": 0.984375,
"rewards/chosen": 0.6058197617530823,
"rewards/margins": 129.5867919921875,
"rewards/rejected": -128.98095703125,
"step": 124
},
{
"epoch": 2.7865168539325844,
"grad_norm": 0.06888113170862198,
"learning_rate": 2.5973813111218548e-08,
"logits/chosen": 1.529250144958496,
"logits/rejected": 1.247063159942627,
"logps/chosen": -2882.323974609375,
"logps/rejected": -3656.96044921875,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": 9.58204174041748,
"rewards/margins": 154.1719970703125,
"rewards/rejected": -144.5899658203125,
"step": 125
},
{
"epoch": 2.808988764044944,
"grad_norm": 0.0029755791183561087,
"learning_rate": 1.909746791798317e-08,
"logits/chosen": 1.4555425643920898,
"logits/rejected": 1.2920844554901123,
"logps/chosen": -2807.64208984375,
"logps/rejected": -3475.54931640625,
"loss": 0.0217,
"rewards/accuracies": 0.984375,
"rewards/chosen": 5.643215179443359,
"rewards/margins": 125.7391128540039,
"rewards/rejected": -120.09590148925781,
"step": 126
},
{
"epoch": 2.831460674157303,
"grad_norm": 0.009821542538702488,
"learning_rate": 1.3270753493989374e-08,
"logits/chosen": 1.535863995552063,
"logits/rejected": 1.3580735921859741,
"logps/chosen": -2754.88818359375,
"logps/rejected": -3732.697021484375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 7.623423099517822,
"rewards/margins": 136.6768035888672,
"rewards/rejected": -129.05337524414062,
"step": 127
},
{
"epoch": 2.853932584269663,
"grad_norm": 0.5018057227134705,
"learning_rate": 8.49779968479436e-09,
"logits/chosen": 1.3728063106536865,
"logits/rejected": 1.154386281967163,
"logps/chosen": -3219.5546875,
"logps/rejected": -3955.0615234375,
"loss": 0.0108,
"rewards/accuracies": 1.0,
"rewards/chosen": 2.1031904220581055,
"rewards/margins": 135.11688232421875,
"rewards/rejected": -133.01368713378906,
"step": 128
},
{
"epoch": 2.8764044943820224,
"grad_norm": 0.0029928251169621944,
"learning_rate": 4.781989453874814e-09,
"logits/chosen": 1.589327335357666,
"logits/rejected": 1.44749116897583,
"logps/chosen": -2659.24462890625,
"logps/rejected": -3233.244873046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 10.386514663696289,
"rewards/margins": 102.26481628417969,
"rewards/rejected": -91.87830352783203,
"step": 129
},
{
"epoch": 2.898876404494382,
"grad_norm": 0.009541017934679985,
"learning_rate": 2.1259564848570834e-09,
"logits/chosen": 1.5677722692489624,
"logits/rejected": 1.2758667469024658,
"logps/chosen": -2889.547607421875,
"logps/rejected": -3603.37109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 15.972006797790527,
"rewards/margins": 140.3019256591797,
"rewards/rejected": -124.32991790771484,
"step": 130
},
{
"epoch": 2.9213483146067416,
"grad_norm": 0.007502752356231213,
"learning_rate": 5.315833148210603e-10,
"logits/chosen": 1.6323837041854858,
"logits/rejected": 1.446678876876831,
"logps/chosen": -2922.07568359375,
"logps/rejected": -3691.432373046875,
"loss": 0.0108,
"rewards/accuracies": 0.984375,
"rewards/chosen": 12.317670822143555,
"rewards/margins": 135.18690490722656,
"rewards/rejected": -122.86924743652344,
"step": 131
},
{
"epoch": 2.943820224719101,
"grad_norm": 0.2958358824253082,
"learning_rate": 0.0,
"logits/chosen": 1.4742579460144043,
"logits/rejected": 1.2774202823638916,
"logps/chosen": -2621.55615234375,
"logps/rejected": -3527.73193359375,
"loss": 0.0217,
"rewards/accuracies": 0.984375,
"rewards/chosen": 11.16303539276123,
"rewards/margins": 133.13824462890625,
"rewards/rejected": -121.9752197265625,
"step": 132
},
{
"epoch": 2.943820224719101,
"step": 132,
"total_flos": 228521444442112.0,
"train_loss": 0.17045999738028772,
"train_runtime": 5166.54,
"train_samples_per_second": 1.651,
"train_steps_per_second": 0.026
}
],
"logging_steps": 1,
"max_steps": 132,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 228521444442112.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}