{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 39.71950149536133, "learning_rate": 5.0000000000000004e-08, "logits/chosen": -1.2357934713363647, "logits/rejected": -0.7058947682380676, "logps/chosen": -220.3852081298828, "logps/rejected": -257.87994384765625, "loss": 0.8354, "rewards/accuracies": 0.5, "rewards/chosen": -0.054030876606702805, "rewards/margins": 0.5196583867073059, "rewards/rejected": -0.573689341545105, "step": 10 }, { "epoch": 0.002, "grad_norm": 27.253252029418945, "learning_rate": 1.0000000000000001e-07, "logits/chosen": -1.2473429441452026, "logits/rejected": -0.5767286419868469, "logps/chosen": -336.3631896972656, "logps/rejected": -438.285400390625, "loss": 1.0858, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.097655177116394, "rewards/margins": 0.39187589287757874, "rewards/rejected": -1.4895310401916504, "step": 20 }, { "epoch": 0.003, "grad_norm": 2.4999659061431885, "learning_rate": 1.5000000000000002e-07, "logits/chosen": -1.7245657444000244, "logits/rejected": -0.5277743935585022, "logps/chosen": -109.9853286743164, "logps/rejected": -330.37506103515625, "loss": 1.3177, "rewards/accuracies": 0.5, "rewards/chosen": -0.6884216666221619, "rewards/margins": 0.37672415375709534, "rewards/rejected": -1.06514573097229, "step": 30 }, { "epoch": 0.004, "grad_norm": 193.43942260742188, "learning_rate": 2.0000000000000002e-07, "logits/chosen": -1.815363883972168, "logits/rejected": -0.4569586217403412, "logps/chosen": -248.228759765625, "logps/rejected": -446.4189453125, "loss": 0.6804, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4480059146881104, "rewards/margins": 1.2186508178710938, "rewards/rejected": -2.666656732559204, "step": 40 }, { "epoch": 0.005, "grad_norm": 6.413777828216553, "learning_rate": 2.5000000000000004e-07, "logits/chosen": -1.0549522638320923, "logits/rejected": -0.3890644907951355, "logps/chosen": -196.66943359375, "logps/rejected": -329.15325927734375, "loss": 1.0372, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -1.0999469757080078, "rewards/margins": 0.1718175858259201, "rewards/rejected": -1.2717645168304443, "step": 50 }, { "epoch": 0.006, "grad_norm": 44.80453109741211, "learning_rate": 3.0000000000000004e-07, "logits/chosen": -1.1564265489578247, "logits/rejected": -0.7624977231025696, "logps/chosen": -212.6322479248047, "logps/rejected": -237.65902709960938, "loss": 1.6069, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -1.7349717617034912, "rewards/margins": -0.5888309478759766, "rewards/rejected": -1.146140694618225, "step": 60 }, { "epoch": 0.007, "grad_norm": 51.307899475097656, "learning_rate": 3.5000000000000004e-07, "logits/chosen": -1.3306106328964233, "logits/rejected": -0.49902287125587463, "logps/chosen": -187.5395965576172, "logps/rejected": -324.9596252441406, "loss": 1.3632, "rewards/accuracies": 0.5, "rewards/chosen": -1.4904448986053467, "rewards/margins": -0.47634443640708923, "rewards/rejected": -1.014100432395935, "step": 70 }, { "epoch": 0.008, "grad_norm": 196.0592498779297, "learning_rate": 4.0000000000000003e-07, "logits/chosen": -1.136975884437561, "logits/rejected": -0.4456964135169983, "logps/chosen": -223.9425048828125, "logps/rejected": -334.194580078125, "loss": 0.5711, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8610240817070007, "rewards/margins": 1.452368974685669, "rewards/rejected": -2.3133931159973145, "step": 80 }, { "epoch": 0.009, "grad_norm": 58.4428596496582, "learning_rate": 4.5000000000000003e-07, "logits/chosen": -1.4022276401519775, "logits/rejected": -0.5699952244758606, "logps/chosen": -159.07693481445312, "logps/rejected": -277.25469970703125, "loss": 0.3843, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.21312984824180603, "rewards/margins": 1.7562767267227173, "rewards/rejected": -1.9694064855575562, "step": 90 }, { "epoch": 0.01, "grad_norm": 117.02696228027344, "learning_rate": 5.000000000000001e-07, "logits/chosen": -1.142896056175232, "logits/rejected": -0.6683061718940735, "logps/chosen": -203.57827758789062, "logps/rejected": -340.763671875, "loss": 1.0243, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9323091506958008, "rewards/margins": 0.5186213254928589, "rewards/rejected": -1.4509305953979492, "step": 100 }, { "epoch": 0.011, "grad_norm": 80.57569885253906, "learning_rate": 5.5e-07, "logits/chosen": -1.0948550701141357, "logits/rejected": -0.6490032076835632, "logps/chosen": -186.33920288085938, "logps/rejected": -293.6062316894531, "loss": 0.579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.49649447202682495, "rewards/margins": 1.350467562675476, "rewards/rejected": -1.8469619750976562, "step": 110 }, { "epoch": 0.012, "grad_norm": 34.219322204589844, "learning_rate": 6.000000000000001e-07, "logits/chosen": -1.7040865421295166, "logits/rejected": -0.4477524161338806, "logps/chosen": -151.40792846679688, "logps/rejected": -396.0971984863281, "loss": 1.1696, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -1.2884950637817383, "rewards/margins": -0.33469006419181824, "rewards/rejected": -0.9538049697875977, "step": 120 }, { "epoch": 0.013, "grad_norm": 3.8949766159057617, "learning_rate": 6.5e-07, "logits/chosen": -1.3104069232940674, "logits/rejected": -0.5389059782028198, "logps/chosen": -209.498291015625, "logps/rejected": -299.7087097167969, "loss": 0.7542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2069940567016602, "rewards/margins": 1.0890920162200928, "rewards/rejected": -2.296085834503174, "step": 130 }, { "epoch": 0.014, "grad_norm": 12.070839881896973, "learning_rate": 7.000000000000001e-07, "logits/chosen": -1.1613848209381104, "logits/rejected": -0.6944864392280579, "logps/chosen": -353.5556335449219, "logps/rejected": -409.09857177734375, "loss": 0.2059, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.0929415225982666, "rewards/margins": 2.8078646659851074, "rewards/rejected": -3.900806427001953, "step": 140 }, { "epoch": 0.015, "grad_norm": 22.197856903076172, "learning_rate": 7.5e-07, "logits/chosen": -0.8517942428588867, "logits/rejected": -0.323356568813324, "logps/chosen": -480.0712890625, "logps/rejected": -459.5816955566406, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": -0.7829483151435852, "rewards/margins": 3.3262500762939453, "rewards/rejected": -4.109198093414307, "step": 150 }, { "epoch": 0.016, "grad_norm": 208.33311462402344, "learning_rate": 8.000000000000001e-07, "logits/chosen": -1.3437260389328003, "logits/rejected": -0.5882256031036377, "logps/chosen": -356.7096862792969, "logps/rejected": -366.26605224609375, "loss": 0.9627, "rewards/accuracies": 0.5, "rewards/chosen": -1.2848951816558838, "rewards/margins": 1.0965783596038818, "rewards/rejected": -2.3814735412597656, "step": 160 }, { "epoch": 0.017, "grad_norm": 0.00499558774754405, "learning_rate": 8.500000000000001e-07, "logits/chosen": -1.5300209522247314, "logits/rejected": -0.47886618971824646, "logps/chosen": -170.71347045898438, "logps/rejected": -377.40386962890625, "loss": 0.122, "rewards/accuracies": 1.0, "rewards/chosen": -0.4191989004611969, "rewards/margins": 3.7101001739501953, "rewards/rejected": -4.129299163818359, "step": 170 }, { "epoch": 0.018, "grad_norm": 0.31560298800468445, "learning_rate": 9.000000000000001e-07, "logits/chosen": -1.1970791816711426, "logits/rejected": -0.5007954835891724, "logps/chosen": -171.7400665283203, "logps/rejected": -343.6292419433594, "loss": 0.3737, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.7679370641708374, "rewards/margins": 2.9629974365234375, "rewards/rejected": -4.7309346199035645, "step": 180 }, { "epoch": 0.019, "grad_norm": 690.4447631835938, "learning_rate": 9.500000000000001e-07, "logits/chosen": -1.0829429626464844, "logits/rejected": -0.633941113948822, "logps/chosen": -325.78363037109375, "logps/rejected": -416.4991760253906, "loss": 0.806, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.0230331420898438, "rewards/margins": 1.7831542491912842, "rewards/rejected": -4.806187629699707, "step": 190 }, { "epoch": 0.02, "grad_norm": 28.971961975097656, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -1.0844286680221558, "logits/rejected": -0.4173709750175476, "logps/chosen": -377.9032897949219, "logps/rejected": -432.1002502441406, "loss": 0.2169, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.7420201301574707, "rewards/margins": 3.7154648303985596, "rewards/rejected": -6.457485198974609, "step": 200 }, { "epoch": 0.021, "grad_norm": 92.62471771240234, "learning_rate": 1.0500000000000001e-06, "logits/chosen": -1.0448482036590576, "logits/rejected": -0.4706448018550873, "logps/chosen": -294.4362487792969, "logps/rejected": -384.4219970703125, "loss": 0.136, "rewards/accuracies": 1.0, "rewards/chosen": -2.8045363426208496, "rewards/margins": 5.647749423980713, "rewards/rejected": -8.452284812927246, "step": 210 }, { "epoch": 0.022, "grad_norm": 6.008492946624756, "learning_rate": 1.1e-06, "logits/chosen": -1.2321628332138062, "logits/rejected": -0.7217515707015991, "logps/chosen": -389.41192626953125, "logps/rejected": -409.8401794433594, "loss": 1.2207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.440738677978516, "rewards/margins": 1.8787305355072021, "rewards/rejected": -6.319469451904297, "step": 220 }, { "epoch": 0.023, "grad_norm": 0.00596708245575428, "learning_rate": 1.1500000000000002e-06, "logits/chosen": -1.1558558940887451, "logits/rejected": -0.4543713629245758, "logps/chosen": -224.6162567138672, "logps/rejected": -349.88677978515625, "loss": 0.8165, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.1587278842926025, "rewards/margins": 2.9656124114990234, "rewards/rejected": -6.124340057373047, "step": 230 }, { "epoch": 0.024, "grad_norm": 9.121099472045898, "learning_rate": 1.2000000000000002e-06, "logits/chosen": -1.4009921550750732, "logits/rejected": -0.6744478940963745, "logps/chosen": -300.53961181640625, "logps/rejected": -365.09185791015625, "loss": 0.2748, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.324301242828369, "rewards/margins": 2.40966796875, "rewards/rejected": -5.733969211578369, "step": 240 }, { "epoch": 0.025, "grad_norm": 262.2490234375, "learning_rate": 1.25e-06, "logits/chosen": -0.9986278414726257, "logits/rejected": -0.6002539396286011, "logps/chosen": -246.1274871826172, "logps/rejected": -286.7688903808594, "loss": 0.6446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.917508363723755, "rewards/margins": 2.402920961380005, "rewards/rejected": -6.320428371429443, "step": 250 }, { "epoch": 0.026, "grad_norm": 2.1036490579717793e-05, "learning_rate": 1.3e-06, "logits/chosen": -1.250001072883606, "logits/rejected": -0.5378462672233582, "logps/chosen": -335.57952880859375, "logps/rejected": -407.97222900390625, "loss": 0.3227, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.664595127105713, "rewards/margins": 4.549985408782959, "rewards/rejected": -9.214579582214355, "step": 260 }, { "epoch": 0.027, "grad_norm": 4.041048049926758, "learning_rate": 1.3500000000000002e-06, "logits/chosen": -0.947732150554657, "logits/rejected": -0.7202231884002686, "logps/chosen": -326.36785888671875, "logps/rejected": -365.80517578125, "loss": 0.631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -5.729133129119873, "rewards/margins": 1.104986310005188, "rewards/rejected": -6.834118843078613, "step": 270 }, { "epoch": 0.028, "grad_norm": 18.695680618286133, "learning_rate": 1.4000000000000001e-06, "logits/chosen": -0.966740608215332, "logits/rejected": -0.5254305005073547, "logps/chosen": -485.3287658691406, "logps/rejected": -571.0950927734375, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -5.315737724304199, "rewards/margins": 8.604612350463867, "rewards/rejected": -13.920351028442383, "step": 280 }, { "epoch": 0.029, "grad_norm": 0.7097064256668091, "learning_rate": 1.45e-06, "logits/chosen": -1.3276937007904053, "logits/rejected": -0.7066922187805176, "logps/chosen": -268.1031494140625, "logps/rejected": -441.033935546875, "loss": 0.0695, "rewards/accuracies": 1.0, "rewards/chosen": -3.8867766857147217, "rewards/margins": 8.865959167480469, "rewards/rejected": -12.75273609161377, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.003646882250905037, "learning_rate": 1.5e-06, "logits/chosen": -1.3603465557098389, "logits/rejected": -0.40795159339904785, "logps/chosen": -275.05108642578125, "logps/rejected": -438.65948486328125, "loss": 0.0554, "rewards/accuracies": 1.0, "rewards/chosen": -4.856437683105469, "rewards/margins": 7.816611289978027, "rewards/rejected": -12.673048973083496, "step": 300 }, { "epoch": 0.031, "grad_norm": 0.048718515783548355, "learning_rate": 1.5500000000000002e-06, "logits/chosen": -1.641971230506897, "logits/rejected": -0.41742610931396484, "logps/chosen": -204.53335571289062, "logps/rejected": -590.5496826171875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -5.203894138336182, "rewards/margins": 12.468754768371582, "rewards/rejected": -17.67264747619629, "step": 310 }, { "epoch": 0.032, "grad_norm": 2.144958972930908, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -1.0715140104293823, "logits/rejected": -0.611113429069519, "logps/chosen": -185.79281616210938, "logps/rejected": -347.6463623046875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -4.1656951904296875, "rewards/margins": 8.577848434448242, "rewards/rejected": -12.743544578552246, "step": 320 }, { "epoch": 0.033, "grad_norm": 15.040058135986328, "learning_rate": 1.6500000000000003e-06, "logits/chosen": -1.2094731330871582, "logits/rejected": -0.5106289386749268, "logps/chosen": -277.95721435546875, "logps/rejected": -438.82293701171875, "loss": 0.027, "rewards/accuracies": 1.0, "rewards/chosen": -7.8129706382751465, "rewards/margins": 7.339131832122803, "rewards/rejected": -15.15210247039795, "step": 330 }, { "epoch": 0.034, "grad_norm": 484.30682373046875, "learning_rate": 1.7000000000000002e-06, "logits/chosen": -0.9956814050674438, "logits/rejected": -0.4930063784122467, "logps/chosen": -486.2621154785156, "logps/rejected": -595.5841064453125, "loss": 0.8495, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.835428237915039, "rewards/margins": 12.453630447387695, "rewards/rejected": -23.289060592651367, "step": 340 }, { "epoch": 0.035, "grad_norm": 80.75074768066406, "learning_rate": 1.75e-06, "logits/chosen": -0.7541705369949341, "logits/rejected": -0.5088824033737183, "logps/chosen": -468.2110290527344, "logps/rejected": -566.8349609375, "loss": 0.0637, "rewards/accuracies": 1.0, "rewards/chosen": -12.546435356140137, "rewards/margins": 11.740583419799805, "rewards/rejected": -24.287019729614258, "step": 350 }, { "epoch": 0.036, "grad_norm": 5.622852086162311e-07, "learning_rate": 1.8000000000000001e-06, "logits/chosen": -1.2729134559631348, "logits/rejected": -0.425194650888443, "logps/chosen": -341.8668212890625, "logps/rejected": -553.38818359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -7.173413276672363, "rewards/margins": 13.597102165222168, "rewards/rejected": -20.77051544189453, "step": 360 }, { "epoch": 0.037, "grad_norm": 4.885341020610667e-10, "learning_rate": 1.85e-06, "logits/chosen": -1.3934372663497925, "logits/rejected": -0.39337393641471863, "logps/chosen": -333.2331848144531, "logps/rejected": -671.9036254882812, "loss": 0.0656, "rewards/accuracies": 1.0, "rewards/chosen": -9.352590560913086, "rewards/margins": 14.552865982055664, "rewards/rejected": -23.90545654296875, "step": 370 }, { "epoch": 0.038, "grad_norm": 0.003559031756594777, "learning_rate": 1.9000000000000002e-06, "logits/chosen": -1.3900734186172485, "logits/rejected": -0.4074910581111908, "logps/chosen": -415.312744140625, "logps/rejected": -717.7049560546875, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -5.72658634185791, "rewards/margins": 18.283475875854492, "rewards/rejected": -24.01006317138672, "step": 380 }, { "epoch": 0.039, "grad_norm": 618.7286376953125, "learning_rate": 1.9500000000000004e-06, "logits/chosen": -1.2673394680023193, "logits/rejected": -0.5207124948501587, "logps/chosen": -428.8023376464844, "logps/rejected": -595.412353515625, "loss": 0.9008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -10.832281112670898, "rewards/margins": 11.771617889404297, "rewards/rejected": -22.603899002075195, "step": 390 }, { "epoch": 0.04, "grad_norm": 2.328858670352929e-07, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -1.086723804473877, "logits/rejected": -0.42811456322669983, "logps/chosen": -209.59927368164062, "logps/rejected": -433.703125, "loss": 0.0888, "rewards/accuracies": 1.0, "rewards/chosen": -6.72649621963501, "rewards/margins": 12.687189102172852, "rewards/rejected": -19.413684844970703, "step": 400 }, { "epoch": 0.041, "grad_norm": 0.0021633415017277002, "learning_rate": 2.05e-06, "logits/chosen": -1.3655961751937866, "logits/rejected": -0.2905605435371399, "logps/chosen": -250.3804473876953, "logps/rejected": -517.5274047851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.846493721008301, "rewards/margins": 13.903947830200195, "rewards/rejected": -19.75044059753418, "step": 410 }, { "epoch": 0.042, "grad_norm": 0.0014651113888248801, "learning_rate": 2.1000000000000002e-06, "logits/chosen": -0.803868293762207, "logits/rejected": -0.6163159012794495, "logps/chosen": -296.12939453125, "logps/rejected": -395.0677490234375, "loss": 0.5212, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.992659568786621, "rewards/margins": 8.027377128601074, "rewards/rejected": -16.020038604736328, "step": 420 }, { "epoch": 0.043, "grad_norm": 268.1305847167969, "learning_rate": 2.15e-06, "logits/chosen": -1.315779447555542, "logits/rejected": -0.3827098309993744, "logps/chosen": -395.30950927734375, "logps/rejected": -537.7557373046875, "loss": 0.5238, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.556341648101807, "rewards/margins": 8.877912521362305, "rewards/rejected": -16.434253692626953, "step": 430 }, { "epoch": 0.044, "grad_norm": 2.4609992124169366e-06, "learning_rate": 2.2e-06, "logits/chosen": -1.0926239490509033, "logits/rejected": -0.5294037461280823, "logps/chosen": -199.20274353027344, "logps/rejected": -308.11114501953125, "loss": 0.1815, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.401622772216797, "rewards/margins": 7.701199531555176, "rewards/rejected": -14.102824211120605, "step": 440 }, { "epoch": 0.045, "grad_norm": 2.4242572180810384e-05, "learning_rate": 2.25e-06, "logits/chosen": -1.2972681522369385, "logits/rejected": -0.34338143467903137, "logps/chosen": -239.63858032226562, "logps/rejected": -592.6209106445312, "loss": 0.0933, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.572920799255371, "rewards/margins": 17.80848503112793, "rewards/rejected": -25.381404876708984, "step": 450 }, { "epoch": 0.046, "grad_norm": 2.4251374242112433e-09, "learning_rate": 2.3000000000000004e-06, "logits/chosen": -1.2712339162826538, "logits/rejected": -0.3596915602684021, "logps/chosen": -288.246337890625, "logps/rejected": -570.5148315429688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.5534467697143555, "rewards/margins": 17.334653854370117, "rewards/rejected": -23.888103485107422, "step": 460 }, { "epoch": 0.047, "grad_norm": 0.00030805158894509077, "learning_rate": 2.35e-06, "logits/chosen": -1.4388515949249268, "logits/rejected": -0.34689822793006897, "logps/chosen": -268.9949035644531, "logps/rejected": -542.9310302734375, "loss": 0.1292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.221593856811523, "rewards/margins": 14.034378051757812, "rewards/rejected": -23.255971908569336, "step": 470 }, { "epoch": 0.048, "grad_norm": 0.32315555214881897, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -1.1578876972198486, "logits/rejected": -0.5471175312995911, "logps/chosen": -498.71588134765625, "logps/rejected": -660.1671752929688, "loss": 0.3037, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.711542129516602, "rewards/margins": 13.745755195617676, "rewards/rejected": -27.45729637145996, "step": 480 }, { "epoch": 0.049, "grad_norm": 3.87372857789804e-15, "learning_rate": 2.4500000000000003e-06, "logits/chosen": -0.8546003103256226, "logits/rejected": -0.39226824045181274, "logps/chosen": -500.3169860839844, "logps/rejected": -690.2376098632812, "loss": 0.6083, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.116811752319336, "rewards/margins": 16.512969970703125, "rewards/rejected": -30.62978172302246, "step": 490 }, { "epoch": 0.05, "grad_norm": 132.11865234375, "learning_rate": 2.5e-06, "logits/chosen": -0.9677556753158569, "logits/rejected": -0.3073219656944275, "logps/chosen": -347.23687744140625, "logps/rejected": -564.311279296875, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": -13.032984733581543, "rewards/margins": 16.148744583129883, "rewards/rejected": -29.181732177734375, "step": 500 }, { "epoch": 0.051, "grad_norm": 5.183964965193438e-14, "learning_rate": 2.55e-06, "logits/chosen": -0.9282774925231934, "logits/rejected": -0.3282240927219391, "logps/chosen": -284.4826354980469, "logps/rejected": -543.125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -9.672806739807129, "rewards/margins": 19.902965545654297, "rewards/rejected": -29.57577133178711, "step": 510 }, { "epoch": 0.052, "grad_norm": 0.0002157751878257841, "learning_rate": 2.6e-06, "logits/chosen": -1.4002107381820679, "logits/rejected": -0.2400444746017456, "logps/chosen": -269.77008056640625, "logps/rejected": -616.1730346679688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.905624389648438, "rewards/margins": 22.148664474487305, "rewards/rejected": -31.054290771484375, "step": 520 }, { "epoch": 0.053, "grad_norm": 7.059870767989196e-06, "learning_rate": 2.6500000000000005e-06, "logits/chosen": -0.8650500178337097, "logits/rejected": -0.3586713671684265, "logps/chosen": -244.1106719970703, "logps/rejected": -523.0113525390625, "loss": 0.1536, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.576187133789062, "rewards/margins": 16.308609008789062, "rewards/rejected": -26.884796142578125, "step": 530 }, { "epoch": 0.054, "grad_norm": 5.5057416958881333e-11, "learning_rate": 2.7000000000000004e-06, "logits/chosen": -1.052830696105957, "logits/rejected": -0.8504500389099121, "logps/chosen": -346.8498229980469, "logps/rejected": -457.3299255371094, "loss": 2.0321, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -11.125707626342773, "rewards/margins": 10.678735733032227, "rewards/rejected": -21.804443359375, "step": 540 }, { "epoch": 0.055, "grad_norm": 1.947714372363407e-05, "learning_rate": 2.7500000000000004e-06, "logits/chosen": -1.0468206405639648, "logits/rejected": -0.416832834482193, "logps/chosen": -450.96832275390625, "logps/rejected": -750.3878173828125, "loss": 0.2331, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.610055923461914, "rewards/margins": 20.343788146972656, "rewards/rejected": -33.9538459777832, "step": 550 }, { "epoch": 0.056, "grad_norm": 5.5382918383977406e-17, "learning_rate": 2.8000000000000003e-06, "logits/chosen": -1.5744976997375488, "logits/rejected": -0.3118807077407837, "logps/chosen": -226.67398071289062, "logps/rejected": -629.4376831054688, "loss": 0.9147, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.122535705566406, "rewards/margins": 20.281320571899414, "rewards/rejected": -29.403854370117188, "step": 560 }, { "epoch": 0.057, "grad_norm": 0.5190900564193726, "learning_rate": 2.85e-06, "logits/chosen": -1.0775177478790283, "logits/rejected": -0.6953670382499695, "logps/chosen": -338.7844543457031, "logps/rejected": -576.4647216796875, "loss": 0.5182, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.195273399353027, "rewards/margins": 19.976835250854492, "rewards/rejected": -25.172109603881836, "step": 570 }, { "epoch": 0.058, "grad_norm": 2.0154071535216644e-05, "learning_rate": 2.9e-06, "logits/chosen": -0.8628554344177246, "logits/rejected": -0.36490216851234436, "logps/chosen": -266.5061340332031, "logps/rejected": -409.20050048828125, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -6.164038181304932, "rewards/margins": 11.171427726745605, "rewards/rejected": -17.335468292236328, "step": 580 }, { "epoch": 0.059, "grad_norm": 0.007016741205006838, "learning_rate": 2.95e-06, "logits/chosen": -1.1316006183624268, "logits/rejected": -0.4583218991756439, "logps/chosen": -468.5044860839844, "logps/rejected": -696.3089599609375, "loss": 0.0496, "rewards/accuracies": 1.0, "rewards/chosen": -9.475954055786133, "rewards/margins": 20.412355422973633, "rewards/rejected": -29.8883113861084, "step": 590 }, { "epoch": 0.06, "grad_norm": 713.2523803710938, "learning_rate": 3e-06, "logits/chosen": -1.00506591796875, "logits/rejected": -0.45325785875320435, "logps/chosen": -389.286865234375, "logps/rejected": -647.7471923828125, "loss": 0.5942, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.856890678405762, "rewards/margins": 18.655689239501953, "rewards/rejected": -33.51258087158203, "step": 600 }, { "epoch": 0.061, "grad_norm": 0.0021474172826856375, "learning_rate": 3.05e-06, "logits/chosen": -1.1691076755523682, "logits/rejected": -0.463383287191391, "logps/chosen": -381.8735046386719, "logps/rejected": -664.1435546875, "loss": 0.5041, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.865372657775879, "rewards/margins": 17.90451431274414, "rewards/rejected": -28.769886016845703, "step": 610 }, { "epoch": 0.062, "grad_norm": 5.3986898285174334e-11, "learning_rate": 3.1000000000000004e-06, "logits/chosen": -1.3650020360946655, "logits/rejected": -0.35710233449935913, "logps/chosen": -326.7483215332031, "logps/rejected": -618.0527954101562, "loss": 0.1221, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.112927436828613, "rewards/margins": 21.5576171875, "rewards/rejected": -28.670541763305664, "step": 620 }, { "epoch": 0.063, "grad_norm": 402.3836669921875, "learning_rate": 3.1500000000000003e-06, "logits/chosen": -1.0086907148361206, "logits/rejected": -0.33054283261299133, "logps/chosen": -393.40740966796875, "logps/rejected": -667.0352783203125, "loss": 0.4131, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.809080123901367, "rewards/margins": 17.650827407836914, "rewards/rejected": -27.459903717041016, "step": 630 }, { "epoch": 0.064, "grad_norm": 0.00016561997472308576, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -0.7735254168510437, "logits/rejected": -0.4900113642215729, "logps/chosen": -365.861572265625, "logps/rejected": -569.7633056640625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -11.899888038635254, "rewards/margins": 16.789934158325195, "rewards/rejected": -28.6898193359375, "step": 640 }, { "epoch": 0.065, "grad_norm": 59.89263916015625, "learning_rate": 3.2500000000000002e-06, "logits/chosen": -0.9595259428024292, "logits/rejected": -0.35813766717910767, "logps/chosen": -261.4634094238281, "logps/rejected": -478.71539306640625, "loss": 0.0998, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.686963081359863, "rewards/margins": 16.35685157775879, "rewards/rejected": -24.043813705444336, "step": 650 }, { "epoch": 0.066, "grad_norm": 1.7982006161876285e-10, "learning_rate": 3.3000000000000006e-06, "logits/chosen": -1.4923456907272339, "logits/rejected": -0.4172714352607727, "logps/chosen": -519.4299926757812, "logps/rejected": -703.1495971679688, "loss": 1.4947, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.720640182495117, "rewards/margins": 21.21224594116211, "rewards/rejected": -30.932886123657227, "step": 660 }, { "epoch": 0.067, "grad_norm": 2.2147442436571483e-18, "learning_rate": 3.3500000000000005e-06, "logits/chosen": -1.2784268856048584, "logits/rejected": -0.3738354742527008, "logps/chosen": -467.1693420410156, "logps/rejected": -785.3607177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.926630020141602, "rewards/margins": 24.06552505493164, "rewards/rejected": -34.992156982421875, "step": 670 }, { "epoch": 0.068, "grad_norm": 2.3722683973464997e-12, "learning_rate": 3.4000000000000005e-06, "logits/chosen": -1.2794643640518188, "logits/rejected": -0.5191472172737122, "logps/chosen": -374.89996337890625, "logps/rejected": -682.2388916015625, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -9.074565887451172, "rewards/margins": 20.966999053955078, "rewards/rejected": -30.04156494140625, "step": 680 }, { "epoch": 0.069, "grad_norm": 3.0734440058070855e-13, "learning_rate": 3.45e-06, "logits/chosen": -1.0704734325408936, "logits/rejected": -0.36915844678878784, "logps/chosen": -282.9461975097656, "logps/rejected": -606.0206909179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.159383773803711, "rewards/margins": 23.78475570678711, "rewards/rejected": -31.944141387939453, "step": 690 }, { "epoch": 0.07, "grad_norm": 0.0012723951367661357, "learning_rate": 3.5e-06, "logits/chosen": -1.42892324924469, "logits/rejected": -0.40216541290283203, "logps/chosen": -279.84637451171875, "logps/rejected": -653.5191040039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.145906448364258, "rewards/margins": 22.8740234375, "rewards/rejected": -31.019927978515625, "step": 700 }, { "epoch": 0.071, "grad_norm": 14.484932899475098, "learning_rate": 3.5500000000000003e-06, "logits/chosen": -1.364404320716858, "logits/rejected": -0.25203460454940796, "logps/chosen": -239.27487182617188, "logps/rejected": -695.9551391601562, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -8.248779296875, "rewards/margins": 23.697933197021484, "rewards/rejected": -31.946712493896484, "step": 710 }, { "epoch": 0.072, "grad_norm": 3.2707375794416294e-06, "learning_rate": 3.6000000000000003e-06, "logits/chosen": -1.447434663772583, "logits/rejected": -0.28548040986061096, "logps/chosen": -227.4899139404297, "logps/rejected": -585.2236938476562, "loss": 0.1291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.030697345733643, "rewards/margins": 25.32795524597168, "rewards/rejected": -30.358654022216797, "step": 720 }, { "epoch": 0.073, "grad_norm": 0.06150020286440849, "learning_rate": 3.65e-06, "logits/chosen": -1.1551002264022827, "logits/rejected": -0.3233141005039215, "logps/chosen": -284.21990966796875, "logps/rejected": -589.0573120117188, "loss": 0.4251, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.090296745300293, "rewards/margins": 24.372478485107422, "rewards/rejected": -32.46277618408203, "step": 730 }, { "epoch": 0.074, "grad_norm": 226.98077392578125, "learning_rate": 3.7e-06, "logits/chosen": -1.0338466167449951, "logits/rejected": -0.30206966400146484, "logps/chosen": -294.2165832519531, "logps/rejected": -544.0670166015625, "loss": 0.0822, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.504319190979004, "rewards/margins": 18.94892692565918, "rewards/rejected": -28.4532470703125, "step": 740 }, { "epoch": 0.075, "grad_norm": 0.00013524027599487454, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -1.2259372472763062, "logits/rejected": -0.408051073551178, "logps/chosen": -450.68182373046875, "logps/rejected": -706.7760620117188, "loss": 0.0146, "rewards/accuracies": 1.0, "rewards/chosen": -12.297445297241211, "rewards/margins": 19.66615867614746, "rewards/rejected": -31.963603973388672, "step": 750 }, { "epoch": 0.076, "grad_norm": 3.848186491683947e-10, "learning_rate": 3.8000000000000005e-06, "logits/chosen": -0.7417271137237549, "logits/rejected": -0.5415674448013306, "logps/chosen": -414.8028259277344, "logps/rejected": -559.133544921875, "loss": 0.1328, "rewards/accuracies": 1.0, "rewards/chosen": -11.061386108398438, "rewards/margins": 16.20886993408203, "rewards/rejected": -27.270259857177734, "step": 760 }, { "epoch": 0.077, "grad_norm": 7.364305562931883e-13, "learning_rate": 3.85e-06, "logits/chosen": -1.117499589920044, "logits/rejected": -0.2406352460384369, "logps/chosen": -344.18463134765625, "logps/rejected": -646.6490478515625, "loss": 0.0194, "rewards/accuracies": 1.0, "rewards/chosen": -13.47022819519043, "rewards/margins": 19.650503158569336, "rewards/rejected": -33.12073516845703, "step": 770 }, { "epoch": 0.078, "grad_norm": 0.0572834387421608, "learning_rate": 3.900000000000001e-06, "logits/chosen": -1.432673454284668, "logits/rejected": -0.17732997238636017, "logps/chosen": -254.5869903564453, "logps/rejected": -625.7298583984375, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -10.25689697265625, "rewards/margins": 25.62160301208496, "rewards/rejected": -35.878501892089844, "step": 780 }, { "epoch": 0.079, "grad_norm": 6.07656394820133e-13, "learning_rate": 3.95e-06, "logits/chosen": -0.8815616369247437, "logits/rejected": -0.30584144592285156, "logps/chosen": -766.3916625976562, "logps/rejected": -845.8123779296875, "loss": 0.1973, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.423290252685547, "rewards/margins": 19.526325225830078, "rewards/rejected": -35.94961166381836, "step": 790 }, { "epoch": 0.08, "grad_norm": 3.452759151782028e-20, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.6766525506973267, "logits/rejected": -0.2753424346446991, "logps/chosen": -246.43896484375, "logps/rejected": -725.5803833007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.301102638244629, "rewards/margins": 28.94573974609375, "rewards/rejected": -41.2468376159668, "step": 800 }, { "epoch": 0.081, "grad_norm": 6.12564349466993e-07, "learning_rate": 4.05e-06, "logits/chosen": -0.8568048477172852, "logits/rejected": -0.28903594613075256, "logps/chosen": -604.75244140625, "logps/rejected": -807.2450561523438, "loss": 0.1291, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.469955444335938, "rewards/margins": 20.681560516357422, "rewards/rejected": -42.151512145996094, "step": 810 }, { "epoch": 0.082, "grad_norm": 0.0315057635307312, "learning_rate": 4.1e-06, "logits/chosen": -0.8729526400566101, "logits/rejected": -0.36734262108802795, "logps/chosen": -472.77264404296875, "logps/rejected": -788.4251708984375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -15.225198745727539, "rewards/margins": 28.6204776763916, "rewards/rejected": -43.845680236816406, "step": 820 }, { "epoch": 0.083, "grad_norm": 2.3944246768951416, "learning_rate": 4.15e-06, "logits/chosen": -1.023874044418335, "logits/rejected": -0.07073228061199188, "logps/chosen": -164.30848693847656, "logps/rejected": -547.1531982421875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -7.095312595367432, "rewards/margins": 27.135452270507812, "rewards/rejected": -34.23076629638672, "step": 830 }, { "epoch": 0.084, "grad_norm": 0.006124300882220268, "learning_rate": 4.2000000000000004e-06, "logits/chosen": -0.9126752614974976, "logits/rejected": -0.3035658299922943, "logps/chosen": -397.1790466308594, "logps/rejected": -735.6592407226562, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -18.258554458618164, "rewards/margins": 26.314579010009766, "rewards/rejected": -44.57312774658203, "step": 840 }, { "epoch": 0.085, "grad_norm": 3.089939588841582e-18, "learning_rate": 4.25e-06, "logits/chosen": -1.015491247177124, "logits/rejected": -0.30377858877182007, "logps/chosen": -352.0212097167969, "logps/rejected": -681.9417724609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.543761253356934, "rewards/margins": 27.535030364990234, "rewards/rejected": -39.07879638671875, "step": 850 }, { "epoch": 0.086, "grad_norm": 2.16502828340149e-15, "learning_rate": 4.3e-06, "logits/chosen": -1.081601858139038, "logits/rejected": -0.15704300999641418, "logps/chosen": -481.53973388671875, "logps/rejected": -850.2933349609375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -18.392127990722656, "rewards/margins": 25.024600982666016, "rewards/rejected": -43.416725158691406, "step": 860 }, { "epoch": 0.087, "grad_norm": 4.893959339824505e-05, "learning_rate": 4.350000000000001e-06, "logits/chosen": -1.2104352712631226, "logits/rejected": -0.07070871442556381, "logps/chosen": -370.2465515136719, "logps/rejected": -821.4221801757812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -15.73651123046875, "rewards/margins": 32.19016647338867, "rewards/rejected": -47.92667770385742, "step": 870 }, { "epoch": 0.088, "grad_norm": 1201.88623046875, "learning_rate": 4.4e-06, "logits/chosen": -1.0273677110671997, "logits/rejected": -0.21135012805461884, "logps/chosen": -353.56280517578125, "logps/rejected": -722.2070922851562, "loss": 1.691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.636672973632812, "rewards/margins": 27.89716148376465, "rewards/rejected": -46.533836364746094, "step": 880 }, { "epoch": 0.089, "grad_norm": 3.308795930320285e-15, "learning_rate": 4.450000000000001e-06, "logits/chosen": -1.327782154083252, "logits/rejected": -0.0714651569724083, "logps/chosen": -502.51007080078125, "logps/rejected": -904.955078125, "loss": 1.6639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -24.43936538696289, "rewards/margins": 28.087310791015625, "rewards/rejected": -52.52668380737305, "step": 890 }, { "epoch": 0.09, "grad_norm": 4.506963812572762e-12, "learning_rate": 4.5e-06, "logits/chosen": -1.178138017654419, "logits/rejected": -0.036952096968889236, "logps/chosen": -337.31756591796875, "logps/rejected": -817.9973754882812, "loss": 0.1378, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.041440963745117, "rewards/margins": 30.560115814208984, "rewards/rejected": -45.60155487060547, "step": 900 }, { "epoch": 0.091, "grad_norm": 5.517401950783096e-05, "learning_rate": 4.5500000000000005e-06, "logits/chosen": -1.073327898979187, "logits/rejected": -0.4490521550178528, "logps/chosen": -490.901611328125, "logps/rejected": -721.9581909179688, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -14.316640853881836, "rewards/margins": 29.4993953704834, "rewards/rejected": -43.8160400390625, "step": 910 }, { "epoch": 0.092, "grad_norm": 3.1243777994617106e-15, "learning_rate": 4.600000000000001e-06, "logits/chosen": -1.8327325582504272, "logits/rejected": -0.01891680620610714, "logps/chosen": -203.77676391601562, "logps/rejected": -841.4299926757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.827959060668945, "rewards/margins": 43.51868438720703, "rewards/rejected": -53.34663772583008, "step": 920 }, { "epoch": 0.093, "grad_norm": 3.558796279889975e-08, "learning_rate": 4.65e-06, "logits/chosen": -1.027311086654663, "logits/rejected": -0.20076104998588562, "logps/chosen": -326.1154479980469, "logps/rejected": -669.1527099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.236804962158203, "rewards/margins": 28.084369659423828, "rewards/rejected": -40.32117462158203, "step": 930 }, { "epoch": 0.094, "grad_norm": 7.486784133018432e-23, "learning_rate": 4.7e-06, "logits/chosen": -0.8670459985733032, "logits/rejected": -0.19873929023742676, "logps/chosen": -254.91256713867188, "logps/rejected": -628.9371948242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.05659008026123, "rewards/margins": 27.286083221435547, "rewards/rejected": -38.342674255371094, "step": 940 }, { "epoch": 0.095, "grad_norm": 4.481721733536244e-16, "learning_rate": 4.75e-06, "logits/chosen": -1.0405689477920532, "logits/rejected": -0.050393976271152496, "logps/chosen": -293.336669921875, "logps/rejected": -628.1806640625, "loss": 0.8787, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.126751899719238, "rewards/margins": 26.10247802734375, "rewards/rejected": -39.229225158691406, "step": 950 }, { "epoch": 0.096, "grad_norm": 0.6695166826248169, "learning_rate": 4.800000000000001e-06, "logits/chosen": -1.4227111339569092, "logits/rejected": -0.11280278861522675, "logps/chosen": -249.92056274414062, "logps/rejected": -675.5670776367188, "loss": 0.2883, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.505855560302734, "rewards/margins": 25.19675064086914, "rewards/rejected": -34.702606201171875, "step": 960 }, { "epoch": 0.097, "grad_norm": 4.120292729226094e-08, "learning_rate": 4.85e-06, "logits/chosen": -1.407098412513733, "logits/rejected": -0.2420966923236847, "logps/chosen": -257.65374755859375, "logps/rejected": -694.7686157226562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.209831237792969, "rewards/margins": 27.865127563476562, "rewards/rejected": -38.07495880126953, "step": 970 }, { "epoch": 0.098, "grad_norm": 1.1203562983556366e-19, "learning_rate": 4.9000000000000005e-06, "logits/chosen": -1.0926339626312256, "logits/rejected": -0.24714651703834534, "logps/chosen": -388.4772644042969, "logps/rejected": -831.8443603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.813573837280273, "rewards/margins": 32.880027770996094, "rewards/rejected": -43.693603515625, "step": 980 }, { "epoch": 0.099, "grad_norm": 0.03495605289936066, "learning_rate": 4.95e-06, "logits/chosen": -1.2097073793411255, "logits/rejected": -0.46265825629234314, "logps/chosen": -424.83245849609375, "logps/rejected": -783.1539916992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.046907424926758, "rewards/margins": 30.06099510192871, "rewards/rejected": -40.10790252685547, "step": 990 }, { "epoch": 0.1, "grad_norm": 1.0239310510584687e-09, "learning_rate": 5e-06, "logits/chosen": -1.1762347221374512, "logits/rejected": -0.045468103140592575, "logps/chosen": -182.35528564453125, "logps/rejected": -574.6000366210938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.4294304847717285, "rewards/margins": 24.870960235595703, "rewards/rejected": -31.30039405822754, "step": 1000 }, { "epoch": 0.101, "grad_norm": 0.08554869890213013, "learning_rate": 4.999984769144476e-06, "logits/chosen": -1.0135505199432373, "logits/rejected": -0.1592954695224762, "logps/chosen": -429.33612060546875, "logps/rejected": -704.2899169921875, "loss": 0.3016, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.181246757507324, "rewards/margins": 22.7156925201416, "rewards/rejected": -32.89693832397461, "step": 1010 }, { "epoch": 0.102, "grad_norm": 202.30699157714844, "learning_rate": 4.999939076763487e-06, "logits/chosen": -1.2693045139312744, "logits/rejected": -0.2478354275226593, "logps/chosen": -160.41709899902344, "logps/rejected": -502.111083984375, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": -7.172953128814697, "rewards/margins": 22.934494018554688, "rewards/rejected": -30.107446670532227, "step": 1020 }, { "epoch": 0.103, "grad_norm": 6.811764317040607e-16, "learning_rate": 4.999862923413781e-06, "logits/chosen": -0.9106602668762207, "logits/rejected": -0.4323008060455322, "logps/chosen": -550.83837890625, "logps/rejected": -812.7847900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.511116981506348, "rewards/margins": 26.43093490600586, "rewards/rejected": -34.94205093383789, "step": 1030 }, { "epoch": 0.104, "grad_norm": 6.298041515390151e-13, "learning_rate": 4.999756310023261e-06, "logits/chosen": -1.1837074756622314, "logits/rejected": -0.42569518089294434, "logps/chosen": -483.1153259277344, "logps/rejected": -856.2667236328125, "loss": 1.8217, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.297500610351562, "rewards/margins": 29.203968048095703, "rewards/rejected": -42.501468658447266, "step": 1040 }, { "epoch": 0.105, "grad_norm": 0.015196479856967926, "learning_rate": 4.9996192378909785e-06, "logits/chosen": -1.2038367986679077, "logits/rejected": -0.36220741271972656, "logps/chosen": -272.37530517578125, "logps/rejected": -554.7415161132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.584593772888184, "rewards/margins": 23.410310745239258, "rewards/rejected": -28.994903564453125, "step": 1050 }, { "epoch": 0.106, "grad_norm": 0.07376130670309067, "learning_rate": 4.999451708687114e-06, "logits/chosen": -1.2006012201309204, "logits/rejected": -0.46303287148475647, "logps/chosen": -300.4767761230469, "logps/rejected": -579.3289794921875, "loss": 1.0262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.391415119171143, "rewards/margins": 13.561741828918457, "rewards/rejected": -18.953155517578125, "step": 1060 }, { "epoch": 0.107, "grad_norm": 5.374231726307244e-09, "learning_rate": 4.9992537244529585e-06, "logits/chosen": -0.9604480862617493, "logits/rejected": -0.30329519510269165, "logps/chosen": -316.0964660644531, "logps/rejected": -505.31396484375, "loss": 0.0341, "rewards/accuracies": 1.0, "rewards/chosen": -4.198069095611572, "rewards/margins": 17.348926544189453, "rewards/rejected": -21.546995162963867, "step": 1070 }, { "epoch": 0.108, "grad_norm": 0.00011263292981311679, "learning_rate": 4.999025287600886e-06, "logits/chosen": -1.090127944946289, "logits/rejected": -0.5740963220596313, "logps/chosen": -243.5624237060547, "logps/rejected": -525.05126953125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -6.5334296226501465, "rewards/margins": 18.794811248779297, "rewards/rejected": -25.3282413482666, "step": 1080 }, { "epoch": 0.109, "grad_norm": 112.01258850097656, "learning_rate": 4.998766400914329e-06, "logits/chosen": -1.1208736896514893, "logits/rejected": -0.2704886198043823, "logps/chosen": -215.65731811523438, "logps/rejected": -516.1124267578125, "loss": 0.0734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.3899407386779785, "rewards/margins": 19.51913070678711, "rewards/rejected": -25.909076690673828, "step": 1090 }, { "epoch": 0.11, "grad_norm": 2.881012919550563e-12, "learning_rate": 4.99847706754774e-06, "logits/chosen": -1.248564600944519, "logits/rejected": -0.23786959052085876, "logps/chosen": -295.33233642578125, "logps/rejected": -660.6017456054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.113610744476318, "rewards/margins": 29.882919311523438, "rewards/rejected": -35.99652862548828, "step": 1100 }, { "epoch": 0.111, "grad_norm": 8.900973014203117e-11, "learning_rate": 4.998157291026553e-06, "logits/chosen": -0.8095799684524536, "logits/rejected": -0.5679833889007568, "logps/chosen": -364.50518798828125, "logps/rejected": -604.2783203125, "loss": 1.2719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -10.384870529174805, "rewards/margins": 17.38606834411621, "rewards/rejected": -27.77094078063965, "step": 1110 }, { "epoch": 0.112, "grad_norm": 1.76853864886084e-10, "learning_rate": 4.997807075247147e-06, "logits/chosen": -1.0766984224319458, "logits/rejected": -0.40137988328933716, "logps/chosen": -344.74493408203125, "logps/rejected": -616.8096923828125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -6.687139987945557, "rewards/margins": 19.36139488220215, "rewards/rejected": -26.048538208007812, "step": 1120 }, { "epoch": 0.113, "grad_norm": 0.18956993520259857, "learning_rate": 4.997426424476787e-06, "logits/chosen": -1.1012532711029053, "logits/rejected": -0.47621792554855347, "logps/chosen": -413.59075927734375, "logps/rejected": -553.7152099609375, "loss": 0.0778, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.19697380065918, "rewards/margins": 17.737911224365234, "rewards/rejected": -22.934885025024414, "step": 1130 }, { "epoch": 0.114, "grad_norm": 4.789621829986572, "learning_rate": 4.9970153433535855e-06, "logits/chosen": -0.9956803321838379, "logits/rejected": -0.15662881731987, "logps/chosen": -178.8624725341797, "logps/rejected": -406.60443115234375, "loss": 0.0942, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.233434200286865, "rewards/margins": 15.290153503417969, "rewards/rejected": -21.523588180541992, "step": 1140 }, { "epoch": 0.115, "grad_norm": 1.3092710560158594e-07, "learning_rate": 4.9965738368864345e-06, "logits/chosen": -1.0208417177200317, "logits/rejected": -0.3284724950790405, "logps/chosen": -319.4895935058594, "logps/rejected": -578.760009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.61468505859375, "rewards/margins": 19.40301513671875, "rewards/rejected": -28.017696380615234, "step": 1150 }, { "epoch": 0.116, "grad_norm": 2.5855173589661717e-05, "learning_rate": 4.996101910454953e-06, "logits/chosen": -1.500450611114502, "logits/rejected": -0.21137702465057373, "logps/chosen": -271.80279541015625, "logps/rejected": -712.6658935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.964298248291016, "rewards/margins": 27.46476173400879, "rewards/rejected": -36.42905807495117, "step": 1160 }, { "epoch": 0.117, "grad_norm": 3.229709277796644e-10, "learning_rate": 4.995599569809414e-06, "logits/chosen": -1.355883240699768, "logits/rejected": -0.21876247227191925, "logps/chosen": -178.31338500976562, "logps/rejected": -683.889892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.416735649108887, "rewards/margins": 30.885372161865234, "rewards/rejected": -38.30210876464844, "step": 1170 }, { "epoch": 0.118, "grad_norm": 0.000150295440107584, "learning_rate": 4.9950668210706795e-06, "logits/chosen": -0.8052603602409363, "logits/rejected": -0.5494809746742249, "logps/chosen": -378.8117370605469, "logps/rejected": -612.0853271484375, "loss": 0.268, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.668352127075195, "rewards/margins": 22.803325653076172, "rewards/rejected": -33.4716796875, "step": 1180 }, { "epoch": 0.119, "grad_norm": 9.76726077794865e-09, "learning_rate": 4.994503670730126e-06, "logits/chosen": -0.9996203184127808, "logits/rejected": -0.3798294961452484, "logps/chosen": -486.4266052246094, "logps/rejected": -707.1964111328125, "loss": 0.2434, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.749849319458008, "rewards/margins": 24.026813507080078, "rewards/rejected": -35.77666473388672, "step": 1190 }, { "epoch": 0.12, "grad_norm": 7.2385314409118e-10, "learning_rate": 4.993910125649561e-06, "logits/chosen": -1.1892848014831543, "logits/rejected": -0.22287265956401825, "logps/chosen": -295.4376525878906, "logps/rejected": -631.9881591796875, "loss": 0.0264, "rewards/accuracies": 1.0, "rewards/chosen": -5.681472301483154, "rewards/margins": 18.257129669189453, "rewards/rejected": -23.9385986328125, "step": 1200 }, { "epoch": 0.121, "grad_norm": 4.749460824626794e-10, "learning_rate": 4.993286193061145e-06, "logits/chosen": -1.0906132459640503, "logits/rejected": -0.20747146010398865, "logps/chosen": -257.7426452636719, "logps/rejected": -655.1525268554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.186422348022461, "rewards/margins": 27.005573272705078, "rewards/rejected": -35.19198989868164, "step": 1210 }, { "epoch": 0.122, "grad_norm": 6.223758930445911e-08, "learning_rate": 4.992631880567301e-06, "logits/chosen": -1.656432867050171, "logits/rejected": -0.3102510869503021, "logps/chosen": -348.0914306640625, "logps/rejected": -825.349609375, "loss": 0.0834, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.202226638793945, "rewards/margins": 29.386104583740234, "rewards/rejected": -39.58833312988281, "step": 1220 }, { "epoch": 0.123, "grad_norm": 1.3237407074484508e-05, "learning_rate": 4.991947196140619e-06, "logits/chosen": -1.154322624206543, "logits/rejected": -0.4798669219017029, "logps/chosen": -273.0415344238281, "logps/rejected": -574.0501708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.697595119476318, "rewards/margins": 23.833728790283203, "rewards/rejected": -30.531320571899414, "step": 1230 }, { "epoch": 0.124, "grad_norm": 0.009432843886315823, "learning_rate": 4.9912321481237616e-06, "logits/chosen": -1.3529380559921265, "logits/rejected": -0.28025001287460327, "logps/chosen": -214.38296508789062, "logps/rejected": -579.0044555664062, "loss": 0.0135, "rewards/accuracies": 1.0, "rewards/chosen": -5.91778564453125, "rewards/margins": 20.354816436767578, "rewards/rejected": -26.27260398864746, "step": 1240 }, { "epoch": 0.125, "grad_norm": 1.7649913475192847e-10, "learning_rate": 4.990486745229364e-06, "logits/chosen": -1.2412010431289673, "logits/rejected": -0.19147519767284393, "logps/chosen": -286.7811279296875, "logps/rejected": -753.50341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.454304218292236, "rewards/margins": 29.799936294555664, "rewards/rejected": -35.25423812866211, "step": 1250 }, { "epoch": 0.126, "grad_norm": 1.9105982074218986e-10, "learning_rate": 4.989710996539926e-06, "logits/chosen": -1.4346072673797607, "logits/rejected": -0.3352917730808258, "logps/chosen": -304.7495422363281, "logps/rejected": -753.637939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.5735673904418945, "rewards/margins": 38.21772003173828, "rewards/rejected": -42.791290283203125, "step": 1260 }, { "epoch": 0.127, "grad_norm": 2.4584993596477034e-08, "learning_rate": 4.9889049115077e-06, "logits/chosen": -1.0849168300628662, "logits/rejected": -0.21878328919410706, "logps/chosen": -340.9671325683594, "logps/rejected": -746.2839965820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.222436904907227, "rewards/margins": 35.21474838256836, "rewards/rejected": -43.437191009521484, "step": 1270 }, { "epoch": 0.128, "grad_norm": 9.595669325790368e-07, "learning_rate": 4.988068499954578e-06, "logits/chosen": -1.0678956508636475, "logits/rejected": -0.3092970848083496, "logps/chosen": -186.111572265625, "logps/rejected": -427.2423400878906, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -6.491952419281006, "rewards/margins": 16.506975173950195, "rewards/rejected": -22.998926162719727, "step": 1280 }, { "epoch": 0.129, "grad_norm": 6.616324241953686e-12, "learning_rate": 4.987201772071971e-06, "logits/chosen": -0.824511706829071, "logits/rejected": -0.7110381126403809, "logps/chosen": -341.7249450683594, "logps/rejected": -489.94921875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -6.455601692199707, "rewards/margins": 21.240055084228516, "rewards/rejected": -27.69565773010254, "step": 1290 }, { "epoch": 0.13, "grad_norm": 0.00014927101437933743, "learning_rate": 4.986304738420684e-06, "logits/chosen": -1.0902425050735474, "logits/rejected": -0.05907214805483818, "logps/chosen": -247.59384155273438, "logps/rejected": -632.3173828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.22404956817627, "rewards/margins": 30.62563705444336, "rewards/rejected": -39.84968566894531, "step": 1300 }, { "epoch": 0.131, "grad_norm": 0.0, "learning_rate": 4.985377409930789e-06, "logits/chosen": -1.1010768413543701, "logits/rejected": -0.34575071930885315, "logps/chosen": -489.7151794433594, "logps/rejected": -749.903564453125, "loss": 1.3087, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.157520294189453, "rewards/margins": 30.689640045166016, "rewards/rejected": -39.84716033935547, "step": 1310 }, { "epoch": 0.132, "grad_norm": 1.2251684909647675e-11, "learning_rate": 4.984419797901491e-06, "logits/chosen": -0.7680908441543579, "logits/rejected": -0.5849398374557495, "logps/chosen": -323.7127685546875, "logps/rejected": -463.91302490234375, "loss": 0.1654, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.299878120422363, "rewards/margins": 16.57649040222168, "rewards/rejected": -25.876373291015625, "step": 1320 }, { "epoch": 0.133, "grad_norm": 4.208574894831729e-12, "learning_rate": 4.983431914000991e-06, "logits/chosen": -0.7924326658248901, "logits/rejected": -0.30698415637016296, "logps/chosen": -515.7220458984375, "logps/rejected": -723.1101684570312, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": -14.746968269348145, "rewards/margins": 22.760456085205078, "rewards/rejected": -37.507423400878906, "step": 1330 }, { "epoch": 0.134, "grad_norm": 1.1323597230195467e-19, "learning_rate": 4.9824137702663424e-06, "logits/chosen": -1.2904155254364014, "logits/rejected": 0.13465338945388794, "logps/chosen": -374.4416198730469, "logps/rejected": -968.5813598632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.409658432006836, "rewards/margins": 43.10974884033203, "rewards/rejected": -60.5194091796875, "step": 1340 }, { "epoch": 0.135, "grad_norm": 1.4024823240688794e-13, "learning_rate": 4.981365379103306e-06, "logits/chosen": -0.8922684788703918, "logits/rejected": -0.23835989832878113, "logps/chosen": -437.84259033203125, "logps/rejected": -763.39013671875, "loss": 0.765, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.31096649169922, "rewards/margins": 28.40448570251465, "rewards/rejected": -47.7154541015625, "step": 1350 }, { "epoch": 0.136, "grad_norm": 8.513531676510033e-13, "learning_rate": 4.980286753286196e-06, "logits/chosen": -1.1230740547180176, "logits/rejected": -0.06482603400945663, "logps/chosen": -502.35980224609375, "logps/rejected": -846.9089965820312, "loss": 0.2196, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.695874214172363, "rewards/margins": 28.752582550048828, "rewards/rejected": -40.448455810546875, "step": 1360 }, { "epoch": 0.137, "grad_norm": 3.9868555505584435e-12, "learning_rate": 4.979177905957726e-06, "logits/chosen": -0.9407769441604614, "logits/rejected": 0.15109823644161224, "logps/chosen": -332.65191650390625, "logps/rejected": -824.77392578125, "loss": 0.4068, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.708237648010254, "rewards/margins": 37.805335998535156, "rewards/rejected": -45.513572692871094, "step": 1370 }, { "epoch": 0.138, "grad_norm": 0.012242639437317848, "learning_rate": 4.978038850628855e-06, "logits/chosen": -0.960767924785614, "logits/rejected": -0.6809446811676025, "logps/chosen": -340.8134765625, "logps/rejected": -660.6018676757812, "loss": 1.068, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.148280143737793, "rewards/margins": 26.553054809570312, "rewards/rejected": -38.70133590698242, "step": 1380 }, { "epoch": 0.139, "grad_norm": 1.2719906408165116e-05, "learning_rate": 4.9768696011786095e-06, "logits/chosen": -1.1656242609024048, "logits/rejected": -0.02300162985920906, "logps/chosen": -227.0721893310547, "logps/rejected": -624.4703369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.238847732543945, "rewards/margins": 24.26767921447754, "rewards/rejected": -34.50652313232422, "step": 1390 }, { "epoch": 0.14, "grad_norm": 18.823410034179688, "learning_rate": 4.975670171853926e-06, "logits/chosen": -1.070709466934204, "logits/rejected": -0.17539706826210022, "logps/chosen": -367.56500244140625, "logps/rejected": -743.7613525390625, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -7.786466121673584, "rewards/margins": 26.3763484954834, "rewards/rejected": -34.162811279296875, "step": 1400 }, { "epoch": 0.141, "grad_norm": 0.0017390275606885552, "learning_rate": 4.974440577269473e-06, "logits/chosen": -0.8289377093315125, "logits/rejected": -0.35058996081352234, "logps/chosen": -452.47021484375, "logps/rejected": -699.7356567382812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -9.515665054321289, "rewards/margins": 25.03122329711914, "rewards/rejected": -34.54689025878906, "step": 1410 }, { "epoch": 0.142, "grad_norm": 4.752753739012405e-05, "learning_rate": 4.973180832407471e-06, "logits/chosen": -0.5075998902320862, "logits/rejected": -0.3585384786128998, "logps/chosen": -574.3099365234375, "logps/rejected": -675.3820190429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.680159568786621, "rewards/margins": 25.572025299072266, "rewards/rejected": -33.25218200683594, "step": 1420 }, { "epoch": 0.143, "grad_norm": 2.4035329156310446e-17, "learning_rate": 4.971890952617515e-06, "logits/chosen": -1.3064539432525635, "logits/rejected": 0.013357448391616344, "logps/chosen": -317.8672790527344, "logps/rejected": -717.834716796875, "loss": 0.399, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.242793083190918, "rewards/margins": 26.032501220703125, "rewards/rejected": -36.275291442871094, "step": 1430 }, { "epoch": 0.144, "grad_norm": 6.969142060317401e-13, "learning_rate": 4.970570953616383e-06, "logits/chosen": -1.1992871761322021, "logits/rejected": -0.16932205855846405, "logps/chosen": -274.62652587890625, "logps/rejected": -713.0188598632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.339066505432129, "rewards/margins": 26.44083595275879, "rewards/rejected": -34.77989959716797, "step": 1440 }, { "epoch": 0.145, "grad_norm": 2.2085606946926776e-15, "learning_rate": 4.9692208514878445e-06, "logits/chosen": -1.0547511577606201, "logits/rejected": -0.20317073166370392, "logps/chosen": -360.881103515625, "logps/rejected": -724.4630737304688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.842625141143799, "rewards/margins": 26.726482391357422, "rewards/rejected": -32.5691032409668, "step": 1450 }, { "epoch": 0.146, "grad_norm": 1.7620060965839457e-09, "learning_rate": 4.96784066268247e-06, "logits/chosen": -1.0773088932037354, "logits/rejected": -0.14823777973651886, "logps/chosen": -206.27163696289062, "logps/rejected": -585.3616333007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.059189796447754, "rewards/margins": 25.024410247802734, "rewards/rejected": -31.083599090576172, "step": 1460 }, { "epoch": 0.147, "grad_norm": 5.868219886906445e-05, "learning_rate": 4.966430404017424e-06, "logits/chosen": -0.9046875238418579, "logits/rejected": -0.4450520873069763, "logps/chosen": -223.2705535888672, "logps/rejected": -525.7073974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.235110282897949, "rewards/margins": 22.660573959350586, "rewards/rejected": -28.89568519592285, "step": 1470 }, { "epoch": 0.148, "grad_norm": 7.98956989456201e-09, "learning_rate": 4.964990092676263e-06, "logits/chosen": -1.0882080793380737, "logits/rejected": -0.07989266514778137, "logps/chosen": -289.68768310546875, "logps/rejected": -685.7807006835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.977403163909912, "rewards/margins": 24.825477600097656, "rewards/rejected": -31.802881240844727, "step": 1480 }, { "epoch": 0.149, "grad_norm": 3.933014531458737e-14, "learning_rate": 4.963519746208726e-06, "logits/chosen": -1.6329920291900635, "logits/rejected": -0.03951167315244675, "logps/chosen": -350.8020935058594, "logps/rejected": -891.4786987304688, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -4.784034729003906, "rewards/margins": 27.927047729492188, "rewards/rejected": -32.711082458496094, "step": 1490 }, { "epoch": 0.15, "grad_norm": 2.4590647220611572, "learning_rate": 4.962019382530521e-06, "logits/chosen": -0.8447348475456238, "logits/rejected": -0.46400079131126404, "logps/chosen": -466.80584716796875, "logps/rejected": -604.7357788085938, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -6.835474967956543, "rewards/margins": 16.11906623840332, "rewards/rejected": -22.954544067382812, "step": 1500 }, { "epoch": 0.151, "grad_norm": 6.710806227123306e-14, "learning_rate": 4.960489019923105e-06, "logits/chosen": -1.0921003818511963, "logits/rejected": -0.08635418117046356, "logps/chosen": -269.69671630859375, "logps/rejected": -679.5653686523438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.616307258605957, "rewards/margins": 28.96999740600586, "rewards/rejected": -35.5863037109375, "step": 1510 }, { "epoch": 0.152, "grad_norm": 3.868865228184859e-10, "learning_rate": 4.958928677033465e-06, "logits/chosen": -1.3451675176620483, "logits/rejected": -0.0031303453724831343, "logps/chosen": -277.45928955078125, "logps/rejected": -788.315673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.414937973022461, "rewards/margins": 32.910972595214844, "rewards/rejected": -41.3259162902832, "step": 1520 }, { "epoch": 0.153, "grad_norm": 3.853562976531555e-13, "learning_rate": 4.957338372873886e-06, "logits/chosen": -0.8953266143798828, "logits/rejected": -0.23004481196403503, "logps/chosen": -368.2197265625, "logps/rejected": -699.2587280273438, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -8.798627853393555, "rewards/margins": 28.176212310791016, "rewards/rejected": -36.97483444213867, "step": 1530 }, { "epoch": 0.154, "grad_norm": 3.0009095668792725, "learning_rate": 4.9557181268217225e-06, "logits/chosen": -1.0614800453186035, "logits/rejected": -0.3679501712322235, "logps/chosen": -328.5702819824219, "logps/rejected": -513.2587890625, "loss": 1.0402, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.790850639343262, "rewards/margins": 17.721195220947266, "rewards/rejected": -26.512048721313477, "step": 1540 }, { "epoch": 0.155, "grad_norm": 1.3953936096877673e-11, "learning_rate": 4.9540679586191605e-06, "logits/chosen": -0.8217973709106445, "logits/rejected": -0.2370542585849762, "logps/chosen": -196.01568603515625, "logps/rejected": -472.8330078125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.229257106781006, "rewards/margins": 22.110279083251953, "rewards/rejected": -26.33953857421875, "step": 1550 }, { "epoch": 0.156, "grad_norm": 5.3244052141692783e-20, "learning_rate": 4.9523878883729794e-06, "logits/chosen": -1.1830456256866455, "logits/rejected": -0.013311699032783508, "logps/chosen": -356.19744873046875, "logps/rejected": -797.1712646484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.410015106201172, "rewards/margins": 31.568416595458984, "rewards/rejected": -39.978431701660156, "step": 1560 }, { "epoch": 0.157, "grad_norm": 8.391878054681001e-07, "learning_rate": 4.9506779365543054e-06, "logits/chosen": -0.6549090147018433, "logits/rejected": -0.2016439139842987, "logps/chosen": -334.9619140625, "logps/rejected": -612.3310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.262808799743652, "rewards/margins": 28.336029052734375, "rewards/rejected": -35.598838806152344, "step": 1570 }, { "epoch": 0.158, "grad_norm": 105.29165649414062, "learning_rate": 4.94893812399836e-06, "logits/chosen": -1.018554449081421, "logits/rejected": -0.3415969908237457, "logps/chosen": -261.0390625, "logps/rejected": -572.3067626953125, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -6.529544830322266, "rewards/margins": 26.439651489257812, "rewards/rejected": -32.96919631958008, "step": 1580 }, { "epoch": 0.159, "grad_norm": 0.03807740658521652, "learning_rate": 4.947168471904213e-06, "logits/chosen": -0.9052284955978394, "logits/rejected": -0.2753170132637024, "logps/chosen": -466.19232177734375, "logps/rejected": -726.24755859375, "loss": 0.0445, "rewards/accuracies": 1.0, "rewards/chosen": -9.326812744140625, "rewards/margins": 23.40597152709961, "rewards/rejected": -32.732784271240234, "step": 1590 }, { "epoch": 0.16, "grad_norm": 1.2223373897259082e-13, "learning_rate": 4.9453690018345144e-06, "logits/chosen": -1.063926100730896, "logits/rejected": -0.015577336773276329, "logps/chosen": -194.1289825439453, "logps/rejected": -624.2291259765625, "loss": 0.0661, "rewards/accuracies": 1.0, "rewards/chosen": -6.718228340148926, "rewards/margins": 30.950618743896484, "rewards/rejected": -37.66884994506836, "step": 1600 }, { "epoch": 0.161, "grad_norm": 3.4670115628236686e-13, "learning_rate": 4.9435397357152406e-06, "logits/chosen": -0.7654945850372314, "logits/rejected": -0.07977879047393799, "logps/chosen": -287.39947509765625, "logps/rejected": -611.6721801757812, "loss": 0.0506, "rewards/accuracies": 1.0, "rewards/chosen": -11.743795394897461, "rewards/margins": 24.993785858154297, "rewards/rejected": -36.737579345703125, "step": 1610 }, { "epoch": 0.162, "grad_norm": 5.27442256716182e-19, "learning_rate": 4.9416806958354206e-06, "logits/chosen": -1.0056555271148682, "logits/rejected": -0.032809026539325714, "logps/chosen": -188.92874145507812, "logps/rejected": -566.5960693359375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -7.255312442779541, "rewards/margins": 29.489330291748047, "rewards/rejected": -36.74464797973633, "step": 1620 }, { "epoch": 0.163, "grad_norm": 2.0571063841061388e-13, "learning_rate": 4.939791904846869e-06, "logits/chosen": -1.0535011291503906, "logits/rejected": 0.179019033908844, "logps/chosen": -235.6096649169922, "logps/rejected": -637.8615112304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.38923168182373, "rewards/margins": 27.224655151367188, "rewards/rejected": -37.61388397216797, "step": 1630 }, { "epoch": 0.164, "grad_norm": 0.007213903125375509, "learning_rate": 4.937873385763909e-06, "logits/chosen": -0.9827578663825989, "logits/rejected": 0.1371154934167862, "logps/chosen": -250.7228546142578, "logps/rejected": -675.5855712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.908210754394531, "rewards/margins": 28.934223175048828, "rewards/rejected": -36.842437744140625, "step": 1640 }, { "epoch": 0.165, "grad_norm": 0.0, "learning_rate": 4.935925161963089e-06, "logits/chosen": -0.8703482747077942, "logits/rejected": 0.06418517976999283, "logps/chosen": -367.58697509765625, "logps/rejected": -749.47509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.039400100708008, "rewards/margins": 35.043800354003906, "rewards/rejected": -49.08320236206055, "step": 1650 }, { "epoch": 0.166, "grad_norm": 6.4045049645578e-12, "learning_rate": 4.933947257182901e-06, "logits/chosen": -0.9856742024421692, "logits/rejected": 0.08307775110006332, "logps/chosen": -305.7294006347656, "logps/rejected": -904.21240234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.920564651489258, "rewards/margins": 46.30979919433594, "rewards/rejected": -55.23036575317383, "step": 1660 }, { "epoch": 0.167, "grad_norm": 1.2415427403392098e-22, "learning_rate": 4.9319396955234925e-06, "logits/chosen": -0.8148317337036133, "logits/rejected": -0.1875368058681488, "logps/chosen": -424.0921936035156, "logps/rejected": -860.6064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.587820053100586, "rewards/margins": 38.1686897277832, "rewards/rejected": -49.75651168823242, "step": 1670 }, { "epoch": 0.168, "grad_norm": 0.0003383158764336258, "learning_rate": 4.9299025014463665e-06, "logits/chosen": -1.0005062818527222, "logits/rejected": 0.22835354506969452, "logps/chosen": -508.1582946777344, "logps/rejected": -1031.61572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.51125717163086, "rewards/margins": 43.42790603637695, "rewards/rejected": -61.93916702270508, "step": 1680 }, { "epoch": 0.169, "grad_norm": 1.0438952626574613e-13, "learning_rate": 4.92783569977409e-06, "logits/chosen": -0.7535207867622375, "logits/rejected": 0.13958851993083954, "logps/chosen": -349.7854309082031, "logps/rejected": -842.4319458007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.62995433807373, "rewards/margins": 43.466609954833984, "rewards/rejected": -59.09656524658203, "step": 1690 }, { "epoch": 0.17, "grad_norm": 3.889030228090189e-15, "learning_rate": 4.925739315689991e-06, "logits/chosen": -0.6942230463027954, "logits/rejected": -0.04554635286331177, "logps/chosen": -540.0942993164062, "logps/rejected": -736.609130859375, "loss": 0.1435, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.868269920349121, "rewards/margins": 18.347684860229492, "rewards/rejected": -30.215953826904297, "step": 1700 }, { "epoch": 0.171, "grad_norm": 1.0066810395264331e-13, "learning_rate": 4.923613374737848e-06, "logits/chosen": -1.0682138204574585, "logits/rejected": 0.1531905233860016, "logps/chosen": -301.99566650390625, "logps/rejected": -800.46923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.131755828857422, "rewards/margins": 36.60453796386719, "rewards/rejected": -45.73629379272461, "step": 1710 }, { "epoch": 0.172, "grad_norm": 0.000590948446188122, "learning_rate": 4.921457902821578e-06, "logits/chosen": -1.2912073135375977, "logits/rejected": 0.09540309756994247, "logps/chosen": -261.936767578125, "logps/rejected": -738.4759521484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -6.333407402038574, "rewards/margins": 31.073110580444336, "rewards/rejected": -37.406517028808594, "step": 1720 }, { "epoch": 0.173, "grad_norm": 5.637766364863239e-10, "learning_rate": 4.9192729262049285e-06, "logits/chosen": -0.7073559165000916, "logits/rejected": 0.033843234181404114, "logps/chosen": -345.47894287109375, "logps/rejected": -699.8843994140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -13.881582260131836, "rewards/margins": 29.07110023498535, "rewards/rejected": -42.95268630981445, "step": 1730 }, { "epoch": 0.174, "grad_norm": 2.4128008023104536e-19, "learning_rate": 4.917058471511149e-06, "logits/chosen": -0.7510659694671631, "logits/rejected": -0.06710796803236008, "logps/chosen": -461.6541442871094, "logps/rejected": -844.5467529296875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -11.30162525177002, "rewards/margins": 33.08237838745117, "rewards/rejected": -44.38400650024414, "step": 1740 }, { "epoch": 0.175, "grad_norm": 9.65522123906729e-19, "learning_rate": 4.914814565722671e-06, "logits/chosen": -0.9065462350845337, "logits/rejected": 0.027768870815634727, "logps/chosen": -382.0977478027344, "logps/rejected": -884.6822509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.617523193359375, "rewards/margins": 45.349388122558594, "rewards/rejected": -57.96691131591797, "step": 1750 }, { "epoch": 0.176, "grad_norm": 1.2494966172837962e-09, "learning_rate": 4.912541236180779e-06, "logits/chosen": -0.8603243827819824, "logits/rejected": 0.08707042783498764, "logps/chosen": -439.8019104003906, "logps/rejected": -842.1697998046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.627752304077148, "rewards/margins": 33.403865814208984, "rewards/rejected": -47.0316162109375, "step": 1760 }, { "epoch": 0.177, "grad_norm": 1.0479344451455618e-15, "learning_rate": 4.910238510585275e-06, "logits/chosen": -1.1826056241989136, "logits/rejected": 0.29609158635139465, "logps/chosen": -289.4844665527344, "logps/rejected": -989.2132568359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.005671501159668, "rewards/margins": 54.07866287231445, "rewards/rejected": -67.08433532714844, "step": 1770 }, { "epoch": 0.178, "grad_norm": 3.945892224077596e-10, "learning_rate": 4.907906416994146e-06, "logits/chosen": -0.7862873077392578, "logits/rejected": 0.3076401948928833, "logps/chosen": -392.3423156738281, "logps/rejected": -1059.8935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.646921157836914, "rewards/margins": 50.55992889404297, "rewards/rejected": -64.20684814453125, "step": 1780 }, { "epoch": 0.179, "grad_norm": 1.2721311702071532e-14, "learning_rate": 4.905544983823214e-06, "logits/chosen": -0.8739240765571594, "logits/rejected": 0.2608945965766907, "logps/chosen": -400.95867919921875, "logps/rejected": -937.3480224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.51372718811035, "rewards/margins": 46.725791931152344, "rewards/rejected": -63.23952102661133, "step": 1790 }, { "epoch": 0.18, "grad_norm": 1.5841317382157312e-16, "learning_rate": 4.903154239845798e-06, "logits/chosen": -0.866929829120636, "logits/rejected": 0.0072061000391840935, "logps/chosen": -300.40277099609375, "logps/rejected": -879.0672607421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -12.66291618347168, "rewards/margins": 46.29401397705078, "rewards/rejected": -58.956932067871094, "step": 1800 }, { "epoch": 0.181, "grad_norm": 0.0, "learning_rate": 4.900734214192358e-06, "logits/chosen": -0.9717338681221008, "logits/rejected": 0.19119112193584442, "logps/chosen": -286.30841064453125, "logps/rejected": -806.5407104492188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.181900024414062, "rewards/margins": 40.651390075683594, "rewards/rejected": -50.833290100097656, "step": 1810 }, { "epoch": 0.182, "grad_norm": 0.03354150429368019, "learning_rate": 4.898284936350144e-06, "logits/chosen": -0.608110249042511, "logits/rejected": 0.10701987892389297, "logps/chosen": -460.31756591796875, "logps/rejected": -805.1393432617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.571718215942383, "rewards/margins": 26.98175621032715, "rewards/rejected": -45.55347442626953, "step": 1820 }, { "epoch": 0.183, "grad_norm": 4.13684983868734e-06, "learning_rate": 4.8958064361628334e-06, "logits/chosen": -0.7738394141197205, "logits/rejected": 0.21792516112327576, "logps/chosen": -432.724609375, "logps/rejected": -925.1007690429688, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": -14.735455513000488, "rewards/margins": 44.763633728027344, "rewards/rejected": -59.49909210205078, "step": 1830 }, { "epoch": 0.184, "grad_norm": 7.981440584233542e-16, "learning_rate": 4.893298743830168e-06, "logits/chosen": -0.6500649452209473, "logits/rejected": 0.09442566335201263, "logps/chosen": -341.99603271484375, "logps/rejected": -855.1522216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.761459350585938, "rewards/margins": 49.43480682373047, "rewards/rejected": -60.19626998901367, "step": 1840 }, { "epoch": 0.185, "grad_norm": 7.473514168632178e-11, "learning_rate": 4.890761889907589e-06, "logits/chosen": -0.5870779752731323, "logits/rejected": 0.15625113248825073, "logps/chosen": -409.39178466796875, "logps/rejected": -909.2233276367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.405719757080078, "rewards/margins": 41.44813537597656, "rewards/rejected": -59.853851318359375, "step": 1850 }, { "epoch": 0.186, "grad_norm": 4.363639305579245e-14, "learning_rate": 4.888195905305859e-06, "logits/chosen": -0.59303218126297, "logits/rejected": 0.18804967403411865, "logps/chosen": -403.5731506347656, "logps/rejected": -938.0885009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.057333946228027, "rewards/margins": 49.87173843383789, "rewards/rejected": -58.9290771484375, "step": 1860 }, { "epoch": 0.187, "grad_norm": 4.640702172764577e-06, "learning_rate": 4.885600821290692e-06, "logits/chosen": -0.6436842679977417, "logits/rejected": 0.3434585630893707, "logps/chosen": -349.7669372558594, "logps/rejected": -794.9974975585938, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -12.01045036315918, "rewards/margins": 38.5369987487793, "rewards/rejected": -50.54745101928711, "step": 1870 }, { "epoch": 0.188, "grad_norm": 2.3586930070771187e-14, "learning_rate": 4.882976669482368e-06, "logits/chosen": -1.0222156047821045, "logits/rejected": 0.18538489937782288, "logps/chosen": -431.86614990234375, "logps/rejected": -936.9752197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.006241798400879, "rewards/margins": 40.09410858154297, "rewards/rejected": -50.1003532409668, "step": 1880 }, { "epoch": 0.189, "grad_norm": 3.1524606411897062e-21, "learning_rate": 4.880323481855347e-06, "logits/chosen": -0.9518159031867981, "logits/rejected": 0.019279232248663902, "logps/chosen": -267.8340759277344, "logps/rejected": -740.6573486328125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -10.248370170593262, "rewards/margins": 34.88971710205078, "rewards/rejected": -45.138084411621094, "step": 1890 }, { "epoch": 0.19, "grad_norm": 1.8914584597745732e-19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -1.017465353012085, "logits/rejected": 0.13340437412261963, "logps/chosen": -412.73297119140625, "logps/rejected": -891.4244384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.114110946655273, "rewards/margins": 41.254695892333984, "rewards/rejected": -52.368812561035156, "step": 1900 }, { "epoch": 0.191, "grad_norm": 0.011506685987114906, "learning_rate": 4.874930128811631e-06, "logits/chosen": -1.0823355913162231, "logits/rejected": 0.020175794139504433, "logps/chosen": -399.35394287109375, "logps/rejected": -887.8450927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.699014663696289, "rewards/margins": 38.75952911376953, "rewards/rejected": -53.45853805541992, "step": 1910 }, { "epoch": 0.192, "grad_norm": 0.0, "learning_rate": 4.8721900291112415e-06, "logits/chosen": -0.6110566854476929, "logits/rejected": 0.08111194521188736, "logps/chosen": -354.4425964355469, "logps/rejected": -915.6475830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.294811248779297, "rewards/margins": 48.587364196777344, "rewards/rejected": -60.882171630859375, "step": 1920 }, { "epoch": 0.193, "grad_norm": 5.552347877824371e-22, "learning_rate": 4.869421025023965e-06, "logits/chosen": -1.0768553018569946, "logits/rejected": 0.2780148983001709, "logps/chosen": -283.3021545410156, "logps/rejected": -874.9591064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.546971321105957, "rewards/margins": 46.320335388183594, "rewards/rejected": -59.86730194091797, "step": 1930 }, { "epoch": 0.194, "grad_norm": 2.024776508438119e-17, "learning_rate": 4.866623150289241e-06, "logits/chosen": -1.423117756843567, "logits/rejected": 0.02501138485968113, "logps/chosen": -237.41915893554688, "logps/rejected": -842.1968994140625, "loss": 0.5865, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.444206237792969, "rewards/margins": 47.00324249267578, "rewards/rejected": -56.44744873046875, "step": 1940 }, { "epoch": 0.195, "grad_norm": 7.398041645956255e-08, "learning_rate": 4.863796438998293e-06, "logits/chosen": -0.9332197308540344, "logits/rejected": 0.01928057335317135, "logps/chosen": -153.62330627441406, "logps/rejected": -553.70361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.0888495445251465, "rewards/margins": 29.092769622802734, "rewards/rejected": -35.181617736816406, "step": 1950 }, { "epoch": 0.196, "grad_norm": 0.09112061560153961, "learning_rate": 4.860940925593703e-06, "logits/chosen": -0.7799338102340698, "logits/rejected": -0.06467507779598236, "logps/chosen": -520.5501708984375, "logps/rejected": -877.4957275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.793745040893555, "rewards/margins": 35.70417785644531, "rewards/rejected": -48.497928619384766, "step": 1960 }, { "epoch": 0.197, "grad_norm": 0.011287910863757133, "learning_rate": 4.858056644869002e-06, "logits/chosen": -0.8448853492736816, "logits/rejected": -0.1531025767326355, "logps/chosen": -388.1729431152344, "logps/rejected": -773.3935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.335466384887695, "rewards/margins": 34.20120620727539, "rewards/rejected": -46.53667068481445, "step": 1970 }, { "epoch": 0.198, "grad_norm": 2.031313246360152e-19, "learning_rate": 4.855143631968242e-06, "logits/chosen": -0.9060547947883606, "logits/rejected": 0.058340221643447876, "logps/chosen": -466.2227478027344, "logps/rejected": -1013.80908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.147106170654297, "rewards/margins": 42.48836135864258, "rewards/rejected": -53.635467529296875, "step": 1980 }, { "epoch": 0.199, "grad_norm": 3.3271295874631734e-12, "learning_rate": 4.852201922385564e-06, "logits/chosen": -1.5826104879379272, "logits/rejected": 0.2186201810836792, "logps/chosen": -353.6684875488281, "logps/rejected": -870.4959716796875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -8.283282279968262, "rewards/margins": 34.60364532470703, "rewards/rejected": -42.886932373046875, "step": 1990 }, { "epoch": 0.2, "grad_norm": 4.831719453110865e-16, "learning_rate": 4.849231551964771e-06, "logits/chosen": -0.8290309906005859, "logits/rejected": 0.015381842851638794, "logps/chosen": -291.46063232421875, "logps/rejected": -669.02685546875, "loss": 0.0872, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.124984741210938, "rewards/margins": 33.23019027709961, "rewards/rejected": -44.35517883300781, "step": 2000 }, { "epoch": 0.201, "grad_norm": 0.002687457948923111, "learning_rate": 4.84623255689889e-06, "logits/chosen": -0.6981030702590942, "logits/rejected": 0.045853037387132645, "logps/chosen": -382.01776123046875, "logps/rejected": -782.2008666992188, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.968143463134766, "rewards/margins": 36.116371154785156, "rewards/rejected": -53.08452224731445, "step": 2010 }, { "epoch": 0.202, "grad_norm": 2.2274776711128652e-05, "learning_rate": 4.84320497372973e-06, "logits/chosen": -1.1311920881271362, "logits/rejected": 0.06535493582487106, "logps/chosen": -234.904296875, "logps/rejected": -763.780517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.619631290435791, "rewards/margins": 35.88097381591797, "rewards/rejected": -43.50060272216797, "step": 2020 }, { "epoch": 0.203, "grad_norm": 3.651647127039803e-15, "learning_rate": 4.840148839347434e-06, "logits/chosen": -1.1937439441680908, "logits/rejected": 0.05392221733927727, "logps/chosen": -231.7136688232422, "logps/rejected": -714.2911987304688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -11.07800579071045, "rewards/margins": 35.264305114746094, "rewards/rejected": -46.342308044433594, "step": 2030 }, { "epoch": 0.204, "grad_norm": 5.8405490221957734e-08, "learning_rate": 4.837064190990036e-06, "logits/chosen": -0.9981630444526672, "logits/rejected": 0.05016200616955757, "logps/chosen": -309.5982971191406, "logps/rejected": -757.8245239257812, "loss": 0.1777, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.973468780517578, "rewards/margins": 33.17815017700195, "rewards/rejected": -44.15161895751953, "step": 2040 }, { "epoch": 0.205, "grad_norm": 6.093002491436295e-11, "learning_rate": 4.833951066243004e-06, "logits/chosen": -0.9039271473884583, "logits/rejected": 0.15411342680454254, "logps/chosen": -275.2000427246094, "logps/rejected": -727.4915771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.230191230773926, "rewards/margins": 36.9615364074707, "rewards/rejected": -44.19172668457031, "step": 2050 }, { "epoch": 0.206, "grad_norm": 9.243760677691767e-14, "learning_rate": 4.830809503038781e-06, "logits/chosen": -1.0659441947937012, "logits/rejected": -0.04560966417193413, "logps/chosen": -430.9668884277344, "logps/rejected": -842.630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.900915145874023, "rewards/margins": 39.50886154174805, "rewards/rejected": -52.4097785949707, "step": 2060 }, { "epoch": 0.207, "grad_norm": 3.012393055812877e-14, "learning_rate": 4.8276395396563215e-06, "logits/chosen": -0.707872211933136, "logits/rejected": -0.12487606704235077, "logps/chosen": -333.6909484863281, "logps/rejected": -650.5589599609375, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": -12.892992973327637, "rewards/margins": 29.976953506469727, "rewards/rejected": -42.86994552612305, "step": 2070 }, { "epoch": 0.208, "grad_norm": 0.020325161516666412, "learning_rate": 4.824441214720629e-06, "logits/chosen": -1.0603435039520264, "logits/rejected": -0.13006174564361572, "logps/chosen": -430.6055603027344, "logps/rejected": -714.2709350585938, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -10.49073600769043, "rewards/margins": 26.808090209960938, "rewards/rejected": -37.298828125, "step": 2080 }, { "epoch": 0.209, "grad_norm": 4.8418461080779185e-12, "learning_rate": 4.821214567202284e-06, "logits/chosen": -0.6019529104232788, "logits/rejected": -0.05509559437632561, "logps/chosen": -446.13134765625, "logps/rejected": -764.00830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.890316009521484, "rewards/margins": 34.51970672607422, "rewards/rejected": -44.4100227355957, "step": 2090 }, { "epoch": 0.21, "grad_norm": 0.015145066194236279, "learning_rate": 4.817959636416969e-06, "logits/chosen": -0.5784981846809387, "logits/rejected": -0.10248645395040512, "logps/chosen": -570.4688720703125, "logps/rejected": -822.8307495117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.669044494628906, "rewards/margins": 33.286407470703125, "rewards/rejected": -42.95545196533203, "step": 2100 }, { "epoch": 0.211, "grad_norm": 2.6829666960326293e-15, "learning_rate": 4.814676462024988e-06, "logits/chosen": -0.9595147371292114, "logits/rejected": 0.07999895513057709, "logps/chosen": -278.6533203125, "logps/rejected": -740.11767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.203185081481934, "rewards/margins": 35.281524658203125, "rewards/rejected": -45.48470687866211, "step": 2110 }, { "epoch": 0.212, "grad_norm": 1.7712128943383588e-19, "learning_rate": 4.811365084030784e-06, "logits/chosen": -1.53053879737854, "logits/rejected": 0.17698611319065094, "logps/chosen": -161.68276977539062, "logps/rejected": -739.840087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.296278953552246, "rewards/margins": 40.40182113647461, "rewards/rejected": -47.69810104370117, "step": 2120 }, { "epoch": 0.213, "grad_norm": 5.7248204881482545e-21, "learning_rate": 4.808025542782453e-06, "logits/chosen": -1.048194408416748, "logits/rejected": 0.06344493478536606, "logps/chosen": -280.8482360839844, "logps/rejected": -703.5811767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.806572914123535, "rewards/margins": 31.60160255432129, "rewards/rejected": -43.40817642211914, "step": 2130 }, { "epoch": 0.214, "grad_norm": 1.4793377484237152e-17, "learning_rate": 4.804657878971252e-06, "logits/chosen": -1.3861225843429565, "logits/rejected": 0.1794586479663849, "logps/chosen": -385.51275634765625, "logps/rejected": -1037.645751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.886439323425293, "rewards/margins": 45.353580474853516, "rewards/rejected": -61.240013122558594, "step": 2140 }, { "epoch": 0.215, "grad_norm": 2.505604057567723e-10, "learning_rate": 4.801262133631101e-06, "logits/chosen": -0.9277293086051941, "logits/rejected": -0.0969119742512703, "logps/chosen": -468.418212890625, "logps/rejected": -712.3648681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.228230476379395, "rewards/margins": 27.422122955322266, "rewards/rejected": -39.650352478027344, "step": 2150 }, { "epoch": 0.216, "grad_norm": 0.2817370891571045, "learning_rate": 4.7978383481380865e-06, "logits/chosen": -1.1354224681854248, "logits/rejected": 0.02800583280622959, "logps/chosen": -397.59820556640625, "logps/rejected": -760.2446899414062, "loss": 0.0155, "rewards/accuracies": 1.0, "rewards/chosen": -10.743389129638672, "rewards/margins": 31.957714080810547, "rewards/rejected": -42.701107025146484, "step": 2160 }, { "epoch": 0.217, "grad_norm": 1.5132579434321003e-12, "learning_rate": 4.794386564209953e-06, "logits/chosen": -1.0182678699493408, "logits/rejected": 0.03556183725595474, "logps/chosen": -417.7530212402344, "logps/rejected": -964.2975463867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.372791290283203, "rewards/margins": 37.62430191040039, "rewards/rejected": -53.997093200683594, "step": 2170 }, { "epoch": 0.218, "grad_norm": 5.079949820211189e-18, "learning_rate": 4.790906823905599e-06, "logits/chosen": -1.0524531602859497, "logits/rejected": -0.14839516580104828, "logps/chosen": -273.4908142089844, "logps/rejected": -714.5278930664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.830060958862305, "rewards/margins": 30.115550994873047, "rewards/rejected": -41.94561004638672, "step": 2180 }, { "epoch": 0.219, "grad_norm": 0.0, "learning_rate": 4.787399169624562e-06, "logits/chosen": -1.0520641803741455, "logits/rejected": -0.10660145431756973, "logps/chosen": -434.2923889160156, "logps/rejected": -936.0968017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.649426460266113, "rewards/margins": 35.36643600463867, "rewards/rejected": -49.01586151123047, "step": 2190 }, { "epoch": 0.22, "grad_norm": 2.784252162157941e-09, "learning_rate": 4.783863644106502e-06, "logits/chosen": -0.8814069032669067, "logits/rejected": -0.05634657293558121, "logps/chosen": -431.2822265625, "logps/rejected": -949.1829223632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.263689994812012, "rewards/margins": 38.06841278076172, "rewards/rejected": -51.33210372924805, "step": 2200 }, { "epoch": 0.221, "grad_norm": 6.444813432926466e-11, "learning_rate": 4.780300290430683e-06, "logits/chosen": -1.072237253189087, "logits/rejected": -0.06441137939691544, "logps/chosen": -376.80572509765625, "logps/rejected": -824.3206176757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.14746379852295, "rewards/margins": 35.05595779418945, "rewards/rejected": -48.20341873168945, "step": 2210 }, { "epoch": 0.222, "grad_norm": 2.647671499414526e-19, "learning_rate": 4.776709152015443e-06, "logits/chosen": -0.9050602912902832, "logits/rejected": -0.0204143263399601, "logps/chosen": -308.73077392578125, "logps/rejected": -743.6929931640625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -12.167070388793945, "rewards/margins": 32.204490661621094, "rewards/rejected": -44.371559143066406, "step": 2220 }, { "epoch": 0.223, "grad_norm": 0.0, "learning_rate": 4.773090272617672e-06, "logits/chosen": -1.1379512548446655, "logits/rejected": 0.2818123996257782, "logps/chosen": -341.7289123535156, "logps/rejected": -922.5020751953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -10.875950813293457, "rewards/margins": 41.55849075317383, "rewards/rejected": -52.43444061279297, "step": 2230 }, { "epoch": 0.224, "grad_norm": 5.748112752042268e-16, "learning_rate": 4.769443696332272e-06, "logits/chosen": -1.0871905088424683, "logits/rejected": 0.021607961505651474, "logps/chosen": -422.33416748046875, "logps/rejected": -950.8518676757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.902830123901367, "rewards/margins": 45.880855560302734, "rewards/rejected": -58.78368377685547, "step": 2240 }, { "epoch": 0.225, "grad_norm": 2.0195723493543483e-07, "learning_rate": 4.765769467591626e-06, "logits/chosen": -0.7570234537124634, "logits/rejected": 0.23347148299217224, "logps/chosen": -582.53515625, "logps/rejected": -1147.514892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.481950759887695, "rewards/margins": 51.846229553222656, "rewards/rejected": -75.32818603515625, "step": 2250 }, { "epoch": 0.226, "grad_norm": 2.10311114904509e-14, "learning_rate": 4.762067631165049e-06, "logits/chosen": -1.2956483364105225, "logits/rejected": 0.10478191077709198, "logps/chosen": -331.7798767089844, "logps/rejected": -1047.0546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.469011306762695, "rewards/margins": 46.84741973876953, "rewards/rejected": -62.316429138183594, "step": 2260 }, { "epoch": 0.227, "grad_norm": 1.4005004621286954e-11, "learning_rate": 4.7583382321582525e-06, "logits/chosen": -0.673801600933075, "logits/rejected": -0.1823035627603531, "logps/chosen": -460.52874755859375, "logps/rejected": -798.1029052734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.7214298248291, "rewards/margins": 31.615991592407227, "rewards/rejected": -48.337425231933594, "step": 2270 }, { "epoch": 0.228, "grad_norm": 1.375444922278239e-16, "learning_rate": 4.754581316012785e-06, "logits/chosen": -0.8537979125976562, "logits/rejected": 0.008811051957309246, "logps/chosen": -436.7643127441406, "logps/rejected": -1024.1982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.984895706176758, "rewards/margins": 50.079132080078125, "rewards/rejected": -67.06402587890625, "step": 2280 }, { "epoch": 0.229, "grad_norm": 1.5123860094102626e-15, "learning_rate": 4.750796928505484e-06, "logits/chosen": -0.7344772219657898, "logits/rejected": -0.0585576593875885, "logps/chosen": -478.2090759277344, "logps/rejected": -905.4718627929688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -21.288328170776367, "rewards/margins": 33.283172607421875, "rewards/rejected": -54.571502685546875, "step": 2290 }, { "epoch": 0.23, "grad_norm": 5.140918073187617e-13, "learning_rate": 4.746985115747918e-06, "logits/chosen": -0.9128534197807312, "logits/rejected": 0.12439526617527008, "logps/chosen": -514.3945922851562, "logps/rejected": -916.0233154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.596830368041992, "rewards/margins": 31.888294219970703, "rewards/rejected": -48.48512649536133, "step": 2300 }, { "epoch": 0.231, "grad_norm": 6.90247385077927e-22, "learning_rate": 4.743145924185821e-06, "logits/chosen": -0.7148122787475586, "logits/rejected": -0.020510563626885414, "logps/chosen": -363.60162353515625, "logps/rejected": -751.8236083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.753090858459473, "rewards/margins": 31.371822357177734, "rewards/rejected": -43.124916076660156, "step": 2310 }, { "epoch": 0.232, "grad_norm": 2.671577152210669e-13, "learning_rate": 4.7392794005985324e-06, "logits/chosen": -0.8572785258293152, "logits/rejected": -0.11579354107379913, "logps/chosen": -425.34954833984375, "logps/rejected": -812.0877075195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.852200508117676, "rewards/margins": 35.16447448730469, "rewards/rejected": -50.01667404174805, "step": 2320 }, { "epoch": 0.233, "grad_norm": 7.430420967973477e-22, "learning_rate": 4.735385592098421e-06, "logits/chosen": -1.1626355648040771, "logits/rejected": -0.20799453556537628, "logps/chosen": -299.6154479980469, "logps/rejected": -671.9234008789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.785792350769043, "rewards/margins": 30.775033950805664, "rewards/rejected": -42.56082534790039, "step": 2330 }, { "epoch": 0.234, "grad_norm": 1.9862537974128506e-18, "learning_rate": 4.731464546130315e-06, "logits/chosen": -1.1884291172027588, "logits/rejected": 0.12569603323936462, "logps/chosen": -248.02633666992188, "logps/rejected": -826.1163940429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.018312454223633, "rewards/margins": 39.91654586791992, "rewards/rejected": -49.93485641479492, "step": 2340 }, { "epoch": 0.235, "grad_norm": 0.0, "learning_rate": 4.72751631047092e-06, "logits/chosen": -0.9497979879379272, "logits/rejected": 0.24937982857227325, "logps/chosen": -394.09246826171875, "logps/rejected": -831.47900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.291522026062012, "rewards/margins": 34.22481155395508, "rewards/rejected": -47.516334533691406, "step": 2350 }, { "epoch": 0.236, "grad_norm": 0.0, "learning_rate": 4.723540933228245e-06, "logits/chosen": -0.6847087144851685, "logits/rejected": -0.33126509189605713, "logps/chosen": -548.4906005859375, "logps/rejected": -793.0479736328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.37607192993164, "rewards/margins": 32.81710433959961, "rewards/rejected": -49.193180084228516, "step": 2360 }, { "epoch": 0.237, "grad_norm": 2.270067621580634e-16, "learning_rate": 4.719538462841003e-06, "logits/chosen": -0.1746879518032074, "logits/rejected": 0.21270795166492462, "logps/chosen": -448.89263916015625, "logps/rejected": -816.49609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.152996063232422, "rewards/margins": 38.69011688232422, "rewards/rejected": -59.843109130859375, "step": 2370 }, { "epoch": 0.238, "grad_norm": 3.4636649104413664e-10, "learning_rate": 4.715508948078037e-06, "logits/chosen": -0.8395630717277527, "logits/rejected": 0.0964946523308754, "logps/chosen": -501.96148681640625, "logps/rejected": -984.0113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.775581359863281, "rewards/margins": 43.83132553100586, "rewards/rejected": -58.606910705566406, "step": 2380 }, { "epoch": 0.239, "grad_norm": 7.653852551747775e-11, "learning_rate": 4.71145243803771e-06, "logits/chosen": -1.217986822128296, "logits/rejected": 0.4775959551334381, "logps/chosen": -415.57720947265625, "logps/rejected": -1103.2398681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.806949615478516, "rewards/margins": 50.73517608642578, "rewards/rejected": -69.54212951660156, "step": 2390 }, { "epoch": 0.24, "grad_norm": 1.6787088386038818e-10, "learning_rate": 4.707368982147318e-06, "logits/chosen": -0.7131022214889526, "logits/rejected": 0.4084620475769043, "logps/chosen": -376.76824951171875, "logps/rejected": -944.67236328125, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -20.048091888427734, "rewards/margins": 53.066627502441406, "rewards/rejected": -73.11471557617188, "step": 2400 }, { "epoch": 0.241, "grad_norm": 0.0, "learning_rate": 4.703258630162481e-06, "logits/chosen": -0.9870785474777222, "logits/rejected": 0.13874481618404388, "logps/chosen": -472.11407470703125, "logps/rejected": -1179.2464599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.6230525970459, "rewards/margins": 60.36846160888672, "rewards/rejected": -77.99150848388672, "step": 2410 }, { "epoch": 0.242, "grad_norm": 1.0387123823165894, "learning_rate": 4.699121432166541e-06, "logits/chosen": -0.8836095929145813, "logits/rejected": 0.36422693729400635, "logps/chosen": -379.01629638671875, "logps/rejected": -1056.7281494140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -20.028507232666016, "rewards/margins": 54.816200256347656, "rewards/rejected": -74.84471130371094, "step": 2420 }, { "epoch": 0.243, "grad_norm": 0.0, "learning_rate": 4.6949574385699514e-06, "logits/chosen": -0.4048551917076111, "logits/rejected": 0.24003490805625916, "logps/chosen": -461.79315185546875, "logps/rejected": -1028.343505859375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -19.345182418823242, "rewards/margins": 55.839576721191406, "rewards/rejected": -75.18475341796875, "step": 2430 }, { "epoch": 0.244, "grad_norm": 0.0, "learning_rate": 4.690766700109659e-06, "logits/chosen": -0.5087685585021973, "logits/rejected": 0.617567777633667, "logps/chosen": -413.0282287597656, "logps/rejected": -1033.424560546875, "loss": 0.3753, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.37348747253418, "rewards/margins": 55.75347900390625, "rewards/rejected": -76.12696075439453, "step": 2440 }, { "epoch": 0.245, "grad_norm": 0.0, "learning_rate": 4.68654926784849e-06, "logits/chosen": -0.9591018557548523, "logits/rejected": 0.39723971486091614, "logps/chosen": -504.660400390625, "logps/rejected": -1122.2666015625, "loss": 0.0374, "rewards/accuracies": 1.0, "rewards/chosen": -15.985147476196289, "rewards/margins": 48.736328125, "rewards/rejected": -64.72147369384766, "step": 2450 }, { "epoch": 0.246, "grad_norm": 4.3823277605042146e-21, "learning_rate": 4.682305193174524e-06, "logits/chosen": -0.8720195889472961, "logits/rejected": 0.5519598126411438, "logps/chosen": -364.02899169921875, "logps/rejected": -1117.5640869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.054271697998047, "rewards/margins": 64.96049499511719, "rewards/rejected": -84.0147705078125, "step": 2460 }, { "epoch": 0.247, "grad_norm": 0.0, "learning_rate": 4.6780345278004744e-06, "logits/chosen": -0.40707603096961975, "logits/rejected": 0.39848193526268005, "logps/chosen": -595.8785400390625, "logps/rejected": -996.6804809570312, "loss": 0.1193, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.818096160888672, "rewards/margins": 48.075130462646484, "rewards/rejected": -68.89323425292969, "step": 2470 }, { "epoch": 0.248, "grad_norm": 0.0, "learning_rate": 4.673737323763048e-06, "logits/chosen": -0.6710236668586731, "logits/rejected": 0.3840904235839844, "logps/chosen": -578.5338745117188, "logps/rejected": -1234.1817626953125, "loss": 0.4578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -26.0264949798584, "rewards/margins": 55.99785232543945, "rewards/rejected": -82.02433776855469, "step": 2480 }, { "epoch": 0.249, "grad_norm": 0.0, "learning_rate": 4.669413633422322e-06, "logits/chosen": -0.7051594853401184, "logits/rejected": 0.35642680525779724, "logps/chosen": -423.958984375, "logps/rejected": -1116.8492431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.242204666137695, "rewards/margins": 61.63847732543945, "rewards/rejected": -78.88069152832031, "step": 2490 }, { "epoch": 0.25, "grad_norm": 3.743392066509216e-23, "learning_rate": 4.665063509461098e-06, "logits/chosen": -0.799991250038147, "logits/rejected": 0.2828425168991089, "logps/chosen": -328.34417724609375, "logps/rejected": -939.0185546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.603776931762695, "rewards/margins": 48.95137405395508, "rewards/rejected": -61.555145263671875, "step": 2500 }, { "epoch": 0.251, "grad_norm": 2.941124880411644e-21, "learning_rate": 4.6606870048842626e-06, "logits/chosen": -1.1419764757156372, "logits/rejected": 0.26133501529693604, "logps/chosen": -367.69720458984375, "logps/rejected": -1034.8980712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.324525833129883, "rewards/margins": 53.705535888671875, "rewards/rejected": -71.03005981445312, "step": 2510 }, { "epoch": 0.252, "grad_norm": 0.0, "learning_rate": 4.656284173018144e-06, "logits/chosen": -1.3266280889511108, "logits/rejected": 0.24299263954162598, "logps/chosen": -277.9893798828125, "logps/rejected": -921.3558349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.854076385498047, "rewards/margins": 48.599815368652344, "rewards/rejected": -59.453895568847656, "step": 2520 }, { "epoch": 0.253, "grad_norm": 855.6547241210938, "learning_rate": 4.65185506750986e-06, "logits/chosen": -0.75602787733078, "logits/rejected": -0.04198075085878372, "logps/chosen": -404.5797119140625, "logps/rejected": -844.5486450195312, "loss": 0.3552, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.992825508117676, "rewards/margins": 37.5327033996582, "rewards/rejected": -50.52552795410156, "step": 2530 }, { "epoch": 0.254, "grad_norm": 2.341215069034952e-11, "learning_rate": 4.6473997423266615e-06, "logits/chosen": -1.0960681438446045, "logits/rejected": 0.11165539175271988, "logps/chosen": -309.8856201171875, "logps/rejected": -850.849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.645660400390625, "rewards/margins": 41.12796401977539, "rewards/rejected": -50.77361297607422, "step": 2540 }, { "epoch": 0.255, "grad_norm": 0.0, "learning_rate": 4.642918251755281e-06, "logits/chosen": -1.3027342557907104, "logits/rejected": 0.24640479683876038, "logps/chosen": -373.795654296875, "logps/rejected": -1020.3546752929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.509068489074707, "rewards/margins": 44.65599822998047, "rewards/rejected": -54.165069580078125, "step": 2550 }, { "epoch": 0.256, "grad_norm": 7.732006110927614e-07, "learning_rate": 4.638410650401267e-06, "logits/chosen": -1.2951546907424927, "logits/rejected": 0.06202126666903496, "logps/chosen": -278.08746337890625, "logps/rejected": -755.7003173828125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -8.85218620300293, "rewards/margins": 38.408973693847656, "rewards/rejected": -47.26116180419922, "step": 2560 }, { "epoch": 0.257, "grad_norm": 2.750820075636127e-22, "learning_rate": 4.633876993188319e-06, "logits/chosen": -0.5697834491729736, "logits/rejected": -0.17692281305789948, "logps/chosen": -330.2970886230469, "logps/rejected": -675.277587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.79739761352539, "rewards/margins": 36.20167922973633, "rewards/rejected": -46.99907684326172, "step": 2570 }, { "epoch": 0.258, "grad_norm": 0.0, "learning_rate": 4.62931733535762e-06, "logits/chosen": -0.24441662430763245, "logits/rejected": 0.09682003408670425, "logps/chosen": -453.4608459472656, "logps/rejected": -752.7393188476562, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -10.770639419555664, "rewards/margins": 33.42501449584961, "rewards/rejected": -44.195655822753906, "step": 2580 }, { "epoch": 0.259, "grad_norm": 0.0, "learning_rate": 4.62473173246716e-06, "logits/chosen": -0.736251950263977, "logits/rejected": 0.019975418224930763, "logps/chosen": -448.28643798828125, "logps/rejected": -904.33642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.213622093200684, "rewards/margins": 47.143733978271484, "rewards/rejected": -57.35735321044922, "step": 2590 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 4.620120240391065e-06, "logits/chosen": -0.46649056673049927, "logits/rejected": 0.018019551411271095, "logps/chosen": -447.64166259765625, "logps/rejected": -842.9020385742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.979809761047363, "rewards/margins": 42.84097671508789, "rewards/rejected": -58.8207893371582, "step": 2600 }, { "epoch": 0.261, "grad_norm": 0.0, "learning_rate": 4.6154829153189105e-06, "logits/chosen": -0.7219793796539307, "logits/rejected": 0.47301802039146423, "logps/chosen": -324.66229248046875, "logps/rejected": -1142.9036865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.965510368347168, "rewards/margins": 67.4233627319336, "rewards/rejected": -82.38887023925781, "step": 2610 }, { "epoch": 0.262, "grad_norm": 1256.3182373046875, "learning_rate": 4.610819813755038e-06, "logits/chosen": -0.7228553891181946, "logits/rejected": 0.12379207462072372, "logps/chosen": -552.47900390625, "logps/rejected": -960.6402587890625, "loss": 0.5346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.032733917236328, "rewards/margins": 34.79325485229492, "rewards/rejected": -58.82598876953125, "step": 2620 }, { "epoch": 0.263, "grad_norm": 30.419069290161133, "learning_rate": 4.60613099251787e-06, "logits/chosen": -1.010801076889038, "logits/rejected": 0.1571781188249588, "logps/chosen": -313.34747314453125, "logps/rejected": -845.5224609375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -10.521732330322266, "rewards/margins": 42.821800231933594, "rewards/rejected": -53.343536376953125, "step": 2630 }, { "epoch": 0.264, "grad_norm": 4.838548232731629e-15, "learning_rate": 4.601416508739211e-06, "logits/chosen": -1.073610544204712, "logits/rejected": 0.5915216207504272, "logps/chosen": -371.45989990234375, "logps/rejected": -1123.2381591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.641399383544922, "rewards/margins": 60.57389450073242, "rewards/rejected": -81.21529388427734, "step": 2640 }, { "epoch": 0.265, "grad_norm": 2.1994967028149404e-05, "learning_rate": 4.596676419863561e-06, "logits/chosen": -0.4245632290840149, "logits/rejected": -0.18038161098957062, "logps/chosen": -603.9671630859375, "logps/rejected": -949.8527221679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.18203353881836, "rewards/margins": 47.4343376159668, "rewards/rejected": -66.61637878417969, "step": 2650 }, { "epoch": 0.266, "grad_norm": 0.0, "learning_rate": 4.591910783647405e-06, "logits/chosen": -0.6803088784217834, "logits/rejected": 0.3073822855949402, "logps/chosen": -396.9436340332031, "logps/rejected": -895.3616333007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.408302307128906, "rewards/margins": 48.797462463378906, "rewards/rejected": -64.20576477050781, "step": 2660 }, { "epoch": 0.267, "grad_norm": 0.0, "learning_rate": 4.587119658158517e-06, "logits/chosen": -0.844752311706543, "logits/rejected": 1.0756970643997192, "logps/chosen": -336.9431457519531, "logps/rejected": -1189.891357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.892147064208984, "rewards/margins": 72.17032623291016, "rewards/rejected": -93.06246948242188, "step": 2670 }, { "epoch": 0.268, "grad_norm": 1.235100492332914e-18, "learning_rate": 4.582303101775249e-06, "logits/chosen": -0.6267115473747253, "logits/rejected": 0.35415276885032654, "logps/chosen": -635.7947998046875, "logps/rejected": -1453.327880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.361858367919922, "rewards/margins": 78.53101348876953, "rewards/rejected": -98.89286804199219, "step": 2680 }, { "epoch": 0.269, "grad_norm": 2.9888548233603096e-13, "learning_rate": 4.577461173185821e-06, "logits/chosen": -0.8019062280654907, "logits/rejected": 0.57016521692276, "logps/chosen": -379.5988464355469, "logps/rejected": -1071.9979248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.178808212280273, "rewards/margins": 59.92768478393555, "rewards/rejected": -77.10649108886719, "step": 2690 }, { "epoch": 0.27, "grad_norm": 4.660129422205473e-20, "learning_rate": 4.572593931387604e-06, "logits/chosen": -0.770391583442688, "logits/rejected": 0.47237473726272583, "logps/chosen": -390.8031311035156, "logps/rejected": -1198.1607666015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.715108871459961, "rewards/margins": 70.4637451171875, "rewards/rejected": -85.1788558959961, "step": 2700 }, { "epoch": 0.271, "grad_norm": 0.0, "learning_rate": 4.567701435686405e-06, "logits/chosen": -1.1127533912658691, "logits/rejected": 0.7476900815963745, "logps/chosen": -383.3504943847656, "logps/rejected": -1261.5062255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.624284744262695, "rewards/margins": 68.47001647949219, "rewards/rejected": -87.09429931640625, "step": 2710 }, { "epoch": 0.272, "grad_norm": 0.0, "learning_rate": 4.562783745695738e-06, "logits/chosen": -0.4293700158596039, "logits/rejected": -0.32799363136291504, "logps/chosen": -723.4006958007812, "logps/rejected": -1026.2906494140625, "loss": 1.5142, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.579858779907227, "rewards/margins": 37.50405502319336, "rewards/rejected": -59.08391571044922, "step": 2720 }, { "epoch": 0.273, "grad_norm": 8.267878001788631e-06, "learning_rate": 4.5578409213361055e-06, "logits/chosen": -0.38455820083618164, "logits/rejected": -0.13795824348926544, "logps/chosen": -422.9434509277344, "logps/rejected": -579.631591796875, "loss": 1.4694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.918153762817383, "rewards/margins": 20.949848175048828, "rewards/rejected": -30.868000030517578, "step": 2730 }, { "epoch": 0.274, "grad_norm": 3.323042983538471e-05, "learning_rate": 4.55287302283426e-06, "logits/chosen": -1.2258391380310059, "logits/rejected": 0.2508848309516907, "logps/chosen": -230.49356079101562, "logps/rejected": -710.0572509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.474408149719238, "rewards/margins": 33.696800231933594, "rewards/rejected": -40.17120361328125, "step": 2740 }, { "epoch": 0.275, "grad_norm": 2.5122176339209545e-07, "learning_rate": 4.54788011072248e-06, "logits/chosen": -1.135868787765503, "logits/rejected": -0.020803770050406456, "logps/chosen": -258.2652282714844, "logps/rejected": -616.9526977539062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.1399359703063965, "rewards/margins": 25.91520118713379, "rewards/rejected": -33.055137634277344, "step": 2750 }, { "epoch": 0.276, "grad_norm": 4.9381639660099445e-15, "learning_rate": 4.542862245837821e-06, "logits/chosen": -0.38430148363113403, "logits/rejected": -0.3277333378791809, "logps/chosen": -369.1678771972656, "logps/rejected": -599.8237915039062, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -5.9210052490234375, "rewards/margins": 25.239404678344727, "rewards/rejected": -31.160409927368164, "step": 2760 }, { "epoch": 0.277, "grad_norm": 2.8379170894622803, "learning_rate": 4.537819489321385e-06, "logits/chosen": -1.19253408908844, "logits/rejected": -0.09504680335521698, "logps/chosen": -227.212646484375, "logps/rejected": -592.4996948242188, "loss": 0.0937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -6.070375919342041, "rewards/margins": 26.809539794921875, "rewards/rejected": -32.87991714477539, "step": 2770 }, { "epoch": 0.278, "grad_norm": 0.0, "learning_rate": 4.5327519026175694e-06, "logits/chosen": -1.2971299886703491, "logits/rejected": 0.0600772388279438, "logps/chosen": -272.6226501464844, "logps/rejected": -866.8453979492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.302969932556152, "rewards/margins": 43.80091094970703, "rewards/rejected": -50.1038818359375, "step": 2780 }, { "epoch": 0.279, "grad_norm": 0.0, "learning_rate": 4.527659547473317e-06, "logits/chosen": -0.9313802719116211, "logits/rejected": -0.12935884296894073, "logps/chosen": -352.4999694824219, "logps/rejected": -728.7339477539062, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -7.947843074798584, "rewards/margins": 34.59426498413086, "rewards/rejected": -42.54210662841797, "step": 2790 }, { "epoch": 0.28, "grad_norm": 4.836014401432553e-13, "learning_rate": 4.522542485937369e-06, "logits/chosen": -1.0397319793701172, "logits/rejected": -0.26670709252357483, "logps/chosen": -367.3852233886719, "logps/rejected": -756.3072509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.932214736938477, "rewards/margins": 33.279319763183594, "rewards/rejected": -43.2115364074707, "step": 2800 }, { "epoch": 0.281, "grad_norm": 5.379221512669119e-10, "learning_rate": 4.517400780359505e-06, "logits/chosen": -0.9027656316757202, "logits/rejected": -0.13679035007953644, "logps/chosen": -492.4688415527344, "logps/rejected": -851.7306518554688, "loss": 0.2074, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.402759552001953, "rewards/margins": 34.22053527832031, "rewards/rejected": -46.62329864501953, "step": 2810 }, { "epoch": 0.282, "grad_norm": 0.0, "learning_rate": 4.512234493389785e-06, "logits/chosen": -1.3217861652374268, "logits/rejected": 0.2540954053401947, "logps/chosen": -446.2145080566406, "logps/rejected": -1161.365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.036980628967285, "rewards/margins": 58.81218338012695, "rewards/rejected": -68.84915924072266, "step": 2820 }, { "epoch": 0.283, "grad_norm": 5.972985661628627e-08, "learning_rate": 4.507043687977787e-06, "logits/chosen": -0.5777202844619751, "logits/rejected": 0.18531468510627747, "logps/chosen": -338.59832763671875, "logps/rejected": -736.2530517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.795352935791016, "rewards/margins": 31.938003540039062, "rewards/rejected": -49.73335647583008, "step": 2830 }, { "epoch": 0.284, "grad_norm": 0.004476075526326895, "learning_rate": 4.501828427371834e-06, "logits/chosen": -0.9521886110305786, "logits/rejected": 0.17324507236480713, "logps/chosen": -289.2124328613281, "logps/rejected": -889.6448974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.43809700012207, "rewards/margins": 41.17052459716797, "rewards/rejected": -51.60862350463867, "step": 2840 }, { "epoch": 0.285, "grad_norm": 1.6598479533058708e-06, "learning_rate": 4.496588775118232e-06, "logits/chosen": -1.029706358909607, "logits/rejected": 0.3757559061050415, "logps/chosen": -307.02191162109375, "logps/rejected": -935.701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.742807388305664, "rewards/margins": 45.651023864746094, "rewards/rejected": -58.393836975097656, "step": 2850 }, { "epoch": 0.286, "grad_norm": 0.0, "learning_rate": 4.491324795060491e-06, "logits/chosen": -0.9051336050033569, "logits/rejected": 0.1347285807132721, "logps/chosen": -230.3036651611328, "logps/rejected": -741.8186645507812, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -11.721261978149414, "rewards/margins": 39.301753997802734, "rewards/rejected": -51.023014068603516, "step": 2860 }, { "epoch": 0.287, "grad_norm": 2.454706430494147e-22, "learning_rate": 4.4860365513385456e-06, "logits/chosen": -0.9631183743476868, "logits/rejected": 0.08518068492412567, "logps/chosen": -438.61004638671875, "logps/rejected": -948.8082275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.66820240020752, "rewards/margins": 44.123191833496094, "rewards/rejected": -59.7913932800293, "step": 2870 }, { "epoch": 0.288, "grad_norm": 1.4058292646456716e-15, "learning_rate": 4.4807241083879774e-06, "logits/chosen": -0.9526158571243286, "logits/rejected": 0.23263370990753174, "logps/chosen": -412.12567138671875, "logps/rejected": -976.13916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.246286392211914, "rewards/margins": 44.95268630981445, "rewards/rejected": -62.19896697998047, "step": 2880 }, { "epoch": 0.289, "grad_norm": 3.208458630850027e-16, "learning_rate": 4.475387530939226e-06, "logits/chosen": -0.9484020471572876, "logits/rejected": 0.1921675205230713, "logps/chosen": -313.9803466796875, "logps/rejected": -839.4591064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.41915225982666, "rewards/margins": 41.73972702026367, "rewards/rejected": -53.15887451171875, "step": 2890 }, { "epoch": 0.29, "grad_norm": 1.1192635156476172e-06, "learning_rate": 4.470026884016805e-06, "logits/chosen": -1.1647913455963135, "logits/rejected": 0.28769174218177795, "logps/chosen": -178.21139526367188, "logps/rejected": -632.3584594726562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.1957526206970215, "rewards/margins": 32.56772994995117, "rewards/rejected": -39.76348114013672, "step": 2900 }, { "epoch": 0.291, "grad_norm": 3.7680575104559466e-08, "learning_rate": 4.464642232938505e-06, "logits/chosen": -0.1608423888683319, "logits/rejected": -0.09430716931819916, "logps/chosen": -674.2434692382812, "logps/rejected": -874.7601318359375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -26.605106353759766, "rewards/margins": 29.54248046875, "rewards/rejected": -56.14759063720703, "step": 2910 }, { "epoch": 0.292, "grad_norm": 0.0, "learning_rate": 4.4592336433146e-06, "logits/chosen": -0.9661940336227417, "logits/rejected": 0.5986965894699097, "logps/chosen": -401.9140930175781, "logps/rejected": -1143.014892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.071043014526367, "rewards/margins": 57.54827880859375, "rewards/rejected": -72.61932373046875, "step": 2920 }, { "epoch": 0.293, "grad_norm": 3.743392066509216e-23, "learning_rate": 4.453801181047047e-06, "logits/chosen": -0.7017570734024048, "logits/rejected": 0.8444482088088989, "logps/chosen": -495.0025939941406, "logps/rejected": -1380.618896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -26.271066665649414, "rewards/margins": 69.16044616699219, "rewards/rejected": -95.4314956665039, "step": 2930 }, { "epoch": 0.294, "grad_norm": 0.0, "learning_rate": 4.448344912328686e-06, "logits/chosen": -0.6854699850082397, "logits/rejected": 0.7347670197486877, "logps/chosen": -314.5595397949219, "logps/rejected": -929.7980346679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.546504974365234, "rewards/margins": 46.76772689819336, "rewards/rejected": -63.314231872558594, "step": 2940 }, { "epoch": 0.295, "grad_norm": 0.0, "learning_rate": 4.442864903642428e-06, "logits/chosen": -1.2334846258163452, "logits/rejected": 0.5767850875854492, "logps/chosen": -262.81158447265625, "logps/rejected": -999.1419067382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.631780624389648, "rewards/margins": 57.38239288330078, "rewards/rejected": -73.01416778564453, "step": 2950 }, { "epoch": 0.296, "grad_norm": 0.0, "learning_rate": 4.437361221760449e-06, "logits/chosen": -0.767090916633606, "logits/rejected": 0.47413578629493713, "logps/chosen": -399.1095275878906, "logps/rejected": -1085.1121826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.238618850708008, "rewards/margins": 58.78046417236328, "rewards/rejected": -74.01908111572266, "step": 2960 }, { "epoch": 0.297, "grad_norm": 0.0, "learning_rate": 4.431833933743378e-06, "logits/chosen": -0.8993236422538757, "logits/rejected": -0.05611775070428848, "logps/chosen": -527.3430786132812, "logps/rejected": -1073.151611328125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -28.32375144958496, "rewards/margins": 49.06525802612305, "rewards/rejected": -77.38900756835938, "step": 2970 }, { "epoch": 0.298, "grad_norm": 3.127561820637226e-10, "learning_rate": 4.426283106939474e-06, "logits/chosen": -0.27920812368392944, "logits/rejected": 0.6835567951202393, "logps/chosen": -497.9610290527344, "logps/rejected": -932.0389404296875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -22.19385528564453, "rewards/margins": 40.395111083984375, "rewards/rejected": -62.588966369628906, "step": 2980 }, { "epoch": 0.299, "grad_norm": 2.7995953999493395e-08, "learning_rate": 4.420708808983809e-06, "logits/chosen": -0.8772599101066589, "logits/rejected": 0.5049712061882019, "logps/chosen": -325.65692138671875, "logps/rejected": -930.7125244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.707733154296875, "rewards/margins": 54.16173553466797, "rewards/rejected": -67.86946868896484, "step": 2990 }, { "epoch": 0.3, "grad_norm": 0.023355349898338318, "learning_rate": 4.415111107797445e-06, "logits/chosen": -0.9795050621032715, "logits/rejected": 0.5347188115119934, "logps/chosen": -458.56268310546875, "logps/rejected": -967.0360107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.780598640441895, "rewards/margins": 42.03343200683594, "rewards/rejected": -51.81402587890625, "step": 3000 }, { "epoch": 0.301, "grad_norm": 7.46494047132451e-11, "learning_rate": 4.409490071586606e-06, "logits/chosen": -0.8965972065925598, "logits/rejected": 0.2855593264102936, "logps/chosen": -317.4532775878906, "logps/rejected": -826.0565185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.005922794342041, "rewards/margins": 46.22938537597656, "rewards/rejected": -53.23530960083008, "step": 3010 }, { "epoch": 0.302, "grad_norm": 8.980111374512489e-07, "learning_rate": 4.403845768841842e-06, "logits/chosen": -0.7695995569229126, "logits/rejected": 0.45763593912124634, "logps/chosen": -346.1946716308594, "logps/rejected": -922.2404174804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.814911842346191, "rewards/margins": 51.58049774169922, "rewards/rejected": -61.395408630371094, "step": 3020 }, { "epoch": 0.303, "grad_norm": 1.9820722507823787e-20, "learning_rate": 4.398178268337203e-06, "logits/chosen": -0.7960144877433777, "logits/rejected": 0.06469273567199707, "logps/chosen": -330.5350341796875, "logps/rejected": -910.6071166992188, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -12.916448593139648, "rewards/margins": 42.928802490234375, "rewards/rejected": -55.84525680541992, "step": 3030 }, { "epoch": 0.304, "grad_norm": 9.17138249002274e-20, "learning_rate": 4.3924876391293915e-06, "logits/chosen": -0.8251537084579468, "logits/rejected": 0.2735294699668884, "logps/chosen": -439.69549560546875, "logps/rejected": -902.1564331054688, "loss": 0.646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.860895156860352, "rewards/margins": 38.2791748046875, "rewards/rejected": -50.140071868896484, "step": 3040 }, { "epoch": 0.305, "grad_norm": 0.001080155256204307, "learning_rate": 4.386773950556931e-06, "logits/chosen": -0.98908531665802, "logits/rejected": 0.057259947061538696, "logps/chosen": -372.20709228515625, "logps/rejected": -811.25732421875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -8.590067863464355, "rewards/margins": 33.14325714111328, "rewards/rejected": -41.73332595825195, "step": 3050 }, { "epoch": 0.306, "grad_norm": 4.190588143160312e-08, "learning_rate": 4.381037272239311e-06, "logits/chosen": -1.0001757144927979, "logits/rejected": -0.20020675659179688, "logps/chosen": -442.28668212890625, "logps/rejected": -700.0807495117188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -7.053180694580078, "rewards/margins": 25.24466896057129, "rewards/rejected": -32.2978515625, "step": 3060 }, { "epoch": 0.307, "grad_norm": 9.719526133267209e-06, "learning_rate": 4.3752776740761495e-06, "logits/chosen": -0.9625173807144165, "logits/rejected": -0.23919124901294708, "logps/chosen": -319.67620849609375, "logps/rejected": -535.2568969726562, "loss": 0.0351, "rewards/accuracies": 1.0, "rewards/chosen": -7.780137538909912, "rewards/margins": 20.768360137939453, "rewards/rejected": -28.54849624633789, "step": 3070 }, { "epoch": 0.308, "grad_norm": 1.6489254852588456e-19, "learning_rate": 4.36949522624633e-06, "logits/chosen": -0.850079357624054, "logits/rejected": -0.02385178580880165, "logps/chosen": -372.00897216796875, "logps/rejected": -700.4833374023438, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -7.652767181396484, "rewards/margins": 29.697641372680664, "rewards/rejected": -37.35040283203125, "step": 3080 }, { "epoch": 0.309, "grad_norm": 0.0, "learning_rate": 4.3636899992071555e-06, "logits/chosen": -1.0359759330749512, "logits/rejected": 0.22866709530353546, "logps/chosen": -348.95001220703125, "logps/rejected": -818.4954833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.694208145141602, "rewards/margins": 37.28623580932617, "rewards/rejected": -46.980445861816406, "step": 3090 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 4.357862063693486e-06, "logits/chosen": -1.073272943496704, "logits/rejected": -0.13734038174152374, "logps/chosen": -310.79510498046875, "logps/rejected": -881.04638671875, "loss": 0.1594, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.433976173400879, "rewards/margins": 35.776695251464844, "rewards/rejected": -44.210670471191406, "step": 3100 }, { "epoch": 0.311, "grad_norm": 1.0155697793834406e-07, "learning_rate": 4.352011490716875e-06, "logits/chosen": -0.9790974855422974, "logits/rejected": 0.12780261039733887, "logps/chosen": -317.59039306640625, "logps/rejected": -703.695068359375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -9.203834533691406, "rewards/margins": 30.016159057617188, "rewards/rejected": -39.219993591308594, "step": 3110 }, { "epoch": 0.312, "grad_norm": 9.904084925088661e-23, "learning_rate": 4.346138351564711e-06, "logits/chosen": -1.1815786361694336, "logits/rejected": 0.22094345092773438, "logps/chosen": -266.4033203125, "logps/rejected": -817.2267456054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.6234712600708, "rewards/margins": 35.64247512817383, "rewards/rejected": -45.26594161987305, "step": 3120 }, { "epoch": 0.313, "grad_norm": 5.889712003694092e-13, "learning_rate": 4.340242717799337e-06, "logits/chosen": -1.1714904308319092, "logits/rejected": 0.30348244309425354, "logps/chosen": -227.6770782470703, "logps/rejected": -735.3243408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.901235103607178, "rewards/margins": 38.13142776489258, "rewards/rejected": -44.03266525268555, "step": 3130 }, { "epoch": 0.314, "grad_norm": 0.00010150240268558264, "learning_rate": 4.334324661257191e-06, "logits/chosen": -0.5792279839515686, "logits/rejected": -0.24451354146003723, "logps/chosen": -522.0756225585938, "logps/rejected": -817.6600341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.947957992553711, "rewards/margins": 35.51698303222656, "rewards/rejected": -44.46493911743164, "step": 3140 }, { "epoch": 0.315, "grad_norm": 0.0, "learning_rate": 4.328384254047927e-06, "logits/chosen": -0.7550392746925354, "logits/rejected": -0.0565187931060791, "logps/chosen": -419.688720703125, "logps/rejected": -683.5443115234375, "loss": 1.1448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.32347297668457, "rewards/margins": 25.387094497680664, "rewards/rejected": -38.7105712890625, "step": 3150 }, { "epoch": 0.316, "grad_norm": 4.217108611345691e-20, "learning_rate": 4.322421568553529e-06, "logits/chosen": -0.8440915942192078, "logits/rejected": 0.08169057220220566, "logps/chosen": -244.05636596679688, "logps/rejected": -744.2340087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.935759544372559, "rewards/margins": 36.457157135009766, "rewards/rejected": -41.392913818359375, "step": 3160 }, { "epoch": 0.317, "grad_norm": 7.900644760638897e-08, "learning_rate": 4.316436677427441e-06, "logits/chosen": -0.670063853263855, "logits/rejected": 0.19559910893440247, "logps/chosen": -433.0946350097656, "logps/rejected": -748.577392578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.119277000427246, "rewards/margins": 27.639232635498047, "rewards/rejected": -34.758506774902344, "step": 3170 }, { "epoch": 0.318, "grad_norm": 4.573404710495055e-10, "learning_rate": 4.3104296535936695e-06, "logits/chosen": -0.7517415285110474, "logits/rejected": -0.16787463426589966, "logps/chosen": -229.44692993164062, "logps/rejected": -462.67431640625, "loss": 0.09, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -7.413804531097412, "rewards/margins": 22.188800811767578, "rewards/rejected": -29.602609634399414, "step": 3180 }, { "epoch": 0.319, "grad_norm": 7.644340000072052e-19, "learning_rate": 4.3044005702459055e-06, "logits/chosen": -1.4360209703445435, "logits/rejected": 0.1630072295665741, "logps/chosen": -207.827880859375, "logps/rejected": -807.9542846679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.8019585609436035, "rewards/margins": 40.77233123779297, "rewards/rejected": -47.57428741455078, "step": 3190 }, { "epoch": 0.32, "grad_norm": 1.3456431064914898e-12, "learning_rate": 4.2983495008466285e-06, "logits/chosen": -0.7747661471366882, "logits/rejected": 0.05215495079755783, "logps/chosen": -299.6119079589844, "logps/rejected": -676.51806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.976496696472168, "rewards/margins": 34.04780578613281, "rewards/rejected": -42.0243034362793, "step": 3200 }, { "epoch": 0.321, "grad_norm": 16.59024429321289, "learning_rate": 4.2922765191262075e-06, "logits/chosen": -1.094308614730835, "logits/rejected": 0.11995135247707367, "logps/chosen": -300.5848693847656, "logps/rejected": -866.1476440429688, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -8.33760929107666, "rewards/margins": 39.02849197387695, "rewards/rejected": -47.36610412597656, "step": 3210 }, { "epoch": 0.322, "grad_norm": 0.0, "learning_rate": 4.286181699082008e-06, "logits/chosen": -1.0322264432907104, "logits/rejected": 0.15850770473480225, "logps/chosen": -371.9905700683594, "logps/rejected": -892.07568359375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -13.305109977722168, "rewards/margins": 43.74433135986328, "rewards/rejected": -57.0494384765625, "step": 3220 }, { "epoch": 0.323, "grad_norm": 1.3465262645469115e-15, "learning_rate": 4.280065114977492e-06, "logits/chosen": -1.3070619106292725, "logits/rejected": 0.4935874044895172, "logps/chosen": -305.3742370605469, "logps/rejected": -1267.24609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.339349746704102, "rewards/margins": 78.47554016113281, "rewards/rejected": -91.81489562988281, "step": 3230 }, { "epoch": 0.324, "grad_norm": 7.401005694337914e-19, "learning_rate": 4.273926841341303e-06, "logits/chosen": -0.48427170515060425, "logits/rejected": 0.372748464345932, "logps/chosen": -363.49005126953125, "logps/rejected": -985.8108520507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.015512466430664, "rewards/margins": 58.766265869140625, "rewards/rejected": -74.78177642822266, "step": 3240 }, { "epoch": 0.325, "grad_norm": 0.0, "learning_rate": 4.267766952966369e-06, "logits/chosen": -0.9825299382209778, "logits/rejected": 0.47985076904296875, "logps/chosen": -534.361328125, "logps/rejected": -1301.0528564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.222763061523438, "rewards/margins": 67.33207702636719, "rewards/rejected": -86.55484008789062, "step": 3250 }, { "epoch": 0.326, "grad_norm": 0.0, "learning_rate": 4.261585524908987e-06, "logits/chosen": -0.5122109651565552, "logits/rejected": 0.4423252046108246, "logps/chosen": -444.8743591308594, "logps/rejected": -1195.684814453125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -19.829586029052734, "rewards/margins": 67.60523986816406, "rewards/rejected": -87.43482208251953, "step": 3260 }, { "epoch": 0.327, "grad_norm": 0.0, "learning_rate": 4.255382632487907e-06, "logits/chosen": -1.0329724550247192, "logits/rejected": 0.7542210221290588, "logps/chosen": -447.3409729003906, "logps/rejected": -1327.537841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.9887638092041, "rewards/margins": 72.96781921386719, "rewards/rejected": -98.95658111572266, "step": 3270 }, { "epoch": 0.328, "grad_norm": 0.0, "learning_rate": 4.249158351283414e-06, "logits/chosen": -0.4629250168800354, "logits/rejected": 1.0543975830078125, "logps/chosen": -495.45379638671875, "logps/rejected": -1429.62744140625, "loss": 0.0993, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -27.047439575195312, "rewards/margins": 84.02159881591797, "rewards/rejected": -111.06903076171875, "step": 3280 }, { "epoch": 0.329, "grad_norm": 5.326713596085457e-17, "learning_rate": 4.242912757136412e-06, "logits/chosen": -0.8477737307548523, "logits/rejected": 0.3507903814315796, "logps/chosen": -372.31951904296875, "logps/rejected": -992.7218627929688, "loss": 1.0371, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.680524826049805, "rewards/margins": 61.86509323120117, "rewards/rejected": -75.54561614990234, "step": 3290 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 4.236645926147493e-06, "logits/chosen": -0.4431152939796448, "logits/rejected": 0.42956599593162537, "logps/chosen": -286.32318115234375, "logps/rejected": -851.01611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.283034324645996, "rewards/margins": 55.66416549682617, "rewards/rejected": -65.94720458984375, "step": 3300 }, { "epoch": 0.331, "grad_norm": 7.99737845599944e-21, "learning_rate": 4.230357934676017e-06, "logits/chosen": -0.689271867275238, "logits/rejected": 0.30687215924263, "logps/chosen": -594.4046020507812, "logps/rejected": -1044.365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.995326042175293, "rewards/margins": 45.54576873779297, "rewards/rejected": -60.54109573364258, "step": 3310 }, { "epoch": 0.332, "grad_norm": 76.74764251708984, "learning_rate": 4.224048859339175e-06, "logits/chosen": -0.7327234148979187, "logits/rejected": 0.20926149189472198, "logps/chosen": -365.96612548828125, "logps/rejected": -895.5828247070312, "loss": 0.0936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.642107009887695, "rewards/margins": 46.72553634643555, "rewards/rejected": -55.367645263671875, "step": 3320 }, { "epoch": 0.333, "grad_norm": 4.214549327455149e-19, "learning_rate": 4.217718777011058e-06, "logits/chosen": -0.9751136898994446, "logits/rejected": 0.4269269108772278, "logps/chosen": -265.28314208984375, "logps/rejected": -883.4361572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.531183242797852, "rewards/margins": 47.17866134643555, "rewards/rejected": -57.7098503112793, "step": 3330 }, { "epoch": 0.334, "grad_norm": 9.039357564688544e-07, "learning_rate": 4.211367764821722e-06, "logits/chosen": -1.0815322399139404, "logits/rejected": 0.3311161398887634, "logps/chosen": -205.88211059570312, "logps/rejected": -701.0584716796875, "loss": 0.8769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.06307315826416, "rewards/margins": 33.633060455322266, "rewards/rejected": -42.696136474609375, "step": 3340 }, { "epoch": 0.335, "grad_norm": 1.3225919914816586e-08, "learning_rate": 4.204995900156247e-06, "logits/chosen": -0.7970871925354004, "logits/rejected": -0.21015918254852295, "logps/chosen": -584.626220703125, "logps/rejected": -836.6129760742188, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -9.297735214233398, "rewards/margins": 29.705364227294922, "rewards/rejected": -39.00310134887695, "step": 3350 }, { "epoch": 0.336, "grad_norm": 4.926764821598923e-17, "learning_rate": 4.198603260653792e-06, "logits/chosen": -0.6659067869186401, "logits/rejected": -0.021963249891996384, "logps/chosen": -236.69851684570312, "logps/rejected": -597.7718505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.031260013580322, "rewards/margins": 31.933868408203125, "rewards/rejected": -37.965126037597656, "step": 3360 }, { "epoch": 0.337, "grad_norm": 0.04632464796304703, "learning_rate": 4.192189924206652e-06, "logits/chosen": -0.6960038542747498, "logits/rejected": 0.19569489359855652, "logps/chosen": -214.2620086669922, "logps/rejected": -617.9694213867188, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -7.799983024597168, "rewards/margins": 27.065723419189453, "rewards/rejected": -34.86570358276367, "step": 3370 }, { "epoch": 0.338, "grad_norm": 1.2611774125037556e-10, "learning_rate": 4.185755968959308e-06, "logits/chosen": -0.9139487147331238, "logits/rejected": 0.12455719709396362, "logps/chosen": -476.13140869140625, "logps/rejected": -759.7525634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.588300704956055, "rewards/margins": 31.280990600585938, "rewards/rejected": -41.869293212890625, "step": 3380 }, { "epoch": 0.339, "grad_norm": 6.151973502710462e-05, "learning_rate": 4.179301473307476e-06, "logits/chosen": -0.8834859132766724, "logits/rejected": -0.13724976778030396, "logps/chosen": -228.32272338867188, "logps/rejected": -701.5811767578125, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": -7.848291873931885, "rewards/margins": 34.67478561401367, "rewards/rejected": -42.52307891845703, "step": 3390 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 4.172826515897146e-06, "logits/chosen": -1.3233671188354492, "logits/rejected": 0.1761193573474884, "logps/chosen": -345.5697326660156, "logps/rejected": -839.4622802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.214780807495117, "rewards/margins": 35.262630462646484, "rewards/rejected": -41.47740936279297, "step": 3400 }, { "epoch": 0.341, "grad_norm": 8.89336535423426e-18, "learning_rate": 4.166331175623631e-06, "logits/chosen": -0.9410387277603149, "logits/rejected": 0.25650379061698914, "logps/chosen": -382.94012451171875, "logps/rejected": -868.2106323242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.65563440322876, "rewards/margins": 41.03944778442383, "rewards/rejected": -47.69507598876953, "step": 3410 }, { "epoch": 0.342, "grad_norm": 6.152333132706959e-19, "learning_rate": 4.159815531630604e-06, "logits/chosen": -0.9895069003105164, "logits/rejected": -0.024330515414476395, "logps/chosen": -419.55145263671875, "logps/rejected": -816.4887084960938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.520910739898682, "rewards/margins": 36.27510070800781, "rewards/rejected": -42.79601287841797, "step": 3420 }, { "epoch": 0.343, "grad_norm": 1.754929014607942e-11, "learning_rate": 4.15327966330913e-06, "logits/chosen": -0.9556158781051636, "logits/rejected": 0.02099316194653511, "logps/chosen": -331.62164306640625, "logps/rejected": -952.9747924804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.674660682678223, "rewards/margins": 45.470130920410156, "rewards/rejected": -54.1447868347168, "step": 3430 }, { "epoch": 0.344, "grad_norm": 4.288543058541573e-15, "learning_rate": 4.146723650296701e-06, "logits/chosen": -0.505741536617279, "logits/rejected": 0.18241751194000244, "logps/chosen": -326.7845458984375, "logps/rejected": -704.0743408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.242865562438965, "rewards/margins": 33.074039459228516, "rewards/rejected": -41.31690216064453, "step": 3440 }, { "epoch": 0.345, "grad_norm": 5.038234667154029e-05, "learning_rate": 4.140147572476269e-06, "logits/chosen": -0.7677274942398071, "logits/rejected": -0.07271343469619751, "logps/chosen": -296.08465576171875, "logps/rejected": -587.8889770507812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.46233081817627, "rewards/margins": 25.770553588867188, "rewards/rejected": -34.23288345336914, "step": 3450 }, { "epoch": 0.346, "grad_norm": 1.3172983603804974e-17, "learning_rate": 4.133551509975264e-06, "logits/chosen": -1.312073826789856, "logits/rejected": 0.2874998450279236, "logps/chosen": -234.2183380126953, "logps/rejected": -836.1720581054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.3478875160217285, "rewards/margins": 40.78498077392578, "rewards/rejected": -46.13286590576172, "step": 3460 }, { "epoch": 0.347, "grad_norm": 3.578258837236975e-13, "learning_rate": 4.126935543164628e-06, "logits/chosen": -0.6281191110610962, "logits/rejected": 0.1928253471851349, "logps/chosen": -385.08843994140625, "logps/rejected": -773.7591552734375, "loss": 1.2362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -11.442838668823242, "rewards/margins": 35.947547912597656, "rewards/rejected": -47.39038848876953, "step": 3470 }, { "epoch": 0.348, "grad_norm": 0.02128647267818451, "learning_rate": 4.120299752657828e-06, "logits/chosen": -1.0916345119476318, "logits/rejected": -0.13186690211296082, "logps/chosen": -405.6370849609375, "logps/rejected": -581.0313720703125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.9594109058380127, "rewards/margins": 23.054386138916016, "rewards/rejected": -27.013797760009766, "step": 3480 }, { "epoch": 0.349, "grad_norm": 0.692663311958313, "learning_rate": 4.113644219309877e-06, "logits/chosen": -0.8682696223258972, "logits/rejected": -0.07819642126560211, "logps/chosen": -241.39566040039062, "logps/rejected": -459.0397033691406, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.898946762084961, "rewards/margins": 18.163551330566406, "rewards/rejected": -21.062496185302734, "step": 3490 }, { "epoch": 0.35, "grad_norm": 4.095468061904306e-11, "learning_rate": 4.106969024216348e-06, "logits/chosen": -1.165206789970398, "logits/rejected": 0.32706567645072937, "logps/chosen": -347.5617370605469, "logps/rejected": -740.993896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.557514190673828, "rewards/margins": 27.075342178344727, "rewards/rejected": -32.63285446166992, "step": 3500 }, { "epoch": 0.351, "grad_norm": 1.331050469674763e-16, "learning_rate": 4.1002742487123896e-06, "logits/chosen": -0.9754883050918579, "logits/rejected": 0.30378806591033936, "logps/chosen": -375.8541259765625, "logps/rejected": -662.5955810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.981966495513916, "rewards/margins": 23.170042037963867, "rewards/rejected": -30.152008056640625, "step": 3510 }, { "epoch": 0.352, "grad_norm": 4.5469066098300764e-17, "learning_rate": 4.093559974371725e-06, "logits/chosen": -0.9044982194900513, "logits/rejected": 0.12653622031211853, "logps/chosen": -287.69097900390625, "logps/rejected": -752.1959228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.752053260803223, "rewards/margins": 33.69719696044922, "rewards/rejected": -42.449249267578125, "step": 3520 }, { "epoch": 0.353, "grad_norm": 1.1403776073152431e-20, "learning_rate": 4.086826283005669e-06, "logits/chosen": -0.9870929718017578, "logits/rejected": 0.21940307319164276, "logps/chosen": -317.22210693359375, "logps/rejected": -676.0667114257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.004629611968994, "rewards/margins": 28.83846664428711, "rewards/rejected": -35.84309768676758, "step": 3530 }, { "epoch": 0.354, "grad_norm": 2.823813923990982e-19, "learning_rate": 4.080073256662128e-06, "logits/chosen": -0.7896633744239807, "logits/rejected": 0.13053257763385773, "logps/chosen": -226.0159149169922, "logps/rejected": -630.7973022460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.601428031921387, "rewards/margins": 30.512859344482422, "rewards/rejected": -38.114288330078125, "step": 3540 }, { "epoch": 0.355, "grad_norm": 122.45453643798828, "learning_rate": 4.073300977624594e-06, "logits/chosen": -0.7716548442840576, "logits/rejected": 0.1828223615884781, "logps/chosen": -394.8473205566406, "logps/rejected": -690.2156372070312, "loss": 0.0363, "rewards/accuracies": 1.0, "rewards/chosen": -5.993786334991455, "rewards/margins": 25.322904586791992, "rewards/rejected": -31.316692352294922, "step": 3550 }, { "epoch": 0.356, "grad_norm": 2.582539650880511e-12, "learning_rate": 4.066509528411151e-06, "logits/chosen": -0.7894098162651062, "logits/rejected": 0.20214462280273438, "logps/chosen": -178.9608612060547, "logps/rejected": -506.6351623535156, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -6.481031894683838, "rewards/margins": 23.32442855834961, "rewards/rejected": -29.805461883544922, "step": 3560 }, { "epoch": 0.357, "grad_norm": 3.1474918671392516e-08, "learning_rate": 4.059698991773466e-06, "logits/chosen": -0.5627990961074829, "logits/rejected": -0.15371878445148468, "logps/chosen": -318.6998596191406, "logps/rejected": -579.3805541992188, "loss": 0.1134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.315168380737305, "rewards/margins": 27.846643447875977, "rewards/rejected": -38.16181182861328, "step": 3570 }, { "epoch": 0.358, "grad_norm": 8.990660717245191e-06, "learning_rate": 4.052869450695776e-06, "logits/chosen": -1.002862811088562, "logits/rejected": 0.23829932510852814, "logps/chosen": -290.24456787109375, "logps/rejected": -817.5097045898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.639854431152344, "rewards/margins": 39.795753479003906, "rewards/rejected": -48.435604095458984, "step": 3580 }, { "epoch": 0.359, "grad_norm": 0.005798167083412409, "learning_rate": 4.046020988393886e-06, "logits/chosen": -0.8494859933853149, "logits/rejected": 0.033515565097332, "logps/chosen": -402.35382080078125, "logps/rejected": -763.16552734375, "loss": 0.015, "rewards/accuracies": 1.0, "rewards/chosen": -9.38691234588623, "rewards/margins": 29.28324317932129, "rewards/rejected": -38.67015838623047, "step": 3590 }, { "epoch": 0.36, "grad_norm": 6.382561840156953e-11, "learning_rate": 4.039153688314146e-06, "logits/chosen": -1.2625701427459717, "logits/rejected": 0.5352746248245239, "logps/chosen": -263.8447265625, "logps/rejected": -885.6497192382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.839212417602539, "rewards/margins": 43.649513244628906, "rewards/rejected": -52.48872756958008, "step": 3600 }, { "epoch": 0.361, "grad_norm": 1.465044301375816e-20, "learning_rate": 4.032267634132442e-06, "logits/chosen": -0.5378260016441345, "logits/rejected": 0.22525055706501007, "logps/chosen": -348.68145751953125, "logps/rejected": -841.7462158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.665252685546875, "rewards/margins": 47.320892333984375, "rewards/rejected": -55.98614501953125, "step": 3610 }, { "epoch": 0.362, "grad_norm": 0.0037902365438640118, "learning_rate": 4.02536290975317e-06, "logits/chosen": -0.46131354570388794, "logits/rejected": 0.11978636682033539, "logps/chosen": -471.6434631347656, "logps/rejected": -775.079833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.731196403503418, "rewards/margins": 34.168792724609375, "rewards/rejected": -44.899986267089844, "step": 3620 }, { "epoch": 0.363, "grad_norm": 1.2722707273555428e-13, "learning_rate": 4.018439599308217e-06, "logits/chosen": -0.8846076130867004, "logits/rejected": 0.47798728942871094, "logps/chosen": -317.46466064453125, "logps/rejected": -997.0470581054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.721329689025879, "rewards/margins": 53.88054275512695, "rewards/rejected": -64.60187530517578, "step": 3630 }, { "epoch": 0.364, "grad_norm": 1.3620032538678645e-17, "learning_rate": 4.011497787155938e-06, "logits/chosen": -0.5411085486412048, "logits/rejected": 0.30354979634284973, "logps/chosen": -375.63720703125, "logps/rejected": -837.1346435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.404248237609863, "rewards/margins": 42.27107620239258, "rewards/rejected": -52.675315856933594, "step": 3640 }, { "epoch": 0.365, "grad_norm": 0.0, "learning_rate": 4.0045375578801216e-06, "logits/chosen": -0.9094980955123901, "logits/rejected": 0.5757162570953369, "logps/chosen": -241.68801879882812, "logps/rejected": -861.6231689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.014677047729492, "rewards/margins": 42.840049743652344, "rewards/rejected": -52.8547248840332, "step": 3650 }, { "epoch": 0.366, "grad_norm": 7.864214921632362e-13, "learning_rate": 3.997558996288965e-06, "logits/chosen": -0.6316531896591187, "logits/rejected": 0.5172852873802185, "logps/chosen": -322.84295654296875, "logps/rejected": -878.1337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.732378959655762, "rewards/margins": 43.56822204589844, "rewards/rejected": -53.30059814453125, "step": 3660 }, { "epoch": 0.367, "grad_norm": 1.152663465690726e-18, "learning_rate": 3.9905621874140396e-06, "logits/chosen": -1.038159966468811, "logits/rejected": 0.21830615401268005, "logps/chosen": -243.6871337890625, "logps/rejected": -831.5692138671875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -7.551340579986572, "rewards/margins": 44.354488372802734, "rewards/rejected": -51.90583038330078, "step": 3670 }, { "epoch": 0.368, "grad_norm": 1.1498552876775108e-20, "learning_rate": 3.983547216509254e-06, "logits/chosen": -0.6559784412384033, "logits/rejected": 0.5230801105499268, "logps/chosen": -262.62261962890625, "logps/rejected": -746.4537963867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.013997077941895, "rewards/margins": 36.55512237548828, "rewards/rejected": -46.56912612915039, "step": 3680 }, { "epoch": 0.369, "grad_norm": 7.899832930232531e-18, "learning_rate": 3.976514169049814e-06, "logits/chosen": -0.8977417945861816, "logits/rejected": 0.6272139549255371, "logps/chosen": -352.23822021484375, "logps/rejected": -923.0206298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.092788696289062, "rewards/margins": 47.015174865722656, "rewards/rejected": -60.10795974731445, "step": 3690 }, { "epoch": 0.37, "grad_norm": 4.87312933061812e-11, "learning_rate": 3.969463130731183e-06, "logits/chosen": -0.737012505531311, "logits/rejected": 0.5239855051040649, "logps/chosen": -440.50286865234375, "logps/rejected": -1080.082275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.673296928405762, "rewards/margins": 50.824745178222656, "rewards/rejected": -64.498046875, "step": 3700 }, { "epoch": 0.371, "grad_norm": 0.0011436374625191092, "learning_rate": 3.962394187468039e-06, "logits/chosen": -0.7259347438812256, "logits/rejected": 0.4603849947452545, "logps/chosen": -348.16156005859375, "logps/rejected": -792.4857788085938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.061820983886719, "rewards/margins": 41.306678771972656, "rewards/rejected": -53.368507385253906, "step": 3710 }, { "epoch": 0.372, "grad_norm": 3.8318094357225885e-12, "learning_rate": 3.955307425393224e-06, "logits/chosen": -0.9659935832023621, "logits/rejected": 0.44564709067344666, "logps/chosen": -257.97076416015625, "logps/rejected": -842.8034057617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.789884567260742, "rewards/margins": 40.528785705566406, "rewards/rejected": -50.31867218017578, "step": 3720 }, { "epoch": 0.373, "grad_norm": 0.0, "learning_rate": 3.948202930856697e-06, "logits/chosen": -0.8581470251083374, "logits/rejected": 0.4110802114009857, "logps/chosen": -404.4557800292969, "logps/rejected": -1082.4560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.478370666503906, "rewards/margins": 45.96970748901367, "rewards/rejected": -59.44807052612305, "step": 3730 }, { "epoch": 0.374, "grad_norm": 2.034799845979096e-12, "learning_rate": 3.941080790424483e-06, "logits/chosen": -0.8508933186531067, "logits/rejected": 0.5776023268699646, "logps/chosen": -221.8153076171875, "logps/rejected": -761.3008422851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.045284271240234, "rewards/margins": 43.52240753173828, "rewards/rejected": -50.56769561767578, "step": 3740 }, { "epoch": 0.375, "grad_norm": 1.2156111672823045e-09, "learning_rate": 3.933941090877615e-06, "logits/chosen": -0.8493801951408386, "logits/rejected": 0.4712657332420349, "logps/chosen": -307.35791015625, "logps/rejected": -789.6890869140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -8.67790412902832, "rewards/margins": 38.20088577270508, "rewards/rejected": -46.87879180908203, "step": 3750 }, { "epoch": 0.376, "grad_norm": 0.0, "learning_rate": 3.92678391921108e-06, "logits/chosen": -0.7485553622245789, "logits/rejected": 0.5399482250213623, "logps/chosen": -329.4286193847656, "logps/rejected": -833.7262573242188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -13.084185600280762, "rewards/margins": 44.51618194580078, "rewards/rejected": -57.600372314453125, "step": 3760 }, { "epoch": 0.377, "grad_norm": 9.002846889207342e-14, "learning_rate": 3.9196093626327535e-06, "logits/chosen": -0.64151531457901, "logits/rejected": 0.6924604177474976, "logps/chosen": -387.8374328613281, "logps/rejected": -1071.4794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.649490356445312, "rewards/margins": 58.88508987426758, "rewards/rejected": -74.53457641601562, "step": 3770 }, { "epoch": 0.378, "grad_norm": 0.0, "learning_rate": 3.912417508562345e-06, "logits/chosen": -0.8945374488830566, "logits/rejected": 0.7975891828536987, "logps/chosen": -276.60089111328125, "logps/rejected": -1138.664794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.599028587341309, "rewards/margins": 64.93830871582031, "rewards/rejected": -77.53733825683594, "step": 3780 }, { "epoch": 0.379, "grad_norm": 2.029291677120462e-15, "learning_rate": 3.905208444630326e-06, "logits/chosen": -0.4724903106689453, "logits/rejected": 0.20428940653800964, "logps/chosen": -431.5040588378906, "logps/rejected": -842.90673828125, "loss": 2.9491, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.42316246032715, "rewards/margins": 38.17705154418945, "rewards/rejected": -57.60021209716797, "step": 3790 }, { "epoch": 0.38, "grad_norm": 3.7381135605496125e-18, "learning_rate": 3.897982258676867e-06, "logits/chosen": -0.48396188020706177, "logits/rejected": 0.40527671575546265, "logps/chosen": -599.632080078125, "logps/rejected": -958.6070556640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.483598709106445, "rewards/margins": 35.09453582763672, "rewards/rejected": -50.57814025878906, "step": 3800 }, { "epoch": 0.381, "grad_norm": 1.3835478346662982e-17, "learning_rate": 3.890739038750763e-06, "logits/chosen": -0.07472027093172073, "logits/rejected": 0.3322257399559021, "logps/chosen": -547.6071166992188, "logps/rejected": -853.6226806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.650531768798828, "rewards/margins": 36.77111053466797, "rewards/rejected": -55.4216423034668, "step": 3810 }, { "epoch": 0.382, "grad_norm": 0.0, "learning_rate": 3.88347887310836e-06, "logits/chosen": -0.4241139888763428, "logits/rejected": 0.4764328896999359, "logps/chosen": -441.03485107421875, "logps/rejected": -940.2620239257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.40770149230957, "rewards/margins": 46.130409240722656, "rewards/rejected": -61.538108825683594, "step": 3820 }, { "epoch": 0.383, "grad_norm": 0.0, "learning_rate": 3.876201850212489e-06, "logits/chosen": -0.9169275164604187, "logits/rejected": 0.6456999778747559, "logps/chosen": -374.56854248046875, "logps/rejected": -972.72314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.147981643676758, "rewards/margins": 49.80410385131836, "rewards/rejected": -65.95207977294922, "step": 3830 }, { "epoch": 0.384, "grad_norm": 1.0824854061080094e-13, "learning_rate": 3.868908058731376e-06, "logits/chosen": -0.5485990047454834, "logits/rejected": 0.7526308298110962, "logps/chosen": -478.01312255859375, "logps/rejected": -938.0384521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.822885513305664, "rewards/margins": 40.522560119628906, "rewards/rejected": -53.3454475402832, "step": 3840 }, { "epoch": 0.385, "grad_norm": 0.0, "learning_rate": 3.861597587537568e-06, "logits/chosen": -0.8074597120285034, "logits/rejected": 0.5264551639556885, "logps/chosen": -325.8951721191406, "logps/rejected": -919.2064208984375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -12.664430618286133, "rewards/margins": 47.888641357421875, "rewards/rejected": -60.553077697753906, "step": 3850 }, { "epoch": 0.386, "grad_norm": 9.141786717338023e-18, "learning_rate": 3.85427052570685e-06, "logits/chosen": -0.5131690502166748, "logits/rejected": 0.6525561809539795, "logps/chosen": -399.3219909667969, "logps/rejected": -844.6796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.690951347351074, "rewards/margins": 45.79986572265625, "rewards/rejected": -61.49082565307617, "step": 3860 }, { "epoch": 0.387, "grad_norm": 0.0, "learning_rate": 3.846926962517158e-06, "logits/chosen": -0.5702225565910339, "logits/rejected": 0.5192240476608276, "logps/chosen": -446.7099609375, "logps/rejected": -1046.5726318359375, "loss": 0.0117, "rewards/accuracies": 1.0, "rewards/chosen": -14.955255508422852, "rewards/margins": 48.32957077026367, "rewards/rejected": -63.284828186035156, "step": 3870 }, { "epoch": 0.388, "grad_norm": 0.0, "learning_rate": 3.839566987447492e-06, "logits/chosen": -0.44374722242355347, "logits/rejected": 0.702018141746521, "logps/chosen": -257.0533752441406, "logps/rejected": -927.4334716796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.47410774230957, "rewards/margins": 53.65742874145508, "rewards/rejected": -66.13153076171875, "step": 3880 }, { "epoch": 0.389, "grad_norm": 1.449809592382247e-22, "learning_rate": 3.832190690176825e-06, "logits/chosen": -0.4135567545890808, "logits/rejected": 0.45063215494155884, "logps/chosen": -438.75189208984375, "logps/rejected": -829.2301635742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.356510162353516, "rewards/margins": 36.14095687866211, "rewards/rejected": -56.497467041015625, "step": 3890 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 3.824798160583012e-06, "logits/chosen": -0.7455593347549438, "logits/rejected": 0.6117419004440308, "logps/chosen": -559.6799926757812, "logps/rejected": -1304.4791259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.246612548828125, "rewards/margins": 63.397003173828125, "rewards/rejected": -85.64362335205078, "step": 3900 }, { "epoch": 0.391, "grad_norm": 0.00011770037235692143, "learning_rate": 3.817389488741694e-06, "logits/chosen": -0.7981809973716736, "logits/rejected": 0.6145745515823364, "logps/chosen": -287.59698486328125, "logps/rejected": -947.4500732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.564549446105957, "rewards/margins": 51.583091735839844, "rewards/rejected": -66.14763641357422, "step": 3910 }, { "epoch": 0.392, "grad_norm": 8.74399904153729e-14, "learning_rate": 3.8099647649251984e-06, "logits/chosen": -0.3144210875034332, "logits/rejected": 0.36411142349243164, "logps/chosen": -704.8829345703125, "logps/rejected": -1009.3225708007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -26.28389549255371, "rewards/margins": 38.85230255126953, "rewards/rejected": -65.13619232177734, "step": 3920 }, { "epoch": 0.393, "grad_norm": 0.0, "learning_rate": 3.802524079601442e-06, "logits/chosen": -0.5111797451972961, "logits/rejected": 0.7962819337844849, "logps/chosen": -282.1951904296875, "logps/rejected": -826.8674926757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.543550491333008, "rewards/margins": 45.35955047607422, "rewards/rejected": -61.903106689453125, "step": 3930 }, { "epoch": 0.394, "grad_norm": 1.9319190344702086e-12, "learning_rate": 3.795067523432826e-06, "logits/chosen": -0.7026056051254272, "logits/rejected": 0.7360762357711792, "logps/chosen": -218.7373809814453, "logps/rejected": -990.8726806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.464958190917969, "rewards/margins": 58.12116622924805, "rewards/rejected": -70.58612823486328, "step": 3940 }, { "epoch": 0.395, "grad_norm": 1.2454868176733741e-21, "learning_rate": 3.787595187275136e-06, "logits/chosen": -0.38544806838035583, "logits/rejected": 0.5606005191802979, "logps/chosen": -564.4012451171875, "logps/rejected": -1214.3851318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.259593963623047, "rewards/margins": 62.833946228027344, "rewards/rejected": -87.09355163574219, "step": 3950 }, { "epoch": 0.396, "grad_norm": 398.5549621582031, "learning_rate": 3.780107162176429e-06, "logits/chosen": -0.12771472334861755, "logits/rejected": 0.23889890313148499, "logps/chosen": -632.0416870117188, "logps/rejected": -1030.476806640625, "loss": 0.053, "rewards/accuracies": 1.0, "rewards/chosen": -31.639551162719727, "rewards/margins": 40.21255874633789, "rewards/rejected": -71.85210418701172, "step": 3960 }, { "epoch": 0.397, "grad_norm": 0.0, "learning_rate": 3.772603539375929e-06, "logits/chosen": -0.5894684195518494, "logits/rejected": 0.7861677408218384, "logps/chosen": -346.8887634277344, "logps/rejected": -944.8414306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.525707244873047, "rewards/margins": 49.367393493652344, "rewards/rejected": -66.89309692382812, "step": 3970 }, { "epoch": 0.398, "grad_norm": 5.1069120672764257e-05, "learning_rate": 3.7650844103029093e-06, "logits/chosen": -0.4754267632961273, "logits/rejected": 0.25245314836502075, "logps/chosen": -240.3736114501953, "logps/rejected": -768.532470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.96320915222168, "rewards/margins": 42.275733947753906, "rewards/rejected": -54.23894500732422, "step": 3980 }, { "epoch": 0.399, "grad_norm": 1.6205316821047843e-15, "learning_rate": 3.7575498665755884e-06, "logits/chosen": -0.3462293744087219, "logits/rejected": 0.3235887885093689, "logps/chosen": -371.97552490234375, "logps/rejected": -866.0631103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.929563522338867, "rewards/margins": 45.36902618408203, "rewards/rejected": -59.29859161376953, "step": 3990 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.8836024403572083, "logits/rejected": 0.7240092754364014, "logps/chosen": -441.99041748046875, "logps/rejected": -1129.660888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.415254592895508, "rewards/margins": 50.131492614746094, "rewards/rejected": -70.5467529296875, "step": 4000 }, { "epoch": 0.401, "grad_norm": 4.55244343114814e-20, "learning_rate": 3.742434902568889e-06, "logits/chosen": -0.3526178002357483, "logits/rejected": 0.26132869720458984, "logps/chosen": -562.0494384765625, "logps/rejected": -974.6246337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.889423370361328, "rewards/margins": 43.921607971191406, "rewards/rejected": -63.81103515625, "step": 4010 }, { "epoch": 0.402, "grad_norm": 341.7988586425781, "learning_rate": 3.7348546664605777e-06, "logits/chosen": -1.0505037307739258, "logits/rejected": 0.5458782911300659, "logps/chosen": -359.09869384765625, "logps/rejected": -1040.2222900390625, "loss": 0.1245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.71851921081543, "rewards/margins": 46.79050827026367, "rewards/rejected": -63.5090217590332, "step": 4020 }, { "epoch": 0.403, "grad_norm": 3.106318462903774e-20, "learning_rate": 3.7272593840378526e-06, "logits/chosen": 0.07517627626657486, "logits/rejected": 0.17533142864704132, "logps/chosen": -382.37725830078125, "logps/rejected": -732.4088745117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.196589469909668, "rewards/margins": 34.664756774902344, "rewards/rejected": -49.86134338378906, "step": 4030 }, { "epoch": 0.404, "grad_norm": 1.848615184540936e-11, "learning_rate": 3.7196491478468322e-06, "logits/chosen": -0.8824928402900696, "logits/rejected": 0.4116589426994324, "logps/chosen": -323.4385681152344, "logps/rejected": -935.02294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.299783706665039, "rewards/margins": 46.94359588623047, "rewards/rejected": -59.243377685546875, "step": 4040 }, { "epoch": 0.405, "grad_norm": 2.899619184764494e-22, "learning_rate": 3.7120240506158433e-06, "logits/chosen": -0.8381205797195435, "logits/rejected": 0.7291172742843628, "logps/chosen": -309.32269287109375, "logps/rejected": -971.3074951171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.42654800415039, "rewards/margins": 52.61445236206055, "rewards/rejected": -64.04100036621094, "step": 4050 }, { "epoch": 0.406, "grad_norm": 6.420053136489467e-17, "learning_rate": 3.7043841852542884e-06, "logits/chosen": -0.557745099067688, "logits/rejected": 0.6661397218704224, "logps/chosen": -288.09661865234375, "logps/rejected": -966.2178955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.155458450317383, "rewards/margins": 58.315834045410156, "rewards/rejected": -70.4712905883789, "step": 4060 }, { "epoch": 0.407, "grad_norm": 0.0, "learning_rate": 3.6967296448515176e-06, "logits/chosen": -0.6943304538726807, "logits/rejected": 0.5316852331161499, "logps/chosen": -434.56268310546875, "logps/rejected": -1232.92626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.114227294921875, "rewards/margins": 66.32272338867188, "rewards/rejected": -82.43694305419922, "step": 4070 }, { "epoch": 0.408, "grad_norm": 4.842999699189443e-12, "learning_rate": 3.689060522675689e-06, "logits/chosen": -0.53111332654953, "logits/rejected": 0.36400312185287476, "logps/chosen": -230.18508911132812, "logps/rejected": -699.6070556640625, "loss": 0.4617, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.08073616027832, "rewards/margins": 40.68952560424805, "rewards/rejected": -49.770259857177734, "step": 4080 }, { "epoch": 0.409, "grad_norm": 3.8501983823380215e-08, "learning_rate": 3.6813769121726356e-06, "logits/chosen": -1.0677297115325928, "logits/rejected": 0.30235835909843445, "logps/chosen": -262.8753662109375, "logps/rejected": -824.7024536132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.651956558227539, "rewards/margins": 38.70851516723633, "rewards/rejected": -47.360477447509766, "step": 4090 }, { "epoch": 0.41, "grad_norm": 2.6643040191057753e-16, "learning_rate": 3.6736789069647273e-06, "logits/chosen": -0.6208855509757996, "logits/rejected": -0.03054434061050415, "logps/chosen": -350.40325927734375, "logps/rejected": -694.046630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.904715061187744, "rewards/margins": 29.830276489257812, "rewards/rejected": -37.73499298095703, "step": 4100 }, { "epoch": 0.411, "grad_norm": 0.0013503417139872909, "learning_rate": 3.6659666008497287e-06, "logits/chosen": -0.6803125143051147, "logits/rejected": 0.3102254867553711, "logps/chosen": -315.3930969238281, "logps/rejected": -651.2322998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.613018989562988, "rewards/margins": 29.166412353515625, "rewards/rejected": -35.7794303894043, "step": 4110 }, { "epoch": 0.412, "grad_norm": 1.428150566556985e-12, "learning_rate": 3.658240087799655e-06, "logits/chosen": -0.40848007798194885, "logits/rejected": -0.06087536737322807, "logps/chosen": -300.522705078125, "logps/rejected": -697.1531372070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.41520881652832, "rewards/margins": 36.236854553222656, "rewards/rejected": -45.652061462402344, "step": 4120 }, { "epoch": 0.413, "grad_norm": 0.00011740612535504624, "learning_rate": 3.6504994619596295e-06, "logits/chosen": -0.5807197690010071, "logits/rejected": 0.24162797629833221, "logps/chosen": -490.38043212890625, "logps/rejected": -848.3361206054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.073907852172852, "rewards/margins": 37.16918182373047, "rewards/rejected": -47.24308776855469, "step": 4130 }, { "epoch": 0.414, "grad_norm": 7.407800755595368e-17, "learning_rate": 3.642744817646736e-06, "logits/chosen": -0.9697567224502563, "logits/rejected": 0.26281073689460754, "logps/chosen": -361.99359130859375, "logps/rejected": -823.3468627929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.846258640289307, "rewards/margins": 30.67578125, "rewards/rejected": -38.52204513549805, "step": 4140 }, { "epoch": 0.415, "grad_norm": 2.6733165742909525e-22, "learning_rate": 3.634976249348867e-06, "logits/chosen": -0.7604053616523743, "logits/rejected": 0.2656019628047943, "logps/chosen": -296.73431396484375, "logps/rejected": -780.6051635742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.004801750183105, "rewards/margins": 36.96376419067383, "rewards/rejected": -44.96856689453125, "step": 4150 }, { "epoch": 0.416, "grad_norm": 1.125780147481541e-19, "learning_rate": 3.627193851723577e-06, "logits/chosen": -0.502686619758606, "logits/rejected": 0.29140302538871765, "logps/chosen": -234.8794403076172, "logps/rejected": -619.0558471679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.264492988586426, "rewards/margins": 30.473552703857422, "rewards/rejected": -40.73804473876953, "step": 4160 }, { "epoch": 0.417, "grad_norm": 1.4964207200773406e-21, "learning_rate": 3.6193977195969243e-06, "logits/chosen": -0.7825326919555664, "logits/rejected": 0.1626361906528473, "logps/chosen": -405.54864501953125, "logps/rejected": -663.46484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.345398902893066, "rewards/margins": 32.6074333190918, "rewards/rejected": -37.95283126831055, "step": 4170 }, { "epoch": 0.418, "grad_norm": 0.0008746925159357488, "learning_rate": 3.611587947962319e-06, "logits/chosen": -0.8463672399520874, "logits/rejected": 0.26045385003089905, "logps/chosen": -236.1060028076172, "logps/rejected": -748.9749755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.019121170043945, "rewards/margins": 37.229774475097656, "rewards/rejected": -45.24889373779297, "step": 4180 }, { "epoch": 0.419, "grad_norm": 2.3645246871595305e-10, "learning_rate": 3.6037646319793635e-06, "logits/chosen": -1.3852901458740234, "logits/rejected": 0.21314740180969238, "logps/chosen": -202.6114044189453, "logps/rejected": -827.9249267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.685309410095215, "rewards/margins": 42.572757720947266, "rewards/rejected": -48.25806427001953, "step": 4190 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 3.595927866972694e-06, "logits/chosen": -0.7606481313705444, "logits/rejected": 0.3610491454601288, "logps/chosen": -369.33905029296875, "logps/rejected": -958.4986572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.40889835357666, "rewards/margins": 44.8622932434082, "rewards/rejected": -54.27119064331055, "step": 4200 }, { "epoch": 0.421, "grad_norm": 3.889532672474161e-05, "learning_rate": 3.5880777484308193e-06, "logits/chosen": -0.8923002481460571, "logits/rejected": 0.17552146315574646, "logps/chosen": -297.636474609375, "logps/rejected": -713.8099365234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.230313301086426, "rewards/margins": 31.368526458740234, "rewards/rejected": -40.598838806152344, "step": 4210 }, { "epoch": 0.422, "grad_norm": 6.331940527459778e-10, "learning_rate": 3.5802143720049565e-06, "logits/chosen": -0.9664111137390137, "logits/rejected": 0.3709142804145813, "logps/chosen": -392.75384521484375, "logps/rejected": -856.77685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.62370777130127, "rewards/margins": 36.612483978271484, "rewards/rejected": -45.23619079589844, "step": 4220 }, { "epoch": 0.423, "grad_norm": 0.0, "learning_rate": 3.5723378335078653e-06, "logits/chosen": -0.5414325594902039, "logits/rejected": 0.0008636951679363847, "logps/chosen": -414.93902587890625, "logps/rejected": -768.058349609375, "loss": 0.292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.064050674438477, "rewards/margins": 29.415279388427734, "rewards/rejected": -43.47932815551758, "step": 4230 }, { "epoch": 0.424, "grad_norm": 5.148122454556869e-06, "learning_rate": 3.564448228912682e-06, "logits/chosen": -0.6183091402053833, "logits/rejected": 0.3114756643772125, "logps/chosen": -337.8242492675781, "logps/rejected": -824.0460815429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.286993026733398, "rewards/margins": 39.22076416015625, "rewards/rejected": -48.507755279541016, "step": 4240 }, { "epoch": 0.425, "grad_norm": 0.2487691193819046, "learning_rate": 3.556545654351749e-06, "logits/chosen": -0.6846747398376465, "logits/rejected": 0.16172091662883759, "logps/chosen": -416.21160888671875, "logps/rejected": -663.5516357421875, "loss": 0.2262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -11.881389617919922, "rewards/margins": 25.58749771118164, "rewards/rejected": -37.46889114379883, "step": 4250 }, { "epoch": 0.426, "grad_norm": 1.436825769474126e-09, "learning_rate": 3.5486302061154433e-06, "logits/chosen": -0.6829143166542053, "logits/rejected": 0.5160936713218689, "logps/chosen": -241.07302856445312, "logps/rejected": -719.6329956054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.42330265045166, "rewards/margins": 37.69516372680664, "rewards/rejected": -46.11846160888672, "step": 4260 }, { "epoch": 0.427, "grad_norm": 0.0, "learning_rate": 3.5407019806510035e-06, "logits/chosen": -0.4164047837257385, "logits/rejected": 0.5037349462509155, "logps/chosen": -414.34600830078125, "logps/rejected": -780.9544067382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.40142822265625, "rewards/margins": 32.7313346862793, "rewards/rejected": -43.13277053833008, "step": 4270 }, { "epoch": 0.428, "grad_norm": 1.8435288018370244e-12, "learning_rate": 3.532761074561355e-06, "logits/chosen": -0.7431804537773132, "logits/rejected": 0.388200581073761, "logps/chosen": -275.937744140625, "logps/rejected": -827.150390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.089512825012207, "rewards/margins": 42.278560638427734, "rewards/rejected": -52.368072509765625, "step": 4280 }, { "epoch": 0.429, "grad_norm": 0.008692407049238682, "learning_rate": 3.524807584603932e-06, "logits/chosen": -0.6186565160751343, "logits/rejected": 0.3314044177532196, "logps/chosen": -253.70889282226562, "logps/rejected": -659.0180053710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.876142978668213, "rewards/margins": 37.75662612915039, "rewards/rejected": -44.63277053833008, "step": 4290 }, { "epoch": 0.43, "grad_norm": 1.1581534427367252e-15, "learning_rate": 3.516841607689501e-06, "logits/chosen": -1.0289478302001953, "logits/rejected": 0.5168917179107666, "logps/chosen": -458.88623046875, "logps/rejected": -1042.71484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.337991714477539, "rewards/margins": 48.008358001708984, "rewards/rejected": -58.346351623535156, "step": 4300 }, { "epoch": 0.431, "grad_norm": 2.3897616524548582e-11, "learning_rate": 3.5088632408809757e-06, "logits/chosen": -0.8067126274108887, "logits/rejected": 0.3740237355232239, "logps/chosen": -226.9942169189453, "logps/rejected": -701.123291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.234461784362793, "rewards/margins": 38.51856231689453, "rewards/rejected": -45.753013610839844, "step": 4310 }, { "epoch": 0.432, "grad_norm": 4.3961008444615146e-11, "learning_rate": 3.5008725813922383e-06, "logits/chosen": -1.3616108894348145, "logits/rejected": 0.6536494493484497, "logps/chosen": -339.8933410644531, "logps/rejected": -918.8294677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.569557189941406, "rewards/margins": 45.761993408203125, "rewards/rejected": -52.33155059814453, "step": 4320 }, { "epoch": 0.433, "grad_norm": 2.4098374662960553e-13, "learning_rate": 3.4928697265869516e-06, "logits/chosen": -0.5547953248023987, "logits/rejected": 0.3869093954563141, "logps/chosen": -310.59234619140625, "logps/rejected": -775.0176391601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.434072494506836, "rewards/margins": 42.483253479003906, "rewards/rejected": -52.917320251464844, "step": 4330 }, { "epoch": 0.434, "grad_norm": 3.553122372197551e-15, "learning_rate": 3.4848547739773782e-06, "logits/chosen": -0.2578112483024597, "logits/rejected": 0.28735774755477905, "logps/chosen": -309.19537353515625, "logps/rejected": -687.3353271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.1480131149292, "rewards/margins": 32.68153381347656, "rewards/rejected": -45.82954788208008, "step": 4340 }, { "epoch": 0.435, "grad_norm": 2.7313951165776243e-16, "learning_rate": 3.476827821223184e-06, "logits/chosen": -0.5213783383369446, "logits/rejected": 0.43540406227111816, "logps/chosen": -269.1460876464844, "logps/rejected": -684.8394775390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.5526556968688965, "rewards/margins": 39.1630859375, "rewards/rejected": -46.71574020385742, "step": 4350 }, { "epoch": 0.436, "grad_norm": 7.970552360347938e-07, "learning_rate": 3.4687889661302577e-06, "logits/chosen": -0.5087043046951294, "logits/rejected": 0.5694769024848938, "logps/chosen": -435.17156982421875, "logps/rejected": -951.3097534179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.336833953857422, "rewards/margins": 45.699851989746094, "rewards/rejected": -62.03668975830078, "step": 4360 }, { "epoch": 0.437, "grad_norm": 9.015243307430865e-08, "learning_rate": 3.460738306649509e-06, "logits/chosen": -0.6573070287704468, "logits/rejected": 0.7821485996246338, "logps/chosen": -173.24493408203125, "logps/rejected": -626.5826416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.188490390777588, "rewards/margins": 35.53300476074219, "rewards/rejected": -42.721492767333984, "step": 4370 }, { "epoch": 0.438, "grad_norm": 7.699915783887652e-18, "learning_rate": 3.452675940875686e-06, "logits/chosen": -0.7135838270187378, "logits/rejected": 0.1635311394929886, "logps/chosen": -472.3910217285156, "logps/rejected": -774.1685791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.920271873474121, "rewards/margins": 31.03672218322754, "rewards/rejected": -41.956993103027344, "step": 4380 }, { "epoch": 0.439, "grad_norm": 0.0, "learning_rate": 3.4446019670461684e-06, "logits/chosen": -1.4899814128875732, "logits/rejected": 0.6516152024269104, "logps/chosen": -200.21481323242188, "logps/rejected": -914.6165161132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.7514543533325195, "rewards/margins": 51.47700881958008, "rewards/rejected": -57.22846603393555, "step": 4390 }, { "epoch": 0.44, "grad_norm": 3.991723588114837e-06, "learning_rate": 3.436516483539781e-06, "logits/chosen": -0.6385399103164673, "logits/rejected": 0.32198888063430786, "logps/chosen": -381.22479248046875, "logps/rejected": -689.6920776367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.779531955718994, "rewards/margins": 30.917633056640625, "rewards/rejected": -38.697166442871094, "step": 4400 }, { "epoch": 0.441, "grad_norm": 6.0199471275945715e-12, "learning_rate": 3.4284195888755877e-06, "logits/chosen": -0.6760299205780029, "logits/rejected": 0.44728073477745056, "logps/chosen": -283.923583984375, "logps/rejected": -700.1207275390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.251309394836426, "rewards/margins": 32.73466491699219, "rewards/rejected": -41.9859733581543, "step": 4410 }, { "epoch": 0.442, "grad_norm": 1.9233450943012542e-10, "learning_rate": 3.4203113817116955e-06, "logits/chosen": -0.8296216726303101, "logits/rejected": 0.1775527447462082, "logps/chosen": -386.5934143066406, "logps/rejected": -863.5372924804688, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -14.466753959655762, "rewards/margins": 36.731040954589844, "rewards/rejected": -51.197792053222656, "step": 4420 }, { "epoch": 0.443, "grad_norm": 0.0, "learning_rate": 3.412191960844049e-06, "logits/chosen": -0.366151362657547, "logits/rejected": 0.12416459619998932, "logps/chosen": -387.76910400390625, "logps/rejected": -773.8753662109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -9.397980690002441, "rewards/margins": 43.43345260620117, "rewards/rejected": -52.8314323425293, "step": 4430 }, { "epoch": 0.444, "grad_norm": 0.020295394584536552, "learning_rate": 3.4040614252052305e-06, "logits/chosen": -0.7474874258041382, "logits/rejected": 0.4018009305000305, "logps/chosen": -328.6078796386719, "logps/rejected": -1040.136962890625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -11.811203002929688, "rewards/margins": 60.68671417236328, "rewards/rejected": -72.49790954589844, "step": 4440 }, { "epoch": 0.445, "grad_norm": 0.0, "learning_rate": 3.39591987386325e-06, "logits/chosen": -0.22013449668884277, "logits/rejected": 0.8452490568161011, "logps/chosen": -458.42352294921875, "logps/rejected": -1054.8280029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.357414245605469, "rewards/margins": 60.5202522277832, "rewards/rejected": -75.87767028808594, "step": 4450 }, { "epoch": 0.446, "grad_norm": 5.777521206575345e-16, "learning_rate": 3.387767406020343e-06, "logits/chosen": -0.4100729823112488, "logits/rejected": 0.9636079668998718, "logps/chosen": -517.3120727539062, "logps/rejected": -1221.5855712890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.92448616027832, "rewards/margins": 66.79444122314453, "rewards/rejected": -80.71891784667969, "step": 4460 }, { "epoch": 0.447, "grad_norm": 200.02029418945312, "learning_rate": 3.3796041210117545e-06, "logits/chosen": -0.5789368152618408, "logits/rejected": 1.0708125829696655, "logps/chosen": -411.73980712890625, "logps/rejected": -1207.94091796875, "loss": 0.1911, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.856058120727539, "rewards/margins": 72.57444763183594, "rewards/rejected": -86.43049621582031, "step": 4470 }, { "epoch": 0.448, "grad_norm": 3.0700267127912767e-16, "learning_rate": 3.3714301183045382e-06, "logits/chosen": -0.26980918645858765, "logits/rejected": 0.574004590511322, "logps/chosen": -414.5419921875, "logps/rejected": -1148.537353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.525867462158203, "rewards/margins": 74.47563171386719, "rewards/rejected": -91.00149536132812, "step": 4480 }, { "epoch": 0.449, "grad_norm": 7.729781224818932e-12, "learning_rate": 3.3632454974963368e-06, "logits/chosen": -0.4832285940647125, "logits/rejected": 0.8683622479438782, "logps/chosen": -475.9676208496094, "logps/rejected": -1218.5394287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.89743995666504, "rewards/margins": 69.6901626586914, "rewards/rejected": -90.58760070800781, "step": 4490 }, { "epoch": 0.45, "grad_norm": 5.4706279506433475e-09, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -0.607457160949707, "logits/rejected": 1.0746209621429443, "logps/chosen": -425.61590576171875, "logps/rejected": -996.9503784179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.851272583007812, "rewards/margins": 54.49498748779297, "rewards/rejected": -74.34626770019531, "step": 4500 }, { "epoch": 0.451, "grad_norm": 3.78396756474126e-14, "learning_rate": 3.346844800613229e-06, "logits/chosen": -0.2813403606414795, "logits/rejected": 1.1134769916534424, "logps/chosen": -579.0421142578125, "logps/rejected": -1489.37060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.425365447998047, "rewards/margins": 87.05371856689453, "rewards/rejected": -112.47908020019531, "step": 4510 }, { "epoch": 0.452, "grad_norm": 6.463025947756051e-11, "learning_rate": 3.338628924375638e-06, "logits/chosen": -0.7063466310501099, "logits/rejected": 0.9095737338066101, "logps/chosen": -304.01470947265625, "logps/rejected": -1101.2586669921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -16.164072036743164, "rewards/margins": 64.8074722290039, "rewards/rejected": -80.97154235839844, "step": 4520 }, { "epoch": 0.453, "grad_norm": 0.0, "learning_rate": 3.3304028297092583e-06, "logits/chosen": -0.31411752104759216, "logits/rejected": 0.9088078737258911, "logps/chosen": -433.57940673828125, "logps/rejected": -1088.935302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.69645118713379, "rewards/margins": 62.624244689941406, "rewards/rejected": -80.32068634033203, "step": 4530 }, { "epoch": 0.454, "grad_norm": 0.0, "learning_rate": 3.3221666168464584e-06, "logits/chosen": -0.5670292377471924, "logits/rejected": 0.8151572942733765, "logps/chosen": -518.0890502929688, "logps/rejected": -1576.5970458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.283605575561523, "rewards/margins": 96.97727966308594, "rewards/rejected": -118.26087951660156, "step": 4540 }, { "epoch": 0.455, "grad_norm": 3.6856651064454127e-10, "learning_rate": 3.313920386142892e-06, "logits/chosen": -0.3371369242668152, "logits/rejected": 0.8347536325454712, "logps/chosen": -359.16168212890625, "logps/rejected": -1016.52685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.546493530273438, "rewards/margins": 59.38935470581055, "rewards/rejected": -76.93585205078125, "step": 4550 }, { "epoch": 0.456, "grad_norm": 0.0, "learning_rate": 3.3056642380762783e-06, "logits/chosen": -0.4648679792881012, "logits/rejected": 1.0613789558410645, "logps/chosen": -707.9065551757812, "logps/rejected": -1881.05078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.829044342041016, "rewards/margins": 105.40827941894531, "rewards/rejected": -137.2373046875, "step": 4560 }, { "epoch": 0.457, "grad_norm": 0.0, "learning_rate": 3.2973982732451753e-06, "logits/chosen": -0.9866671562194824, "logits/rejected": 1.4066946506500244, "logps/chosen": -392.55731201171875, "logps/rejected": -1470.923095703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.702869415283203, "rewards/margins": 90.46757507324219, "rewards/rejected": -110.17044830322266, "step": 4570 }, { "epoch": 0.458, "grad_norm": 0.0, "learning_rate": 3.2891225923677565e-06, "logits/chosen": -0.5084947943687439, "logits/rejected": 0.9717354774475098, "logps/chosen": -396.46392822265625, "logps/rejected": -1423.1839599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.467742919921875, "rewards/margins": 92.16461944580078, "rewards/rejected": -106.63236236572266, "step": 4580 }, { "epoch": 0.459, "grad_norm": 5.6872564788067e-17, "learning_rate": 3.280837296280582e-06, "logits/chosen": -0.8941437005996704, "logits/rejected": 1.4038536548614502, "logps/chosen": -263.62640380859375, "logps/rejected": -1279.1226806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.20585823059082, "rewards/margins": 80.79095458984375, "rewards/rejected": -94.9968032836914, "step": 4590 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 3.272542485937369e-06, "logits/chosen": -0.31348443031311035, "logits/rejected": 0.8007584810256958, "logps/chosen": -546.2010498046875, "logps/rejected": -1243.5308837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.36667251586914, "rewards/margins": 70.93089294433594, "rewards/rejected": -89.29756927490234, "step": 4600 }, { "epoch": 0.461, "grad_norm": 0.0, "learning_rate": 3.2642382624077647e-06, "logits/chosen": -0.5122408270835876, "logits/rejected": 0.698052704334259, "logps/chosen": -435.46142578125, "logps/rejected": -1183.85986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.03237533569336, "rewards/margins": 63.591339111328125, "rewards/rejected": -79.62370300292969, "step": 4610 }, { "epoch": 0.462, "grad_norm": 6.614533741655918e-19, "learning_rate": 3.2559247268761117e-06, "logits/chosen": -0.789227306842804, "logits/rejected": 1.2054466009140015, "logps/chosen": -334.6946105957031, "logps/rejected": -1168.644775390625, "loss": 0.1734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.957490921020508, "rewards/margins": 64.99995422363281, "rewards/rejected": -84.95745086669922, "step": 4620 }, { "epoch": 0.463, "grad_norm": 0.0, "learning_rate": 3.247601980640217e-06, "logits/chosen": -0.46238309144973755, "logits/rejected": 0.8354955911636353, "logps/chosen": -530.7396850585938, "logps/rejected": -1429.529541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.647016525268555, "rewards/margins": 84.20267486572266, "rewards/rejected": -101.84969329833984, "step": 4630 }, { "epoch": 0.464, "grad_norm": 0.0, "learning_rate": 3.2392701251101172e-06, "logits/chosen": -0.7151135206222534, "logits/rejected": 0.6666629910469055, "logps/chosen": -438.103515625, "logps/rejected": -1222.823974609375, "loss": 0.0251, "rewards/accuracies": 1.0, "rewards/chosen": -12.228551864624023, "rewards/margins": 80.06420135498047, "rewards/rejected": -92.29276275634766, "step": 4640 }, { "epoch": 0.465, "grad_norm": 0.0, "learning_rate": 3.230929261806842e-06, "logits/chosen": -0.47254037857055664, "logits/rejected": 0.9429537057876587, "logps/chosen": -334.66949462890625, "logps/rejected": -1285.926025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.019699096679688, "rewards/margins": 82.53520202636719, "rewards/rejected": -101.55490112304688, "step": 4650 }, { "epoch": 0.466, "grad_norm": 0.0, "learning_rate": 3.222579492361179e-06, "logits/chosen": -0.31999364495277405, "logits/rejected": 1.0213924646377563, "logps/chosen": -595.5615234375, "logps/rejected": -1488.1405029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.56376838684082, "rewards/margins": 82.70274353027344, "rewards/rejected": -102.26651000976562, "step": 4660 }, { "epoch": 0.467, "grad_norm": 0.0, "learning_rate": 3.214220918512434e-06, "logits/chosen": -0.6097769737243652, "logits/rejected": 0.9890214800834656, "logps/chosen": -217.25765991210938, "logps/rejected": -1152.97607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.308805465698242, "rewards/margins": 80.55377960205078, "rewards/rejected": -92.86257934570312, "step": 4670 }, { "epoch": 0.468, "grad_norm": 0.0, "learning_rate": 3.205853642107192e-06, "logits/chosen": -0.7291964292526245, "logits/rejected": 0.6837356090545654, "logps/chosen": -446.716796875, "logps/rejected": -1542.17431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.69511604309082, "rewards/margins": 97.58460998535156, "rewards/rejected": -117.27970886230469, "step": 4680 }, { "epoch": 0.469, "grad_norm": 1.9307333374218513e-14, "learning_rate": 3.1974777650980737e-06, "logits/chosen": -0.882508397102356, "logits/rejected": 0.8033136129379272, "logps/chosen": -353.4367980957031, "logps/rejected": -1384.420654296875, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -14.956860542297363, "rewards/margins": 85.45812225341797, "rewards/rejected": -100.41497802734375, "step": 4690 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 3.189093389542498e-06, "logits/chosen": -0.5763040781021118, "logits/rejected": 1.2356733083724976, "logps/chosen": -531.9615478515625, "logps/rejected": -1474.219970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.269664764404297, "rewards/margins": 83.10777282714844, "rewards/rejected": -110.3774185180664, "step": 4700 }, { "epoch": 0.471, "grad_norm": 0.0, "learning_rate": 3.180700617601436e-06, "logits/chosen": -0.5389689207077026, "logits/rejected": 0.7355275750160217, "logps/chosen": -611.3015747070312, "logps/rejected": -1604.859619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.51515197753906, "rewards/margins": 84.79194641113281, "rewards/rejected": -122.3071060180664, "step": 4710 }, { "epoch": 0.472, "grad_norm": 5.714040351989524e-13, "learning_rate": 3.1722995515381644e-06, "logits/chosen": -0.3022814393043518, "logits/rejected": 1.2010146379470825, "logps/chosen": -409.2784423828125, "logps/rejected": -1302.8153076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.96738052368164, "rewards/margins": 79.1131362915039, "rewards/rejected": -98.08050537109375, "step": 4720 }, { "epoch": 0.473, "grad_norm": 1.995590777248055e-15, "learning_rate": 3.1638902937170224e-06, "logits/chosen": -0.5371532440185547, "logits/rejected": 1.079555630683899, "logps/chosen": -493.83477783203125, "logps/rejected": -1238.79150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.327775955200195, "rewards/margins": 65.20509338378906, "rewards/rejected": -85.53287506103516, "step": 4730 }, { "epoch": 0.474, "grad_norm": 0.0, "learning_rate": 3.155472946602162e-06, "logits/chosen": 0.05416768044233322, "logits/rejected": 0.6128655672073364, "logps/chosen": -610.4722900390625, "logps/rejected": -1401.691162109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.495906829833984, "rewards/margins": 79.49934387207031, "rewards/rejected": -112.99525451660156, "step": 4740 }, { "epoch": 0.475, "grad_norm": 0.0, "learning_rate": 3.147047612756302e-06, "logits/chosen": -0.7330142259597778, "logits/rejected": 1.4546329975128174, "logps/chosen": -310.0386657714844, "logps/rejected": -1607.7252197265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.920520782470703, "rewards/margins": 114.64827728271484, "rewards/rejected": -133.56878662109375, "step": 4750 }, { "epoch": 0.476, "grad_norm": 0.0, "learning_rate": 3.1386143948394764e-06, "logits/chosen": -1.1134151220321655, "logits/rejected": 1.2869594097137451, "logps/chosen": -354.29571533203125, "logps/rejected": -1476.500732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.353538513183594, "rewards/margins": 91.77238464355469, "rewards/rejected": -116.12593841552734, "step": 4760 }, { "epoch": 0.477, "grad_norm": 0.0, "learning_rate": 3.130173395607785e-06, "logits/chosen": -0.9510402679443359, "logits/rejected": 1.2608510255813599, "logps/chosen": -573.7791137695312, "logps/rejected": -1927.2662353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.785295486450195, "rewards/margins": 120.95957946777344, "rewards/rejected": -144.744873046875, "step": 4770 }, { "epoch": 0.478, "grad_norm": 0.0, "learning_rate": 3.121724717912138e-06, "logits/chosen": -0.5686971545219421, "logits/rejected": 1.2843066453933716, "logps/chosen": -423.05084228515625, "logps/rejected": -1424.4683837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.960468292236328, "rewards/margins": 82.43710327148438, "rewards/rejected": -108.3975830078125, "step": 4780 }, { "epoch": 0.479, "grad_norm": 0.0, "learning_rate": 3.1132684646970068e-06, "logits/chosen": -0.25530606508255005, "logits/rejected": 0.879818320274353, "logps/chosen": -614.8173217773438, "logps/rejected": -1260.456787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.6771183013916, "rewards/margins": 61.40053176879883, "rewards/rejected": -89.07764434814453, "step": 4790 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 3.1048047389991693e-06, "logits/chosen": -0.5389418601989746, "logits/rejected": 1.1465704441070557, "logps/chosen": -342.2206115722656, "logps/rejected": -1164.1463623046875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -18.74749755859375, "rewards/margins": 69.92078399658203, "rewards/rejected": -88.66828918457031, "step": 4800 }, { "epoch": 0.481, "grad_norm": 0.0, "learning_rate": 3.0963336439464527e-06, "logits/chosen": -1.0253163576126099, "logits/rejected": 0.9419560432434082, "logps/chosen": -423.51068115234375, "logps/rejected": -1376.6455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.159902572631836, "rewards/margins": 77.23731994628906, "rewards/rejected": -95.39723205566406, "step": 4810 }, { "epoch": 0.482, "grad_norm": 0.0, "learning_rate": 3.087855282756475e-06, "logits/chosen": -0.06365472078323364, "logits/rejected": 0.9617937803268433, "logps/chosen": -586.2677001953125, "logps/rejected": -1372.3779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.02089309692383, "rewards/margins": 73.61542510986328, "rewards/rejected": -106.63630676269531, "step": 4820 }, { "epoch": 0.483, "grad_norm": 2.512538492409947e-21, "learning_rate": 3.079369758735393e-06, "logits/chosen": -0.1162848025560379, "logits/rejected": 1.0244472026824951, "logps/chosen": -283.7181091308594, "logps/rejected": -922.8565673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.87861442565918, "rewards/margins": 58.47466278076172, "rewards/rejected": -74.35327911376953, "step": 4830 }, { "epoch": 0.484, "grad_norm": 2.2687768005437425e-13, "learning_rate": 3.0708771752766397e-06, "logits/chosen": -0.730124294757843, "logits/rejected": 1.3653849363327026, "logps/chosen": -306.2144470214844, "logps/rejected": -1348.135009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.26485824584961, "rewards/margins": 90.71617126464844, "rewards/rejected": -106.98104095458984, "step": 4840 }, { "epoch": 0.485, "grad_norm": 3.276530235132772e-18, "learning_rate": 3.062377635859663e-06, "logits/chosen": -0.5650082230567932, "logits/rejected": 0.9500142931938171, "logps/chosen": -571.1583862304688, "logps/rejected": -1898.913818359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.382946014404297, "rewards/margins": 123.2589111328125, "rewards/rejected": -147.641845703125, "step": 4850 }, { "epoch": 0.486, "grad_norm": 0.0, "learning_rate": 3.053871244048669e-06, "logits/chosen": -0.1732010543346405, "logits/rejected": 0.6027408242225647, "logps/chosen": -812.1961669921875, "logps/rejected": -1584.22314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.407480239868164, "rewards/margins": 88.35913848876953, "rewards/rejected": -112.7666015625, "step": 4860 }, { "epoch": 0.487, "grad_norm": 0.0, "learning_rate": 3.045358103491357e-06, "logits/chosen": -0.3686712086200714, "logits/rejected": 1.344543695449829, "logps/chosen": -415.87420654296875, "logps/rejected": -1310.935791015625, "loss": 0.1168, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -24.52159309387207, "rewards/margins": 77.90599060058594, "rewards/rejected": -102.42757415771484, "step": 4870 }, { "epoch": 0.488, "grad_norm": 0.0, "learning_rate": 3.0368383179176584e-06, "logits/chosen": -0.8000626564025879, "logits/rejected": 1.0974117517471313, "logps/chosen": -320.93768310546875, "logps/rejected": -1404.354736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.79820442199707, "rewards/margins": 94.18359375, "rewards/rejected": -115.98179626464844, "step": 4880 }, { "epoch": 0.489, "grad_norm": 0.0, "learning_rate": 3.0283119911384724e-06, "logits/chosen": -0.8725460171699524, "logits/rejected": 1.4595402479171753, "logps/chosen": -270.36187744140625, "logps/rejected": -1621.961669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.967273712158203, "rewards/margins": 112.51634216308594, "rewards/rejected": -129.48361206054688, "step": 4890 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 3.019779227044398e-06, "logits/chosen": -0.5901892781257629, "logits/rejected": 1.4911832809448242, "logps/chosen": -424.73095703125, "logps/rejected": -1390.6676025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.552160263061523, "rewards/margins": 86.83820343017578, "rewards/rejected": -109.39036560058594, "step": 4900 }, { "epoch": 0.491, "grad_norm": 0.0, "learning_rate": 3.0112401296044756e-06, "logits/chosen": -0.31066763401031494, "logits/rejected": 1.3882120847702026, "logps/chosen": -620.4592895507812, "logps/rejected": -1867.9720458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.121437072753906, "rewards/margins": 115.75643157958984, "rewards/rejected": -147.8778533935547, "step": 4910 }, { "epoch": 0.492, "grad_norm": 0.0, "learning_rate": 3.002694802864912e-06, "logits/chosen": -0.33419251441955566, "logits/rejected": 1.1376793384552002, "logps/chosen": -537.1612548828125, "logps/rejected": -1480.853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.045330047607422, "rewards/margins": 86.95547485351562, "rewards/rejected": -118.00080871582031, "step": 4920 }, { "epoch": 0.493, "grad_norm": 0.0, "learning_rate": 2.9941433509478157e-06, "logits/chosen": -0.39663586020469666, "logits/rejected": 1.4704856872558594, "logps/chosen": -459.516845703125, "logps/rejected": -1432.1171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.825571060180664, "rewards/margins": 90.44802856445312, "rewards/rejected": -114.27359771728516, "step": 4930 }, { "epoch": 0.494, "grad_norm": 0.0, "learning_rate": 2.98558587804993e-06, "logits/chosen": -0.7232345342636108, "logits/rejected": 1.2291861772537231, "logps/chosen": -293.2331237792969, "logps/rejected": -1354.670654296875, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -17.588459014892578, "rewards/margins": 89.76078796386719, "rewards/rejected": -107.3492431640625, "step": 4940 }, { "epoch": 0.495, "grad_norm": 0.0, "learning_rate": 2.9770224884413625e-06, "logits/chosen": -0.5555292963981628, "logits/rejected": 1.3182722330093384, "logps/chosen": -564.3573608398438, "logps/rejected": -2021.4749755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.60253143310547, "rewards/margins": 131.98068237304688, "rewards/rejected": -165.58322143554688, "step": 4950 }, { "epoch": 0.496, "grad_norm": 0.0, "learning_rate": 2.9684532864643123e-06, "logits/chosen": 0.037823986262083054, "logits/rejected": 1.2553424835205078, "logps/chosen": -715.589111328125, "logps/rejected": -1661.09765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -40.89625549316406, "rewards/margins": 89.43692779541016, "rewards/rejected": -130.33316040039062, "step": 4960 }, { "epoch": 0.497, "grad_norm": 0.0, "learning_rate": 2.9598783765318005e-06, "logits/chosen": -0.36400288343429565, "logits/rejected": 1.1979119777679443, "logps/chosen": -501.673583984375, "logps/rejected": -1469.694580078125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -27.255168914794922, "rewards/margins": 88.84143829345703, "rewards/rejected": -116.09659576416016, "step": 4970 }, { "epoch": 0.498, "grad_norm": 0.0, "learning_rate": 2.9512978631264006e-06, "logits/chosen": -0.011648990213871002, "logits/rejected": 1.1334331035614014, "logps/chosen": -927.916015625, "logps/rejected": -2148.081298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -45.16447067260742, "rewards/margins": 127.28688049316406, "rewards/rejected": -172.4513397216797, "step": 4980 }, { "epoch": 0.499, "grad_norm": 0.0, "learning_rate": 2.942711850798959e-06, "logits/chosen": -0.4471127986907959, "logits/rejected": 1.7075560092926025, "logps/chosen": -536.8842163085938, "logps/rejected": -1839.9417724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.184226989746094, "rewards/margins": 116.0263900756836, "rewards/rejected": -152.2106170654297, "step": 4990 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -0.24953731894493103, "logits/rejected": 1.3373390436172485, "logps/chosen": -591.9629516601562, "logps/rejected": -1578.786865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.433740615844727, "rewards/margins": 102.1045913696289, "rewards/rejected": -130.538330078125, "step": 5000 }, { "epoch": 0.501, "grad_norm": 1.4258596039984361e-11, "learning_rate": 2.9255237479150815e-06, "logits/chosen": -0.03982694074511528, "logits/rejected": 1.6737682819366455, "logps/chosen": -937.9222412109375, "logps/rejected": -2133.9873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -60.917579650878906, "rewards/margins": 112.63969421386719, "rewards/rejected": -173.55728149414062, "step": 5010 }, { "epoch": 0.502, "grad_norm": 0.0, "learning_rate": 2.9169218667902562e-06, "logits/chosen": -0.39063578844070435, "logits/rejected": 1.115846872329712, "logps/chosen": -773.4056396484375, "logps/rejected": -2109.5830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -41.8221321105957, "rewards/margins": 122.40623474121094, "rewards/rejected": -164.22836303710938, "step": 5020 }, { "epoch": 0.503, "grad_norm": 0.0, "learning_rate": 2.908314905604056e-06, "logits/chosen": -0.6067632436752319, "logits/rejected": 1.213060975074768, "logps/chosen": -385.57440185546875, "logps/rejected": -1478.2047119140625, "loss": 0.031, "rewards/accuracies": 1.0, "rewards/chosen": -21.669397354125977, "rewards/margins": 100.73478698730469, "rewards/rejected": -122.4041976928711, "step": 5030 }, { "epoch": 0.504, "grad_norm": 0.0, "learning_rate": 2.8997029692295875e-06, "logits/chosen": -0.15307000279426575, "logits/rejected": 1.7413864135742188, "logps/chosen": -553.080322265625, "logps/rejected": -1663.1246337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.34833526611328, "rewards/margins": 105.29701232910156, "rewards/rejected": -129.6453399658203, "step": 5040 }, { "epoch": 0.505, "grad_norm": 0.0, "learning_rate": 2.8910861626005774e-06, "logits/chosen": -0.13748657703399658, "logits/rejected": 1.384549617767334, "logps/chosen": -447.5091247558594, "logps/rejected": -1429.4466552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.847366333007812, "rewards/margins": 87.37092590332031, "rewards/rejected": -113.21829986572266, "step": 5050 }, { "epoch": 0.506, "grad_norm": 2.4198081518989056e-05, "learning_rate": 2.8824645907100957e-06, "logits/chosen": -0.11051235347986221, "logits/rejected": 0.8797086477279663, "logps/chosen": -620.237548828125, "logps/rejected": -1389.697021484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -25.004684448242188, "rewards/margins": 79.93074035644531, "rewards/rejected": -104.9354248046875, "step": 5060 }, { "epoch": 0.507, "grad_norm": 7.882509645007474e-10, "learning_rate": 2.8738383586092745e-06, "logits/chosen": 0.07782775163650513, "logits/rejected": 1.0023387670516968, "logps/chosen": -517.6043090820312, "logps/rejected": -1478.5277099609375, "loss": 0.3021, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -30.25027847290039, "rewards/margins": 93.49092102050781, "rewards/rejected": -123.7411880493164, "step": 5070 }, { "epoch": 0.508, "grad_norm": 6.824987797138249e-11, "learning_rate": 2.8652075714060296e-06, "logits/chosen": -1.0027350187301636, "logits/rejected": 0.8647255897521973, "logps/chosen": -311.21234130859375, "logps/rejected": -1197.444091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.196630477905273, "rewards/margins": 74.7915267944336, "rewards/rejected": -92.98814392089844, "step": 5080 }, { "epoch": 0.509, "grad_norm": 2.4030850490281355e-19, "learning_rate": 2.8565723342637797e-06, "logits/chosen": -0.7676445841789246, "logits/rejected": 0.6487428545951843, "logps/chosen": -506.98919677734375, "logps/rejected": -1381.9840087890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.320411682128906, "rewards/margins": 81.42391204833984, "rewards/rejected": -96.74433898925781, "step": 5090 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 2.847932752400164e-06, "logits/chosen": -0.817142128944397, "logits/rejected": 0.9467900991439819, "logps/chosen": -381.5802307128906, "logps/rejected": -1384.46142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.012475967407227, "rewards/margins": 84.2886962890625, "rewards/rejected": -103.30118560791016, "step": 5100 }, { "epoch": 0.511, "grad_norm": 0.0, "learning_rate": 2.8392889310857615e-06, "logits/chosen": -0.8176937103271484, "logits/rejected": 0.7632077932357788, "logps/chosen": -248.266357421875, "logps/rejected": -954.0487060546875, "loss": 0.0361, "rewards/accuracies": 1.0, "rewards/chosen": -8.92024040222168, "rewards/margins": 60.94586181640625, "rewards/rejected": -69.86610412597656, "step": 5110 }, { "epoch": 0.512, "grad_norm": 1.8101504364408225e-17, "learning_rate": 2.8306409756428067e-06, "logits/chosen": -0.6258947253227234, "logits/rejected": 0.6817538738250732, "logps/chosen": -364.7867126464844, "logps/rejected": -1302.393798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.83382225036621, "rewards/margins": 77.67640686035156, "rewards/rejected": -98.51023864746094, "step": 5120 }, { "epoch": 0.513, "grad_norm": 0.0, "learning_rate": 2.8219889914439073e-06, "logits/chosen": -1.2403560876846313, "logits/rejected": 0.8351043462753296, "logps/chosen": -279.90972900390625, "logps/rejected": -1319.77099609375, "loss": 0.1724, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.439977645874023, "rewards/margins": 79.86945343017578, "rewards/rejected": -92.30941772460938, "step": 5130 }, { "epoch": 0.514, "grad_norm": 8.39083418450239e-20, "learning_rate": 2.813333083910761e-06, "logits/chosen": -0.9665325284004211, "logits/rejected": 0.5111045241355896, "logps/chosen": -145.3264923095703, "logps/rejected": -1021.9627685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.8399481773376465, "rewards/margins": 69.44557189941406, "rewards/rejected": -76.2855224609375, "step": 5140 }, { "epoch": 0.515, "grad_norm": 0.0, "learning_rate": 2.804673358512869e-06, "logits/chosen": -0.45989829301834106, "logits/rejected": 0.41274410486221313, "logps/chosen": -561.388916015625, "logps/rejected": -1412.365234375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -20.812267303466797, "rewards/margins": 80.39027404785156, "rewards/rejected": -101.2025375366211, "step": 5150 }, { "epoch": 0.516, "grad_norm": 0.0, "learning_rate": 2.7960099207662535e-06, "logits/chosen": 0.02474859170615673, "logits/rejected": 0.44673410058021545, "logps/chosen": -635.7348022460938, "logps/rejected": -1145.3740234375, "loss": 0.9996, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -22.584806442260742, "rewards/margins": 56.94877243041992, "rewards/rejected": -79.53358459472656, "step": 5160 }, { "epoch": 0.517, "grad_norm": 0.0, "learning_rate": 2.7873428762321667e-06, "logits/chosen": -0.7794687151908875, "logits/rejected": 0.7619781494140625, "logps/chosen": -375.510986328125, "logps/rejected": -1299.2457275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.562225341796875, "rewards/margins": 83.94371032714844, "rewards/rejected": -101.50593566894531, "step": 5170 }, { "epoch": 0.518, "grad_norm": 0.0, "learning_rate": 2.778672330515814e-06, "logits/chosen": -0.6919055581092834, "logits/rejected": 0.6613712310791016, "logps/chosen": -451.2239685058594, "logps/rejected": -1180.0167236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.567163467407227, "rewards/margins": 60.3519172668457, "rewards/rejected": -82.91908264160156, "step": 5180 }, { "epoch": 0.519, "grad_norm": 0.0, "learning_rate": 2.769998389265057e-06, "logits/chosen": -0.21784038841724396, "logits/rejected": 0.3405402600765228, "logps/chosen": -757.1791381835938, "logps/rejected": -1300.2464599609375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -23.873455047607422, "rewards/margins": 60.86193084716797, "rewards/rejected": -84.7353744506836, "step": 5190 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 2.761321158169134e-06, "logits/chosen": -0.7988853454589844, "logits/rejected": 0.634006917476654, "logps/chosen": -276.1188049316406, "logps/rejected": -1188.635009765625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -14.00566291809082, "rewards/margins": 72.88716125488281, "rewards/rejected": -86.89281463623047, "step": 5200 }, { "epoch": 0.521, "grad_norm": 0.0, "learning_rate": 2.752640742957366e-06, "logits/chosen": -1.1302502155303955, "logits/rejected": 0.5086938738822937, "logps/chosen": -423.6642150878906, "logps/rejected": -1387.6676025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.596208572387695, "rewards/margins": 78.29840850830078, "rewards/rejected": -95.89461517333984, "step": 5210 }, { "epoch": 0.522, "grad_norm": 0.0, "learning_rate": 2.743957249397874e-06, "logits/chosen": -1.0586451292037964, "logits/rejected": 0.7228761315345764, "logps/chosen": -252.70962524414062, "logps/rejected": -1112.5032958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.525865077972412, "rewards/margins": 71.3156509399414, "rewards/rejected": -78.84151458740234, "step": 5220 }, { "epoch": 0.523, "grad_norm": 0.0, "learning_rate": 2.7352707832962865e-06, "logits/chosen": -0.5234454274177551, "logits/rejected": 0.3423479497432709, "logps/chosen": -307.30731201171875, "logps/rejected": -1012.4879760742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.181329727172852, "rewards/margins": 61.77275848388672, "rewards/rejected": -74.95409393310547, "step": 5230 }, { "epoch": 0.524, "grad_norm": 1.0060982570359205e-16, "learning_rate": 2.726581450494451e-06, "logits/chosen": -0.9201523065567017, "logits/rejected": 0.5635375380516052, "logps/chosen": -228.01455688476562, "logps/rejected": -932.2889404296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.266953468322754, "rewards/margins": 57.03753662109375, "rewards/rejected": -64.30448913574219, "step": 5240 }, { "epoch": 0.525, "grad_norm": 0.0, "learning_rate": 2.717889356869146e-06, "logits/chosen": -0.9244860410690308, "logits/rejected": 0.3733980357646942, "logps/chosen": -389.0986328125, "logps/rejected": -1089.4500732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.316126823425293, "rewards/margins": 57.2689323425293, "rewards/rejected": -69.58506774902344, "step": 5250 }, { "epoch": 0.526, "grad_norm": 0.0, "learning_rate": 2.70919460833079e-06, "logits/chosen": -1.0785424709320068, "logits/rejected": 0.5775918960571289, "logps/chosen": -389.9350891113281, "logps/rejected": -1264.129638671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.48707103729248, "rewards/margins": 67.63380432128906, "rewards/rejected": -81.1208724975586, "step": 5260 }, { "epoch": 0.527, "grad_norm": 7.135824320759349e-14, "learning_rate": 2.700497310822147e-06, "logits/chosen": -0.6644527316093445, "logits/rejected": 0.04605517536401749, "logps/chosen": -502.5401306152344, "logps/rejected": -885.4474487304688, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": -9.509904861450195, "rewards/margins": 48.577606201171875, "rewards/rejected": -58.0875129699707, "step": 5270 }, { "epoch": 0.528, "grad_norm": 0.0, "learning_rate": 2.6917975703170466e-06, "logits/chosen": -1.0135033130645752, "logits/rejected": 0.859094500541687, "logps/chosen": -378.8349304199219, "logps/rejected": -1455.221923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.651538848876953, "rewards/margins": 89.67914581298828, "rewards/rejected": -106.33067321777344, "step": 5280 }, { "epoch": 0.529, "grad_norm": 7.46895166230388e-05, "learning_rate": 2.6830954928190795e-06, "logits/chosen": -0.7778174877166748, "logits/rejected": 0.6204961538314819, "logps/chosen": -495.7356872558594, "logps/rejected": -1281.3050537109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.955945014953613, "rewards/margins": 75.40312957763672, "rewards/rejected": -90.35907745361328, "step": 5290 }, { "epoch": 0.53, "grad_norm": 1.8743726564813783e-18, "learning_rate": 2.6743911843603134e-06, "logits/chosen": -0.44399577379226685, "logits/rejected": 0.5348590612411499, "logps/chosen": -536.5358276367188, "logps/rejected": -1139.5767822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.537757873535156, "rewards/margins": 64.67687225341797, "rewards/rejected": -82.21463012695312, "step": 5300 }, { "epoch": 0.531, "grad_norm": 0.0, "learning_rate": 2.6656847510000013e-06, "logits/chosen": -1.0946407318115234, "logits/rejected": 0.8582640886306763, "logps/chosen": -297.0832824707031, "logps/rejected": -1163.2041015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.352200508117676, "rewards/margins": 73.29154968261719, "rewards/rejected": -86.64375305175781, "step": 5310 }, { "epoch": 0.532, "grad_norm": 0.0, "learning_rate": 2.6569762988232838e-06, "logits/chosen": -0.6373372077941895, "logits/rejected": 0.6762363314628601, "logps/chosen": -314.57879638671875, "logps/rejected": -1196.302978515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.09781265258789, "rewards/margins": 71.00390625, "rewards/rejected": -87.10172271728516, "step": 5320 }, { "epoch": 0.533, "grad_norm": 0.0, "learning_rate": 2.6482659339399047e-06, "logits/chosen": -0.6710726022720337, "logits/rejected": 0.7335922122001648, "logps/chosen": -502.332763671875, "logps/rejected": -1351.554931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.287044525146484, "rewards/margins": 76.317138671875, "rewards/rejected": -97.60417938232422, "step": 5330 }, { "epoch": 0.534, "grad_norm": 0.0, "learning_rate": 2.63955376248291e-06, "logits/chosen": -0.6070116758346558, "logits/rejected": 0.26677125692367554, "logps/chosen": -332.3114929199219, "logps/rejected": -861.4298706054688, "loss": 0.2173, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.19046688079834, "rewards/margins": 51.89105987548828, "rewards/rejected": -62.0815315246582, "step": 5340 }, { "epoch": 0.535, "grad_norm": 0.0, "learning_rate": 2.6308398906073603e-06, "logits/chosen": -0.7868450880050659, "logits/rejected": 0.4568649232387543, "logps/chosen": -312.804931640625, "logps/rejected": -983.685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.068277359008789, "rewards/margins": 56.0164909362793, "rewards/rejected": -67.08477020263672, "step": 5350 }, { "epoch": 0.536, "grad_norm": 0.0, "learning_rate": 2.6221244244890336e-06, "logits/chosen": -0.29687172174453735, "logits/rejected": 0.35419854521751404, "logps/chosen": -341.05596923828125, "logps/rejected": -851.9397583007812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -10.857192993164062, "rewards/margins": 52.79615020751953, "rewards/rejected": -63.653350830078125, "step": 5360 }, { "epoch": 0.537, "grad_norm": 0.0, "learning_rate": 2.613407470323134e-06, "logits/chosen": -0.5562046766281128, "logits/rejected": 0.6017817258834839, "logps/chosen": -576.7545776367188, "logps/rejected": -1128.655517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.32354736328125, "rewards/margins": 50.09172439575195, "rewards/rejected": -66.41526794433594, "step": 5370 }, { "epoch": 0.538, "grad_norm": 2.7073644212871053e-15, "learning_rate": 2.604689134322999e-06, "logits/chosen": -0.7671887278556824, "logits/rejected": 0.18423447012901306, "logps/chosen": -495.18218994140625, "logps/rejected": -990.7804565429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.015054702758789, "rewards/margins": 44.199180603027344, "rewards/rejected": -55.2142333984375, "step": 5380 }, { "epoch": 0.539, "grad_norm": 0.0, "learning_rate": 2.5959695227188e-06, "logits/chosen": -0.8589827418327332, "logits/rejected": 0.6271054148674011, "logps/chosen": -410.236572265625, "logps/rejected": -1313.5830078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.382259368896484, "rewards/margins": 74.65422058105469, "rewards/rejected": -92.0364761352539, "step": 5390 }, { "epoch": 0.54, "grad_norm": 9.483873873250559e-05, "learning_rate": 2.587248741756253e-06, "logits/chosen": -0.36108261346817017, "logits/rejected": 0.16383466124534607, "logps/chosen": -565.5712890625, "logps/rejected": -1062.492919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.465934753417969, "rewards/margins": 53.61336135864258, "rewards/rejected": -65.07929229736328, "step": 5400 }, { "epoch": 0.541, "grad_norm": 2.3672402471412723e-13, "learning_rate": 2.578526897695321e-06, "logits/chosen": -0.5892329216003418, "logits/rejected": 0.7394314408302307, "logps/chosen": -379.57183837890625, "logps/rejected": -951.5711669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.3280029296875, "rewards/margins": 51.3201904296875, "rewards/rejected": -65.648193359375, "step": 5410 }, { "epoch": 0.542, "grad_norm": 0.0, "learning_rate": 2.569804096808923e-06, "logits/chosen": -0.6709790229797363, "logits/rejected": 0.42003265023231506, "logps/chosen": -464.9453125, "logps/rejected": -999.1988525390625, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -9.656137466430664, "rewards/margins": 50.31475830078125, "rewards/rejected": -59.97089385986328, "step": 5420 }, { "epoch": 0.543, "grad_norm": 5.099034326824965e-14, "learning_rate": 2.5610804453816333e-06, "logits/chosen": -0.7774316072463989, "logits/rejected": 0.4784523844718933, "logps/chosen": -458.3973693847656, "logps/rejected": -1123.7091064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.050821304321289, "rewards/margins": 60.913475036621094, "rewards/rejected": -75.96430206298828, "step": 5430 }, { "epoch": 0.544, "grad_norm": 0.0, "learning_rate": 2.5523560497083927e-06, "logits/chosen": -0.5590888261795044, "logits/rejected": 0.5929467082023621, "logps/chosen": -469.6875915527344, "logps/rejected": -1505.582763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.421056747436523, "rewards/margins": 93.0894775390625, "rewards/rejected": -109.51053619384766, "step": 5440 }, { "epoch": 0.545, "grad_norm": 6.712893271306715e-18, "learning_rate": 2.543631016093209e-06, "logits/chosen": -0.6673922538757324, "logits/rejected": 0.9135753512382507, "logps/chosen": -501.02728271484375, "logps/rejected": -1274.625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -18.747516632080078, "rewards/margins": 70.78562927246094, "rewards/rejected": -89.53314971923828, "step": 5450 }, { "epoch": 0.546, "grad_norm": 0.0, "learning_rate": 2.5349054508478636e-06, "logits/chosen": -0.7824907898902893, "logits/rejected": 0.8109520077705383, "logps/chosen": -499.3113708496094, "logps/rejected": -1379.143798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.124048233032227, "rewards/margins": 87.3325424194336, "rewards/rejected": -106.45658874511719, "step": 5460 }, { "epoch": 0.547, "grad_norm": 4.859037795943949e-19, "learning_rate": 2.526179460290615e-06, "logits/chosen": -0.8277426958084106, "logits/rejected": 1.123626470565796, "logps/chosen": -290.3539733886719, "logps/rejected": -1319.580322265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.491305351257324, "rewards/margins": 89.3653335571289, "rewards/rejected": -104.85664367675781, "step": 5470 }, { "epoch": 0.548, "grad_norm": 0.16051889955997467, "learning_rate": 2.517453150744904e-06, "logits/chosen": -0.36050155758857727, "logits/rejected": 1.1666429042816162, "logps/chosen": -409.23846435546875, "logps/rejected": -1217.471923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.654441833496094, "rewards/margins": 78.6314468383789, "rewards/rejected": -94.28587341308594, "step": 5480 }, { "epoch": 0.549, "grad_norm": 1.2967491197527487e-22, "learning_rate": 2.5087266285380597e-06, "logits/chosen": -0.4092417359352112, "logits/rejected": 0.9499204754829407, "logps/chosen": -409.16278076171875, "logps/rejected": -1338.765869140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.687070846557617, "rewards/margins": 84.14730834960938, "rewards/rejected": -98.83438110351562, "step": 5490 }, { "epoch": 0.55, "grad_norm": 1.0176245092833531e-21, "learning_rate": 2.5e-06, "logits/chosen": -0.47044605016708374, "logits/rejected": 0.5489223599433899, "logps/chosen": -455.0294494628906, "logps/rejected": -1252.549072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.840413093566895, "rewards/margins": 80.813720703125, "rewards/rejected": -95.65412902832031, "step": 5500 }, { "epoch": 0.551, "grad_norm": 0.0, "learning_rate": 2.4912733714619415e-06, "logits/chosen": -1.1358522176742554, "logits/rejected": 0.8391151428222656, "logps/chosen": -336.40625, "logps/rejected": -1572.833251953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -13.205284118652344, "rewards/margins": 105.4991455078125, "rewards/rejected": -118.70442962646484, "step": 5510 }, { "epoch": 0.552, "grad_norm": 0.0, "learning_rate": 2.482546849255096e-06, "logits/chosen": -0.43424397706985474, "logits/rejected": 1.4264408349990845, "logps/chosen": -546.3912353515625, "logps/rejected": -1812.598388671875, "loss": 0.59, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -33.54061508178711, "rewards/margins": 116.97703552246094, "rewards/rejected": -150.51763916015625, "step": 5520 }, { "epoch": 0.553, "grad_norm": 0.0, "learning_rate": 2.4738205397093863e-06, "logits/chosen": 0.08370640128850937, "logits/rejected": 1.2765973806381226, "logps/chosen": -435.69342041015625, "logps/rejected": -1340.457275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.692970275878906, "rewards/margins": 87.23201751708984, "rewards/rejected": -110.92498779296875, "step": 5530 }, { "epoch": 0.554, "grad_norm": 0.0, "learning_rate": 2.4650945491521372e-06, "logits/chosen": -0.7669180035591125, "logits/rejected": 0.8787969350814819, "logps/chosen": -640.8975830078125, "logps/rejected": -1826.0234375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -28.16714096069336, "rewards/margins": 105.84178161621094, "rewards/rejected": -134.00892639160156, "step": 5540 }, { "epoch": 0.555, "grad_norm": 1.2562163765814094e-12, "learning_rate": 2.4563689839067913e-06, "logits/chosen": -0.3882649838924408, "logits/rejected": 0.9912908673286438, "logps/chosen": -418.70263671875, "logps/rejected": -1501.5875244140625, "loss": 0.1537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -23.708744049072266, "rewards/margins": 99.52375793457031, "rewards/rejected": -123.23250579833984, "step": 5550 }, { "epoch": 0.556, "grad_norm": 0.0, "learning_rate": 2.447643950291608e-06, "logits/chosen": -0.444000244140625, "logits/rejected": 1.3806891441345215, "logps/chosen": -306.3438415527344, "logps/rejected": -1433.14501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.685264587402344, "rewards/margins": 95.08927154541016, "rewards/rejected": -115.7745361328125, "step": 5560 }, { "epoch": 0.557, "grad_norm": 2.19682027375889e-18, "learning_rate": 2.4389195546183676e-06, "logits/chosen": -0.7899306416511536, "logits/rejected": 1.2382338047027588, "logps/chosen": -407.1981506347656, "logps/rejected": -1564.7291259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.620243072509766, "rewards/margins": 104.19229888916016, "rewards/rejected": -121.81254577636719, "step": 5570 }, { "epoch": 0.558, "grad_norm": 0.0, "learning_rate": 2.4301959031910785e-06, "logits/chosen": -0.41182246804237366, "logits/rejected": 1.5325813293457031, "logps/chosen": -453.8590393066406, "logps/rejected": -1610.720458984375, "loss": 0.0618, "rewards/accuracies": 1.0, "rewards/chosen": -25.81329917907715, "rewards/margins": 103.739990234375, "rewards/rejected": -129.55328369140625, "step": 5580 }, { "epoch": 0.559, "grad_norm": 0.0, "learning_rate": 2.4214731023046795e-06, "logits/chosen": -0.6320607662200928, "logits/rejected": 0.9761055707931519, "logps/chosen": -472.55517578125, "logps/rejected": -1443.104248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.019092559814453, "rewards/margins": 86.91942596435547, "rewards/rejected": -109.93852233886719, "step": 5590 }, { "epoch": 0.56, "grad_norm": 5.9338226318359375, "learning_rate": 2.4127512582437486e-06, "logits/chosen": -0.35688918828964233, "logits/rejected": 1.3163071870803833, "logps/chosen": -589.3690185546875, "logps/rejected": -1790.47265625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -34.09458541870117, "rewards/margins": 107.3012466430664, "rewards/rejected": -141.39584350585938, "step": 5600 }, { "epoch": 0.561, "grad_norm": 0.0, "learning_rate": 2.4040304772812002e-06, "logits/chosen": -0.5669654607772827, "logits/rejected": 0.7861363887786865, "logps/chosen": -357.8940734863281, "logps/rejected": -989.2740478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.1118221282959, "rewards/margins": 51.08353805541992, "rewards/rejected": -71.19535827636719, "step": 5610 }, { "epoch": 0.562, "grad_norm": 0.0, "learning_rate": 2.3953108656770018e-06, "logits/chosen": -0.6459758877754211, "logits/rejected": 0.6650265455245972, "logps/chosen": -463.2652282714844, "logps/rejected": -1128.4453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.661130905151367, "rewards/margins": 61.90986251831055, "rewards/rejected": -77.57099914550781, "step": 5620 }, { "epoch": 0.563, "grad_norm": 5.690169564331882e-05, "learning_rate": 2.3865925296768658e-06, "logits/chosen": -0.06742945313453674, "logits/rejected": 0.5442739725112915, "logps/chosen": -351.1674499511719, "logps/rejected": -1076.4241943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.4465274810791, "rewards/margins": 67.57926177978516, "rewards/rejected": -85.02578735351562, "step": 5630 }, { "epoch": 0.564, "grad_norm": 9.043148738013525e-18, "learning_rate": 2.377875575510967e-06, "logits/chosen": -0.6333662271499634, "logits/rejected": 0.7028582096099854, "logps/chosen": -295.8404846191406, "logps/rejected": -1131.8314208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.302228927612305, "rewards/margins": 71.73029327392578, "rewards/rejected": -83.03252410888672, "step": 5640 }, { "epoch": 0.565, "grad_norm": 2.721798864513403e-06, "learning_rate": 2.3691601093926406e-06, "logits/chosen": -0.7991350293159485, "logits/rejected": 0.430896133184433, "logps/chosen": -292.8297119140625, "logps/rejected": -885.7141723632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.243639945983887, "rewards/margins": 50.072479248046875, "rewards/rejected": -63.31612014770508, "step": 5650 }, { "epoch": 0.566, "grad_norm": 0.0, "learning_rate": 2.3604462375170905e-06, "logits/chosen": -0.8914593458175659, "logits/rejected": 0.5920891761779785, "logps/chosen": -471.0685119628906, "logps/rejected": -1146.7142333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.243207931518555, "rewards/margins": 60.8463134765625, "rewards/rejected": -75.08952331542969, "step": 5660 }, { "epoch": 0.567, "grad_norm": 0.0, "learning_rate": 2.3517340660600965e-06, "logits/chosen": -0.5996074676513672, "logits/rejected": 0.47150737047195435, "logps/chosen": -497.8310546875, "logps/rejected": -1160.7313232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.251384735107422, "rewards/margins": 62.97553634643555, "rewards/rejected": -79.22691345214844, "step": 5670 }, { "epoch": 0.568, "grad_norm": 0.0, "learning_rate": 2.3430237011767166e-06, "logits/chosen": -0.8915193676948547, "logits/rejected": 0.988507091999054, "logps/chosen": -258.1877746582031, "logps/rejected": -1035.653076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.907968521118164, "rewards/margins": 68.4066162109375, "rewards/rejected": -82.31459045410156, "step": 5680 }, { "epoch": 0.569, "grad_norm": 2.1966523043957533e-19, "learning_rate": 2.3343152490000004e-06, "logits/chosen": -0.6327385902404785, "logits/rejected": 0.5575627088546753, "logps/chosen": -420.05474853515625, "logps/rejected": -978.6383056640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.766349792480469, "rewards/margins": 51.23418426513672, "rewards/rejected": -65.00053405761719, "step": 5690 }, { "epoch": 0.57, "grad_norm": 153.5894012451172, "learning_rate": 2.325608815639687e-06, "logits/chosen": -0.7398956418037415, "logits/rejected": 0.3631681799888611, "logps/chosen": -445.54754638671875, "logps/rejected": -1215.155517578125, "loss": 0.0887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.032880783081055, "rewards/margins": 69.30997467041016, "rewards/rejected": -82.34285736083984, "step": 5700 }, { "epoch": 0.571, "grad_norm": 9.302183912041073e-09, "learning_rate": 2.3169045071809217e-06, "logits/chosen": -0.7493211627006531, "logits/rejected": 0.3405657708644867, "logps/chosen": -409.96435546875, "logps/rejected": -1141.1806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.728360176086426, "rewards/margins": 65.49519348144531, "rewards/rejected": -77.22355651855469, "step": 5710 }, { "epoch": 0.572, "grad_norm": 1.2539364888652926e-06, "learning_rate": 2.3082024296829538e-06, "logits/chosen": -0.5696662664413452, "logits/rejected": 0.18319830298423767, "logps/chosen": -483.90655517578125, "logps/rejected": -1233.9267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.041364669799805, "rewards/margins": 65.56819152832031, "rewards/rejected": -77.60954284667969, "step": 5720 }, { "epoch": 0.573, "grad_norm": 0.0, "learning_rate": 2.2995026891778533e-06, "logits/chosen": -0.6082257032394409, "logits/rejected": 0.2994880676269531, "logps/chosen": -288.6447448730469, "logps/rejected": -933.8486328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.254521369934082, "rewards/margins": 59.925621032714844, "rewards/rejected": -69.18013763427734, "step": 5730 }, { "epoch": 0.574, "grad_norm": 3.743392066509216e-23, "learning_rate": 2.290805391669212e-06, "logits/chosen": -1.0448625087738037, "logits/rejected": 0.6118916273117065, "logps/chosen": -322.68231201171875, "logps/rejected": -1163.752685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.992044448852539, "rewards/margins": 70.45680236816406, "rewards/rejected": -81.4488525390625, "step": 5740 }, { "epoch": 0.575, "grad_norm": 9.589562413097884e-17, "learning_rate": 2.2821106431308546e-06, "logits/chosen": -0.5972896814346313, "logits/rejected": -0.02794502303004265, "logps/chosen": -602.9471435546875, "logps/rejected": -1100.9271240234375, "loss": 0.1444, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.757568359375, "rewards/margins": 58.230491638183594, "rewards/rejected": -71.9880599975586, "step": 5750 }, { "epoch": 0.576, "grad_norm": 0.0, "learning_rate": 2.2734185495055503e-06, "logits/chosen": -0.9066916704177856, "logits/rejected": 0.5560423135757446, "logps/chosen": -319.40240478515625, "logps/rejected": -1094.9283447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.93016529083252, "rewards/margins": 62.95295333862305, "rewards/rejected": -74.88311004638672, "step": 5760 }, { "epoch": 0.577, "grad_norm": 3.373019552554979e-08, "learning_rate": 2.2647292167037143e-06, "logits/chosen": -0.981410801410675, "logits/rejected": 0.6274434328079224, "logps/chosen": -214.78018188476562, "logps/rejected": -852.8345947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.489265441894531, "rewards/margins": 49.72411346435547, "rewards/rejected": -59.21337890625, "step": 5770 }, { "epoch": 0.578, "grad_norm": 1.8989554303106886e-19, "learning_rate": 2.256042750602127e-06, "logits/chosen": -0.3405448794364929, "logits/rejected": 0.5417619943618774, "logps/chosen": -321.7869567871094, "logps/rejected": -918.9954833984375, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -9.773573875427246, "rewards/margins": 49.88673782348633, "rewards/rejected": -59.660308837890625, "step": 5780 }, { "epoch": 0.579, "grad_norm": 2.0045403156105073e-15, "learning_rate": 2.2473592570426343e-06, "logits/chosen": -0.8019183874130249, "logits/rejected": 0.17649047076702118, "logps/chosen": -328.5504455566406, "logps/rejected": -991.7091674804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.611047744750977, "rewards/margins": 56.48290252685547, "rewards/rejected": -70.09394836425781, "step": 5790 }, { "epoch": 0.58, "grad_norm": 1.5212917503504286e-07, "learning_rate": 2.238678841830867e-06, "logits/chosen": -0.969714343547821, "logits/rejected": 0.5986171364784241, "logps/chosen": -305.9940185546875, "logps/rejected": -1073.6085205078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.390681266784668, "rewards/margins": 63.28633499145508, "rewards/rejected": -76.67700958251953, "step": 5800 }, { "epoch": 0.581, "grad_norm": 0.0, "learning_rate": 2.230001610734943e-06, "logits/chosen": -0.4790991246700287, "logits/rejected": 0.2994881570339203, "logps/chosen": -428.73291015625, "logps/rejected": -1098.240478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.178543090820312, "rewards/margins": 66.76612091064453, "rewards/rejected": -80.94465637207031, "step": 5810 }, { "epoch": 0.582, "grad_norm": 0.0, "learning_rate": 2.2213276694841866e-06, "logits/chosen": -0.9775009155273438, "logits/rejected": 0.6576396226882935, "logps/chosen": -301.51812744140625, "logps/rejected": -1096.399169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.869229316711426, "rewards/margins": 61.690216064453125, "rewards/rejected": -76.5594482421875, "step": 5820 }, { "epoch": 0.583, "grad_norm": 0.0, "learning_rate": 2.212657123767834e-06, "logits/chosen": -0.31216496229171753, "logits/rejected": 0.3422687351703644, "logps/chosen": -367.2973937988281, "logps/rejected": -886.2732543945312, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -16.941743850708008, "rewards/margins": 53.467002868652344, "rewards/rejected": -70.40875244140625, "step": 5830 }, { "epoch": 0.584, "grad_norm": 9.648000883725073e-15, "learning_rate": 2.2039900792337477e-06, "logits/chosen": -0.6409394145011902, "logits/rejected": 0.2590024769306183, "logps/chosen": -540.4464111328125, "logps/rejected": -1060.7490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.835675239562988, "rewards/margins": 53.53327178955078, "rewards/rejected": -64.36894226074219, "step": 5840 }, { "epoch": 0.585, "grad_norm": 0.0, "learning_rate": 2.195326641487132e-06, "logits/chosen": -0.343078076839447, "logits/rejected": 0.4754057824611664, "logps/chosen": -322.4148864746094, "logps/rejected": -1100.409423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.861885070800781, "rewards/margins": 70.69842529296875, "rewards/rejected": -83.56031799316406, "step": 5850 }, { "epoch": 0.586, "grad_norm": 2.893859733358337e-18, "learning_rate": 2.186666916089239e-06, "logits/chosen": -0.8028178215026855, "logits/rejected": 0.560520350933075, "logps/chosen": -427.0816345214844, "logps/rejected": -1150.7421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.319122314453125, "rewards/margins": 59.44419479370117, "rewards/rejected": -76.76332092285156, "step": 5860 }, { "epoch": 0.587, "grad_norm": 1.4526141574322526e-11, "learning_rate": 2.1780110085560935e-06, "logits/chosen": -0.47322821617126465, "logits/rejected": 0.35947781801223755, "logps/chosen": -366.51885986328125, "logps/rejected": -961.4563598632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.354814529418945, "rewards/margins": 61.39038848876953, "rewards/rejected": -71.74519348144531, "step": 5870 }, { "epoch": 0.588, "grad_norm": 0.0, "learning_rate": 2.1693590243571937e-06, "logits/chosen": -0.7379357218742371, "logits/rejected": 0.541722297668457, "logps/chosen": -333.3455810546875, "logps/rejected": -952.2578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.706256866455078, "rewards/margins": 52.303131103515625, "rewards/rejected": -65.00939178466797, "step": 5880 }, { "epoch": 0.589, "grad_norm": 2.424898097498228e-16, "learning_rate": 2.1607110689142393e-06, "logits/chosen": -0.7496501207351685, "logits/rejected": 0.1588805913925171, "logps/chosen": -266.98760986328125, "logps/rejected": -715.7047119140625, "loss": 0.1048, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.766366958618164, "rewards/margins": 40.335838317871094, "rewards/rejected": -51.10220718383789, "step": 5890 }, { "epoch": 0.59, "grad_norm": 1.9521148207028022e-14, "learning_rate": 2.1520672475998374e-06, "logits/chosen": -0.46376457810401917, "logits/rejected": 0.2426864206790924, "logps/chosen": -598.9649047851562, "logps/rejected": -1084.9559326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.458829879760742, "rewards/margins": 56.31378936767578, "rewards/rejected": -73.77261352539062, "step": 5900 }, { "epoch": 0.591, "grad_norm": 0.0, "learning_rate": 2.143427665736221e-06, "logits/chosen": -1.3513346910476685, "logits/rejected": 0.6467947363853455, "logps/chosen": -178.78578186035156, "logps/rejected": -986.7268676757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.031607627868652, "rewards/margins": 62.00954055786133, "rewards/rejected": -71.04115295410156, "step": 5910 }, { "epoch": 0.592, "grad_norm": 0.0, "learning_rate": 2.134792428593971e-06, "logits/chosen": -0.9142158627510071, "logits/rejected": 0.142775759100914, "logps/chosen": -418.8714294433594, "logps/rejected": -1166.554931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.114670753479004, "rewards/margins": 68.04464721679688, "rewards/rejected": -78.15931701660156, "step": 5920 }, { "epoch": 0.593, "grad_norm": 4.327770424878341e-15, "learning_rate": 2.1261616413907267e-06, "logits/chosen": -0.7585387825965881, "logits/rejected": 0.018490100279450417, "logps/chosen": -398.62091064453125, "logps/rejected": -756.716552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.56114387512207, "rewards/margins": 40.72282409667969, "rewards/rejected": -51.283966064453125, "step": 5930 }, { "epoch": 0.594, "grad_norm": 0.29131418466567993, "learning_rate": 2.117535409289905e-06, "logits/chosen": -0.8369476199150085, "logits/rejected": 0.48800697922706604, "logps/chosen": -322.8172302246094, "logps/rejected": -1040.8592529296875, "loss": 0.1809, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.917299270629883, "rewards/margins": 56.43064498901367, "rewards/rejected": -71.34794616699219, "step": 5940 }, { "epoch": 0.595, "grad_norm": 6.851609413160986e-08, "learning_rate": 2.1089138373994226e-06, "logits/chosen": -0.4474567770957947, "logits/rejected": -0.12609001994132996, "logps/chosen": -425.7982482910156, "logps/rejected": -696.9923706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.972140312194824, "rewards/margins": 30.715478897094727, "rewards/rejected": -43.6876220703125, "step": 5950 }, { "epoch": 0.596, "grad_norm": 3.2781477784737945e-05, "learning_rate": 2.1002970307704134e-06, "logits/chosen": -0.7989672422409058, "logits/rejected": 0.2822956442832947, "logps/chosen": -343.94207763671875, "logps/rejected": -1046.2325439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.518392562866211, "rewards/margins": 62.300148010253906, "rewards/rejected": -74.81852722167969, "step": 5960 }, { "epoch": 0.597, "grad_norm": 0.0, "learning_rate": 2.0916850943959453e-06, "logits/chosen": -0.9798108339309692, "logits/rejected": 0.031129514798521996, "logps/chosen": -349.68011474609375, "logps/rejected": -926.5896606445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.097526550292969, "rewards/margins": 52.179046630859375, "rewards/rejected": -62.27656936645508, "step": 5970 }, { "epoch": 0.598, "grad_norm": 2.6305224309908226e-06, "learning_rate": 2.0830781332097446e-06, "logits/chosen": -1.1097701787948608, "logits/rejected": 0.24544647336006165, "logps/chosen": -307.0548095703125, "logps/rejected": -812.1460571289062, "loss": 0.2396, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.199498176574707, "rewards/margins": 41.038108825683594, "rewards/rejected": -50.23760223388672, "step": 5980 }, { "epoch": 0.599, "grad_norm": 3.837552151053636e-11, "learning_rate": 2.0744762520849193e-06, "logits/chosen": -0.8115105628967285, "logits/rejected": 0.38198933005332947, "logps/chosen": -315.76873779296875, "logps/rejected": -777.8378295898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.973611354827881, "rewards/margins": 42.35847854614258, "rewards/rejected": -50.33209228515625, "step": 5990 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 2.0658795558326745e-06, "logits/chosen": -1.053982138633728, "logits/rejected": 0.04489628225564957, "logps/chosen": -420.40087890625, "logps/rejected": -1016.2745971679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.601455688476562, "rewards/margins": 55.01713180541992, "rewards/rejected": -66.61858367919922, "step": 6000 }, { "epoch": 0.601, "grad_norm": 0.0, "learning_rate": 2.0572881492010423e-06, "logits/chosen": -0.9520200490951538, "logits/rejected": 0.4393930435180664, "logps/chosen": -258.37884521484375, "logps/rejected": -915.1923828125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -8.035299301147461, "rewards/margins": 54.08031463623047, "rewards/rejected": -62.115623474121094, "step": 6010 }, { "epoch": 0.602, "grad_norm": 0.0, "learning_rate": 2.0487021368736002e-06, "logits/chosen": -0.727254331111908, "logits/rejected": 0.055363964289426804, "logps/chosen": -608.6199951171875, "logps/rejected": -1106.275146484375, "loss": 1.1745, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.362531661987305, "rewards/margins": 57.39153289794922, "rewards/rejected": -71.75406646728516, "step": 6020 }, { "epoch": 0.603, "grad_norm": 8.775121294461786e-21, "learning_rate": 2.0401216234682e-06, "logits/chosen": -1.1106388568878174, "logits/rejected": 0.28149712085723877, "logps/chosen": -506.61077880859375, "logps/rejected": -1085.811279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.73906135559082, "rewards/margins": 53.52830123901367, "rewards/rejected": -64.26737213134766, "step": 6030 }, { "epoch": 0.604, "grad_norm": 0.0, "learning_rate": 2.031546713535688e-06, "logits/chosen": -0.8970220685005188, "logits/rejected": 0.5340319275856018, "logps/chosen": -187.3031463623047, "logps/rejected": -888.166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.198158264160156, "rewards/margins": 56.56108856201172, "rewards/rejected": -65.7592544555664, "step": 6040 }, { "epoch": 0.605, "grad_norm": 0.0, "learning_rate": 2.022977511558638e-06, "logits/chosen": -0.49822598695755005, "logits/rejected": -0.051455218344926834, "logps/chosen": -539.9725341796875, "logps/rejected": -912.2399291992188, "loss": 0.0528, "rewards/accuracies": 1.0, "rewards/chosen": -12.603445053100586, "rewards/margins": 41.82299041748047, "rewards/rejected": -54.42643356323242, "step": 6050 }, { "epoch": 0.606, "grad_norm": 3.051496014185517e-16, "learning_rate": 2.0144141219500707e-06, "logits/chosen": -0.5907710790634155, "logits/rejected": 0.0812007263302803, "logps/chosen": -668.8988037109375, "logps/rejected": -1045.5540771484375, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -14.167119979858398, "rewards/margins": 42.041015625, "rewards/rejected": -56.20813751220703, "step": 6060 }, { "epoch": 0.607, "grad_norm": 5.917444961412447e-18, "learning_rate": 2.0058566490521848e-06, "logits/chosen": -0.28065013885498047, "logits/rejected": 0.347043514251709, "logps/chosen": -434.7425842285156, "logps/rejected": -957.3238525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.05737018585205, "rewards/margins": 50.82740020751953, "rewards/rejected": -64.88477325439453, "step": 6070 }, { "epoch": 0.608, "grad_norm": 0.0, "learning_rate": 1.997305197135089e-06, "logits/chosen": -0.7074576616287231, "logits/rejected": 0.348089337348938, "logps/chosen": -308.53753662109375, "logps/rejected": -1001.8761596679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.252487182617188, "rewards/margins": 67.42102813720703, "rewards/rejected": -80.67351531982422, "step": 6080 }, { "epoch": 0.609, "grad_norm": 35.3226432800293, "learning_rate": 1.9887598703955244e-06, "logits/chosen": -0.26775437593460083, "logits/rejected": 0.3113982081413269, "logps/chosen": -366.8798522949219, "logps/rejected": -877.09228515625, "loss": 0.2139, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -19.909229278564453, "rewards/margins": 44.00845718383789, "rewards/rejected": -63.917686462402344, "step": 6090 }, { "epoch": 0.61, "grad_norm": 1.7081393386554248e-19, "learning_rate": 1.9802207729556023e-06, "logits/chosen": -0.3729914128780365, "logits/rejected": 0.7869149446487427, "logps/chosen": -341.86688232421875, "logps/rejected": -942.552734375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -19.829532623291016, "rewards/margins": 51.61570358276367, "rewards/rejected": -71.44523620605469, "step": 6100 }, { "epoch": 0.611, "grad_norm": 0.0, "learning_rate": 1.971688008861529e-06, "logits/chosen": -0.7285288572311401, "logits/rejected": 0.8685183525085449, "logps/chosen": -676.9864501953125, "logps/rejected": -1683.438232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.139759063720703, "rewards/margins": 93.76480865478516, "rewards/rejected": -122.9045639038086, "step": 6110 }, { "epoch": 0.612, "grad_norm": 0.0, "learning_rate": 1.963161682082342e-06, "logits/chosen": -0.40115728974342346, "logits/rejected": 0.9319968223571777, "logps/chosen": -672.72900390625, "logps/rejected": -1428.954833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.64020347595215, "rewards/margins": 77.25892639160156, "rewards/rejected": -105.8991470336914, "step": 6120 }, { "epoch": 0.613, "grad_norm": 5.300155225440156e-18, "learning_rate": 1.9546418965086444e-06, "logits/chosen": -0.3054092228412628, "logits/rejected": 0.3358805775642395, "logps/chosen": -673.1195068359375, "logps/rejected": -1441.669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.96059226989746, "rewards/margins": 78.94953918457031, "rewards/rejected": -106.9101333618164, "step": 6130 }, { "epoch": 0.614, "grad_norm": 0.0, "learning_rate": 1.946128755951332e-06, "logits/chosen": -0.7575784921646118, "logits/rejected": 0.9253544807434082, "logps/chosen": -581.2315063476562, "logps/rejected": -1685.402587890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.97461700439453, "rewards/margins": 97.02174377441406, "rewards/rejected": -129.9963836669922, "step": 6140 }, { "epoch": 0.615, "grad_norm": 0.0, "learning_rate": 1.937622364140338e-06, "logits/chosen": -0.3287307918071747, "logits/rejected": 0.8551700711250305, "logps/chosen": -698.66015625, "logps/rejected": -1677.6588134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.919662475585938, "rewards/margins": 97.02134704589844, "rewards/rejected": -125.94100189208984, "step": 6150 }, { "epoch": 0.616, "grad_norm": 0.0, "learning_rate": 1.9291228247233607e-06, "logits/chosen": -0.10450273752212524, "logits/rejected": 0.12423954159021378, "logps/chosen": -672.6328125, "logps/rejected": -1056.39404296875, "loss": 1.1588, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -29.697463989257812, "rewards/margins": 49.33916091918945, "rewards/rejected": -79.03662109375, "step": 6160 }, { "epoch": 0.617, "grad_norm": 8.932964556152001e-05, "learning_rate": 1.9206302412646074e-06, "logits/chosen": -0.8520253896713257, "logits/rejected": 0.530659019947052, "logps/chosen": -543.1141967773438, "logps/rejected": -1338.0950927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.23099136352539, "rewards/margins": 75.07130432128906, "rewards/rejected": -98.30229187011719, "step": 6170 }, { "epoch": 0.618, "grad_norm": 0.0, "learning_rate": 1.912144717243525e-06, "logits/chosen": -0.46287283301353455, "logits/rejected": 0.8936434984207153, "logps/chosen": -479.36773681640625, "logps/rejected": -1329.708251953125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -18.504770278930664, "rewards/margins": 79.99925231933594, "rewards/rejected": -98.50402069091797, "step": 6180 }, { "epoch": 0.619, "grad_norm": 1153.3616943359375, "learning_rate": 1.9036663560535484e-06, "logits/chosen": -0.21779179573059082, "logits/rejected": 0.7919279336929321, "logps/chosen": -481.73870849609375, "logps/rejected": -1110.656494140625, "loss": 1.2391, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -28.207775115966797, "rewards/margins": 58.3253173828125, "rewards/rejected": -86.53309631347656, "step": 6190 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 1.895195261000831e-06, "logits/chosen": -0.6195026636123657, "logits/rejected": 0.8219470977783203, "logps/chosen": -394.10955810546875, "logps/rejected": -1159.2943115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.551332473754883, "rewards/margins": 62.34423065185547, "rewards/rejected": -82.89556121826172, "step": 6200 }, { "epoch": 0.621, "grad_norm": 1122.32861328125, "learning_rate": 1.8867315353029937e-06, "logits/chosen": -0.8880168199539185, "logits/rejected": 1.2520583868026733, "logps/chosen": -328.0982360839844, "logps/rejected": -1303.611572265625, "loss": 0.3969, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.13575553894043, "rewards/margins": 83.49183654785156, "rewards/rejected": -100.62757873535156, "step": 6210 }, { "epoch": 0.622, "grad_norm": 1.964397094899521e-15, "learning_rate": 1.8782752820878636e-06, "logits/chosen": -0.16132107377052307, "logits/rejected": 0.4336255192756653, "logps/chosen": -609.0550537109375, "logps/rejected": -1116.940673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.98457145690918, "rewards/margins": 60.426025390625, "rewards/rejected": -77.41058349609375, "step": 6220 }, { "epoch": 0.623, "grad_norm": 0.0, "learning_rate": 1.8698266043922159e-06, "logits/chosen": -0.5896113514900208, "logits/rejected": 1.0377198457717896, "logps/chosen": -258.1077575683594, "logps/rejected": -1181.6407470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.312314987182617, "rewards/margins": 80.14900207519531, "rewards/rejected": -93.46131896972656, "step": 6230 }, { "epoch": 0.624, "grad_norm": 88.24200439453125, "learning_rate": 1.8613856051605242e-06, "logits/chosen": -0.3807418942451477, "logits/rejected": 0.5020023584365845, "logps/chosen": -382.841552734375, "logps/rejected": -858.4488525390625, "loss": 0.037, "rewards/accuracies": 1.0, "rewards/chosen": -15.80988597869873, "rewards/margins": 45.08965301513672, "rewards/rejected": -60.8995361328125, "step": 6240 }, { "epoch": 0.625, "grad_norm": 0.0, "learning_rate": 1.852952387243698e-06, "logits/chosen": -0.6610077023506165, "logits/rejected": 0.6249849200248718, "logps/chosen": -316.33349609375, "logps/rejected": -1102.541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.481925964355469, "rewards/margins": 68.57567596435547, "rewards/rejected": -81.05760192871094, "step": 6250 }, { "epoch": 0.626, "grad_norm": 0.0, "learning_rate": 1.8445270533978387e-06, "logits/chosen": -1.2662389278411865, "logits/rejected": 0.8339168429374695, "logps/chosen": -370.40118408203125, "logps/rejected": -1378.05859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.953084945678711, "rewards/margins": 80.45343780517578, "rewards/rejected": -92.4065170288086, "step": 6260 }, { "epoch": 0.627, "grad_norm": 0.0, "learning_rate": 1.836109706282978e-06, "logits/chosen": -0.8804155588150024, "logits/rejected": 0.7382038235664368, "logps/chosen": -221.7541961669922, "logps/rejected": -1014.7086181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.514819145202637, "rewards/margins": 65.65678405761719, "rewards/rejected": -77.1716079711914, "step": 6270 }, { "epoch": 0.628, "grad_norm": 6.518362027918556e-08, "learning_rate": 1.827700448461836e-06, "logits/chosen": -0.8720572590827942, "logits/rejected": 0.5231325626373291, "logps/chosen": -294.4517517089844, "logps/rejected": -1028.52685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.470726013183594, "rewards/margins": 63.9245491027832, "rewards/rejected": -76.39527893066406, "step": 6280 }, { "epoch": 0.629, "grad_norm": 2.3427173562252272e-18, "learning_rate": 1.8192993823985643e-06, "logits/chosen": -0.9169327020645142, "logits/rejected": 0.49997347593307495, "logps/chosen": -261.94268798828125, "logps/rejected": -907.6803588867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.600554466247559, "rewards/margins": 53.24702072143555, "rewards/rejected": -64.84757995605469, "step": 6290 }, { "epoch": 0.63, "grad_norm": 1.7803756079109385e-21, "learning_rate": 1.8109066104575023e-06, "logits/chosen": -0.5586016178131104, "logits/rejected": 0.49877291917800903, "logps/chosen": -348.76483154296875, "logps/rejected": -890.5277099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.734954833984375, "rewards/margins": 52.39110565185547, "rewards/rejected": -65.12606048583984, "step": 6300 }, { "epoch": 0.631, "grad_norm": 8.788940242254739e-09, "learning_rate": 1.8025222349019273e-06, "logits/chosen": -0.5049250721931458, "logits/rejected": 0.47886282205581665, "logps/chosen": -478.8617248535156, "logps/rejected": -1211.92578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.844632148742676, "rewards/margins": 73.86564636230469, "rewards/rejected": -88.71028137207031, "step": 6310 }, { "epoch": 0.632, "grad_norm": 0.0, "learning_rate": 1.7941463578928088e-06, "logits/chosen": -0.4538189470767975, "logits/rejected": 0.46021947264671326, "logps/chosen": -531.1541748046875, "logps/rejected": -1060.1229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.083524703979492, "rewards/margins": 55.92713165283203, "rewards/rejected": -74.01065826416016, "step": 6320 }, { "epoch": 0.633, "grad_norm": 0.0, "learning_rate": 1.7857790814875665e-06, "logits/chosen": -0.7689892053604126, "logits/rejected": 0.6113948225975037, "logps/chosen": -419.75775146484375, "logps/rejected": -1273.57421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.010692596435547, "rewards/margins": 71.44483947753906, "rewards/rejected": -90.45552825927734, "step": 6330 }, { "epoch": 0.634, "grad_norm": 0.0, "learning_rate": 1.7774205076388207e-06, "logits/chosen": -0.762971043586731, "logits/rejected": 0.8176964521408081, "logps/chosen": -239.3108673095703, "logps/rejected": -915.2222900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.808510780334473, "rewards/margins": 54.695709228515625, "rewards/rejected": -66.50421905517578, "step": 6340 }, { "epoch": 0.635, "grad_norm": 1.2885712408206018e-07, "learning_rate": 1.7690707381931585e-06, "logits/chosen": -1.4763569831848145, "logits/rejected": 0.7502704858779907, "logps/chosen": -228.9136962890625, "logps/rejected": -1045.082275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.950324058532715, "rewards/margins": 59.335121154785156, "rewards/rejected": -69.28544616699219, "step": 6350 }, { "epoch": 0.636, "grad_norm": 0.0, "learning_rate": 1.7607298748898844e-06, "logits/chosen": -0.7422378659248352, "logits/rejected": 0.5283973217010498, "logps/chosen": -269.3170166015625, "logps/rejected": -842.0760498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.552061080932617, "rewards/margins": 47.43568801879883, "rewards/rejected": -62.98775100708008, "step": 6360 }, { "epoch": 0.637, "grad_norm": 3.602982634465235e-10, "learning_rate": 1.7523980193597837e-06, "logits/chosen": -0.6213805079460144, "logits/rejected": 0.7540527582168579, "logps/chosen": -347.17779541015625, "logps/rejected": -1026.532958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.154919624328613, "rewards/margins": 59.262908935546875, "rewards/rejected": -71.41783142089844, "step": 6370 }, { "epoch": 0.638, "grad_norm": 0.0022318889386951923, "learning_rate": 1.744075273123889e-06, "logits/chosen": -0.31907743215560913, "logits/rejected": 0.8481258153915405, "logps/chosen": -642.7422485351562, "logps/rejected": -1179.421875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -20.963634490966797, "rewards/margins": 53.08282470703125, "rewards/rejected": -74.04646301269531, "step": 6380 }, { "epoch": 0.639, "grad_norm": 9.515637247343306e-14, "learning_rate": 1.735761737592236e-06, "logits/chosen": -0.6827441453933716, "logits/rejected": 0.2889486253261566, "logps/chosen": -539.0543823242188, "logps/rejected": -1096.6243896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.704416275024414, "rewards/margins": 55.89228439331055, "rewards/rejected": -73.59669494628906, "step": 6390 }, { "epoch": 0.64, "grad_norm": 0.37831369042396545, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -0.37383323907852173, "logits/rejected": 0.634491503238678, "logps/chosen": -344.0820007324219, "logps/rejected": -1041.222900390625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -11.306551933288574, "rewards/margins": 65.38855743408203, "rewards/rejected": -76.69511413574219, "step": 6400 }, { "epoch": 0.641, "grad_norm": 1.802431224022169e-13, "learning_rate": 1.7191627037194187e-06, "logits/chosen": -0.9682666063308716, "logits/rejected": 0.46219348907470703, "logps/chosen": -399.88909912109375, "logps/rejected": -1011.9031982421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.483869552612305, "rewards/margins": 52.6249885559082, "rewards/rejected": -66.10885620117188, "step": 6410 }, { "epoch": 0.642, "grad_norm": 4.1179455407668736e-15, "learning_rate": 1.7108774076322443e-06, "logits/chosen": -0.9478727579116821, "logits/rejected": 0.4891994893550873, "logps/chosen": -316.1852111816406, "logps/rejected": -1141.15185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.764043807983398, "rewards/margins": 72.55058288574219, "rewards/rejected": -83.31462097167969, "step": 6420 }, { "epoch": 0.643, "grad_norm": 575.969482421875, "learning_rate": 1.702601726754825e-06, "logits/chosen": -0.7723701000213623, "logits/rejected": 0.13531820476055145, "logps/chosen": -432.0208435058594, "logps/rejected": -1143.351318359375, "loss": 1.7103, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -18.985231399536133, "rewards/margins": 61.39850997924805, "rewards/rejected": -80.38373565673828, "step": 6430 }, { "epoch": 0.644, "grad_norm": 0.0, "learning_rate": 1.6943357619237227e-06, "logits/chosen": -1.2825745344161987, "logits/rejected": 0.7488567233085632, "logps/chosen": -296.5671081542969, "logps/rejected": -1154.734619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.1475248336792, "rewards/margins": 60.58649826049805, "rewards/rejected": -73.73402404785156, "step": 6440 }, { "epoch": 0.645, "grad_norm": 0.0, "learning_rate": 1.686079613857109e-06, "logits/chosen": -0.5981461405754089, "logits/rejected": 0.599422037601471, "logps/chosen": -424.4049377441406, "logps/rejected": -1025.369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.99231243133545, "rewards/margins": 57.72393035888672, "rewards/rejected": -72.71624755859375, "step": 6450 }, { "epoch": 0.646, "grad_norm": 0.0, "learning_rate": 1.677833383153542e-06, "logits/chosen": -0.33491963148117065, "logits/rejected": 0.2924351096153259, "logps/chosen": -494.74298095703125, "logps/rejected": -979.0546875, "loss": 1.7072, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.02706527709961, "rewards/margins": 47.13035202026367, "rewards/rejected": -64.15740966796875, "step": 6460 }, { "epoch": 0.647, "grad_norm": 0.0, "learning_rate": 1.6695971702907425e-06, "logits/chosen": -0.5291346311569214, "logits/rejected": 0.04112546145915985, "logps/chosen": -389.43548583984375, "logps/rejected": -861.4542846679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.868743896484375, "rewards/margins": 45.284690856933594, "rewards/rejected": -61.1534309387207, "step": 6470 }, { "epoch": 0.648, "grad_norm": 3.206112458853383e-11, "learning_rate": 1.661371075624363e-06, "logits/chosen": -0.820167064666748, "logits/rejected": 0.22958044707775116, "logps/chosen": -268.8128967285156, "logps/rejected": -816.1143188476562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.350539207458496, "rewards/margins": 41.11225128173828, "rewards/rejected": -51.462791442871094, "step": 6480 }, { "epoch": 0.649, "grad_norm": 0.0, "learning_rate": 1.6531551993867717e-06, "logits/chosen": -1.0266730785369873, "logits/rejected": 0.31012818217277527, "logps/chosen": -227.1880340576172, "logps/rejected": -852.8770751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.547816276550293, "rewards/margins": 48.10416793823242, "rewards/rejected": -56.65198516845703, "step": 6490 }, { "epoch": 0.65, "grad_norm": 9.873515782635005e-15, "learning_rate": 1.6449496416858285e-06, "logits/chosen": -1.2130448818206787, "logits/rejected": 0.2985571026802063, "logps/chosen": -328.6929626464844, "logps/rejected": -840.7061767578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.523885726928711, "rewards/margins": 38.90607833862305, "rewards/rejected": -48.429962158203125, "step": 6500 }, { "epoch": 0.651, "grad_norm": 0.0, "learning_rate": 1.6367545025036634e-06, "logits/chosen": -1.709118127822876, "logits/rejected": 0.4337089955806732, "logps/chosen": -177.02171325683594, "logps/rejected": -972.3427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.665238380432129, "rewards/margins": 57.431732177734375, "rewards/rejected": -67.09696960449219, "step": 6510 }, { "epoch": 0.652, "grad_norm": 9.39294147491455, "learning_rate": 1.6285698816954626e-06, "logits/chosen": -0.7160784006118774, "logits/rejected": 0.2711246609687805, "logps/chosen": -312.7411804199219, "logps/rejected": -796.49658203125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -11.27977180480957, "rewards/margins": 36.427284240722656, "rewards/rejected": -47.707054138183594, "step": 6520 }, { "epoch": 0.653, "grad_norm": 5.293955920339377e-23, "learning_rate": 1.6203958789882457e-06, "logits/chosen": -0.5094423294067383, "logits/rejected": 0.31378093361854553, "logps/chosen": -440.42694091796875, "logps/rejected": -1072.122802734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.908445358276367, "rewards/margins": 57.47730255126953, "rewards/rejected": -71.38574981689453, "step": 6530 }, { "epoch": 0.654, "grad_norm": 1.2480342009412587e-13, "learning_rate": 1.612232593979658e-06, "logits/chosen": -0.5389014482498169, "logits/rejected": 0.4068358540534973, "logps/chosen": -410.4971618652344, "logps/rejected": -1073.8428955078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.45536994934082, "rewards/margins": 61.50476837158203, "rewards/rejected": -75.96012878417969, "step": 6540 }, { "epoch": 0.655, "grad_norm": 0.0, "learning_rate": 1.6040801261367494e-06, "logits/chosen": -0.9499231576919556, "logits/rejected": 0.3402765691280365, "logps/chosen": -211.19735717773438, "logps/rejected": -858.4918823242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.90208911895752, "rewards/margins": 53.628746032714844, "rewards/rejected": -62.53083419799805, "step": 6550 }, { "epoch": 0.656, "grad_norm": 3.516597623349375e-21, "learning_rate": 1.5959385747947697e-06, "logits/chosen": -1.0181282758712769, "logits/rejected": 0.5531474351882935, "logps/chosen": -224.2174835205078, "logps/rejected": -1108.608154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.008251190185547, "rewards/margins": 72.00736236572266, "rewards/rejected": -80.01561737060547, "step": 6560 }, { "epoch": 0.657, "grad_norm": 5.693732811618002e-12, "learning_rate": 1.5878080391559507e-06, "logits/chosen": -0.7998801469802856, "logits/rejected": 0.24991516768932343, "logps/chosen": -464.068359375, "logps/rejected": -992.8370361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.504290580749512, "rewards/margins": 48.961421966552734, "rewards/rejected": -62.46571731567383, "step": 6570 }, { "epoch": 0.658, "grad_norm": 3.429549756263497e-19, "learning_rate": 1.5796886182883053e-06, "logits/chosen": -0.6511567831039429, "logits/rejected": 0.4151083827018738, "logps/chosen": -389.8343811035156, "logps/rejected": -1196.2972412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.969945907592773, "rewards/margins": 70.79234313964844, "rewards/rejected": -84.76228332519531, "step": 6580 }, { "epoch": 0.659, "grad_norm": 2.676518306543585e-05, "learning_rate": 1.5715804111244138e-06, "logits/chosen": -1.3965880870819092, "logits/rejected": 0.4531164765357971, "logps/chosen": -166.05572509765625, "logps/rejected": -980.0531005859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.495244026184082, "rewards/margins": 50.93138885498047, "rewards/rejected": -59.42664337158203, "step": 6590 }, { "epoch": 0.66, "grad_norm": 4.1763072999856377e-08, "learning_rate": 1.56348351646022e-06, "logits/chosen": -0.7084104418754578, "logits/rejected": 0.251697301864624, "logps/chosen": -414.48553466796875, "logps/rejected": -966.2922973632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.428361892700195, "rewards/margins": 49.03688430786133, "rewards/rejected": -59.465248107910156, "step": 6600 }, { "epoch": 0.661, "grad_norm": 6.320921164903152e-17, "learning_rate": 1.5553980329538326e-06, "logits/chosen": -0.8995717763900757, "logits/rejected": 0.3077837824821472, "logps/chosen": -229.172607421875, "logps/rejected": -759.6439208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.536977767944336, "rewards/margins": 41.12507247924805, "rewards/rejected": -52.66204833984375, "step": 6610 }, { "epoch": 0.662, "grad_norm": 1.0289334488600144e-11, "learning_rate": 1.547324059124315e-06, "logits/chosen": -0.8084653615951538, "logits/rejected": 0.15149430930614471, "logps/chosen": -355.364013671875, "logps/rejected": -922.4557495117188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.91248607635498, "rewards/margins": 51.052677154541016, "rewards/rejected": -64.96516418457031, "step": 6620 }, { "epoch": 0.663, "grad_norm": 0.0, "learning_rate": 1.539261693350491e-06, "logits/chosen": -0.8480122685432434, "logits/rejected": 0.20509465038776398, "logps/chosen": -326.6919250488281, "logps/rejected": -920.38330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.641644477844238, "rewards/margins": 53.87500762939453, "rewards/rejected": -64.51664733886719, "step": 6630 }, { "epoch": 0.664, "grad_norm": 2.461848956910131e-15, "learning_rate": 1.5312110338697427e-06, "logits/chosen": -0.5872770547866821, "logits/rejected": 0.07159560918807983, "logps/chosen": -389.7482604980469, "logps/rejected": -811.3320922851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.820398330688477, "rewards/margins": 40.80659866333008, "rewards/rejected": -53.626991271972656, "step": 6640 }, { "epoch": 0.665, "grad_norm": 0.0, "learning_rate": 1.5231721787768162e-06, "logits/chosen": -0.7103424072265625, "logits/rejected": 0.08447039872407913, "logps/chosen": -422.61962890625, "logps/rejected": -948.6730346679688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.737173080444336, "rewards/margins": 47.96685028076172, "rewards/rejected": -63.70402145385742, "step": 6650 }, { "epoch": 0.666, "grad_norm": 0.0, "learning_rate": 1.5151452260226224e-06, "logits/chosen": -0.9289888143539429, "logits/rejected": 0.18887189030647278, "logps/chosen": -377.4011535644531, "logps/rejected": -859.4797973632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.963301658630371, "rewards/margins": 46.39201354980469, "rewards/rejected": -54.355316162109375, "step": 6660 }, { "epoch": 0.667, "grad_norm": 0.0, "learning_rate": 1.5071302734130488e-06, "logits/chosen": -0.9944952726364136, "logits/rejected": 0.5345765352249146, "logps/chosen": -303.9234619140625, "logps/rejected": -991.6798706054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.570622444152832, "rewards/margins": 58.970664978027344, "rewards/rejected": -66.54129028320312, "step": 6670 }, { "epoch": 0.668, "grad_norm": 0.0, "learning_rate": 1.4991274186077632e-06, "logits/chosen": -1.0211021900177002, "logits/rejected": 0.1890941858291626, "logps/chosen": -625.1935424804688, "logps/rejected": -1045.278564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.653399467468262, "rewards/margins": 45.128929138183594, "rewards/rejected": -57.78232955932617, "step": 6680 }, { "epoch": 0.669, "grad_norm": 0.0, "learning_rate": 1.491136759119025e-06, "logits/chosen": -1.015205979347229, "logits/rejected": 0.25742340087890625, "logps/chosen": -416.8662109375, "logps/rejected": -1021.6949462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.015602111816406, "rewards/margins": 52.00904083251953, "rewards/rejected": -63.02463912963867, "step": 6690 }, { "epoch": 0.67, "grad_norm": 1.739340937945144e-08, "learning_rate": 1.4831583923104997e-06, "logits/chosen": -0.38597649335861206, "logits/rejected": 0.28067824244499207, "logps/chosen": -511.48779296875, "logps/rejected": -923.99560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.102460861206055, "rewards/margins": 43.810855865478516, "rewards/rejected": -57.9133186340332, "step": 6700 }, { "epoch": 0.671, "grad_norm": 5.359721928721332e-22, "learning_rate": 1.4751924153960681e-06, "logits/chosen": -0.5853025317192078, "logits/rejected": 0.1794218271970749, "logps/chosen": -587.2357177734375, "logps/rejected": -1080.541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.21432876586914, "rewards/margins": 53.93608856201172, "rewards/rejected": -70.15042114257812, "step": 6710 }, { "epoch": 0.672, "grad_norm": 0.0, "learning_rate": 1.467238925438646e-06, "logits/chosen": -1.0530178546905518, "logits/rejected": 0.48365315794944763, "logps/chosen": -196.48049926757812, "logps/rejected": -989.3134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.558575630187988, "rewards/margins": 60.487815856933594, "rewards/rejected": -70.04638671875, "step": 6720 }, { "epoch": 0.673, "grad_norm": 1.5804560438148485e-10, "learning_rate": 1.4592980193489975e-06, "logits/chosen": -0.4733239710330963, "logits/rejected": 0.3845910429954529, "logps/chosen": -347.839599609375, "logps/rejected": -834.8303833007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.455202102661133, "rewards/margins": 39.814720153808594, "rewards/rejected": -49.26992416381836, "step": 6730 }, { "epoch": 0.674, "grad_norm": 2.8186614800618866e-16, "learning_rate": 1.4513697938845571e-06, "logits/chosen": -1.0593464374542236, "logits/rejected": 0.3638271391391754, "logps/chosen": -243.1426544189453, "logps/rejected": -907.37255859375, "loss": 0.0217, "rewards/accuracies": 1.0, "rewards/chosen": -10.289741516113281, "rewards/margins": 49.817100524902344, "rewards/rejected": -60.106834411621094, "step": 6740 }, { "epoch": 0.675, "grad_norm": 2.792390985106863e-09, "learning_rate": 1.443454345648252e-06, "logits/chosen": -0.5483334064483643, "logits/rejected": 0.18954530358314514, "logps/chosen": -512.3756713867188, "logps/rejected": -916.8468627929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.741385459899902, "rewards/margins": 44.06202697753906, "rewards/rejected": -55.80341720581055, "step": 6750 }, { "epoch": 0.676, "grad_norm": 0.0, "learning_rate": 1.4355517710873184e-06, "logits/chosen": -1.1610043048858643, "logits/rejected": 0.23131489753723145, "logps/chosen": -362.11016845703125, "logps/rejected": -1002.7393798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.117647171020508, "rewards/margins": 51.933372497558594, "rewards/rejected": -64.051025390625, "step": 6760 }, { "epoch": 0.677, "grad_norm": 0.0, "learning_rate": 1.4276621664921358e-06, "logits/chosen": -1.1171985864639282, "logits/rejected": 0.08405411243438721, "logps/chosen": -434.3948669433594, "logps/rejected": -875.7274169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.467716217041016, "rewards/margins": 38.010677337646484, "rewards/rejected": -54.4783935546875, "step": 6770 }, { "epoch": 0.678, "grad_norm": 1.0053239233383422e-15, "learning_rate": 1.419785627995044e-06, "logits/chosen": -0.7967459559440613, "logits/rejected": 0.46557193994522095, "logps/chosen": -384.51470947265625, "logps/rejected": -1023.1734619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.550901412963867, "rewards/margins": 46.259498596191406, "rewards/rejected": -60.81040573120117, "step": 6780 }, { "epoch": 0.679, "grad_norm": 3.572149283217892e-18, "learning_rate": 1.4119222515691817e-06, "logits/chosen": -0.819512665271759, "logits/rejected": 0.2668699324131012, "logps/chosen": -322.56097412109375, "logps/rejected": -923.2853393554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.771075248718262, "rewards/margins": 44.104061126708984, "rewards/rejected": -55.8751335144043, "step": 6790 }, { "epoch": 0.68, "grad_norm": 0.00208302098326385, "learning_rate": 1.4040721330273063e-06, "logits/chosen": -0.5939651727676392, "logits/rejected": -0.12722672522068024, "logps/chosen": -430.8482971191406, "logps/rejected": -837.2296752929688, "loss": 0.1543, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -16.117908477783203, "rewards/margins": 43.303470611572266, "rewards/rejected": -59.42137908935547, "step": 6800 }, { "epoch": 0.681, "grad_norm": 1.484518757921549e-11, "learning_rate": 1.3962353680206372e-06, "logits/chosen": -0.5131450295448303, "logits/rejected": -0.08215949684381485, "logps/chosen": -521.3753662109375, "logps/rejected": -982.7488403320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.535924911499023, "rewards/margins": 44.76746368408203, "rewards/rejected": -62.30338668823242, "step": 6810 }, { "epoch": 0.682, "grad_norm": 0.0, "learning_rate": 1.388412052037682e-06, "logits/chosen": -0.686394989490509, "logits/rejected": 0.25932493805885315, "logps/chosen": -337.4996032714844, "logps/rejected": -921.4371948242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.139752388000488, "rewards/margins": 45.83702850341797, "rewards/rejected": -58.976783752441406, "step": 6820 }, { "epoch": 0.683, "grad_norm": 0.0, "learning_rate": 1.380602280403076e-06, "logits/chosen": -0.6851155161857605, "logits/rejected": 0.1985010951757431, "logps/chosen": -406.23663330078125, "logps/rejected": -868.2559814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.177343368530273, "rewards/margins": 48.88190841674805, "rewards/rejected": -59.05924606323242, "step": 6830 }, { "epoch": 0.684, "grad_norm": 0.0, "learning_rate": 1.3728061482764238e-06, "logits/chosen": -0.675434947013855, "logits/rejected": 0.35549676418304443, "logps/chosen": -417.695556640625, "logps/rejected": -1046.248779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.115216255187988, "rewards/margins": 50.35161209106445, "rewards/rejected": -64.46682739257812, "step": 6840 }, { "epoch": 0.685, "grad_norm": 1.7023156495543645e-10, "learning_rate": 1.3650237506511333e-06, "logits/chosen": -0.9004403948783875, "logits/rejected": 0.2808682322502136, "logps/chosen": -341.0504455566406, "logps/rejected": -935.5436401367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.120309829711914, "rewards/margins": 53.2804069519043, "rewards/rejected": -62.40071487426758, "step": 6850 }, { "epoch": 0.686, "grad_norm": 0.0, "learning_rate": 1.3572551823532654e-06, "logits/chosen": -0.49469512701034546, "logits/rejected": 0.2047506868839264, "logps/chosen": -517.6826171875, "logps/rejected": -975.38232421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.508434295654297, "rewards/margins": 47.37997055053711, "rewards/rejected": -64.88841247558594, "step": 6860 }, { "epoch": 0.687, "grad_norm": 2.4329527314898353e-19, "learning_rate": 1.349500538040371e-06, "logits/chosen": -0.7579712271690369, "logits/rejected": 0.23987862467765808, "logps/chosen": -303.2074890136719, "logps/rejected": -856.4743041992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.821787357330322, "rewards/margins": 51.322776794433594, "rewards/rejected": -59.144569396972656, "step": 6870 }, { "epoch": 0.688, "grad_norm": 0.0, "learning_rate": 1.3417599122003464e-06, "logits/chosen": -0.7172698378562927, "logits/rejected": 0.41875559091567993, "logps/chosen": -434.141845703125, "logps/rejected": -1052.8280029296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -10.568477630615234, "rewards/margins": 57.83243942260742, "rewards/rejected": -68.40091705322266, "step": 6880 }, { "epoch": 0.689, "grad_norm": 0.003907814156264067, "learning_rate": 1.3340333991502723e-06, "logits/chosen": -0.24279102683067322, "logits/rejected": 0.33490872383117676, "logps/chosen": -311.4708251953125, "logps/rejected": -821.8943481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.199905395507812, "rewards/margins": 48.722801208496094, "rewards/rejected": -62.922706604003906, "step": 6890 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 1.3263210930352737e-06, "logits/chosen": -0.8461467623710632, "logits/rejected": 0.21782417595386505, "logps/chosen": -332.932373046875, "logps/rejected": -1089.2677001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.197572708129883, "rewards/margins": 68.2454605102539, "rewards/rejected": -77.44303894042969, "step": 6900 }, { "epoch": 0.691, "grad_norm": 0.0, "learning_rate": 1.3186230878273654e-06, "logits/chosen": -1.0278829336166382, "logits/rejected": 0.48746466636657715, "logps/chosen": -175.5904541015625, "logps/rejected": -934.48046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.513206481933594, "rewards/margins": 59.157508850097656, "rewards/rejected": -67.67072296142578, "step": 6910 }, { "epoch": 0.692, "grad_norm": 0.0, "learning_rate": 1.3109394773243117e-06, "logits/chosen": -0.322140634059906, "logits/rejected": 0.26277634501457214, "logps/chosen": -565.261962890625, "logps/rejected": -993.6104736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.960165023803711, "rewards/margins": 47.8547248840332, "rewards/rejected": -59.81489181518555, "step": 6920 }, { "epoch": 0.693, "grad_norm": 0.0, "learning_rate": 1.3032703551484832e-06, "logits/chosen": -0.6050577759742737, "logits/rejected": 0.24701687693595886, "logps/chosen": -320.1561584472656, "logps/rejected": -811.4998779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.287206649780273, "rewards/margins": 47.75560760498047, "rewards/rejected": -57.042808532714844, "step": 6930 }, { "epoch": 0.694, "grad_norm": 4.693359187513124e-06, "learning_rate": 1.2956158147457116e-06, "logits/chosen": -0.7445470094680786, "logits/rejected": 0.4302369952201843, "logps/chosen": -271.83868408203125, "logps/rejected": -832.82177734375, "loss": 0.1174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.641332626342773, "rewards/margins": 47.020240783691406, "rewards/rejected": -56.66157150268555, "step": 6940 }, { "epoch": 0.695, "grad_norm": 5.191566856410645e-07, "learning_rate": 1.2879759493841577e-06, "logits/chosen": -1.0439097881317139, "logits/rejected": 0.6121958494186401, "logps/chosen": -200.1259307861328, "logps/rejected": -844.1336059570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.755642890930176, "rewards/margins": 49.18115997314453, "rewards/rejected": -59.936805725097656, "step": 6950 }, { "epoch": 0.696, "grad_norm": 2.7079682496378155e-08, "learning_rate": 1.280350852153168e-06, "logits/chosen": -0.7683631181716919, "logits/rejected": 0.4195129871368408, "logps/chosen": -410.979736328125, "logps/rejected": -1058.7357177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.451654434204102, "rewards/margins": 56.14714813232422, "rewards/rejected": -67.59880065917969, "step": 6960 }, { "epoch": 0.697, "grad_norm": 0.0, "learning_rate": 1.272740615962148e-06, "logits/chosen": -0.3984186351299286, "logits/rejected": 0.3945949673652649, "logps/chosen": -437.71600341796875, "logps/rejected": -1098.1243896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.609410285949707, "rewards/margins": 65.10707092285156, "rewards/rejected": -79.71647644042969, "step": 6970 }, { "epoch": 0.698, "grad_norm": 1.6409067843652786e-14, "learning_rate": 1.2651453335394232e-06, "logits/chosen": -0.6048688888549805, "logits/rejected": 0.12000073492527008, "logps/chosen": -758.637939453125, "logps/rejected": -1092.565673828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.610742568969727, "rewards/margins": 44.76811599731445, "rewards/rejected": -58.37885665893555, "step": 6980 }, { "epoch": 0.699, "grad_norm": 0.0, "learning_rate": 1.2575650974311118e-06, "logits/chosen": -1.0600610971450806, "logits/rejected": 0.607258141040802, "logps/chosen": -189.1847686767578, "logps/rejected": -864.9268798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.889070510864258, "rewards/margins": 51.53076171875, "rewards/rejected": -61.419837951660156, "step": 6990 }, { "epoch": 0.7, "grad_norm": 0.0004449795524124056, "learning_rate": 1.2500000000000007e-06, "logits/chosen": -0.7913299798965454, "logits/rejected": 0.18162783980369568, "logps/chosen": -394.7737121582031, "logps/rejected": -997.1735229492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.63129997253418, "rewards/margins": 58.36121368408203, "rewards/rejected": -70.99250793457031, "step": 7000 }, { "epoch": 0.701, "grad_norm": 2.3053121911420504e-12, "learning_rate": 1.2424501334244124e-06, "logits/chosen": -0.3424440026283264, "logits/rejected": 0.23116068542003632, "logps/chosen": -497.0257873535156, "logps/rejected": -1031.158447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.78152084350586, "rewards/margins": 51.52674102783203, "rewards/rejected": -70.30825805664062, "step": 7010 }, { "epoch": 0.702, "grad_norm": 0.08845698833465576, "learning_rate": 1.234915589697091e-06, "logits/chosen": -0.4873952269554138, "logits/rejected": 0.16965332627296448, "logps/chosen": -439.7547912597656, "logps/rejected": -884.2762451171875, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -14.531588554382324, "rewards/margins": 38.97401809692383, "rewards/rejected": -53.5056037902832, "step": 7020 }, { "epoch": 0.703, "grad_norm": 0.0, "learning_rate": 1.2273964606240718e-06, "logits/chosen": -1.1222232580184937, "logits/rejected": 0.5668286085128784, "logps/chosen": -311.9913635253906, "logps/rejected": -963.0960083007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.711811065673828, "rewards/margins": 50.60163497924805, "rewards/rejected": -61.313446044921875, "step": 7030 }, { "epoch": 0.704, "grad_norm": 0.0, "learning_rate": 1.2198928378235717e-06, "logits/chosen": -0.9157301187515259, "logits/rejected": 0.8059916496276855, "logps/chosen": -273.55029296875, "logps/rejected": -903.5457763671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.198125839233398, "rewards/margins": 51.39958572387695, "rewards/rejected": -59.59770965576172, "step": 7040 }, { "epoch": 0.705, "grad_norm": 9.500559383255822e-14, "learning_rate": 1.2124048127248644e-06, "logits/chosen": -0.3811189830303192, "logits/rejected": 0.7395201921463013, "logps/chosen": -226.52603149414062, "logps/rejected": -791.3262329101562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.798089981079102, "rewards/margins": 49.31720733642578, "rewards/rejected": -58.11529541015625, "step": 7050 }, { "epoch": 0.706, "grad_norm": 1.8333838884395179e-16, "learning_rate": 1.204932476567175e-06, "logits/chosen": -0.2587122917175293, "logits/rejected": 0.3872125744819641, "logps/chosen": -462.1097717285156, "logps/rejected": -999.1061401367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.840702056884766, "rewards/margins": 56.51240158081055, "rewards/rejected": -76.35310363769531, "step": 7060 }, { "epoch": 0.707, "grad_norm": 0.0, "learning_rate": 1.19747592039856e-06, "logits/chosen": -0.7832753658294678, "logits/rejected": 0.3581964075565338, "logps/chosen": -441.05780029296875, "logps/rejected": -1143.239990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.54179573059082, "rewards/margins": 65.3556137084961, "rewards/rejected": -76.89740753173828, "step": 7070 }, { "epoch": 0.708, "grad_norm": 9.741703427059047e-17, "learning_rate": 1.1900352350748026e-06, "logits/chosen": -0.4372076094150543, "logits/rejected": 0.41365084052085876, "logps/chosen": -311.98260498046875, "logps/rejected": -779.3128662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.149206161499023, "rewards/margins": 42.368614196777344, "rewards/rejected": -55.51781463623047, "step": 7080 }, { "epoch": 0.709, "grad_norm": 7.48366429589864e-13, "learning_rate": 1.1826105112583061e-06, "logits/chosen": -0.7754701972007751, "logits/rejected": 0.3482648730278015, "logps/chosen": -442.53985595703125, "logps/rejected": -1012.1441650390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -9.848672866821289, "rewards/margins": 50.82096862792969, "rewards/rejected": -60.66963577270508, "step": 7090 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 1.1752018394169882e-06, "logits/chosen": -0.7517408132553101, "logits/rejected": 0.5923580527305603, "logps/chosen": -251.5076904296875, "logps/rejected": -1010.1513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.077350616455078, "rewards/margins": 63.697349548339844, "rewards/rejected": -75.77470397949219, "step": 7100 }, { "epoch": 0.711, "grad_norm": 2.079006303960973e-17, "learning_rate": 1.1678093098231748e-06, "logits/chosen": -0.6140622496604919, "logits/rejected": 0.6913038492202759, "logps/chosen": -387.5259704589844, "logps/rejected": -1004.0714111328125, "loss": 0.5563, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.7683744430542, "rewards/margins": 54.26259231567383, "rewards/rejected": -69.03096771240234, "step": 7110 }, { "epoch": 0.712, "grad_norm": 0.0, "learning_rate": 1.160433012552508e-06, "logits/chosen": -0.9175033569335938, "logits/rejected": 0.3144712746143341, "logps/chosen": -406.3265686035156, "logps/rejected": -1089.0546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.071568489074707, "rewards/margins": 62.89362716674805, "rewards/rejected": -70.96519470214844, "step": 7120 }, { "epoch": 0.713, "grad_norm": 0.0, "learning_rate": 1.1530730374828424e-06, "logits/chosen": -0.9688884615898132, "logits/rejected": 0.6511700749397278, "logps/chosen": -416.6983947753906, "logps/rejected": -1149.14794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.32402229309082, "rewards/margins": 55.059349060058594, "rewards/rejected": -67.38337707519531, "step": 7130 }, { "epoch": 0.714, "grad_norm": 0.0, "learning_rate": 1.1457294742931508e-06, "logits/chosen": -0.810795783996582, "logits/rejected": -0.004343023989349604, "logps/chosen": -714.1817016601562, "logps/rejected": -1107.7537841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.2621431350708, "rewards/margins": 53.681312561035156, "rewards/rejected": -67.94345092773438, "step": 7140 }, { "epoch": 0.715, "grad_norm": 8.800793781812907e-21, "learning_rate": 1.1384024124624324e-06, "logits/chosen": -1.0921555757522583, "logits/rejected": 0.08647129684686661, "logps/chosen": -353.8780212402344, "logps/rejected": -922.1368408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.05639934539795, "rewards/margins": 46.51305389404297, "rewards/rejected": -55.56945037841797, "step": 7150 }, { "epoch": 0.716, "grad_norm": 1.2604188919067383, "learning_rate": 1.1310919412686248e-06, "logits/chosen": -0.9001690149307251, "logits/rejected": 0.18475279211997986, "logps/chosen": -418.95562744140625, "logps/rejected": -1113.604248046875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -13.656079292297363, "rewards/margins": 58.632850646972656, "rewards/rejected": -72.28893280029297, "step": 7160 }, { "epoch": 0.717, "grad_norm": 0.11071855574846268, "learning_rate": 1.1237981497875112e-06, "logits/chosen": -0.7475544810295105, "logits/rejected": 0.2684534192085266, "logps/chosen": -382.6466064453125, "logps/rejected": -833.0397338867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.405658721923828, "rewards/margins": 41.787742614746094, "rewards/rejected": -53.19340133666992, "step": 7170 }, { "epoch": 0.718, "grad_norm": 0.0, "learning_rate": 1.11652112689164e-06, "logits/chosen": -1.6065635681152344, "logits/rejected": 0.68021160364151, "logps/chosen": -204.63723754882812, "logps/rejected": -1166.2694091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.474165439605713, "rewards/margins": 67.179443359375, "rewards/rejected": -74.65361022949219, "step": 7180 }, { "epoch": 0.719, "grad_norm": 1.3178431436389193e-11, "learning_rate": 1.109260961249238e-06, "logits/chosen": -1.1102924346923828, "logits/rejected": 0.444924533367157, "logps/chosen": -300.0912780761719, "logps/rejected": -1036.2974853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.845537185668945, "rewards/margins": 54.732566833496094, "rewards/rejected": -67.5781021118164, "step": 7190 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 1.1020177413231334e-06, "logits/chosen": -1.1612586975097656, "logits/rejected": 0.28627079725265503, "logps/chosen": -308.6979064941406, "logps/rejected": -1037.414306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.11671257019043, "rewards/margins": 47.389488220214844, "rewards/rejected": -60.506202697753906, "step": 7200 }, { "epoch": 0.721, "grad_norm": 3.743392066509216e-23, "learning_rate": 1.0947915553696742e-06, "logits/chosen": -0.8225234150886536, "logits/rejected": 0.5830526351928711, "logps/chosen": -275.85394287109375, "logps/rejected": -997.060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.838689804077148, "rewards/margins": 63.3530387878418, "rewards/rejected": -72.19173431396484, "step": 7210 }, { "epoch": 0.722, "grad_norm": 8.84302053805186e-09, "learning_rate": 1.0875824914376555e-06, "logits/chosen": -0.5271292328834534, "logits/rejected": 0.6783641576766968, "logps/chosen": -432.099853515625, "logps/rejected": -1039.996826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.017711639404297, "rewards/margins": 54.85394287109375, "rewards/rejected": -72.87165832519531, "step": 7220 }, { "epoch": 0.723, "grad_norm": 1.3129018952895422e-05, "learning_rate": 1.0803906373672477e-06, "logits/chosen": -0.33937448263168335, "logits/rejected": 0.41331392526626587, "logps/chosen": -365.21868896484375, "logps/rejected": -936.5850830078125, "loss": 0.4843, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.739160537719727, "rewards/margins": 46.56276321411133, "rewards/rejected": -62.30192184448242, "step": 7230 }, { "epoch": 0.724, "grad_norm": 0.0, "learning_rate": 1.073216080788921e-06, "logits/chosen": -0.733219563961029, "logits/rejected": 0.18814750015735626, "logps/chosen": -397.82293701171875, "logps/rejected": -884.7576293945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.14622688293457, "rewards/margins": 47.09293746948242, "rewards/rejected": -58.239173889160156, "step": 7240 }, { "epoch": 0.725, "grad_norm": 1.6266512833904598e-15, "learning_rate": 1.0660589091223854e-06, "logits/chosen": -0.5925036072731018, "logits/rejected": 0.1687505543231964, "logps/chosen": -344.05572509765625, "logps/rejected": -875.701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.012567520141602, "rewards/margins": 50.19481658935547, "rewards/rejected": -60.2073860168457, "step": 7250 }, { "epoch": 0.726, "grad_norm": 0.0, "learning_rate": 1.0589192095755172e-06, "logits/chosen": -0.7121064066886902, "logits/rejected": 0.22998180985450745, "logps/chosen": -286.26458740234375, "logps/rejected": -935.8848876953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -8.205058097839355, "rewards/margins": 54.207542419433594, "rewards/rejected": -62.41259765625, "step": 7260 }, { "epoch": 0.727, "grad_norm": 5.209674054640345e-05, "learning_rate": 1.0517970691433035e-06, "logits/chosen": -1.3975350856781006, "logits/rejected": 0.3783726990222931, "logps/chosen": -350.1981506347656, "logps/rejected": -978.7327880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.47797966003418, "rewards/margins": 48.890647888183594, "rewards/rejected": -57.368629455566406, "step": 7270 }, { "epoch": 0.728, "grad_norm": 1.089237144924482e-09, "learning_rate": 1.0446925746067768e-06, "logits/chosen": -0.7229653000831604, "logits/rejected": 0.2919732630252838, "logps/chosen": -379.4202880859375, "logps/rejected": -898.99853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.882861137390137, "rewards/margins": 43.108375549316406, "rewards/rejected": -54.991233825683594, "step": 7280 }, { "epoch": 0.729, "grad_norm": 7.443757112923777e-06, "learning_rate": 1.0376058125319614e-06, "logits/chosen": -0.6685749292373657, "logits/rejected": 0.09258606284856796, "logps/chosen": -364.90899658203125, "logps/rejected": -830.1349487304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.9243803024292, "rewards/margins": 41.93610763549805, "rewards/rejected": -51.86049270629883, "step": 7290 }, { "epoch": 0.73, "grad_norm": 4.272778311720238e-12, "learning_rate": 1.0305368692688175e-06, "logits/chosen": -0.7117483615875244, "logits/rejected": 0.2935028672218323, "logps/chosen": -331.0298767089844, "logps/rejected": -861.7659301757812, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -8.721847534179688, "rewards/margins": 47.882728576660156, "rewards/rejected": -56.604576110839844, "step": 7300 }, { "epoch": 0.731, "grad_norm": 1.561633285822126e-17, "learning_rate": 1.0234858309501864e-06, "logits/chosen": -0.9831579923629761, "logits/rejected": 0.2982550263404846, "logps/chosen": -528.8619384765625, "logps/rejected": -1077.370361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.656841278076172, "rewards/margins": 53.95241165161133, "rewards/rejected": -62.6092529296875, "step": 7310 }, { "epoch": 0.732, "grad_norm": 2.661149880168437e-17, "learning_rate": 1.0164527834907468e-06, "logits/chosen": -0.7919927835464478, "logits/rejected": 0.5363516211509705, "logps/chosen": -300.376708984375, "logps/rejected": -838.0631103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.472113132476807, "rewards/margins": 43.48785400390625, "rewards/rejected": -50.9599723815918, "step": 7320 }, { "epoch": 0.733, "grad_norm": 3.8212088161642394e-18, "learning_rate": 1.0094378125859602e-06, "logits/chosen": -1.0631580352783203, "logits/rejected": 0.5589848160743713, "logps/chosen": -170.1670379638672, "logps/rejected": -833.8010864257812, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -9.077153205871582, "rewards/margins": 53.96528244018555, "rewards/rejected": -63.04243850708008, "step": 7330 }, { "epoch": 0.734, "grad_norm": 1.1333886134029091e-14, "learning_rate": 1.0024410037110356e-06, "logits/chosen": -1.00998055934906, "logits/rejected": 0.4892210066318512, "logps/chosen": -373.8369445800781, "logps/rejected": -1075.8577880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.881095886230469, "rewards/margins": 54.31298828125, "rewards/rejected": -69.19407653808594, "step": 7340 }, { "epoch": 0.735, "grad_norm": 0.0, "learning_rate": 9.95462442119879e-07, "logits/chosen": -0.7339566946029663, "logits/rejected": 0.4608619809150696, "logps/chosen": -290.23321533203125, "logps/rejected": -881.2384643554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.089037895202637, "rewards/margins": 49.113502502441406, "rewards/rejected": -62.202537536621094, "step": 7350 }, { "epoch": 0.736, "grad_norm": 1.6385482201310323e-18, "learning_rate": 9.88502212844063e-07, "logits/chosen": -0.5577305555343628, "logits/rejected": -0.04514486715197563, "logps/chosen": -494.9710388183594, "logps/rejected": -983.4933471679688, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": -18.71878433227539, "rewards/margins": 48.584686279296875, "rewards/rejected": -67.30347442626953, "step": 7360 }, { "epoch": 0.737, "grad_norm": 0.0, "learning_rate": 9.815604006917839e-07, "logits/chosen": -0.7128852605819702, "logits/rejected": 0.5136991739273071, "logps/chosen": -336.8421325683594, "logps/rejected": -1104.3736572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.214262008666992, "rewards/margins": 62.831642150878906, "rewards/rejected": -76.0458984375, "step": 7370 }, { "epoch": 0.738, "grad_norm": 0.0, "learning_rate": 9.746370902468311e-07, "logits/chosen": -0.5148681998252869, "logits/rejected": 0.2315702885389328, "logps/chosen": -495.29150390625, "logps/rejected": -932.6849365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.574322700500488, "rewards/margins": 42.701656341552734, "rewards/rejected": -57.27598190307617, "step": 7380 }, { "epoch": 0.739, "grad_norm": 5.64483789572412e-13, "learning_rate": 9.677323658675594e-07, "logits/chosen": -0.8095144033432007, "logits/rejected": 0.015424412675201893, "logps/chosen": -367.74444580078125, "logps/rejected": -735.0386962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.414527893066406, "rewards/margins": 37.90756607055664, "rewards/rejected": -50.32209014892578, "step": 7390 }, { "epoch": 0.74, "grad_norm": 0.002067842520773411, "learning_rate": 9.608463116858544e-07, "logits/chosen": -0.8038798570632935, "logits/rejected": 0.47396841645240784, "logps/chosen": -428.75592041015625, "logps/rejected": -1115.0914306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.412130355834961, "rewards/margins": 59.50236892700195, "rewards/rejected": -72.91450500488281, "step": 7400 }, { "epoch": 0.741, "grad_norm": 2.0158783812668233e-22, "learning_rate": 9.53979011606115e-07, "logits/chosen": -0.7873549461364746, "logits/rejected": 0.4778427481651306, "logps/chosen": -234.4102020263672, "logps/rejected": -925.4586791992188, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -8.518302917480469, "rewards/margins": 56.09989547729492, "rewards/rejected": -64.61819458007812, "step": 7410 }, { "epoch": 0.742, "grad_norm": 0.0, "learning_rate": 9.471305493042243e-07, "logits/chosen": -0.8623729944229126, "logits/rejected": 0.5287893414497375, "logps/chosen": -196.85728454589844, "logps/rejected": -967.7962036132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.566311836242676, "rewards/margins": 59.91844940185547, "rewards/rejected": -66.48475646972656, "step": 7420 }, { "epoch": 0.743, "grad_norm": 7.9055621148254245e-22, "learning_rate": 9.403010082265351e-07, "logits/chosen": -0.5145904421806335, "logits/rejected": 0.5459538698196411, "logps/chosen": -408.49102783203125, "logps/rejected": -997.2546997070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.392236709594727, "rewards/margins": 58.52399444580078, "rewards/rejected": -68.91622924804688, "step": 7430 }, { "epoch": 0.744, "grad_norm": 1.038647681790984e-15, "learning_rate": 9.334904715888496e-07, "logits/chosen": -0.9317490458488464, "logits/rejected": 0.5168638229370117, "logps/chosen": -442.51861572265625, "logps/rejected": -1209.481689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.806231498718262, "rewards/margins": 61.112640380859375, "rewards/rejected": -74.91886901855469, "step": 7440 }, { "epoch": 0.745, "grad_norm": 0.0, "learning_rate": 9.266990223754069e-07, "logits/chosen": -0.9047843813896179, "logits/rejected": 0.7249792814254761, "logps/chosen": -312.70367431640625, "logps/rejected": -1056.26220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.898077011108398, "rewards/margins": 59.7227897644043, "rewards/rejected": -69.62086486816406, "step": 7450 }, { "epoch": 0.746, "grad_norm": 0.0, "learning_rate": 9.199267433378728e-07, "logits/chosen": -0.8632529377937317, "logits/rejected": 0.6839910745620728, "logps/chosen": -316.83856201171875, "logps/rejected": -1040.1484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.796337127685547, "rewards/margins": 60.876007080078125, "rewards/rejected": -70.6723403930664, "step": 7460 }, { "epoch": 0.747, "grad_norm": 0.0, "learning_rate": 9.131737169943314e-07, "logits/chosen": -1.0358017683029175, "logits/rejected": 0.4787723422050476, "logps/chosen": -299.0080871582031, "logps/rejected": -1036.6314697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -15.263322830200195, "rewards/margins": 56.86602783203125, "rewards/rejected": -72.1293716430664, "step": 7470 }, { "epoch": 0.748, "grad_norm": 0.0, "learning_rate": 9.064400256282757e-07, "logits/chosen": -0.7976016998291016, "logits/rejected": 0.7549166679382324, "logps/chosen": -206.1776123046875, "logps/rejected": -975.0432739257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.270216941833496, "rewards/margins": 60.506622314453125, "rewards/rejected": -68.77684020996094, "step": 7480 }, { "epoch": 0.749, "grad_norm": 8.384397318299565e-17, "learning_rate": 8.99725751287611e-07, "logits/chosen": -0.6219117045402527, "logits/rejected": 0.44370508193969727, "logps/chosen": -420.1907653808594, "logps/rejected": -1133.765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.35955810546875, "rewards/margins": 68.6363296508789, "rewards/rejected": -81.99589538574219, "step": 7490 }, { "epoch": 0.75, "grad_norm": 2.094380985929404e-18, "learning_rate": 8.930309757836517e-07, "logits/chosen": -0.6963815689086914, "logits/rejected": 0.6372653245925903, "logps/chosen": -433.8370666503906, "logps/rejected": -1238.472900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.223983764648438, "rewards/margins": 69.96954345703125, "rewards/rejected": -87.19352722167969, "step": 7500 }, { "epoch": 0.751, "grad_norm": 0.0, "learning_rate": 8.863557806901233e-07, "logits/chosen": -0.146893709897995, "logits/rejected": 0.3493548035621643, "logps/chosen": -653.1065673828125, "logps/rejected": -1275.319091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.061296463012695, "rewards/margins": 67.54277038574219, "rewards/rejected": -89.60407257080078, "step": 7510 }, { "epoch": 0.752, "grad_norm": 0.0, "learning_rate": 8.797002473421729e-07, "logits/chosen": -0.355530321598053, "logits/rejected": 0.8727057576179504, "logps/chosen": -294.88006591796875, "logps/rejected": -957.97900390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -11.46592903137207, "rewards/margins": 60.088661193847656, "rewards/rejected": -71.5545883178711, "step": 7520 }, { "epoch": 0.753, "grad_norm": 0.004240179434418678, "learning_rate": 8.73064456835373e-07, "logits/chosen": -1.0234708786010742, "logits/rejected": 0.3270181119441986, "logps/chosen": -342.8228759765625, "logps/rejected": -1176.275634765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.1382474899292, "rewards/margins": 66.42398834228516, "rewards/rejected": -81.56224060058594, "step": 7530 }, { "epoch": 0.754, "grad_norm": 2.278529131322614e-14, "learning_rate": 8.664484900247363e-07, "logits/chosen": -0.9598628878593445, "logits/rejected": 0.6580491065979004, "logps/chosen": -283.4799499511719, "logps/rejected": -1075.259033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.367246627807617, "rewards/margins": 62.03153610229492, "rewards/rejected": -75.39878845214844, "step": 7540 }, { "epoch": 0.755, "grad_norm": 0.0, "learning_rate": 8.598524275237321e-07, "logits/chosen": -0.837388813495636, "logits/rejected": 1.1218478679656982, "logps/chosen": -289.286865234375, "logps/rejected": -1147.4429931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.133901596069336, "rewards/margins": 73.49677276611328, "rewards/rejected": -89.63066864013672, "step": 7550 }, { "epoch": 0.756, "grad_norm": 2.807277161829369e-20, "learning_rate": 8.532763497032987e-07, "logits/chosen": -0.38878798484802246, "logits/rejected": 0.5794919729232788, "logps/chosen": -512.4061889648438, "logps/rejected": -1225.756591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.27451515197754, "rewards/margins": 65.82911682128906, "rewards/rejected": -82.10364532470703, "step": 7560 }, { "epoch": 0.757, "grad_norm": 0.0, "learning_rate": 8.467203366908708e-07, "logits/chosen": -0.7684077024459839, "logits/rejected": 0.4302092492580414, "logps/chosen": -290.90289306640625, "logps/rejected": -995.5157470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.721793174743652, "rewards/margins": 61.515533447265625, "rewards/rejected": -71.2373275756836, "step": 7570 }, { "epoch": 0.758, "grad_norm": 0.0, "learning_rate": 8.40184468369396e-07, "logits/chosen": -0.5130096077919006, "logits/rejected": 0.5950255393981934, "logps/chosen": -315.585205078125, "logps/rejected": -1058.7269287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.12911605834961, "rewards/margins": 60.01392364501953, "rewards/rejected": -78.14305114746094, "step": 7580 }, { "epoch": 0.759, "grad_norm": 0.0, "learning_rate": 8.336688243763691e-07, "logits/chosen": -1.0224621295928955, "logits/rejected": 0.558266818523407, "logps/chosen": -430.31793212890625, "logps/rejected": -1285.520263671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.064857482910156, "rewards/margins": 72.1447525024414, "rewards/rejected": -86.20960235595703, "step": 7590 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 8.271734841028553e-07, "logits/chosen": -0.5809733867645264, "logits/rejected": 0.902326762676239, "logps/chosen": -500.68280029296875, "logps/rejected": -1293.6524658203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -20.30832290649414, "rewards/margins": 71.17332458496094, "rewards/rejected": -91.48163604736328, "step": 7600 }, { "epoch": 0.761, "grad_norm": 0.0, "learning_rate": 8.206985266925249e-07, "logits/chosen": -1.140430212020874, "logits/rejected": 1.0953062772750854, "logps/chosen": -450.14691162109375, "logps/rejected": -1547.922607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.620083808898926, "rewards/margins": 95.6705322265625, "rewards/rejected": -110.2906265258789, "step": 7610 }, { "epoch": 0.762, "grad_norm": 1.6068162069854874e-16, "learning_rate": 8.142440310406923e-07, "logits/chosen": -0.014678800478577614, "logits/rejected": 0.8266127705574036, "logps/chosen": -457.71075439453125, "logps/rejected": -953.9225463867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.222761154174805, "rewards/margins": 48.474849700927734, "rewards/rejected": -67.69761657714844, "step": 7620 }, { "epoch": 0.763, "grad_norm": 0.0, "learning_rate": 8.078100757933486e-07, "logits/chosen": -0.9533795118331909, "logits/rejected": 0.5999480485916138, "logps/chosen": -447.9500427246094, "logps/rejected": -1231.653564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.345884323120117, "rewards/margins": 77.30724334716797, "rewards/rejected": -90.65313720703125, "step": 7630 }, { "epoch": 0.764, "grad_norm": 0.0, "learning_rate": 8.013967393462094e-07, "logits/chosen": -0.3110244870185852, "logits/rejected": 0.5950175523757935, "logps/chosen": -463.6453552246094, "logps/rejected": -975.3029174804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.173898696899414, "rewards/margins": 49.042076110839844, "rewards/rejected": -68.21597290039062, "step": 7640 }, { "epoch": 0.765, "grad_norm": 0.0, "learning_rate": 7.950040998437541e-07, "logits/chosen": -1.0438666343688965, "logits/rejected": 0.9437441825866699, "logps/chosen": -346.1753234863281, "logps/rejected": -1227.1187744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.079136848449707, "rewards/margins": 74.47779083251953, "rewards/rejected": -87.55693054199219, "step": 7650 }, { "epoch": 0.766, "grad_norm": 2.848948995975536e-17, "learning_rate": 7.886322351782782e-07, "logits/chosen": -0.37577199935913086, "logits/rejected": 0.2696637511253357, "logps/chosen": -501.11114501953125, "logps/rejected": -861.0324096679688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -12.849464416503906, "rewards/margins": 40.464820861816406, "rewards/rejected": -53.31428909301758, "step": 7660 }, { "epoch": 0.767, "grad_norm": 1.937213681569433e-13, "learning_rate": 7.822812229889429e-07, "logits/chosen": -0.6278411149978638, "logits/rejected": 0.5903183817863464, "logps/chosen": -349.5834045410156, "logps/rejected": -1101.945556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.024063110351562, "rewards/margins": 65.76557922363281, "rewards/rejected": -82.78965759277344, "step": 7670 }, { "epoch": 0.768, "grad_norm": 0.0, "learning_rate": 7.759511406608255e-07, "logits/chosen": -0.25402379035949707, "logits/rejected": 0.7397147417068481, "logps/chosen": -428.45892333984375, "logps/rejected": -1207.371337890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.313592910766602, "rewards/margins": 73.54627990722656, "rewards/rejected": -88.85987091064453, "step": 7680 }, { "epoch": 0.769, "grad_norm": 0.0, "learning_rate": 7.696420653239834e-07, "logits/chosen": -0.5498959422111511, "logits/rejected": 1.0125188827514648, "logps/chosen": -429.74560546875, "logps/rejected": -1378.413330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.181983947753906, "rewards/margins": 90.03840637207031, "rewards/rejected": -106.22039794921875, "step": 7690 }, { "epoch": 0.77, "grad_norm": 2.008980715118014e-07, "learning_rate": 7.633540738525066e-07, "logits/chosen": -0.6189178228378296, "logits/rejected": 0.47738155722618103, "logps/chosen": -677.8727416992188, "logps/rejected": -1215.9979248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -25.010875701904297, "rewards/margins": 50.56502151489258, "rewards/rejected": -75.57588958740234, "step": 7700 }, { "epoch": 0.771, "grad_norm": 8.869028939606787e-16, "learning_rate": 7.57087242863589e-07, "logits/chosen": -0.6059373617172241, "logits/rejected": 0.5790004730224609, "logps/chosen": -365.8356018066406, "logps/rejected": -1020.9505615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.18605613708496, "rewards/margins": 53.259193420410156, "rewards/rejected": -69.44524383544922, "step": 7710 }, { "epoch": 0.772, "grad_norm": 0.0, "learning_rate": 7.508416487165864e-07, "logits/chosen": -0.5776602625846863, "logits/rejected": 0.7393280267715454, "logps/chosen": -298.1356506347656, "logps/rejected": -1209.6595458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.489774703979492, "rewards/margins": 76.7217025756836, "rewards/rejected": -89.21147918701172, "step": 7720 }, { "epoch": 0.773, "grad_norm": 9.048666859879866e-18, "learning_rate": 7.446173675120943e-07, "logits/chosen": -0.45593494176864624, "logits/rejected": 0.6403728127479553, "logps/chosen": -461.8550720214844, "logps/rejected": -1142.08642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.827007293701172, "rewards/margins": 60.17173385620117, "rewards/rejected": -79.99874114990234, "step": 7730 }, { "epoch": 0.774, "grad_norm": 3.2634135614775815e-22, "learning_rate": 7.384144750910133e-07, "logits/chosen": -0.45879751443862915, "logits/rejected": 0.7685213088989258, "logps/chosen": -507.66021728515625, "logps/rejected": -1147.2984619140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.178197860717773, "rewards/margins": 65.04210662841797, "rewards/rejected": -80.22030639648438, "step": 7740 }, { "epoch": 0.775, "grad_norm": 0.0, "learning_rate": 7.322330470336314e-07, "logits/chosen": -0.6126856803894043, "logits/rejected": 0.41923293471336365, "logps/chosen": -483.3583984375, "logps/rejected": -1216.6903076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.74558448791504, "rewards/margins": 65.92410278320312, "rewards/rejected": -82.66969299316406, "step": 7750 }, { "epoch": 0.776, "grad_norm": 0.0, "learning_rate": 7.260731586586983e-07, "logits/chosen": -0.5855390429496765, "logits/rejected": 1.0094908475875854, "logps/chosen": -311.06524658203125, "logps/rejected": -1248.450439453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.760951042175293, "rewards/margins": 84.68740844726562, "rewards/rejected": -94.44835662841797, "step": 7760 }, { "epoch": 0.777, "grad_norm": 0.0, "learning_rate": 7.199348850225091e-07, "logits/chosen": -0.4212276339530945, "logits/rejected": 0.845160961151123, "logps/chosen": -426.73419189453125, "logps/rejected": -1159.7073974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.290374755859375, "rewards/margins": 66.98060607910156, "rewards/rejected": -85.27098083496094, "step": 7770 }, { "epoch": 0.778, "grad_norm": 0.0, "learning_rate": 7.138183009179922e-07, "logits/chosen": -0.6795636415481567, "logits/rejected": 0.7848717570304871, "logps/chosen": -327.3196105957031, "logps/rejected": -966.79541015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.527212142944336, "rewards/margins": 55.81372833251953, "rewards/rejected": -71.34093475341797, "step": 7780 }, { "epoch": 0.779, "grad_norm": 0.0, "learning_rate": 7.077234808737932e-07, "logits/chosen": -0.38788530230522156, "logits/rejected": 0.15900078415870667, "logps/chosen": -410.41839599609375, "logps/rejected": -959.5905151367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.295787811279297, "rewards/margins": 53.696380615234375, "rewards/rejected": -70.9921646118164, "step": 7790 }, { "epoch": 0.78, "grad_norm": 2.8316051218189864e-13, "learning_rate": 7.016504991533727e-07, "logits/chosen": -0.39930716156959534, "logits/rejected": 0.008740996941924095, "logps/chosen": -597.921142578125, "logps/rejected": -835.9523315429688, "loss": 0.4145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -21.053123474121094, "rewards/margins": 34.08021926879883, "rewards/rejected": -55.133338928222656, "step": 7800 }, { "epoch": 0.781, "grad_norm": 0.0, "learning_rate": 6.955994297540947e-07, "logits/chosen": -0.7211162447929382, "logits/rejected": 0.7454463243484497, "logps/chosen": -369.17742919921875, "logps/rejected": -1076.2313232421875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -13.157571792602539, "rewards/margins": 62.682777404785156, "rewards/rejected": -75.84034729003906, "step": 7810 }, { "epoch": 0.782, "grad_norm": 1.4056424946110474e-21, "learning_rate": 6.895703464063319e-07, "logits/chosen": -0.04733237624168396, "logits/rejected": 0.5720465183258057, "logps/chosen": -285.1549987792969, "logps/rejected": -922.0696411132812, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": -16.364412307739258, "rewards/margins": 59.26238250732422, "rewards/rejected": -75.62680053710938, "step": 7820 }, { "epoch": 0.783, "grad_norm": 1.2598473581162394e-18, "learning_rate": 6.835633225725604e-07, "logits/chosen": -0.9407356381416321, "logits/rejected": 0.767217755317688, "logps/chosen": -441.0362854003906, "logps/rejected": -1393.307373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.530881881713867, "rewards/margins": 87.36241149902344, "rewards/rejected": -104.8932876586914, "step": 7830 }, { "epoch": 0.784, "grad_norm": 0.0, "learning_rate": 6.775784314464717e-07, "logits/chosen": -1.044764757156372, "logits/rejected": 1.2633781433105469, "logps/chosen": -280.1641845703125, "logps/rejected": -1321.974365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.51031494140625, "rewards/margins": 84.8749008178711, "rewards/rejected": -104.38520812988281, "step": 7840 }, { "epoch": 0.785, "grad_norm": 0.0, "learning_rate": 6.716157459520739e-07, "logits/chosen": -0.7862663865089417, "logits/rejected": 0.907000720500946, "logps/chosen": -588.1331176757812, "logps/rejected": -1509.89501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.144685745239258, "rewards/margins": 90.22511291503906, "rewards/rejected": -107.36979675292969, "step": 7850 }, { "epoch": 0.786, "grad_norm": 0.0, "learning_rate": 6.656753387428089e-07, "logits/chosen": -0.7701439261436462, "logits/rejected": 1.0488814115524292, "logps/chosen": -291.90594482421875, "logps/rejected": -1312.0511474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.93437385559082, "rewards/margins": 87.22757720947266, "rewards/rejected": -101.16195678710938, "step": 7860 }, { "epoch": 0.787, "grad_norm": 0.0, "learning_rate": 6.597572822006643e-07, "logits/chosen": -1.2947529554367065, "logits/rejected": 1.1008408069610596, "logps/chosen": -359.7368469238281, "logps/rejected": -1567.0120849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.875795364379883, "rewards/margins": 97.81360626220703, "rewards/rejected": -112.68940734863281, "step": 7870 }, { "epoch": 0.788, "grad_norm": 3.3539895861166484e-20, "learning_rate": 6.538616484352902e-07, "logits/chosen": -0.8063453435897827, "logits/rejected": 0.6678211092948914, "logps/chosen": -332.2959289550781, "logps/rejected": -1273.5902099609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.185787200927734, "rewards/margins": 73.93902587890625, "rewards/rejected": -91.12480926513672, "step": 7880 }, { "epoch": 0.789, "grad_norm": 0.0, "learning_rate": 6.479885092831248e-07, "logits/chosen": -0.667913019657135, "logits/rejected": 0.936458945274353, "logps/chosen": -701.1113891601562, "logps/rejected": -1564.383544921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.8167667388916, "rewards/margins": 81.56040954589844, "rewards/rejected": -110.3771743774414, "step": 7890 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 6.421379363065142e-07, "logits/chosen": -0.3396407663822174, "logits/rejected": 0.5185993313789368, "logps/chosen": -385.85736083984375, "logps/rejected": -1270.9588623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.686500549316406, "rewards/margins": 87.16119384765625, "rewards/rejected": -104.84769439697266, "step": 7900 }, { "epoch": 0.791, "grad_norm": 2.754020567152793e-09, "learning_rate": 6.363100007928447e-07, "logits/chosen": -0.6791292428970337, "logits/rejected": 0.6307904124259949, "logps/chosen": -474.0777282714844, "logps/rejected": -1043.2554931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.698204040527344, "rewards/margins": 49.53090286254883, "rewards/rejected": -65.22911071777344, "step": 7910 }, { "epoch": 0.792, "grad_norm": 0.0, "learning_rate": 6.305047737536707e-07, "logits/chosen": -0.8387699127197266, "logits/rejected": 0.6438099145889282, "logps/chosen": -380.71185302734375, "logps/rejected": -1177.8743896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.04198455810547, "rewards/margins": 71.42205047607422, "rewards/rejected": -91.46404266357422, "step": 7920 }, { "epoch": 0.793, "grad_norm": 0.0, "learning_rate": 6.247223259238513e-07, "logits/chosen": -0.8732248544692993, "logits/rejected": 1.2813969850540161, "logps/chosen": -402.7436218261719, "logps/rejected": -1218.497314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.730911254882812, "rewards/margins": 68.49177551269531, "rewards/rejected": -87.2226791381836, "step": 7930 }, { "epoch": 0.794, "grad_norm": 0.0, "learning_rate": 6.189627277606894e-07, "logits/chosen": -0.6855721473693848, "logits/rejected": 0.9724220037460327, "logps/chosen": -359.7814636230469, "logps/rejected": -1313.208251953125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -15.863734245300293, "rewards/margins": 77.40478515625, "rewards/rejected": -93.2685317993164, "step": 7940 }, { "epoch": 0.795, "grad_norm": 0.0, "learning_rate": 6.1322604944307e-07, "logits/chosen": -0.5548166632652283, "logits/rejected": 0.7209320664405823, "logps/chosen": -434.252685546875, "logps/rejected": -1383.366943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.734821319580078, "rewards/margins": 82.97564697265625, "rewards/rejected": -102.7104721069336, "step": 7950 }, { "epoch": 0.796, "grad_norm": 0.0, "learning_rate": 6.075123608706093e-07, "logits/chosen": -0.6931466460227966, "logits/rejected": 0.5858211517333984, "logps/chosen": -304.97662353515625, "logps/rejected": -1122.347900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.999123573303223, "rewards/margins": 73.74942779541016, "rewards/rejected": -85.74855041503906, "step": 7960 }, { "epoch": 0.797, "grad_norm": 0.0, "learning_rate": 6.01821731662798e-07, "logits/chosen": -0.5279142260551453, "logits/rejected": 1.04735267162323, "logps/chosen": -446.77001953125, "logps/rejected": -1314.6702880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.26654052734375, "rewards/margins": 85.6667251586914, "rewards/rejected": -102.93327331542969, "step": 7970 }, { "epoch": 0.798, "grad_norm": 0.0, "learning_rate": 5.961542311581586e-07, "logits/chosen": -0.6521563529968262, "logits/rejected": 0.8276292681694031, "logps/chosen": -385.0362243652344, "logps/rejected": -1391.097412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.053596496582031, "rewards/margins": 85.08977508544922, "rewards/rejected": -100.14338684082031, "step": 7980 }, { "epoch": 0.799, "grad_norm": 0.0, "learning_rate": 5.905099284133953e-07, "logits/chosen": -0.7128938436508179, "logits/rejected": 0.9515337944030762, "logps/chosen": -321.2559814453125, "logps/rejected": -1249.6884765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.714630126953125, "rewards/margins": 77.06490325927734, "rewards/rejected": -91.779541015625, "step": 7990 }, { "epoch": 0.8, "grad_norm": 5.697309621690086e-14, "learning_rate": 5.848888922025553e-07, "logits/chosen": -0.6935831904411316, "logits/rejected": 1.0498263835906982, "logps/chosen": -458.8016662597656, "logps/rejected": -1300.8131103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.941858291625977, "rewards/margins": 80.65689086914062, "rewards/rejected": -99.5987548828125, "step": 8000 }, { "epoch": 0.801, "grad_norm": 0.0, "learning_rate": 5.792911910161922e-07, "logits/chosen": -0.32794898748397827, "logits/rejected": 0.5728577375411987, "logps/chosen": -395.8527526855469, "logps/rejected": -1102.51953125, "loss": 0.3056, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.5410795211792, "rewards/margins": 65.24958801269531, "rewards/rejected": -80.79066467285156, "step": 8010 }, { "epoch": 0.802, "grad_norm": 2.4850881160414627e-19, "learning_rate": 5.737168930605272e-07, "logits/chosen": -0.7031986713409424, "logits/rejected": 0.6816826462745667, "logps/chosen": -258.4869384765625, "logps/rejected": -1057.6505126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.595638275146484, "rewards/margins": 67.13885498046875, "rewards/rejected": -77.7344970703125, "step": 8020 }, { "epoch": 0.803, "grad_norm": 1.8630241446770945e-11, "learning_rate": 5.681660662566225e-07, "logits/chosen": -1.1092045307159424, "logits/rejected": 1.042096734046936, "logps/chosen": -316.0285339355469, "logps/rejected": -1484.916259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.377889633178711, "rewards/margins": 94.16575622558594, "rewards/rejected": -107.54365539550781, "step": 8030 }, { "epoch": 0.804, "grad_norm": 0.0, "learning_rate": 5.626387782395515e-07, "logits/chosen": -0.5211082696914673, "logits/rejected": 0.8625160455703735, "logps/chosen": -279.7019348144531, "logps/rejected": -1177.743896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.55776309967041, "rewards/margins": 74.29469299316406, "rewards/rejected": -86.85245513916016, "step": 8040 }, { "epoch": 0.805, "grad_norm": 1.746179100280054e-18, "learning_rate": 5.571350963575728e-07, "logits/chosen": -0.792542576789856, "logits/rejected": 0.5339727401733398, "logps/chosen": -340.4499816894531, "logps/rejected": -1162.2421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.852516174316406, "rewards/margins": 57.709815979003906, "rewards/rejected": -74.56233215332031, "step": 8050 }, { "epoch": 0.806, "grad_norm": 0.0, "learning_rate": 5.516550876713142e-07, "logits/chosen": -0.9227225184440613, "logits/rejected": 0.7504435777664185, "logps/chosen": -334.015625, "logps/rejected": -1463.6187744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.994755744934082, "rewards/margins": 93.83234405517578, "rewards/rejected": -106.82710266113281, "step": 8060 }, { "epoch": 0.807, "grad_norm": 3.1763735522036263e-22, "learning_rate": 5.461988189529529e-07, "logits/chosen": -0.7259895205497742, "logits/rejected": 0.7027822136878967, "logps/chosen": -424.6468200683594, "logps/rejected": -1181.9427490234375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -16.932525634765625, "rewards/margins": 67.4690170288086, "rewards/rejected": -84.40153503417969, "step": 8070 }, { "epoch": 0.808, "grad_norm": 0.0, "learning_rate": 5.407663566854008e-07, "logits/chosen": -0.6267033815383911, "logits/rejected": 0.7194575071334839, "logps/chosen": -426.80389404296875, "logps/rejected": -1407.231201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.166587829589844, "rewards/margins": 86.96299743652344, "rewards/rejected": -104.12960052490234, "step": 8080 }, { "epoch": 0.809, "grad_norm": 4.3124832933603033e-17, "learning_rate": 5.353577670614951e-07, "logits/chosen": -0.5230453014373779, "logits/rejected": 1.0515474081039429, "logps/chosen": -363.8052062988281, "logps/rejected": -1169.013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.506147384643555, "rewards/margins": 74.06648254394531, "rewards/rejected": -92.57263946533203, "step": 8090 }, { "epoch": 0.81, "grad_norm": 7.021869350865018e-06, "learning_rate": 5.299731159831953e-07, "logits/chosen": -1.17167329788208, "logits/rejected": 0.774400532245636, "logps/chosen": -375.32366943359375, "logps/rejected": -1388.881103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.649917602539062, "rewards/margins": 77.11151123046875, "rewards/rejected": -99.76142883300781, "step": 8100 }, { "epoch": 0.811, "grad_norm": 8.09361074207104e-18, "learning_rate": 5.24612469060774e-07, "logits/chosen": -0.42869800329208374, "logits/rejected": 0.7785958051681519, "logps/chosen": -322.61175537109375, "logps/rejected": -896.5892333984375, "loss": 0.6231, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -14.81994915008545, "rewards/margins": 45.00872039794922, "rewards/rejected": -59.82868194580078, "step": 8110 }, { "epoch": 0.812, "grad_norm": 0.0, "learning_rate": 5.192758916120236e-07, "logits/chosen": -0.291696161031723, "logits/rejected": 0.6524218320846558, "logps/chosen": -639.3327026367188, "logps/rejected": -1339.4818115234375, "loss": 0.0581, "rewards/accuracies": 1.0, "rewards/chosen": -23.298053741455078, "rewards/margins": 75.19126892089844, "rewards/rejected": -98.48933410644531, "step": 8120 }, { "epoch": 0.813, "grad_norm": 0.0, "learning_rate": 5.139634486614544e-07, "logits/chosen": -0.8535143136978149, "logits/rejected": 0.38042861223220825, "logps/chosen": -614.6556396484375, "logps/rejected": -1314.8524169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.462032318115234, "rewards/margins": 69.82005310058594, "rewards/rejected": -87.2820816040039, "step": 8130 }, { "epoch": 0.814, "grad_norm": 1.0093644127718106e-17, "learning_rate": 5.086752049395097e-07, "logits/chosen": -0.5521525740623474, "logits/rejected": 0.7084673047065735, "logps/chosen": -460.20452880859375, "logps/rejected": -1026.403564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.651652336120605, "rewards/margins": 60.66973876953125, "rewards/rejected": -71.32139587402344, "step": 8140 }, { "epoch": 0.815, "grad_norm": 0.0, "learning_rate": 5.034112248817685e-07, "logits/chosen": -0.33179759979248047, "logits/rejected": 0.34709256887435913, "logps/chosen": -472.8624572753906, "logps/rejected": -989.8903198242188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -15.012979507446289, "rewards/margins": 56.96765899658203, "rewards/rejected": -71.98063659667969, "step": 8150 }, { "epoch": 0.816, "grad_norm": 4.864253169706096e-19, "learning_rate": 4.981715726281666e-07, "logits/chosen": -0.70337975025177, "logits/rejected": 0.5703068971633911, "logps/chosen": -290.66314697265625, "logps/rejected": -938.6148681640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.626321792602539, "rewards/margins": 58.3425178527832, "rewards/rejected": -67.96883392333984, "step": 8160 }, { "epoch": 0.817, "grad_norm": 0.0, "learning_rate": 4.929563120222142e-07, "logits/chosen": -0.40683525800704956, "logits/rejected": 0.7035341262817383, "logps/chosen": -352.96405029296875, "logps/rejected": -1056.8221435546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.423770904541016, "rewards/margins": 65.76141357421875, "rewards/rejected": -78.18519592285156, "step": 8170 }, { "epoch": 0.818, "grad_norm": 1.2423618353016648e-17, "learning_rate": 4.87765506610215e-07, "logits/chosen": -0.3892587721347809, "logits/rejected": 0.3781268894672394, "logps/chosen": -630.7948608398438, "logps/rejected": -1205.042724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.79370403289795, "rewards/margins": 66.3752212524414, "rewards/rejected": -82.16891479492188, "step": 8180 }, { "epoch": 0.819, "grad_norm": 0.0, "learning_rate": 4.825992196404958e-07, "logits/chosen": -0.958387017250061, "logits/rejected": 0.9771413803100586, "logps/chosen": -246.85403442382812, "logps/rejected": -1029.256103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.022276878356934, "rewards/margins": 64.25834655761719, "rewards/rejected": -76.28062438964844, "step": 8190 }, { "epoch": 0.82, "grad_norm": 0.0, "learning_rate": 4.774575140626317e-07, "logits/chosen": -1.1014697551727295, "logits/rejected": 0.7440928220748901, "logps/chosen": -451.4864807128906, "logps/rejected": -1390.8321533203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.898099899291992, "rewards/margins": 77.3144760131836, "rewards/rejected": -97.21258544921875, "step": 8200 }, { "epoch": 0.821, "grad_norm": 0.0, "learning_rate": 4.7234045252668393e-07, "logits/chosen": -0.47737008333206177, "logits/rejected": 0.47926831245422363, "logps/chosen": -418.79327392578125, "logps/rejected": -1029.6893310546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.025524139404297, "rewards/margins": 53.46759796142578, "rewards/rejected": -76.49312591552734, "step": 8210 }, { "epoch": 0.822, "grad_norm": 3.9256209502687434e-14, "learning_rate": 4.672480973824312e-07, "logits/chosen": -0.9004614949226379, "logits/rejected": 0.6222228407859802, "logps/chosen": -295.7037048339844, "logps/rejected": -1047.0849609375, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -13.201852798461914, "rewards/margins": 66.40901947021484, "rewards/rejected": -79.61087036132812, "step": 8220 }, { "epoch": 0.823, "grad_norm": 0.0, "learning_rate": 4.6218051067861423e-07, "logits/chosen": -1.2087002992630005, "logits/rejected": 0.8243007659912109, "logps/chosen": -274.3550109863281, "logps/rejected": -1274.161376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.670193672180176, "rewards/margins": 78.62632751464844, "rewards/rejected": -87.2965316772461, "step": 8230 }, { "epoch": 0.824, "grad_norm": 0.0, "learning_rate": 4.5713775416217884e-07, "logits/chosen": -0.5355249643325806, "logits/rejected": 0.35910564661026, "logps/chosen": -438.91094970703125, "logps/rejected": -999.2205200195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.278973579406738, "rewards/margins": 58.531837463378906, "rewards/rejected": -73.81080627441406, "step": 8240 }, { "epoch": 0.825, "grad_norm": 0.0, "learning_rate": 4.5211988927752026e-07, "logits/chosen": -0.2851003408432007, "logits/rejected": 0.439518541097641, "logps/chosen": -613.4663696289062, "logps/rejected": -1385.641845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.915315628051758, "rewards/margins": 81.16056823730469, "rewards/rejected": -95.07588195800781, "step": 8250 }, { "epoch": 0.826, "grad_norm": 0.0, "learning_rate": 4.4712697716573994e-07, "logits/chosen": -0.9923319816589355, "logits/rejected": 0.5478132963180542, "logps/chosen": -262.02734375, "logps/rejected": -1090.7371826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.208616256713867, "rewards/margins": 70.6222152709961, "rewards/rejected": -80.83082580566406, "step": 8260 }, { "epoch": 0.827, "grad_norm": 3.422428153051528e-16, "learning_rate": 4.421590786638952e-07, "logits/chosen": -1.0225954055786133, "logits/rejected": 0.7992622256278992, "logps/chosen": -465.78814697265625, "logps/rejected": -1263.924072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.851648330688477, "rewards/margins": 67.95602416992188, "rewards/rejected": -82.80766296386719, "step": 8270 }, { "epoch": 0.828, "grad_norm": 9.069101298121805e-16, "learning_rate": 4.372162543042624e-07, "logits/chosen": -0.723979115486145, "logits/rejected": 0.7619308233261108, "logps/chosen": -454.51763916015625, "logps/rejected": -1181.912841796875, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -15.952386856079102, "rewards/margins": 69.74320983886719, "rewards/rejected": -85.69558715820312, "step": 8280 }, { "epoch": 0.829, "grad_norm": 0.0, "learning_rate": 4.3229856431359516e-07, "logits/chosen": -0.6105092167854309, "logits/rejected": 0.9017621278762817, "logps/chosen": -449.3888244628906, "logps/rejected": -1334.6583251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.543269157409668, "rewards/margins": 81.08878326416016, "rewards/rejected": -96.6320571899414, "step": 8290 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 4.27406068612396e-07, "logits/chosen": -0.769940197467804, "logits/rejected": 0.6583008766174316, "logps/chosen": -528.0979614257812, "logps/rejected": -1267.00048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.954166412353516, "rewards/margins": 64.58262634277344, "rewards/rejected": -81.53678894042969, "step": 8300 }, { "epoch": 0.831, "grad_norm": 0.0, "learning_rate": 4.225388268141797e-07, "logits/chosen": -0.6798024773597717, "logits/rejected": 0.6938012838363647, "logps/chosen": -287.25360107421875, "logps/rejected": -1024.869384765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.446317672729492, "rewards/margins": 64.9866714477539, "rewards/rejected": -82.43299102783203, "step": 8310 }, { "epoch": 0.832, "grad_norm": 0.0, "learning_rate": 4.1769689822475147e-07, "logits/chosen": -0.9560664296150208, "logits/rejected": 0.6266263127326965, "logps/chosen": -274.5146789550781, "logps/rejected": -1223.1676025390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.361339569091797, "rewards/margins": 76.72416687011719, "rewards/rejected": -89.08551025390625, "step": 8320 }, { "epoch": 0.833, "grad_norm": 1.053383248683648e-11, "learning_rate": 4.12880341841484e-07, "logits/chosen": -0.6871368288993835, "logits/rejected": 0.6744714379310608, "logps/chosen": -611.6934814453125, "logps/rejected": -1321.7281494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.083663940429688, "rewards/margins": 68.05198669433594, "rewards/rejected": -88.1356430053711, "step": 8330 }, { "epoch": 0.834, "grad_norm": 0.0, "learning_rate": 4.0808921635259595e-07, "logits/chosen": -0.8540255427360535, "logits/rejected": 0.6868570446968079, "logps/chosen": -434.99517822265625, "logps/rejected": -1390.0816650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.621461868286133, "rewards/margins": 85.2814712524414, "rewards/rejected": -104.9029312133789, "step": 8340 }, { "epoch": 0.835, "grad_norm": 5.242558923731424e-20, "learning_rate": 4.033235801364402e-07, "logits/chosen": -0.18468718230724335, "logits/rejected": 0.8583782315254211, "logps/chosen": -509.08551025390625, "logps/rejected": -1399.2689208984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.07135772705078, "rewards/margins": 88.06640625, "rewards/rejected": -106.13775634765625, "step": 8350 }, { "epoch": 0.836, "grad_norm": 6.893914701322501e-07, "learning_rate": 3.9858349126078945e-07, "logits/chosen": -0.6687838435173035, "logits/rejected": 0.7791027426719666, "logps/chosen": -348.71868896484375, "logps/rejected": -1039.650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.388275146484375, "rewards/margins": 58.85942459106445, "rewards/rejected": -73.24771118164062, "step": 8360 }, { "epoch": 0.837, "grad_norm": 1.6544232650517188e-08, "learning_rate": 3.938690074821314e-07, "logits/chosen": -0.4394214153289795, "logits/rejected": 0.9431339502334595, "logps/chosen": -399.00872802734375, "logps/rejected": -1156.00146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.371219635009766, "rewards/margins": 69.17243194580078, "rewards/rejected": -85.54365539550781, "step": 8370 }, { "epoch": 0.838, "grad_norm": 0.0, "learning_rate": 3.891801862449629e-07, "logits/chosen": -0.7251814603805542, "logits/rejected": 0.5887377262115479, "logps/chosen": -369.7410583496094, "logps/rejected": -1277.55712890625, "loss": 0.2999, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.028943061828613, "rewards/margins": 79.9480209350586, "rewards/rejected": -92.97696685791016, "step": 8380 }, { "epoch": 0.839, "grad_norm": 2.708196821822914e-19, "learning_rate": 3.8451708468109026e-07, "logits/chosen": -0.7329934239387512, "logits/rejected": 0.7062281370162964, "logps/chosen": -432.52197265625, "logps/rejected": -1181.098876953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.970266342163086, "rewards/margins": 67.001220703125, "rewards/rejected": -83.97148895263672, "step": 8390 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 3.798797596089351e-07, "logits/chosen": -1.2603354454040527, "logits/rejected": 0.7970761060714722, "logps/chosen": -318.9318542480469, "logps/rejected": -1589.118408203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.669502258300781, "rewards/margins": 99.53646087646484, "rewards/rejected": -113.2059555053711, "step": 8400 }, { "epoch": 0.841, "grad_norm": 0.0, "learning_rate": 3.7526826753284065e-07, "logits/chosen": -0.8607357740402222, "logits/rejected": 0.9649609327316284, "logps/chosen": -422.95477294921875, "logps/rejected": -1409.5218505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.955909729003906, "rewards/margins": 81.52864074707031, "rewards/rejected": -104.48453521728516, "step": 8410 }, { "epoch": 0.842, "grad_norm": 0.0, "learning_rate": 3.7068266464238085e-07, "logits/chosen": -0.9515706300735474, "logits/rejected": 0.6792925596237183, "logps/chosen": -400.7596130371094, "logps/rejected": -1421.045654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.31169319152832, "rewards/margins": 90.6186294555664, "rewards/rejected": -102.93033599853516, "step": 8420 }, { "epoch": 0.843, "grad_norm": 6.503150795644785e-10, "learning_rate": 3.661230068116811e-07, "logits/chosen": -0.9090437889099121, "logits/rejected": 0.6916595697402954, "logps/chosen": -412.43756103515625, "logps/rejected": -1149.2764892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.873706817626953, "rewards/margins": 64.63572692871094, "rewards/rejected": -81.5094223022461, "step": 8430 }, { "epoch": 0.844, "grad_norm": 0.0, "learning_rate": 3.615893495987335e-07, "logits/chosen": -0.7333610653877258, "logits/rejected": 0.6454871892929077, "logps/chosen": -484.1697692871094, "logps/rejected": -1221.397705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.366098403930664, "rewards/margins": 64.28791809082031, "rewards/rejected": -81.6540298461914, "step": 8440 }, { "epoch": 0.845, "grad_norm": 0.0, "learning_rate": 3.5708174824471947e-07, "logits/chosen": -0.8094123601913452, "logits/rejected": 0.7109456062316895, "logps/chosen": -383.9529724121094, "logps/rejected": -1309.468017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.436595916748047, "rewards/margins": 81.26898193359375, "rewards/rejected": -97.70558166503906, "step": 8450 }, { "epoch": 0.846, "grad_norm": 0.0, "learning_rate": 3.5260025767333894e-07, "logits/chosen": -0.6134425401687622, "logits/rejected": 0.8118023872375488, "logps/chosen": -523.8150634765625, "logps/rejected": -1320.671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.67328643798828, "rewards/margins": 70.70545196533203, "rewards/rejected": -94.37873840332031, "step": 8460 }, { "epoch": 0.847, "grad_norm": 0.0, "learning_rate": 3.481449324901412e-07, "logits/chosen": -0.7447524666786194, "logits/rejected": 1.0092899799346924, "logps/chosen": -383.1685791015625, "logps/rejected": -1373.7222900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.77007484436035, "rewards/margins": 84.83432006835938, "rewards/rejected": -103.6043930053711, "step": 8470 }, { "epoch": 0.848, "grad_norm": 2.2618764500270672e-11, "learning_rate": 3.4371582698185636e-07, "logits/chosen": -0.6285834312438965, "logits/rejected": 0.22365979850292206, "logps/chosen": -504.1358947753906, "logps/rejected": -983.326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.448890686035156, "rewards/margins": 49.23127746582031, "rewards/rejected": -67.68016052246094, "step": 8480 }, { "epoch": 0.849, "grad_norm": 3.920732488671419e-11, "learning_rate": 3.393129951157384e-07, "logits/chosen": -0.7750714421272278, "logits/rejected": 0.47617942094802856, "logps/chosen": -256.8757019042969, "logps/rejected": -1155.247314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.384836196899414, "rewards/margins": 75.5903549194336, "rewards/rejected": -87.97518157958984, "step": 8490 }, { "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 3.3493649053890325e-07, "logits/chosen": -0.6838713884353638, "logits/rejected": 0.6836413741111755, "logps/chosen": -524.8140869140625, "logps/rejected": -1271.4749755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.020029067993164, "rewards/margins": 75.66142272949219, "rewards/rejected": -88.68144989013672, "step": 8500 }, { "epoch": 0.851, "grad_norm": 0.0, "learning_rate": 3.3058636657767927e-07, "logits/chosen": -0.3944496810436249, "logits/rejected": 0.7332874536514282, "logps/chosen": -472.6421813964844, "logps/rejected": -1221.7069091796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.113519668579102, "rewards/margins": 76.74790954589844, "rewards/rejected": -87.8614273071289, "step": 8510 }, { "epoch": 0.852, "grad_norm": 6.442988373333725e-19, "learning_rate": 3.262626762369525e-07, "logits/chosen": -0.8572524785995483, "logits/rejected": 0.6225007772445679, "logps/chosen": -361.4649658203125, "logps/rejected": -1145.927001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.924433708190918, "rewards/margins": 66.92786407470703, "rewards/rejected": -82.852294921875, "step": 8520 }, { "epoch": 0.853, "grad_norm": 8.605229723235297e-11, "learning_rate": 3.219654721995266e-07, "logits/chosen": -0.5762674808502197, "logits/rejected": 0.42858409881591797, "logps/chosen": -341.80096435546875, "logps/rejected": -897.2291259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.997732162475586, "rewards/margins": 48.20355987548828, "rewards/rejected": -66.20128631591797, "step": 8530 }, { "epoch": 0.854, "grad_norm": 1.694723438472095e-20, "learning_rate": 3.176948068254762e-07, "logits/chosen": -0.9069119691848755, "logits/rejected": 0.4635804295539856, "logps/chosen": -293.68377685546875, "logps/rejected": -1079.21826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.914752006530762, "rewards/margins": 67.25696563720703, "rewards/rejected": -79.17171478271484, "step": 8540 }, { "epoch": 0.855, "grad_norm": 0.0, "learning_rate": 3.134507321515107e-07, "logits/chosen": -1.0798920392990112, "logits/rejected": 0.9683173894882202, "logps/chosen": -320.7782287597656, "logps/rejected": -1481.5472412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.124308586120605, "rewards/margins": 96.64144134521484, "rewards/rejected": -109.76573181152344, "step": 8550 }, { "epoch": 0.856, "grad_norm": 0.0, "learning_rate": 3.0923329989034134e-07, "logits/chosen": -0.7146934866905212, "logits/rejected": 0.7739596366882324, "logps/chosen": -322.8594665527344, "logps/rejected": -1208.615478515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.003753662109375, "rewards/margins": 80.2951889038086, "rewards/rejected": -90.2989501953125, "step": 8560 }, { "epoch": 0.857, "grad_norm": 0.0, "learning_rate": 3.050425614300487e-07, "logits/chosen": -0.807713508605957, "logits/rejected": 0.5876402854919434, "logps/chosen": -404.4460754394531, "logps/rejected": -1126.020751953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.31097412109375, "rewards/margins": 68.10762786865234, "rewards/rejected": -84.4186019897461, "step": 8570 }, { "epoch": 0.858, "grad_norm": 1.0998899002950119e-14, "learning_rate": 3.0087856783345916e-07, "logits/chosen": -0.5385525226593018, "logits/rejected": 0.42952489852905273, "logps/chosen": -628.0839233398438, "logps/rejected": -1292.969482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.087167739868164, "rewards/margins": 66.9642105102539, "rewards/rejected": -81.05137634277344, "step": 8580 }, { "epoch": 0.859, "grad_norm": 0.0, "learning_rate": 2.967413698375196e-07, "logits/chosen": -1.0018390417099, "logits/rejected": 0.8285702466964722, "logps/chosen": -506.5467224121094, "logps/rejected": -1452.560302734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.048295974731445, "rewards/margins": 94.05846405029297, "rewards/rejected": -109.10675048828125, "step": 8590 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 2.9263101785268253e-07, "logits/chosen": -0.26270899176597595, "logits/rejected": 0.7826281785964966, "logps/chosen": -354.89642333984375, "logps/rejected": -1114.381591796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.83215045928955, "rewards/margins": 69.8828353881836, "rewards/rejected": -85.71498107910156, "step": 8600 }, { "epoch": 0.861, "grad_norm": 0.0, "learning_rate": 2.8854756196229017e-07, "logits/chosen": -0.2683184742927551, "logits/rejected": 0.3286227285861969, "logps/chosen": -564.1398315429688, "logps/rejected": -1226.2554931640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.448040008544922, "rewards/margins": 64.25608825683594, "rewards/rejected": -94.70413208007812, "step": 8610 }, { "epoch": 0.862, "grad_norm": 0.0, "learning_rate": 2.844910519219632e-07, "logits/chosen": -0.8165909051895142, "logits/rejected": 0.6119577288627625, "logps/chosen": -444.7513122558594, "logps/rejected": -1322.9306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.9857177734375, "rewards/margins": 77.33323669433594, "rewards/rejected": -98.3189468383789, "step": 8620 }, { "epoch": 0.863, "grad_norm": 0.0, "learning_rate": 2.8046153715899695e-07, "logits/chosen": -0.3791617751121521, "logits/rejected": 0.6677260398864746, "logps/chosen": -353.25006103515625, "logps/rejected": -1090.166259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.440378189086914, "rewards/margins": 68.6875, "rewards/rejected": -86.12787628173828, "step": 8630 }, { "epoch": 0.864, "grad_norm": 0.0, "learning_rate": 2.7645906677175594e-07, "logits/chosen": -0.7198137044906616, "logits/rejected": 0.9997223615646362, "logps/chosen": -322.55755615234375, "logps/rejected": -1218.536865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.987421035766602, "rewards/margins": 70.57664489746094, "rewards/rejected": -84.56407165527344, "step": 8640 }, { "epoch": 0.865, "grad_norm": 0.0, "learning_rate": 2.7248368952908055e-07, "logits/chosen": -0.34666958451271057, "logits/rejected": 0.5319896340370178, "logps/chosen": -393.0049743652344, "logps/rejected": -1030.311767578125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -15.975300788879395, "rewards/margins": 60.349082946777344, "rewards/rejected": -76.32437896728516, "step": 8650 }, { "epoch": 0.866, "grad_norm": 0.0, "learning_rate": 2.6853545386968607e-07, "logits/chosen": -0.4963590204715729, "logits/rejected": 0.9394834637641907, "logps/chosen": -427.63958740234375, "logps/rejected": -1305.783447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.72681427001953, "rewards/margins": 79.85389709472656, "rewards/rejected": -97.58070373535156, "step": 8660 }, { "epoch": 0.867, "grad_norm": 0.0, "learning_rate": 2.6461440790157974e-07, "logits/chosen": -0.7823432683944702, "logits/rejected": 0.6521458625793457, "logps/chosen": -349.2547607421875, "logps/rejected": -1170.160400390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.58772850036621, "rewards/margins": 69.0348892211914, "rewards/rejected": -86.62260437011719, "step": 8670 }, { "epoch": 0.868, "grad_norm": 0.0, "learning_rate": 2.6072059940146775e-07, "logits/chosen": -1.124406099319458, "logits/rejected": 0.8629137277603149, "logps/chosen": -329.8213806152344, "logps/rejected": -1356.882080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.49382495880127, "rewards/margins": 81.21879577636719, "rewards/rejected": -96.7126235961914, "step": 8680 }, { "epoch": 0.869, "grad_norm": 0.0, "learning_rate": 2.568540758141791e-07, "logits/chosen": -1.092452883720398, "logits/rejected": 0.8541895747184753, "logps/chosen": -219.02163696289062, "logps/rejected": -1051.510986328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.051870346069336, "rewards/margins": 67.15811157226562, "rewards/rejected": -79.20997619628906, "step": 8690 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 2.53014884252083e-07, "logits/chosen": -0.841894268989563, "logits/rejected": 0.8022940754890442, "logps/chosen": -382.36529541015625, "logps/rejected": -1325.561279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.09293556213379, "rewards/margins": 80.35551452636719, "rewards/rejected": -97.44845581054688, "step": 8700 }, { "epoch": 0.871, "grad_norm": 5.792997215066887e-20, "learning_rate": 2.492030714945162e-07, "logits/chosen": -0.6809241771697998, "logits/rejected": 0.5929954648017883, "logps/chosen": -248.1226043701172, "logps/rejected": -1119.2655029296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.076974868774414, "rewards/margins": 76.9704360961914, "rewards/rejected": -88.04740905761719, "step": 8710 }, { "epoch": 0.872, "grad_norm": 1.9078297465225963e-16, "learning_rate": 2.454186839872158e-07, "logits/chosen": -0.5456374883651733, "logits/rejected": 0.8816617727279663, "logps/chosen": -416.0848083496094, "logps/rejected": -1127.365966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.77010726928711, "rewards/margins": 67.05216979980469, "rewards/rejected": -83.822265625, "step": 8720 }, { "epoch": 0.873, "grad_norm": 0.0, "learning_rate": 2.416617678417482e-07, "logits/chosen": -1.4353996515274048, "logits/rejected": 0.6446617841720581, "logps/chosen": -272.59814453125, "logps/rejected": -1227.3382568359375, "loss": 0.4983, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.279558181762695, "rewards/margins": 69.13825225830078, "rewards/rejected": -82.41781616210938, "step": 8730 }, { "epoch": 0.874, "grad_norm": 0.0, "learning_rate": 2.3793236883495164e-07, "logits/chosen": -0.7536391019821167, "logits/rejected": 0.7938761711120605, "logps/chosen": -371.52349853515625, "logps/rejected": -1249.294677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.000296592712402, "rewards/margins": 82.83332061767578, "rewards/rejected": -91.83361053466797, "step": 8740 }, { "epoch": 0.875, "grad_norm": 0.0, "learning_rate": 2.3423053240837518e-07, "logits/chosen": -0.8754490613937378, "logits/rejected": 0.880510151386261, "logps/chosen": -293.3321228027344, "logps/rejected": -993.5623168945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.815601348876953, "rewards/margins": 66.59302520751953, "rewards/rejected": -75.40861511230469, "step": 8750 }, { "epoch": 0.876, "grad_norm": 0.0, "learning_rate": 2.3055630366772857e-07, "logits/chosen": -0.5658458471298218, "logits/rejected": 0.7149707078933716, "logps/chosen": -543.1046752929688, "logps/rejected": -1117.0491943359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.547900199890137, "rewards/margins": 62.61817169189453, "rewards/rejected": -76.16607666015625, "step": 8760 }, { "epoch": 0.877, "grad_norm": 0.0, "learning_rate": 2.269097273823287e-07, "logits/chosen": -0.766351044178009, "logits/rejected": 0.5960129499435425, "logps/chosen": -310.980224609375, "logps/rejected": -1305.391357421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.58404541015625, "rewards/margins": 84.26557922363281, "rewards/rejected": -98.84963989257812, "step": 8770 }, { "epoch": 0.878, "grad_norm": 5.707560185288728e-19, "learning_rate": 2.2329084798455747e-07, "logits/chosen": -0.7957559823989868, "logits/rejected": 0.573180079460144, "logps/chosen": -346.6117248535156, "logps/rejected": -1181.4512939453125, "loss": 0.7611, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.210107803344727, "rewards/margins": 76.22367858886719, "rewards/rejected": -89.43378448486328, "step": 8780 }, { "epoch": 0.879, "grad_norm": 2.2693656578486737e-13, "learning_rate": 2.1969970956931762e-07, "logits/chosen": -1.0760763883590698, "logits/rejected": 0.9111539125442505, "logps/chosen": -238.1721649169922, "logps/rejected": -1140.282958984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.755216598510742, "rewards/margins": 67.73008728027344, "rewards/rejected": -80.48530578613281, "step": 8790 }, { "epoch": 0.88, "grad_norm": 0.0, "learning_rate": 2.1613635589349756e-07, "logits/chosen": -0.5612384080886841, "logits/rejected": 0.535578727722168, "logps/chosen": -456.2445373535156, "logps/rejected": -1187.373779296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -23.860172271728516, "rewards/margins": 63.09345626831055, "rewards/rejected": -86.9536361694336, "step": 8800 }, { "epoch": 0.881, "grad_norm": 0.0, "learning_rate": 2.1260083037543817e-07, "logits/chosen": -0.7693039178848267, "logits/rejected": 0.8871771097183228, "logps/chosen": -213.7225341796875, "logps/rejected": -999.7437744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.34305477142334, "rewards/margins": 63.86836624145508, "rewards/rejected": -73.21142578125, "step": 8810 }, { "epoch": 0.882, "grad_norm": 0.0, "learning_rate": 2.0909317609440093e-07, "logits/chosen": -0.8072845339775085, "logits/rejected": 1.0142757892608643, "logps/chosen": -285.8123474121094, "logps/rejected": -1325.4332275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.264997482299805, "rewards/margins": 89.09748840332031, "rewards/rejected": -101.36248779296875, "step": 8820 }, { "epoch": 0.883, "grad_norm": 1.6345581241948418e-15, "learning_rate": 2.0561343579004716e-07, "logits/chosen": -0.7068424820899963, "logits/rejected": 0.4062952399253845, "logps/chosen": -512.4677734375, "logps/rejected": -1109.9837646484375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -12.541139602661133, "rewards/margins": 57.4853630065918, "rewards/rejected": -70.02650451660156, "step": 8830 }, { "epoch": 0.884, "grad_norm": 5.385317886075214e-11, "learning_rate": 2.0216165186191406e-07, "logits/chosen": -0.7862073183059692, "logits/rejected": 0.6577833890914917, "logps/chosen": -203.6630859375, "logps/rejected": -947.8547973632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.80967903137207, "rewards/margins": 59.02329635620117, "rewards/rejected": -68.8329849243164, "step": 8840 }, { "epoch": 0.885, "grad_norm": 0.0, "learning_rate": 1.9873786636889908e-07, "logits/chosen": -0.8603401184082031, "logits/rejected": 0.7639999985694885, "logps/chosen": -337.18145751953125, "logps/rejected": -1197.7193603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.979072570800781, "rewards/margins": 70.65501403808594, "rewards/rejected": -84.63409423828125, "step": 8850 }, { "epoch": 0.886, "grad_norm": 5.867188956898417e-10, "learning_rate": 1.95342121028749e-07, "logits/chosen": -0.3125002980232239, "logits/rejected": -0.0011765360832214355, "logps/chosen": -694.6043090820312, "logps/rejected": -1111.1990966796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.125167846679688, "rewards/margins": 62.53199005126953, "rewards/rejected": -76.65715026855469, "step": 8860 }, { "epoch": 0.887, "grad_norm": 0.000368534674635157, "learning_rate": 1.9197445721754777e-07, "logits/chosen": -1.0669571161270142, "logits/rejected": 0.7461029887199402, "logps/chosen": -293.17388916015625, "logps/rejected": -1265.558349609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.657773971557617, "rewards/margins": 76.96913146972656, "rewards/rejected": -89.62691497802734, "step": 8870 }, { "epoch": 0.888, "grad_norm": 6.757189956331666e-20, "learning_rate": 1.8863491596921745e-07, "logits/chosen": -0.6019073128700256, "logits/rejected": 0.6984752416610718, "logps/chosen": -375.70751953125, "logps/rejected": -1243.9033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.6553955078125, "rewards/margins": 80.62157440185547, "rewards/rejected": -96.27696990966797, "step": 8880 }, { "epoch": 0.889, "grad_norm": 0.0, "learning_rate": 1.8532353797501318e-07, "logits/chosen": -0.36620140075683594, "logits/rejected": 0.5800802111625671, "logps/chosen": -458.1512756347656, "logps/rejected": -996.4847412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.343849182128906, "rewards/margins": 56.926979064941406, "rewards/rejected": -71.27082824707031, "step": 8890 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 1.8204036358303173e-07, "logits/chosen": -0.6936923265457153, "logits/rejected": 0.49889129400253296, "logps/chosen": -288.4562683105469, "logps/rejected": -958.9161987304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.709049224853516, "rewards/margins": 58.544578552246094, "rewards/rejected": -75.25361633300781, "step": 8900 }, { "epoch": 0.891, "grad_norm": 0.0, "learning_rate": 1.787854327977162e-07, "logits/chosen": -0.9275741577148438, "logits/rejected": 0.6485947370529175, "logps/chosen": -330.7060546875, "logps/rejected": -1296.8360595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.509262084960938, "rewards/margins": 77.84185791015625, "rewards/rejected": -93.35111236572266, "step": 8910 }, { "epoch": 0.892, "grad_norm": 0.0, "learning_rate": 1.7555878527937164e-07, "logits/chosen": -1.2011134624481201, "logits/rejected": 0.6075594425201416, "logps/chosen": -312.7022705078125, "logps/rejected": -1189.131103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.9640474319458, "rewards/margins": 70.05687713623047, "rewards/rejected": -83.02093505859375, "step": 8920 }, { "epoch": 0.893, "grad_norm": 7.314049539630663e-14, "learning_rate": 1.7236046034367959e-07, "logits/chosen": -0.8679525256156921, "logits/rejected": 0.5001112222671509, "logps/chosen": -436.89337158203125, "logps/rejected": -974.1031494140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.955209732055664, "rewards/margins": 54.162025451660156, "rewards/rejected": -68.11723327636719, "step": 8930 }, { "epoch": 0.894, "grad_norm": 6.464930812910552e-09, "learning_rate": 1.6919049696121957e-07, "logits/chosen": -0.7242705225944519, "logits/rejected": 0.9949488639831543, "logps/chosen": -402.5562744140625, "logps/rejected": -1276.095947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.125904083251953, "rewards/margins": 72.64402770996094, "rewards/rejected": -88.76992797851562, "step": 8940 }, { "epoch": 0.895, "grad_norm": 0.0, "learning_rate": 1.6604893375699594e-07, "logits/chosen": -0.8151917457580566, "logits/rejected": 0.576383650302887, "logps/chosen": -461.36309814453125, "logps/rejected": -1293.8941650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.251383781433105, "rewards/margins": 72.68366241455078, "rewards/rejected": -87.93505096435547, "step": 8950 }, { "epoch": 0.896, "grad_norm": 1.5773185534310555e-18, "learning_rate": 1.629358090099639e-07, "logits/chosen": -0.7810664772987366, "logits/rejected": 0.6394819021224976, "logps/chosen": -414.66259765625, "logps/rejected": -1331.1300048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.047119140625, "rewards/margins": 71.95145416259766, "rewards/rejected": -88.99857330322266, "step": 8960 }, { "epoch": 0.897, "grad_norm": 0.0, "learning_rate": 1.5985116065256683e-07, "logits/chosen": -0.6492348909378052, "logits/rejected": 0.7947621941566467, "logps/chosen": -443.70635986328125, "logps/rejected": -1371.603271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.04248332977295, "rewards/margins": 90.69773864746094, "rewards/rejected": -104.740234375, "step": 8970 }, { "epoch": 0.898, "grad_norm": 1.7848679588031597e-16, "learning_rate": 1.567950262702714e-07, "logits/chosen": -0.4879019260406494, "logits/rejected": 0.915458083152771, "logps/chosen": -289.3120422363281, "logps/rejected": -1085.024658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.671075820922852, "rewards/margins": 72.11126708984375, "rewards/rejected": -85.78233337402344, "step": 8980 }, { "epoch": 0.899, "grad_norm": 0.0, "learning_rate": 1.5376744310111019e-07, "logits/chosen": -0.7717766761779785, "logits/rejected": 0.8272747993469238, "logps/chosen": -352.9714050292969, "logps/rejected": -1321.5255126953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.448582649230957, "rewards/margins": 82.11119079589844, "rewards/rejected": -96.55977630615234, "step": 8990 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 1.507684480352292e-07, "logits/chosen": -0.7919676303863525, "logits/rejected": 0.6880122423171997, "logps/chosen": -229.85098266601562, "logps/rejected": -1107.984130859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.7733154296875, "rewards/margins": 74.07725524902344, "rewards/rejected": -82.85057067871094, "step": 9000 }, { "epoch": 0.901, "grad_norm": 0.0, "learning_rate": 1.4779807761443638e-07, "logits/chosen": -0.7188352346420288, "logits/rejected": -0.2085111141204834, "logps/chosen": -403.20623779296875, "logps/rejected": -886.162109375, "loss": 0.6933, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.076815605163574, "rewards/margins": 49.329856872558594, "rewards/rejected": -62.40666961669922, "step": 9010 }, { "epoch": 0.902, "grad_norm": 0.0, "learning_rate": 1.4485636803175828e-07, "logits/chosen": -1.0808542966842651, "logits/rejected": 0.4395454525947571, "logps/chosen": -299.3458557128906, "logps/rejected": -1083.4163818359375, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -11.202362060546875, "rewards/margins": 69.3813705444336, "rewards/rejected": -80.58373260498047, "step": 9020 }, { "epoch": 0.903, "grad_norm": 0.0, "learning_rate": 1.419433551309976e-07, "logits/chosen": -0.831488311290741, "logits/rejected": 0.843484103679657, "logps/chosen": -359.81561279296875, "logps/rejected": -1077.2774658203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.43194580078125, "rewards/margins": 61.063232421875, "rewards/rejected": -74.49516296386719, "step": 9030 }, { "epoch": 0.904, "grad_norm": 2.340053706362255e-10, "learning_rate": 1.3905907440629752e-07, "logits/chosen": -0.6054459810256958, "logits/rejected": 0.43269747495651245, "logps/chosen": -384.861083984375, "logps/rejected": -831.53076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.977055549621582, "rewards/margins": 45.349388122558594, "rewards/rejected": -60.326438903808594, "step": 9040 }, { "epoch": 0.905, "grad_norm": 3.8065961743351227e-17, "learning_rate": 1.362035610017079e-07, "logits/chosen": -0.4957023561000824, "logits/rejected": 0.2642466425895691, "logps/chosen": -495.8026428222656, "logps/rejected": -1130.28759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.497753143310547, "rewards/margins": 60.88861846923828, "rewards/rejected": -77.3863754272461, "step": 9050 }, { "epoch": 0.906, "grad_norm": 2.3743223557877146e-17, "learning_rate": 1.3337684971075932e-07, "logits/chosen": -0.7933815717697144, "logits/rejected": 0.6370527744293213, "logps/chosen": -235.04129028320312, "logps/rejected": -1101.3182373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.383734703063965, "rewards/margins": 78.91749572753906, "rewards/rejected": -91.30121612548828, "step": 9060 }, { "epoch": 0.907, "grad_norm": 0.0, "learning_rate": 1.305789749760361e-07, "logits/chosen": -0.6855477094650269, "logits/rejected": 0.779290497303009, "logps/chosen": -388.5061950683594, "logps/rejected": -1220.8798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.840558052062988, "rewards/margins": 76.36566162109375, "rewards/rejected": -88.20621490478516, "step": 9070 }, { "epoch": 0.908, "grad_norm": 1.1141914078643608e-13, "learning_rate": 1.278099708887587e-07, "logits/chosen": -0.7415876984596252, "logits/rejected": 0.4227770268917084, "logps/chosen": -351.1391296386719, "logps/rejected": -1024.048095703125, "loss": 0.1366, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.448495864868164, "rewards/margins": 56.8670768737793, "rewards/rejected": -70.3155746459961, "step": 9080 }, { "epoch": 0.909, "grad_norm": 0.0, "learning_rate": 1.2506987118836912e-07, "logits/chosen": -0.6299499273300171, "logits/rejected": 0.7337206602096558, "logps/chosen": -351.2576599121094, "logps/rejected": -1170.62255859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.964860916137695, "rewards/margins": 70.13188934326172, "rewards/rejected": -85.09674835205078, "step": 9090 }, { "epoch": 0.91, "grad_norm": 7.363024984640906e-14, "learning_rate": 1.223587092621162e-07, "logits/chosen": -0.21692593395709991, "logits/rejected": 0.2910541892051697, "logps/chosen": -529.19873046875, "logps/rejected": -1077.606201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.683971405029297, "rewards/margins": 62.53369140625, "rewards/rejected": -82.21766662597656, "step": 9100 }, { "epoch": 0.911, "grad_norm": 3.743392066509216e-23, "learning_rate": 1.1967651814465353e-07, "logits/chosen": -0.5983961820602417, "logits/rejected": 0.39085835218429565, "logps/chosen": -580.361572265625, "logps/rejected": -1208.87451171875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -15.13195514678955, "rewards/margins": 64.27543640136719, "rewards/rejected": -79.40740203857422, "step": 9110 }, { "epoch": 0.912, "grad_norm": 5.2141750120031276e-17, "learning_rate": 1.1702333051763271e-07, "logits/chosen": -0.702211856842041, "logits/rejected": 0.6391454935073853, "logps/chosen": -380.1633605957031, "logps/rejected": -1212.1474609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.09587574005127, "rewards/margins": 75.67768096923828, "rewards/rejected": -84.77355194091797, "step": 9120 }, { "epoch": 0.913, "grad_norm": 0.0, "learning_rate": 1.1439917870930795e-07, "logits/chosen": -0.5169418454170227, "logits/rejected": 0.09019921720027924, "logps/chosen": -513.6130981445312, "logps/rejected": -1079.996337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.596075057983398, "rewards/margins": 60.556312561035156, "rewards/rejected": -75.15238952636719, "step": 9130 }, { "epoch": 0.914, "grad_norm": 0.0, "learning_rate": 1.1180409469414094e-07, "logits/chosen": -0.6889699697494507, "logits/rejected": 0.548774778842926, "logps/chosen": -328.4002685546875, "logps/rejected": -948.4647216796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.28093147277832, "rewards/margins": 53.268798828125, "rewards/rejected": -65.54973602294922, "step": 9140 }, { "epoch": 0.915, "grad_norm": 0.0, "learning_rate": 1.0923811009241142e-07, "logits/chosen": -0.7122704386711121, "logits/rejected": 0.8708732724189758, "logps/chosen": -335.0247497558594, "logps/rejected": -1252.97265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.75001049041748, "rewards/margins": 78.19270324707031, "rewards/rejected": -89.94271850585938, "step": 9150 }, { "epoch": 0.916, "grad_norm": 3.938313189073678e-18, "learning_rate": 1.067012561698319e-07, "logits/chosen": -0.7977027297019958, "logits/rejected": 0.4715547561645508, "logps/chosen": -382.7290344238281, "logps/rejected": -1013.48046875, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": -15.897479057312012, "rewards/margins": 54.91753005981445, "rewards/rejected": -70.81500244140625, "step": 9160 }, { "epoch": 0.917, "grad_norm": 0.0, "learning_rate": 1.041935638371669e-07, "logits/chosen": -0.5972322225570679, "logits/rejected": 0.8759559392929077, "logps/chosen": -480.037353515625, "logps/rejected": -1522.079345703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.036136627197266, "rewards/margins": 93.04458618164062, "rewards/rejected": -112.0807113647461, "step": 9170 }, { "epoch": 0.918, "grad_norm": 0.0, "learning_rate": 1.0171506364985622e-07, "logits/chosen": -0.6566218733787537, "logits/rejected": 0.7978938817977905, "logps/chosen": -257.0160827636719, "logps/rejected": -1240.74462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.77754020690918, "rewards/margins": 84.71733093261719, "rewards/rejected": -99.49488067626953, "step": 9180 }, { "epoch": 0.919, "grad_norm": 0.0, "learning_rate": 9.926578580764234e-08, "logits/chosen": -0.6492460370063782, "logits/rejected": 0.5774198770523071, "logps/chosen": -416.5879821777344, "logps/rejected": -1290.6796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.999757766723633, "rewards/margins": 77.02781677246094, "rewards/rejected": -93.02757263183594, "step": 9190 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 9.684576015420277e-08, "logits/chosen": -0.8115363121032715, "logits/rejected": 0.26468801498413086, "logps/chosen": -296.90667724609375, "logps/rejected": -1148.5294189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.467525482177734, "rewards/margins": 76.24222564697266, "rewards/rejected": -88.70976257324219, "step": 9200 }, { "epoch": 0.921, "grad_norm": 0.0, "learning_rate": 9.445501617678654e-08, "logits/chosen": -0.8983446955680847, "logits/rejected": 0.6710523962974548, "logps/chosen": -518.798828125, "logps/rejected": -1462.599365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.77184009552002, "rewards/margins": 87.92976379394531, "rewards/rejected": -100.70159912109375, "step": 9210 }, { "epoch": 0.922, "grad_norm": 0.0, "learning_rate": 9.209358300585474e-08, "logits/chosen": -0.9570168256759644, "logits/rejected": 0.8016937375068665, "logps/chosen": -448.8213806152344, "logps/rejected": -1629.7894287109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.019401550292969, "rewards/margins": 96.5383529663086, "rewards/rejected": -111.55775451660156, "step": 9220 }, { "epoch": 0.923, "grad_norm": 0.0, "learning_rate": 8.9761489414725e-08, "logits/chosen": -0.8167027235031128, "logits/rejected": 0.7627574801445007, "logps/chosen": -385.311767578125, "logps/rejected": -1270.958251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.449134826660156, "rewards/margins": 77.73607635498047, "rewards/rejected": -91.18521118164062, "step": 9230 }, { "epoch": 0.924, "grad_norm": 3.164236744665942e-19, "learning_rate": 8.745876381922147e-08, "logits/chosen": -0.8649279475212097, "logits/rejected": 0.11709287017583847, "logps/chosen": -489.70123291015625, "logps/rejected": -1005.6638793945312, "loss": 0.8828, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -17.700468063354492, "rewards/margins": 52.23369216918945, "rewards/rejected": -69.93415832519531, "step": 9240 }, { "epoch": 0.925, "grad_norm": 0.0, "learning_rate": 8.518543427732951e-08, "logits/chosen": -1.1081702709197998, "logits/rejected": 0.8128757476806641, "logps/chosen": -323.5766296386719, "logps/rejected": -1218.4962158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.58479118347168, "rewards/margins": 69.68755340576172, "rewards/rejected": -82.27234649658203, "step": 9250 }, { "epoch": 0.926, "grad_norm": 0.0, "learning_rate": 8.294152848885156e-08, "logits/chosen": -0.5787280201911926, "logits/rejected": 0.5253779888153076, "logps/chosen": -272.241943359375, "logps/rejected": -1050.86962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.380825996398926, "rewards/margins": 66.82584381103516, "rewards/rejected": -81.2066650390625, "step": 9260 }, { "epoch": 0.927, "grad_norm": 1.3496992307596107e-22, "learning_rate": 8.072707379507217e-08, "logits/chosen": -0.6485394239425659, "logits/rejected": 0.680305004119873, "logps/chosen": -477.92919921875, "logps/rejected": -1094.7174072265625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -11.70506477355957, "rewards/margins": 63.1876106262207, "rewards/rejected": -74.8926773071289, "step": 9270 }, { "epoch": 0.928, "grad_norm": 0.0, "learning_rate": 7.854209717842231e-08, "logits/chosen": -1.0379279851913452, "logits/rejected": 0.70207679271698, "logps/chosen": -348.22869873046875, "logps/rejected": -1431.361083984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.762044906616211, "rewards/margins": 92.30265808105469, "rewards/rejected": -107.06471252441406, "step": 9280 }, { "epoch": 0.929, "grad_norm": 0.0, "learning_rate": 7.638662526215284e-08, "logits/chosen": -0.8047307729721069, "logits/rejected": 0.7297149896621704, "logps/chosen": -355.40277099609375, "logps/rejected": -1222.614990234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.765396118164062, "rewards/margins": 76.424560546875, "rewards/rejected": -89.18995666503906, "step": 9290 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 7.426068431000883e-08, "logits/chosen": -0.7463569641113281, "logits/rejected": 0.5908794403076172, "logps/chosen": -397.95501708984375, "logps/rejected": -1326.8583984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.535120010375977, "rewards/margins": 80.03962707519531, "rewards/rejected": -93.57474517822266, "step": 9300 }, { "epoch": 0.931, "grad_norm": 4.476295639648952e-20, "learning_rate": 7.216430022591009e-08, "logits/chosen": -0.44624462723731995, "logits/rejected": 0.4880369305610657, "logps/chosen": -513.1954956054688, "logps/rejected": -1160.2886962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.831964492797852, "rewards/margins": 70.88438415527344, "rewards/rejected": -81.71633911132812, "step": 9310 }, { "epoch": 0.932, "grad_norm": 0.0, "learning_rate": 7.009749855363457e-08, "logits/chosen": -1.1946498155593872, "logits/rejected": 0.6728850603103638, "logps/chosen": -326.53936767578125, "logps/rejected": -1230.041748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.130026817321777, "rewards/margins": 72.26325225830078, "rewards/rejected": -83.39328002929688, "step": 9320 }, { "epoch": 0.933, "grad_norm": 0.0, "learning_rate": 6.806030447650879e-08, "logits/chosen": -0.31252819299697876, "logits/rejected": 0.6711708307266235, "logps/chosen": -412.03631591796875, "logps/rejected": -1164.569580078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.800756454467773, "rewards/margins": 69.16845703125, "rewards/rejected": -83.96920776367188, "step": 9330 }, { "epoch": 0.934, "grad_norm": 0.00047836932935751975, "learning_rate": 6.605274281709929e-08, "logits/chosen": -0.17928871512413025, "logits/rejected": 0.322933167219162, "logps/chosen": -420.1446838378906, "logps/rejected": -962.8092651367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.62596893310547, "rewards/margins": 52.55628204345703, "rewards/rejected": -69.1822509765625, "step": 9340 }, { "epoch": 0.935, "grad_norm": 0.0, "learning_rate": 6.407483803691216e-08, "logits/chosen": -1.3535900115966797, "logits/rejected": 0.7449843287467957, "logps/chosen": -324.685546875, "logps/rejected": -1163.7113037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.4141263961792, "rewards/margins": 67.3392333984375, "rewards/rejected": -80.75336456298828, "step": 9350 }, { "epoch": 0.936, "grad_norm": 5.005370894650696e-06, "learning_rate": 6.212661423609184e-08, "logits/chosen": -0.7937058806419373, "logits/rejected": 0.8053327798843384, "logps/chosen": -362.6256103515625, "logps/rejected": -1156.7620849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.173696517944336, "rewards/margins": 70.54964447021484, "rewards/rejected": -83.72335052490234, "step": 9360 }, { "epoch": 0.937, "grad_norm": 0.0, "learning_rate": 6.020809515313141e-08, "logits/chosen": -0.5480384230613708, "logits/rejected": 0.6836110949516296, "logps/chosen": -542.61962890625, "logps/rejected": -1206.66552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.34486961364746, "rewards/margins": 60.3333854675293, "rewards/rejected": -79.67825317382812, "step": 9370 }, { "epoch": 0.938, "grad_norm": 0.0, "learning_rate": 5.83193041645802e-08, "logits/chosen": -0.6533951759338379, "logits/rejected": 0.6204730868339539, "logps/chosen": -371.08489990234375, "logps/rejected": -1113.1427001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.826315879821777, "rewards/margins": 66.26902770996094, "rewards/rejected": -78.09534454345703, "step": 9380 }, { "epoch": 0.939, "grad_norm": 0.0, "learning_rate": 5.6460264284760316e-08, "logits/chosen": -0.37336036562919617, "logits/rejected": 1.1612873077392578, "logps/chosen": -526.3222045898438, "logps/rejected": -1342.774169921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.058244705200195, "rewards/margins": 78.78479766845703, "rewards/rejected": -100.84303283691406, "step": 9390 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 5.463099816548578e-08, "logits/chosen": -0.535962700843811, "logits/rejected": 0.4035312533378601, "logps/chosen": -367.12335205078125, "logps/rejected": -1092.9337158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.50265121459961, "rewards/margins": 63.38078689575195, "rewards/rejected": -80.88343048095703, "step": 9400 }, { "epoch": 0.941, "grad_norm": 4.522228316673297e-21, "learning_rate": 5.283152809578751e-08, "logits/chosen": -1.1403727531433105, "logits/rejected": 0.8676480054855347, "logps/chosen": -311.7748107910156, "logps/rejected": -1319.3763427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.405827522277832, "rewards/margins": 82.8596420288086, "rewards/rejected": -97.26547241210938, "step": 9410 }, { "epoch": 0.942, "grad_norm": 0.0, "learning_rate": 5.106187600163987e-08, "logits/chosen": -0.6772249341011047, "logits/rejected": 0.4636703133583069, "logps/chosen": -499.16021728515625, "logps/rejected": -1222.0574951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.858734130859375, "rewards/margins": 63.798866271972656, "rewards/rejected": -84.65760040283203, "step": 9420 }, { "epoch": 0.943, "grad_norm": 0.0, "learning_rate": 4.932206344569562e-08, "logits/chosen": -1.084229826927185, "logits/rejected": 0.500342071056366, "logps/chosen": -337.5640563964844, "logps/rejected": -1246.141357421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -17.161239624023438, "rewards/margins": 69.31696319580078, "rewards/rejected": -86.47819519042969, "step": 9430 }, { "epoch": 0.944, "grad_norm": 0.0, "learning_rate": 4.761211162702117e-08, "logits/chosen": -0.7408514618873596, "logits/rejected": 0.5778809785842896, "logps/chosen": -296.14483642578125, "logps/rejected": -1266.046142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.5400447845459, "rewards/margins": 83.34046173095703, "rewards/rejected": -99.88050842285156, "step": 9440 }, { "epoch": 0.945, "grad_norm": 1.6371098564433791e-18, "learning_rate": 4.593204138084006e-08, "logits/chosen": -0.6246211528778076, "logits/rejected": 0.469452440738678, "logps/chosen": -369.33599853515625, "logps/rejected": -1046.853515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.994848251342773, "rewards/margins": 59.64809036254883, "rewards/rejected": -77.64293670654297, "step": 9450 }, { "epoch": 0.946, "grad_norm": 0.0, "learning_rate": 4.428187317827848e-08, "logits/chosen": -0.5617231726646423, "logits/rejected": 0.5635863542556763, "logps/chosen": -419.0165100097656, "logps/rejected": -1149.50341796875, "loss": 0.0865, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -20.520648956298828, "rewards/margins": 63.31085205078125, "rewards/rejected": -83.83150482177734, "step": 9460 }, { "epoch": 0.947, "grad_norm": 0.0, "learning_rate": 4.26616271261146e-08, "logits/chosen": -0.5824400782585144, "logits/rejected": 0.41373205184936523, "logps/chosen": -272.3680419921875, "logps/rejected": -848.8267822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.654070854187012, "rewards/margins": 44.72352981567383, "rewards/rejected": -60.377593994140625, "step": 9470 }, { "epoch": 0.948, "grad_norm": 1.2813891132657006e-15, "learning_rate": 4.1071322966535487e-08, "logits/chosen": -0.7137543559074402, "logits/rejected": 0.5696569085121155, "logps/chosen": -308.44830322265625, "logps/rejected": -1056.5924072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.067167282104492, "rewards/margins": 62.045066833496094, "rewards/rejected": -77.11224365234375, "step": 9480 }, { "epoch": 0.949, "grad_norm": 2.2281272382684847e-09, "learning_rate": 3.95109800768953e-08, "logits/chosen": -0.41896852850914, "logits/rejected": 0.4885942041873932, "logps/chosen": -361.94439697265625, "logps/rejected": -972.2320556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.41004753112793, "rewards/margins": 54.427696228027344, "rewards/rejected": -66.83775329589844, "step": 9490 }, { "epoch": 0.95, "grad_norm": 2.754418360062516e-10, "learning_rate": 3.798061746947995e-08, "logits/chosen": -0.7670290470123291, "logits/rejected": 0.12368150055408478, "logps/chosen": -377.0394287109375, "logps/rejected": -937.1624755859375, "loss": 0.1833, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -15.164652824401855, "rewards/margins": 53.78802490234375, "rewards/rejected": -68.95268249511719, "step": 9500 }, { "epoch": 0.951, "grad_norm": 0.0, "learning_rate": 3.648025379127479e-08, "logits/chosen": -0.6356409192085266, "logits/rejected": 0.4663293957710266, "logps/chosen": -452.65240478515625, "logps/rejected": -1188.810791015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.409733772277832, "rewards/margins": 72.15763854980469, "rewards/rejected": -83.56737518310547, "step": 9510 }, { "epoch": 0.952, "grad_norm": 0.0, "learning_rate": 3.5009907323737826e-08, "logits/chosen": -0.8558729887008667, "logits/rejected": 0.4594835340976715, "logps/chosen": -386.46319580078125, "logps/rejected": -1148.697998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.500055313110352, "rewards/margins": 66.51141357421875, "rewards/rejected": -79.01146697998047, "step": 9520 }, { "epoch": 0.953, "grad_norm": 0.0, "learning_rate": 3.3569595982576584e-08, "logits/chosen": -0.13464701175689697, "logits/rejected": 0.6260603666305542, "logps/chosen": -449.32867431640625, "logps/rejected": -979.0067138671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.389322280883789, "rewards/margins": 54.79975509643555, "rewards/rejected": -67.18907165527344, "step": 9530 }, { "epoch": 0.954, "grad_norm": 0.0, "learning_rate": 3.2159337317530234e-08, "logits/chosen": -0.9340829849243164, "logits/rejected": 0.6286576986312866, "logps/chosen": -289.25457763671875, "logps/rejected": -977.1263427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.772436141967773, "rewards/margins": 60.88823699951172, "rewards/rejected": -71.66067504882812, "step": 9540 }, { "epoch": 0.955, "grad_norm": 0.0, "learning_rate": 3.077914851215585e-08, "logits/chosen": -0.2953462600708008, "logits/rejected": 0.702400803565979, "logps/chosen": -394.4523010253906, "logps/rejected": -1066.02001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.709041595458984, "rewards/margins": 65.71452331542969, "rewards/rejected": -82.4235610961914, "step": 9550 }, { "epoch": 0.956, "grad_norm": 0.0, "learning_rate": 2.9429046383618042e-08, "logits/chosen": -0.6391351222991943, "logits/rejected": 0.7442789673805237, "logps/chosen": -252.177490234375, "logps/rejected": -1095.124267578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.858415603637695, "rewards/margins": 69.86174774169922, "rewards/rejected": -80.72015380859375, "step": 9560 }, { "epoch": 0.957, "grad_norm": 0.0, "learning_rate": 2.810904738248549e-08, "logits/chosen": -0.6452876329421997, "logits/rejected": 0.4415220618247986, "logps/chosen": -365.7802429199219, "logps/rejected": -1228.2845458984375, "loss": 0.2902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.759931564331055, "rewards/margins": 76.06869506835938, "rewards/rejected": -89.82862091064453, "step": 9570 }, { "epoch": 0.958, "grad_norm": 0.0, "learning_rate": 2.681916759252917e-08, "logits/chosen": -0.8312497138977051, "logits/rejected": 0.8043157458305359, "logps/chosen": -297.5036926269531, "logps/rejected": -1181.446044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.166455268859863, "rewards/margins": 72.9308853149414, "rewards/rejected": -87.09734344482422, "step": 9580 }, { "epoch": 0.959, "grad_norm": 8.871064730355953e-17, "learning_rate": 2.555942273052753e-08, "logits/chosen": -0.5505019426345825, "logits/rejected": 0.7471826076507568, "logps/chosen": -363.4208679199219, "logps/rejected": -1077.77783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.896112442016602, "rewards/margins": 64.6919174194336, "rewards/rejected": -77.58802032470703, "step": 9590 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 2.4329828146074096e-08, "logits/chosen": -0.6976087689399719, "logits/rejected": 0.6933576464653015, "logps/chosen": -284.07476806640625, "logps/rejected": -1206.358642578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.92124080657959, "rewards/margins": 80.40800476074219, "rewards/rejected": -91.32923889160156, "step": 9600 }, { "epoch": 0.961, "grad_norm": 1.819727244059747e-18, "learning_rate": 2.313039882139101e-08, "logits/chosen": -1.001491904258728, "logits/rejected": 0.6855885982513428, "logps/chosen": -151.57177734375, "logps/rejected": -759.6219482421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -6.934805870056152, "rewards/margins": 45.081241607666016, "rewards/rejected": -52.01605224609375, "step": 9610 }, { "epoch": 0.962, "grad_norm": 0.0, "learning_rate": 2.1961149371145795e-08, "logits/chosen": -0.08522322028875351, "logits/rejected": 0.5817008018493652, "logps/chosen": -430.6034240722656, "logps/rejected": -980.8968505859375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -17.8110294342041, "rewards/margins": 53.823448181152344, "rewards/rejected": -71.63447570800781, "step": 9620 }, { "epoch": 0.963, "grad_norm": 0.0, "learning_rate": 2.082209404227403e-08, "logits/chosen": -0.5273114442825317, "logits/rejected": 0.6189590692520142, "logps/chosen": -364.1752014160156, "logps/rejected": -1239.5257568359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.691871643066406, "rewards/margins": 82.79158020019531, "rewards/rejected": -94.48345947265625, "step": 9630 }, { "epoch": 0.964, "grad_norm": 0.0, "learning_rate": 1.9713246713805588e-08, "logits/chosen": -0.6452358365058899, "logits/rejected": 0.6586201190948486, "logps/chosen": -361.86376953125, "logps/rejected": -1184.059326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.489391326904297, "rewards/margins": 72.41313171386719, "rewards/rejected": -88.90251159667969, "step": 9640 }, { "epoch": 0.965, "grad_norm": 5.003034098116643e-10, "learning_rate": 1.8634620896695044e-08, "logits/chosen": -0.3010661005973816, "logits/rejected": 0.596214234828949, "logps/chosen": -331.79730224609375, "logps/rejected": -967.5947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -19.522335052490234, "rewards/margins": 57.9662971496582, "rewards/rejected": -77.48863220214844, "step": 9650 }, { "epoch": 0.966, "grad_norm": 0.0, "learning_rate": 1.7586229733657646e-08, "logits/chosen": -0.9829071760177612, "logits/rejected": 0.6873185634613037, "logps/chosen": -311.52313232421875, "logps/rejected": -1215.0775146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.806241989135742, "rewards/margins": 69.84815979003906, "rewards/rejected": -85.65440368652344, "step": 9660 }, { "epoch": 0.967, "grad_norm": 0.0, "learning_rate": 1.6568085999008886e-08, "logits/chosen": -0.9289532899856567, "logits/rejected": 0.6419546604156494, "logps/chosen": -428.91400146484375, "logps/rejected": -1263.3416748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.84474754333496, "rewards/margins": 70.19862365722656, "rewards/rejected": -87.04336547851562, "step": 9670 }, { "epoch": 0.968, "grad_norm": 0.0, "learning_rate": 1.5580202098509078e-08, "logits/chosen": -0.8172246813774109, "logits/rejected": 0.6110397577285767, "logps/chosen": -309.43719482421875, "logps/rejected": -1129.2818603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -11.678646087646484, "rewards/margins": 72.56365966796875, "rewards/rejected": -84.24231719970703, "step": 9680 }, { "epoch": 0.969, "grad_norm": 0.0, "learning_rate": 1.4622590069211517e-08, "logits/chosen": -0.9688760042190552, "logits/rejected": 0.6006779074668884, "logps/chosen": -274.1756591796875, "logps/rejected": -1105.856201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -9.810132026672363, "rewards/margins": 67.08036804199219, "rewards/rejected": -76.89048767089844, "step": 9690 }, { "epoch": 0.97, "grad_norm": 14.400453567504883, "learning_rate": 1.3695261579316776e-08, "logits/chosen": -0.7687379121780396, "logits/rejected": 0.6138051748275757, "logps/chosen": -404.05914306640625, "logps/rejected": -1301.695556640625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -14.873037338256836, "rewards/margins": 77.2698745727539, "rewards/rejected": -92.14290618896484, "step": 9700 }, { "epoch": 0.971, "grad_norm": 0.0, "learning_rate": 1.2798227928029483e-08, "logits/chosen": -0.6523748636245728, "logits/rejected": 0.47052305936813354, "logps/chosen": -480.67681884765625, "logps/rejected": -1377.9613037109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.083057403564453, "rewards/margins": 82.21842956542969, "rewards/rejected": -100.3014907836914, "step": 9710 }, { "epoch": 0.972, "grad_norm": 0.0, "learning_rate": 1.193150004542204e-08, "logits/chosen": -0.5860597491264343, "logits/rejected": 1.2617757320404053, "logps/chosen": -317.54974365234375, "logps/rejected": -1091.484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.348005294799805, "rewards/margins": 67.207275390625, "rewards/rejected": -83.5552749633789, "step": 9720 }, { "epoch": 0.973, "grad_norm": 0.0, "learning_rate": 1.109508849230001e-08, "logits/chosen": -0.4087650179862976, "logits/rejected": 0.43272989988327026, "logps/chosen": -370.06488037109375, "logps/rejected": -1022.3318481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.40913200378418, "rewards/margins": 60.79473876953125, "rewards/rejected": -74.20386505126953, "step": 9730 }, { "epoch": 0.974, "grad_norm": 0.0, "learning_rate": 1.0289003460074165e-08, "logits/chosen": -0.6892791986465454, "logits/rejected": 0.5428653359413147, "logps/chosen": -627.8890380859375, "logps/rejected": -1332.8411865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.19067668914795, "rewards/margins": 69.77383422851562, "rewards/rejected": -84.96450805664062, "step": 9740 }, { "epoch": 0.975, "grad_norm": 0.0, "learning_rate": 9.513254770636138e-09, "logits/chosen": -0.8619287610054016, "logits/rejected": 0.5513601899147034, "logps/chosen": -349.90069580078125, "logps/rejected": -1181.427001953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.454333305358887, "rewards/margins": 68.1052474975586, "rewards/rejected": -81.55958557128906, "step": 9750 }, { "epoch": 0.976, "grad_norm": 0.0, "learning_rate": 8.767851876239075e-09, "logits/chosen": -0.4201585352420807, "logits/rejected": 0.7723037004470825, "logps/chosen": -571.9500732421875, "logps/rejected": -1227.5380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.813634872436523, "rewards/margins": 63.92681121826172, "rewards/rejected": -82.74044799804688, "step": 9760 }, { "epoch": 0.977, "grad_norm": 0.0, "learning_rate": 8.052803859382174e-09, "logits/chosen": -0.6666244864463806, "logits/rejected": 0.718521773815155, "logps/chosen": -256.37188720703125, "logps/rejected": -1023.0693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.587654113769531, "rewards/margins": 69.85871124267578, "rewards/rejected": -80.44636535644531, "step": 9770 }, { "epoch": 0.978, "grad_norm": 5.396772849053377e-06, "learning_rate": 7.368119432699383e-09, "logits/chosen": -0.6672367453575134, "logits/rejected": 0.411163330078125, "logps/chosen": -361.1486511230469, "logps/rejected": -951.3388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.695712089538574, "rewards/margins": 52.358123779296875, "rewards/rejected": -67.05384063720703, "step": 9780 }, { "epoch": 0.979, "grad_norm": 4.77047155948609e-16, "learning_rate": 6.7138069388547614e-09, "logits/chosen": -1.2189319133758545, "logits/rejected": 0.6601986885070801, "logps/chosen": -294.63934326171875, "logps/rejected": -1052.442626953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -9.933565139770508, "rewards/margins": 63.85259246826172, "rewards/rejected": -73.7861557006836, "step": 9790 }, { "epoch": 0.98, "grad_norm": 8.221260281435824e-15, "learning_rate": 6.089874350439507e-09, "logits/chosen": -0.8612964749336243, "logits/rejected": 0.48667994141578674, "logps/chosen": -276.33428955078125, "logps/rejected": -1090.6768798828125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -10.823007583618164, "rewards/margins": 67.98606872558594, "rewards/rejected": -78.80908203125, "step": 9800 }, { "epoch": 0.981, "grad_norm": 8.672072620090093e-19, "learning_rate": 5.4963292698750896e-09, "logits/chosen": -0.344230979681015, "logits/rejected": -0.045682210475206375, "logps/chosen": -622.1007080078125, "logps/rejected": -998.2276611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -15.51715087890625, "rewards/margins": 42.443077087402344, "rewards/rejected": -57.960227966308594, "step": 9810 }, { "epoch": 0.982, "grad_norm": 3.64466545797206e-15, "learning_rate": 4.933178929321103e-09, "logits/chosen": -0.5035933256149292, "logits/rejected": 0.6916291117668152, "logps/chosen": -372.07379150390625, "logps/rejected": -1095.00732421875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -17.06197738647461, "rewards/margins": 60.83811569213867, "rewards/rejected": -77.90009307861328, "step": 9820 }, { "epoch": 0.983, "grad_norm": 0.0, "learning_rate": 4.400430190586724e-09, "logits/chosen": -0.6599819660186768, "logits/rejected": 0.5303076505661011, "logps/chosen": -485.1739807128906, "logps/rejected": -1263.5828857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.154559135437012, "rewards/margins": 69.51759338378906, "rewards/rejected": -83.67214965820312, "step": 9830 }, { "epoch": 0.984, "grad_norm": 3.5732080050365767e-07, "learning_rate": 3.8980895450474455e-09, "logits/chosen": -0.7814174294471741, "logits/rejected": 0.642800509929657, "logps/chosen": -427.40643310546875, "logps/rejected": -1239.892578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -16.07494354248047, "rewards/margins": 76.21549987792969, "rewards/rejected": -92.29043579101562, "step": 9840 }, { "epoch": 0.985, "grad_norm": 0.0, "learning_rate": 3.4261631135654174e-09, "logits/chosen": -0.8822159767150879, "logits/rejected": 0.41710543632507324, "logps/chosen": -504.44342041015625, "logps/rejected": -1304.072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.102876663208008, "rewards/margins": 72.31712341308594, "rewards/rejected": -86.41999816894531, "step": 9850 }, { "epoch": 0.986, "grad_norm": 0.0, "learning_rate": 2.984656646415063e-09, "logits/chosen": -0.38282960653305054, "logits/rejected": 0.1189902052283287, "logps/chosen": -471.0121154785156, "logps/rejected": -1003.5185546875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": -11.816166877746582, "rewards/margins": 56.61848831176758, "rewards/rejected": -68.43465423583984, "step": 9860 }, { "epoch": 0.987, "grad_norm": 6.396711269288306e-22, "learning_rate": 2.573575523213412e-09, "logits/chosen": -0.5950853824615479, "logits/rejected": 0.6149926781654358, "logps/chosen": -238.8068084716797, "logps/rejected": -932.0408935546875, "loss": 0.0283, "rewards/accuracies": 1.0, "rewards/chosen": -11.022882461547852, "rewards/margins": 59.13804244995117, "rewards/rejected": -70.16092681884766, "step": 9870 }, { "epoch": 0.988, "grad_norm": 0.0, "learning_rate": 2.192924752854042e-09, "logits/chosen": -0.7473016977310181, "logits/rejected": 0.5859171748161316, "logps/chosen": -331.48406982421875, "logps/rejected": -1262.8597412109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.349139213562012, "rewards/margins": 80.0912094116211, "rewards/rejected": -93.44035339355469, "step": 9880 }, { "epoch": 0.989, "grad_norm": 0.0, "learning_rate": 1.842708973447127e-09, "logits/chosen": -0.7945876121520996, "logits/rejected": 0.9833480715751648, "logps/chosen": -267.7002258300781, "logps/rejected": -1046.1787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -13.28888988494873, "rewards/margins": 64.49101257324219, "rewards/rejected": -77.7799072265625, "step": 9890 }, { "epoch": 0.99, "grad_norm": 1.0340225176748083e-13, "learning_rate": 1.5229324522605949e-09, "logits/chosen": -0.530807375907898, "logits/rejected": 0.39602330327033997, "logps/chosen": -627.578857421875, "logps/rejected": -1317.6207275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.532297134399414, "rewards/margins": 70.44889831542969, "rewards/rejected": -87.98119354248047, "step": 9900 }, { "epoch": 0.991, "grad_norm": 0.0, "learning_rate": 1.2335990856710001e-09, "logits/chosen": -0.5323010683059692, "logits/rejected": 0.8327838182449341, "logps/chosen": -462.7493591308594, "logps/rejected": -1313.9512939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.25348472595215, "rewards/margins": 77.57730865478516, "rewards/rejected": -94.8307876586914, "step": 9910 }, { "epoch": 0.992, "grad_norm": 0.0, "learning_rate": 9.747123991141193e-10, "logits/chosen": -0.5736783742904663, "logits/rejected": 0.5615373849868774, "logps/chosen": -391.64105224609375, "logps/rejected": -1180.13134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.440969467163086, "rewards/margins": 73.5404052734375, "rewards/rejected": -87.98137664794922, "step": 9920 }, { "epoch": 0.993, "grad_norm": 0.0, "learning_rate": 7.462755470422078e-10, "logits/chosen": -0.8006995916366577, "logits/rejected": 0.5760239362716675, "logps/chosen": -287.2140197753906, "logps/rejected": -910.4788208007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.046598434448242, "rewards/margins": 51.611488342285156, "rewards/rejected": -61.6580924987793, "step": 9930 }, { "epoch": 0.994, "grad_norm": 0.0, "learning_rate": 5.48291312886251e-10, "logits/chosen": -0.19751985371112823, "logits/rejected": 0.23999378085136414, "logps/chosen": -447.41473388671875, "logps/rejected": -837.7698974609375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -16.676952362060547, "rewards/margins": 47.939002990722656, "rewards/rejected": -64.61595153808594, "step": 9940 }, { "epoch": 0.995, "grad_norm": 0.0, "learning_rate": 3.8076210902182607e-10, "logits/chosen": -0.6151745915412903, "logits/rejected": 0.7026041150093079, "logps/chosen": -376.2629089355469, "logps/rejected": -1227.098388671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.793085098266602, "rewards/margins": 81.91486358642578, "rewards/rejected": -94.70793914794922, "step": 9950 }, { "epoch": 0.996, "grad_norm": 5.696053004267121e-11, "learning_rate": 2.43689976739403e-10, "logits/chosen": -0.647061824798584, "logits/rejected": 0.48720255494117737, "logps/chosen": -303.76898193359375, "logps/rejected": -941.9822387695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -12.006851196289062, "rewards/margins": 54.89765548706055, "rewards/rejected": -66.90450286865234, "step": 9960 }, { "epoch": 0.997, "grad_norm": 0.0, "learning_rate": 1.3707658621964216e-10, "logits/chosen": -0.9006298780441284, "logits/rejected": 0.7990083694458008, "logps/chosen": -274.3728942871094, "logps/rejected": -1247.52880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -8.891749382019043, "rewards/margins": 87.26151275634766, "rewards/rejected": -96.15326690673828, "step": 9970 }, { "epoch": 0.998, "grad_norm": 0.0, "learning_rate": 6.092323651313293e-11, "logits/chosen": -0.8165764808654785, "logits/rejected": 0.2772344648838043, "logps/chosen": -351.58526611328125, "logps/rejected": -1041.4344482421875, "loss": 1.2084, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.576261520385742, "rewards/margins": 58.36811065673828, "rewards/rejected": -70.94436645507812, "step": 9980 }, { "epoch": 0.999, "grad_norm": 0.0, "learning_rate": 1.5230855524017708e-11, "logits/chosen": -1.0131959915161133, "logits/rejected": 0.66839599609375, "logps/chosen": -294.83953857421875, "logps/rejected": -1126.336669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -14.798008918762207, "rewards/margins": 65.11726379394531, "rewards/rejected": -79.91526794433594, "step": 9990 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 0.0, "logits/chosen": -0.6975020170211792, "logits/rejected": 0.882247805595398, "logps/chosen": -402.93658447265625, "logps/rejected": -1386.987548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -18.254676818847656, "rewards/margins": 91.36619567871094, "rewards/rejected": -109.6208724975586, "step": 10000 }, { "epoch": 1.0, "step": 10000, "total_flos": 5.747405857487585e+17, "train_loss": 0.08356396047416255, "train_runtime": 17143.627, "train_samples_per_second": 0.583, "train_steps_per_second": 0.583 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.747405857487585e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }