{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.992, "eval_steps": 500, "global_step": 561, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02666666666666667, "grad_norm": 18.39926528930664, "learning_rate": 8.771929824561403e-08, "logits/chosen": 0.053128667175769806, "logits/rejected": 0.06011464446783066, "logps/chosen": -70.8781967163086, "logps/rejected": -68.79312896728516, "loss": 0.6906, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.0009149238467216492, "rewards/margins": 0.0024268084671348333, "rewards/rejected": -0.003341731382533908, "step": 5 }, { "epoch": 0.05333333333333334, "grad_norm": 19.233495712280273, "learning_rate": 1.7543859649122805e-07, "logits/chosen": -0.09774302691221237, "logits/rejected": -0.09648840129375458, "logps/chosen": -46.74890899658203, "logps/rejected": -57.538856506347656, "loss": 0.6755, "rewards/accuracies": 0.75, "rewards/chosen": 0.04250483959913254, "rewards/margins": 0.04128213971853256, "rewards/rejected": 0.0012226973194628954, "step": 10 }, { "epoch": 0.08, "grad_norm": 17.17833137512207, "learning_rate": 2.631578947368421e-07, "logits/chosen": -0.09319419413805008, "logits/rejected": -0.08535219728946686, "logps/chosen": -58.279396057128906, "logps/rejected": -69.68245697021484, "loss": 0.6361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0669654831290245, "rewards/margins": 0.10347993671894073, "rewards/rejected": -0.03651442751288414, "step": 15 }, { "epoch": 0.10666666666666667, "grad_norm": 16.598756790161133, "learning_rate": 3.508771929824561e-07, "logits/chosen": 0.03794529289007187, "logits/rejected": 0.044014494866132736, "logps/chosen": -63.522361755371094, "logps/rejected": -73.55606842041016, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": 0.07537873089313507, "rewards/margins": 0.3096093237400055, "rewards/rejected": -0.23423054814338684, "step": 20 }, { "epoch": 0.13333333333333333, "grad_norm": 14.252389907836914, "learning_rate": 4.3859649122807013e-07, "logits/chosen": 0.06569056957960129, "logits/rejected": 0.0650763288140297, "logps/chosen": -70.42298126220703, "logps/rejected": -77.9859390258789, "loss": 0.5961, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17782427370548248, "rewards/margins": 0.32860898971557617, "rewards/rejected": -0.15078473091125488, "step": 25 }, { "epoch": 0.16, "grad_norm": 13.940037727355957, "learning_rate": 5.263157894736842e-07, "logits/chosen": -0.007893741130828857, "logits/rejected": -0.010488653555512428, "logps/chosen": -63.06987380981445, "logps/rejected": -64.85173034667969, "loss": 0.549, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.1359255164861679, "rewards/margins": 0.7183843851089478, "rewards/rejected": -0.5824588537216187, "step": 30 }, { "epoch": 0.18666666666666668, "grad_norm": 16.549161911010742, "learning_rate": 6.140350877192982e-07, "logits/chosen": 0.07654356211423874, "logits/rejected": 0.07278482615947723, "logps/chosen": -71.77564239501953, "logps/rejected": -67.6545639038086, "loss": 0.509, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.32316628098487854, "rewards/margins": 0.4559944272041321, "rewards/rejected": -0.13282814621925354, "step": 35 }, { "epoch": 0.21333333333333335, "grad_norm": 15.91128158569336, "learning_rate": 7.017543859649122e-07, "logits/chosen": -0.06502251327037811, "logits/rejected": -0.06176159903407097, "logps/chosen": -91.53266143798828, "logps/rejected": -77.68993377685547, "loss": 0.4975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005904299207031727, "rewards/margins": 0.4281623959541321, "rewards/rejected": -0.43406668305397034, "step": 40 }, { "epoch": 0.24, "grad_norm": 14.597318649291992, "learning_rate": 7.894736842105263e-07, "logits/chosen": -0.07799122482538223, "logits/rejected": -0.07605376839637756, "logps/chosen": -58.30717086791992, "logps/rejected": -68.22291564941406, "loss": 0.4913, "rewards/accuracies": 0.75, "rewards/chosen": 0.33519667387008667, "rewards/margins": 0.9599436521530151, "rewards/rejected": -0.6247469186782837, "step": 45 }, { "epoch": 0.26666666666666666, "grad_norm": 13.722618103027344, "learning_rate": 8.771929824561403e-07, "logits/chosen": -0.014609629288315773, "logits/rejected": -0.014899415895342827, "logps/chosen": -84.4870834350586, "logps/rejected": -94.78424072265625, "loss": 0.4642, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.10160064697265625, "rewards/margins": 1.3170650005340576, "rewards/rejected": -1.4186656475067139, "step": 50 }, { "epoch": 0.29333333333333333, "grad_norm": 17.38532257080078, "learning_rate": 9.649122807017545e-07, "logits/chosen": -0.036196161061525345, "logits/rejected": -0.03828999400138855, "logps/chosen": -68.3674545288086, "logps/rejected": -74.53666687011719, "loss": 0.5491, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10605410486459732, "rewards/margins": 0.8501850366592407, "rewards/rejected": -0.9562392234802246, "step": 55 }, { "epoch": 0.32, "grad_norm": 13.876227378845215, "learning_rate": 1e-06, "logits/chosen": -0.03917285054922104, "logits/rejected": -0.03898438438773155, "logps/chosen": -69.72596740722656, "logps/rejected": -63.88629913330078, "loss": 0.5169, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1265568882226944, "rewards/margins": 1.1786383390426636, "rewards/rejected": -1.0520813465118408, "step": 60 }, { "epoch": 0.3466666666666667, "grad_norm": 11.621339797973633, "learning_rate": 1e-06, "logits/chosen": -0.15784303843975067, "logits/rejected": -0.15451130270957947, "logps/chosen": -70.8653564453125, "logps/rejected": -67.95909118652344, "loss": 0.5204, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21460843086242676, "rewards/margins": 0.8792555928230286, "rewards/rejected": -0.664647102355957, "step": 65 }, { "epoch": 0.37333333333333335, "grad_norm": 12.923187255859375, "learning_rate": 1e-06, "logits/chosen": 0.043206293135881424, "logits/rejected": 0.06076166778802872, "logps/chosen": -71.95893859863281, "logps/rejected": -98.98312377929688, "loss": 0.3642, "rewards/accuracies": 0.75, "rewards/chosen": 0.035441040992736816, "rewards/margins": 1.6655327081680298, "rewards/rejected": -1.630091667175293, "step": 70 }, { "epoch": 0.4, "grad_norm": 13.109973907470703, "learning_rate": 1e-06, "logits/chosen": 0.00962201226502657, "logits/rejected": 0.015961844474077225, "logps/chosen": -62.07453155517578, "logps/rejected": -86.6424560546875, "loss": 0.4846, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.27357736229896545, "rewards/margins": 1.7215477228164673, "rewards/rejected": -1.4479701519012451, "step": 75 }, { "epoch": 0.4266666666666667, "grad_norm": 12.61295223236084, "learning_rate": 1e-06, "logits/chosen": -0.024258632212877274, "logits/rejected": -0.015315225347876549, "logps/chosen": -40.27369689941406, "logps/rejected": -69.33142852783203, "loss": 0.4061, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.4955378472805023, "rewards/margins": 1.1658498048782349, "rewards/rejected": -0.6703119277954102, "step": 80 }, { "epoch": 0.4533333333333333, "grad_norm": 12.405911445617676, "learning_rate": 1e-06, "logits/chosen": -0.14968207478523254, "logits/rejected": -0.14379464089870453, "logps/chosen": -74.81858825683594, "logps/rejected": -89.59300231933594, "loss": 0.3693, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3189702332019806, "rewards/margins": 1.6341769695281982, "rewards/rejected": -1.31520676612854, "step": 85 }, { "epoch": 0.48, "grad_norm": 12.492094039916992, "learning_rate": 1e-06, "logits/chosen": 0.04283991456031799, "logits/rejected": 0.04497765749692917, "logps/chosen": -88.62035369873047, "logps/rejected": -99.63616180419922, "loss": 0.4934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29155534505844116, "rewards/margins": 1.47171950340271, "rewards/rejected": -1.7632747888565063, "step": 90 }, { "epoch": 0.5066666666666667, "grad_norm": 15.640336990356445, "learning_rate": 1e-06, "logits/chosen": -0.04965194687247276, "logits/rejected": -0.043795522302389145, "logps/chosen": -75.30198669433594, "logps/rejected": -96.59324645996094, "loss": 0.4722, "rewards/accuracies": 0.75, "rewards/chosen": -0.1831967532634735, "rewards/margins": 1.4340693950653076, "rewards/rejected": -1.617266297340393, "step": 95 }, { "epoch": 0.5333333333333333, "grad_norm": 17.030439376831055, "learning_rate": 1e-06, "logits/chosen": -0.14364857971668243, "logits/rejected": -0.13528576493263245, "logps/chosen": -37.00883102416992, "logps/rejected": -51.563194274902344, "loss": 0.4453, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1994268000125885, "rewards/margins": 1.2294137477874756, "rewards/rejected": -1.029987096786499, "step": 100 }, { "epoch": 0.56, "grad_norm": 16.2526798248291, "learning_rate": 1e-06, "logits/chosen": -0.0951358824968338, "logits/rejected": -0.08968516439199448, "logps/chosen": -81.79414367675781, "logps/rejected": -89.765625, "loss": 0.4725, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15804192423820496, "rewards/margins": 1.7332594394683838, "rewards/rejected": -1.891301155090332, "step": 105 }, { "epoch": 0.5866666666666667, "grad_norm": 15.710427284240723, "learning_rate": 1e-06, "logits/chosen": -0.027723263949155807, "logits/rejected": -0.01763475313782692, "logps/chosen": -37.25303649902344, "logps/rejected": -52.58305740356445, "loss": 0.5258, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01768249273300171, "rewards/margins": 0.608422577381134, "rewards/rejected": -0.626105010509491, "step": 110 }, { "epoch": 0.6133333333333333, "grad_norm": 16.18587303161621, "learning_rate": 1e-06, "logits/chosen": -0.09127525985240936, "logits/rejected": -0.08952955901622772, "logps/chosen": -51.15592956542969, "logps/rejected": -61.84421920776367, "loss": 0.446, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.43013912439346313, "rewards/margins": 1.3957525491714478, "rewards/rejected": -0.9656132459640503, "step": 115 }, { "epoch": 0.64, "grad_norm": 12.457794189453125, "learning_rate": 1e-06, "logits/chosen": -0.0474393330514431, "logits/rejected": -0.04658551886677742, "logps/chosen": -58.86711883544922, "logps/rejected": -64.27616119384766, "loss": 0.4365, "rewards/accuracies": 0.75, "rewards/chosen": 0.10293619334697723, "rewards/margins": 1.2258039712905884, "rewards/rejected": -1.1228678226470947, "step": 120 }, { "epoch": 0.6666666666666666, "grad_norm": 16.62484359741211, "learning_rate": 1e-06, "logits/chosen": 0.0391828715801239, "logits/rejected": 0.059719525277614594, "logps/chosen": -91.99595642089844, "logps/rejected": -112.70484924316406, "loss": 0.4484, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5286062955856323, "rewards/margins": 2.0147881507873535, "rewards/rejected": -2.5433950424194336, "step": 125 }, { "epoch": 0.6933333333333334, "grad_norm": 12.6619873046875, "learning_rate": 1e-06, "logits/chosen": -0.0015835389494895935, "logits/rejected": 0.004649341106414795, "logps/chosen": -76.80113220214844, "logps/rejected": -89.04924774169922, "loss": 0.4096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.28463220596313477, "rewards/margins": 1.117056131362915, "rewards/rejected": -1.4016883373260498, "step": 130 }, { "epoch": 0.72, "grad_norm": 16.22713279724121, "learning_rate": 1e-06, "logits/chosen": -0.10093293339014053, "logits/rejected": -0.0960889607667923, "logps/chosen": -78.48757934570312, "logps/rejected": -96.75239562988281, "loss": 0.4776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07072822749614716, "rewards/margins": 1.4200364351272583, "rewards/rejected": -1.4907647371292114, "step": 135 }, { "epoch": 0.7466666666666667, "grad_norm": 15.163298606872559, "learning_rate": 1e-06, "logits/chosen": -0.08841624110937119, "logits/rejected": -0.0812966376543045, "logps/chosen": -49.91742706298828, "logps/rejected": -54.39896774291992, "loss": 0.4361, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.10890891402959824, "rewards/margins": 1.1275047063827515, "rewards/rejected": -1.2364134788513184, "step": 140 }, { "epoch": 0.7733333333333333, "grad_norm": 13.627829551696777, "learning_rate": 1e-06, "logits/chosen": -0.07901357114315033, "logits/rejected": -0.07952677458524704, "logps/chosen": -58.6712760925293, "logps/rejected": -60.3519287109375, "loss": 0.4263, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.07741154730319977, "rewards/margins": 0.6744439005851746, "rewards/rejected": -0.5970323085784912, "step": 145 }, { "epoch": 0.8, "grad_norm": 14.12767505645752, "learning_rate": 1e-06, "logits/chosen": -0.07844635099172592, "logits/rejected": -0.07670246064662933, "logps/chosen": -79.21514892578125, "logps/rejected": -95.0439682006836, "loss": 0.3984, "rewards/accuracies": 0.75, "rewards/chosen": -0.20406706631183624, "rewards/margins": 1.3347957134246826, "rewards/rejected": -1.538862705230713, "step": 150 }, { "epoch": 0.8266666666666667, "grad_norm": 10.027976989746094, "learning_rate": 1e-06, "logits/chosen": -0.0394299291074276, "logits/rejected": -0.037283238023519516, "logps/chosen": -59.8607177734375, "logps/rejected": -90.07058715820312, "loss": 0.3387, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.25796985626220703, "rewards/margins": 2.074141263961792, "rewards/rejected": -1.816171646118164, "step": 155 }, { "epoch": 0.8533333333333334, "grad_norm": 12.08365535736084, "learning_rate": 1e-06, "logits/chosen": -0.10505978763103485, "logits/rejected": -0.09901513159275055, "logps/chosen": -47.506202697753906, "logps/rejected": -67.22758483886719, "loss": 0.4439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.07217444479465485, "rewards/margins": 1.0274462699890137, "rewards/rejected": -0.9552720189094543, "step": 160 }, { "epoch": 0.88, "grad_norm": 10.322546005249023, "learning_rate": 1e-06, "logits/chosen": -0.034024275839328766, "logits/rejected": -0.03133025020360947, "logps/chosen": -58.316184997558594, "logps/rejected": -66.901123046875, "loss": 0.4655, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07615742832422256, "rewards/margins": 0.675209641456604, "rewards/rejected": -0.7513669729232788, "step": 165 }, { "epoch": 0.9066666666666666, "grad_norm": 12.919961929321289, "learning_rate": 1e-06, "logits/chosen": -0.007387078367173672, "logits/rejected": -0.0006680100923404098, "logps/chosen": -66.50263977050781, "logps/rejected": -88.05205535888672, "loss": 0.4127, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3587184250354767, "rewards/margins": 1.6810741424560547, "rewards/rejected": -2.039792537689209, "step": 170 }, { "epoch": 0.9333333333333333, "grad_norm": 14.320988655090332, "learning_rate": 1e-06, "logits/chosen": -0.0416673943400383, "logits/rejected": -0.03895064815878868, "logps/chosen": -60.38318634033203, "logps/rejected": -73.31221008300781, "loss": 0.3713, "rewards/accuracies": 0.75, "rewards/chosen": 0.043798673897981644, "rewards/margins": 1.690246343612671, "rewards/rejected": -1.646447777748108, "step": 175 }, { "epoch": 0.96, "grad_norm": 9.279186248779297, "learning_rate": 1e-06, "logits/chosen": -0.011715270578861237, "logits/rejected": 0.005149384029209614, "logps/chosen": -76.2380599975586, "logps/rejected": -107.5550308227539, "loss": 0.3898, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.13200218975543976, "rewards/margins": 2.5765540599823, "rewards/rejected": -2.7085559368133545, "step": 180 }, { "epoch": 0.9866666666666667, "grad_norm": 11.731532096862793, "learning_rate": 1e-06, "logits/chosen": -0.11099245399236679, "logits/rejected": -0.10901761054992676, "logps/chosen": -63.856910705566406, "logps/rejected": -63.48603439331055, "loss": 0.369, "rewards/accuracies": 0.75, "rewards/chosen": 0.09118829667568207, "rewards/margins": 1.6201965808868408, "rewards/rejected": -1.529008150100708, "step": 185 }, { "epoch": 1.0133333333333334, "grad_norm": 9.0255765914917, "learning_rate": 1e-06, "logits/chosen": -0.04720262438058853, "logits/rejected": -0.05262360721826553, "logps/chosen": -59.25706100463867, "logps/rejected": -85.06912994384766, "loss": 0.353, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.4835248589515686, "rewards/margins": 1.9951941967010498, "rewards/rejected": -1.5116695165634155, "step": 190 }, { "epoch": 1.04, "grad_norm": 5.959653377532959, "learning_rate": 1e-06, "logits/chosen": -0.041944488883018494, "logits/rejected": -0.020493442192673683, "logps/chosen": -51.811729431152344, "logps/rejected": -80.3885726928711, "loss": 0.2245, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6860127449035645, "rewards/margins": 1.6832406520843506, "rewards/rejected": -0.9972278475761414, "step": 195 }, { "epoch": 1.0666666666666667, "grad_norm": 5.370494365692139, "learning_rate": 1e-06, "logits/chosen": -0.13854077458381653, "logits/rejected": -0.1376122683286667, "logps/chosen": -45.62406921386719, "logps/rejected": -66.38761901855469, "loss": 0.2236, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5624949932098389, "rewards/margins": 2.370830535888672, "rewards/rejected": -1.808335542678833, "step": 200 }, { "epoch": 1.0933333333333333, "grad_norm": 11.40556812286377, "learning_rate": 1e-06, "logits/chosen": -0.03235360607504845, "logits/rejected": -0.026641175150871277, "logps/chosen": -77.10560607910156, "logps/rejected": -116.6834716796875, "loss": 0.198, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.003305965568870306, "rewards/margins": 2.9688169956207275, "rewards/rejected": -2.97212290763855, "step": 205 }, { "epoch": 1.12, "grad_norm": 6.852243423461914, "learning_rate": 1e-06, "logits/chosen": -0.08796464651823044, "logits/rejected": -0.08174140751361847, "logps/chosen": -63.3108024597168, "logps/rejected": -100.68711853027344, "loss": 0.2087, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.06613560020923615, "rewards/margins": 3.523066759109497, "rewards/rejected": -3.5892021656036377, "step": 210 }, { "epoch": 1.1466666666666667, "grad_norm": 3.7613370418548584, "learning_rate": 1e-06, "logits/chosen": -0.18352393805980682, "logits/rejected": -0.1822497546672821, "logps/chosen": -73.46556091308594, "logps/rejected": -111.50262451171875, "loss": 0.1781, "rewards/accuracies": 1.0, "rewards/chosen": 0.06992371380329132, "rewards/margins": 3.4505341053009033, "rewards/rejected": -3.380610704421997, "step": 215 }, { "epoch": 1.1733333333333333, "grad_norm": 10.82043743133545, "learning_rate": 1e-06, "logits/chosen": -0.0707927718758583, "logits/rejected": -0.06452594697475433, "logps/chosen": -74.3930435180664, "logps/rejected": -109.25379943847656, "loss": 0.1996, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5582473874092102, "rewards/margins": 2.8837478160858154, "rewards/rejected": -3.4419949054718018, "step": 220 }, { "epoch": 1.2, "grad_norm": 13.249909400939941, "learning_rate": 1e-06, "logits/chosen": -0.15726926922798157, "logits/rejected": -0.15615758299827576, "logps/chosen": -84.48081970214844, "logps/rejected": -106.8475341796875, "loss": 0.2885, "rewards/accuracies": 0.75, "rewards/chosen": -1.2800536155700684, "rewards/margins": 2.9067673683166504, "rewards/rejected": -4.186820983886719, "step": 225 }, { "epoch": 1.2266666666666666, "grad_norm": 8.66231918334961, "learning_rate": 1e-06, "logits/chosen": -0.15227961540222168, "logits/rejected": -0.14247794449329376, "logps/chosen": -77.54243469238281, "logps/rejected": -116.9886474609375, "loss": 0.2311, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.4163047671318054, "rewards/margins": 3.9100730419158936, "rewards/rejected": -4.32637882232666, "step": 230 }, { "epoch": 1.2533333333333334, "grad_norm": 4.258462905883789, "learning_rate": 1e-06, "logits/chosen": -0.14345099031925201, "logits/rejected": -0.1368933767080307, "logps/chosen": -41.03181457519531, "logps/rejected": -74.01554107666016, "loss": 0.2209, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6364781260490417, "rewards/margins": 3.3563053607940674, "rewards/rejected": -2.71982741355896, "step": 235 }, { "epoch": 1.28, "grad_norm": 8.00922679901123, "learning_rate": 1e-06, "logits/chosen": -0.13605789840221405, "logits/rejected": -0.13225141167640686, "logps/chosen": -45.99299621582031, "logps/rejected": -70.01762390136719, "loss": 0.2223, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.18717975914478302, "rewards/margins": 2.5417323112487793, "rewards/rejected": -2.3545525074005127, "step": 240 }, { "epoch": 1.3066666666666666, "grad_norm": 6.644509792327881, "learning_rate": 1e-06, "logits/chosen": -0.05485178157687187, "logits/rejected": -0.032927896827459335, "logps/chosen": -78.5289306640625, "logps/rejected": -105.24593353271484, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": -0.4212195873260498, "rewards/margins": 3.12190580368042, "rewards/rejected": -3.5431251525878906, "step": 245 }, { "epoch": 1.3333333333333333, "grad_norm": 6.512491703033447, "learning_rate": 1e-06, "logits/chosen": -0.08749254047870636, "logits/rejected": -0.08491937816143036, "logps/chosen": -79.62251281738281, "logps/rejected": -112.38929748535156, "loss": 0.1805, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7281423807144165, "rewards/margins": 2.684180736541748, "rewards/rejected": -3.412322998046875, "step": 250 }, { "epoch": 1.3599999999999999, "grad_norm": 7.965065002441406, "learning_rate": 1e-06, "logits/chosen": -0.0565565750002861, "logits/rejected": -0.05955087020993233, "logps/chosen": -77.64044952392578, "logps/rejected": -106.85416412353516, "loss": 0.2022, "rewards/accuracies": 0.75, "rewards/chosen": -0.5686990022659302, "rewards/margins": 3.395534038543701, "rewards/rejected": -3.9642326831817627, "step": 255 }, { "epoch": 1.3866666666666667, "grad_norm": 8.91444206237793, "learning_rate": 1e-06, "logits/chosen": -0.1623281091451645, "logits/rejected": -0.15568025410175323, "logps/chosen": -76.49116516113281, "logps/rejected": -105.77110290527344, "loss": 0.2286, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6722622513771057, "rewards/margins": 3.6157422065734863, "rewards/rejected": -4.288004398345947, "step": 260 }, { "epoch": 1.4133333333333333, "grad_norm": 3.856618642807007, "learning_rate": 1e-06, "logits/chosen": -0.14635060727596283, "logits/rejected": -0.14141127467155457, "logps/chosen": -73.30412292480469, "logps/rejected": -102.57535552978516, "loss": 0.1777, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.15799687802791595, "rewards/margins": 3.1063895225524902, "rewards/rejected": -3.2643864154815674, "step": 265 }, { "epoch": 1.44, "grad_norm": 9.558956146240234, "learning_rate": 1e-06, "logits/chosen": -0.16010445356369019, "logits/rejected": -0.16003289818763733, "logps/chosen": -83.60963439941406, "logps/rejected": -108.95814514160156, "loss": 0.1934, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.03610507398843765, "rewards/margins": 3.566607713699341, "rewards/rejected": -3.602712631225586, "step": 270 }, { "epoch": 1.4666666666666668, "grad_norm": 14.296300888061523, "learning_rate": 1e-06, "logits/chosen": -0.19041122496128082, "logits/rejected": -0.19675543904304504, "logps/chosen": -86.94904327392578, "logps/rejected": -108.98336029052734, "loss": 0.2672, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.45788565278053284, "rewards/margins": 2.7378134727478027, "rewards/rejected": -3.195699453353882, "step": 275 }, { "epoch": 1.4933333333333334, "grad_norm": 11.429224967956543, "learning_rate": 1e-06, "logits/chosen": -0.2332134246826172, "logits/rejected": -0.22619101405143738, "logps/chosen": -41.212432861328125, "logps/rejected": -63.74433517456055, "loss": 0.3017, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.22647647559642792, "rewards/margins": 2.678001880645752, "rewards/rejected": -2.4515256881713867, "step": 280 }, { "epoch": 1.52, "grad_norm": 15.175474166870117, "learning_rate": 1e-06, "logits/chosen": -0.1731823831796646, "logits/rejected": -0.17582915723323822, "logps/chosen": -55.227027893066406, "logps/rejected": -78.37431335449219, "loss": 0.2917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3550594449043274, "rewards/margins": 2.29359769821167, "rewards/rejected": -2.6486573219299316, "step": 285 }, { "epoch": 1.5466666666666666, "grad_norm": 9.650674819946289, "learning_rate": 1e-06, "logits/chosen": -0.1489884853363037, "logits/rejected": -0.14982970058918, "logps/chosen": -58.45500564575195, "logps/rejected": -91.8953857421875, "loss": 0.1967, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3005248010158539, "rewards/margins": 3.695702314376831, "rewards/rejected": -3.3951778411865234, "step": 290 }, { "epoch": 1.5733333333333333, "grad_norm": 6.528473854064941, "learning_rate": 1e-06, "logits/chosen": -0.1766640692949295, "logits/rejected": -0.17292292416095734, "logps/chosen": -56.97791290283203, "logps/rejected": -85.62743377685547, "loss": 0.2127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.12213984876871109, "rewards/margins": 2.8668904304504395, "rewards/rejected": -2.744750499725342, "step": 295 }, { "epoch": 1.6, "grad_norm": 10.14043140411377, "learning_rate": 1e-06, "logits/chosen": -0.19708013534545898, "logits/rejected": -0.18595156073570251, "logps/chosen": -54.1507453918457, "logps/rejected": -79.53749084472656, "loss": 0.2324, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.12599337100982666, "rewards/margins": 2.579333543777466, "rewards/rejected": -2.4533400535583496, "step": 300 }, { "epoch": 1.6266666666666667, "grad_norm": 8.498900413513184, "learning_rate": 1e-06, "logits/chosen": -0.17206253111362457, "logits/rejected": -0.15581965446472168, "logps/chosen": -72.01545715332031, "logps/rejected": -153.42544555664062, "loss": 0.1785, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.38456493616104126, "rewards/margins": 4.787345886230469, "rewards/rejected": -5.171910762786865, "step": 305 }, { "epoch": 1.6533333333333333, "grad_norm": 16.62505340576172, "learning_rate": 1e-06, "logits/chosen": -0.2148733139038086, "logits/rejected": -0.2032153606414795, "logps/chosen": -62.933837890625, "logps/rejected": -106.54484558105469, "loss": 0.1895, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.18351763486862183, "rewards/margins": 3.5239806175231934, "rewards/rejected": -3.707498550415039, "step": 310 }, { "epoch": 1.6800000000000002, "grad_norm": 11.589250564575195, "learning_rate": 1e-06, "logits/chosen": -0.21502284705638885, "logits/rejected": -0.1963229775428772, "logps/chosen": -80.04829406738281, "logps/rejected": -141.29750061035156, "loss": 0.1646, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9600610733032227, "rewards/margins": 4.689765930175781, "rewards/rejected": -5.649827003479004, "step": 315 }, { "epoch": 1.7066666666666666, "grad_norm": 10.770737648010254, "learning_rate": 1e-06, "logits/chosen": -0.24441838264465332, "logits/rejected": -0.2412232607603073, "logps/chosen": -86.85003662109375, "logps/rejected": -106.89388275146484, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": -0.8239480257034302, "rewards/margins": 3.0451276302337646, "rewards/rejected": -3.8690757751464844, "step": 320 }, { "epoch": 1.7333333333333334, "grad_norm": 12.021537780761719, "learning_rate": 1e-06, "logits/chosen": -0.2027956247329712, "logits/rejected": -0.19365160167217255, "logps/chosen": -83.82646179199219, "logps/rejected": -117.0644302368164, "loss": 0.2087, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9700089693069458, "rewards/margins": 3.3046371936798096, "rewards/rejected": -4.274645805358887, "step": 325 }, { "epoch": 1.76, "grad_norm": 16.727617263793945, "learning_rate": 1e-06, "logits/chosen": -0.2380843162536621, "logits/rejected": -0.23556020855903625, "logps/chosen": -85.53111267089844, "logps/rejected": -136.96200561523438, "loss": 0.19, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7303121089935303, "rewards/margins": 4.339911460876465, "rewards/rejected": -5.070223808288574, "step": 330 }, { "epoch": 1.7866666666666666, "grad_norm": 8.647014617919922, "learning_rate": 1e-06, "logits/chosen": -0.2260773628950119, "logits/rejected": -0.22472091019153595, "logps/chosen": -70.34246063232422, "logps/rejected": -130.5174560546875, "loss": 0.1145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8340579271316528, "rewards/margins": 4.325329780578613, "rewards/rejected": -5.159387588500977, "step": 335 }, { "epoch": 1.8133333333333335, "grad_norm": 13.704692840576172, "learning_rate": 1e-06, "logits/chosen": -0.19743473827838898, "logits/rejected": -0.18280264735221863, "logps/chosen": -92.22306060791016, "logps/rejected": -126.66398620605469, "loss": 0.1549, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.9958213567733765, "rewards/margins": 3.9418296813964844, "rewards/rejected": -4.937651634216309, "step": 340 }, { "epoch": 1.8399999999999999, "grad_norm": 9.447907447814941, "learning_rate": 1e-06, "logits/chosen": -0.23114176094532013, "logits/rejected": -0.22814705967903137, "logps/chosen": -73.06866455078125, "logps/rejected": -92.7379150390625, "loss": 0.2321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.1413997858762741, "rewards/margins": 3.44819974899292, "rewards/rejected": -3.589600086212158, "step": 345 }, { "epoch": 1.8666666666666667, "grad_norm": 13.660387992858887, "learning_rate": 1e-06, "logits/chosen": -0.2316075563430786, "logits/rejected": -0.2243129014968872, "logps/chosen": -40.69365692138672, "logps/rejected": -77.8421630859375, "loss": 0.1929, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2429409772157669, "rewards/margins": 3.414602756500244, "rewards/rejected": -3.657543659210205, "step": 350 }, { "epoch": 1.8933333333333333, "grad_norm": 7.376171588897705, "learning_rate": 1e-06, "logits/chosen": -0.24942800402641296, "logits/rejected": -0.24522526562213898, "logps/chosen": -72.5764389038086, "logps/rejected": -108.04423522949219, "loss": 0.1523, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.38573572039604187, "rewards/margins": 3.300957441329956, "rewards/rejected": -3.6866931915283203, "step": 355 }, { "epoch": 1.92, "grad_norm": 14.980485916137695, "learning_rate": 1e-06, "logits/chosen": -0.22098588943481445, "logits/rejected": -0.20870821177959442, "logps/chosen": -70.51148986816406, "logps/rejected": -113.34062194824219, "loss": 0.1941, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5927928686141968, "rewards/margins": 3.418287754058838, "rewards/rejected": -4.011080741882324, "step": 360 }, { "epoch": 1.9466666666666668, "grad_norm": 7.132907390594482, "learning_rate": 1e-06, "logits/chosen": -0.27342313528060913, "logits/rejected": -0.26478415727615356, "logps/chosen": -91.9891586303711, "logps/rejected": -129.16757202148438, "loss": 0.174, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.369034767150879, "rewards/margins": 4.006806373596191, "rewards/rejected": -5.37584114074707, "step": 365 }, { "epoch": 1.9733333333333334, "grad_norm": 11.890949249267578, "learning_rate": 1e-06, "logits/chosen": -0.23843975365161896, "logits/rejected": -0.2383933812379837, "logps/chosen": -74.76898956298828, "logps/rejected": -107.2107162475586, "loss": 0.2254, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.747979462146759, "rewards/margins": 3.267796277999878, "rewards/rejected": -4.015775680541992, "step": 370 }, { "epoch": 2.0, "grad_norm": 17.679744720458984, "learning_rate": 1e-06, "logits/chosen": -0.16960373520851135, "logits/rejected": -0.17325231432914734, "logps/chosen": -105.846435546875, "logps/rejected": -122.9857406616211, "loss": 0.2049, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8144283294677734, "rewards/margins": 3.5779411792755127, "rewards/rejected": -5.392369270324707, "step": 375 }, { "epoch": 2.026666666666667, "grad_norm": 6.943394184112549, "learning_rate": 1e-06, "logits/chosen": -0.2742716670036316, "logits/rejected": -0.26814788579940796, "logps/chosen": -66.23405456542969, "logps/rejected": -103.7399673461914, "loss": 0.0704, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8214471936225891, "rewards/margins": 4.027198791503906, "rewards/rejected": -4.8486456871032715, "step": 380 }, { "epoch": 2.0533333333333332, "grad_norm": 3.1160390377044678, "learning_rate": 1e-06, "logits/chosen": -0.30924856662750244, "logits/rejected": -0.29427191615104675, "logps/chosen": -50.3244743347168, "logps/rejected": -112.04768371582031, "loss": 0.0837, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8333573341369629, "rewards/margins": 4.71429967880249, "rewards/rejected": -5.547657012939453, "step": 385 }, { "epoch": 2.08, "grad_norm": 4.3047380447387695, "learning_rate": 1e-06, "logits/chosen": -0.25628188252449036, "logits/rejected": -0.24855029582977295, "logps/chosen": -86.0821533203125, "logps/rejected": -138.10360717773438, "loss": 0.1163, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7257312536239624, "rewards/margins": 4.333987236022949, "rewards/rejected": -6.059718608856201, "step": 390 }, { "epoch": 2.1066666666666665, "grad_norm": 4.764650344848633, "learning_rate": 1e-06, "logits/chosen": -0.1932232677936554, "logits/rejected": -0.19700609147548676, "logps/chosen": -78.89598083496094, "logps/rejected": -130.88352966308594, "loss": 0.0877, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0742863416671753, "rewards/margins": 4.378272533416748, "rewards/rejected": -5.452559471130371, "step": 395 }, { "epoch": 2.1333333333333333, "grad_norm": 4.10252046585083, "learning_rate": 1e-06, "logits/chosen": -0.27843141555786133, "logits/rejected": -0.2746765911579132, "logps/chosen": -76.2488784790039, "logps/rejected": -127.501220703125, "loss": 0.0918, "rewards/accuracies": 1.0, "rewards/chosen": -0.9270022511482239, "rewards/margins": 5.307347297668457, "rewards/rejected": -6.234349727630615, "step": 400 }, { "epoch": 2.16, "grad_norm": 7.236265659332275, "learning_rate": 1e-06, "logits/chosen": -0.31720614433288574, "logits/rejected": -0.3048458695411682, "logps/chosen": -76.51472473144531, "logps/rejected": -135.60202026367188, "loss": 0.0616, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.512957215309143, "rewards/margins": 6.148130416870117, "rewards/rejected": -7.6610870361328125, "step": 405 }, { "epoch": 2.1866666666666665, "grad_norm": 2.3235349655151367, "learning_rate": 1e-06, "logits/chosen": -0.24595189094543457, "logits/rejected": -0.24323758482933044, "logps/chosen": -94.84211730957031, "logps/rejected": -145.022216796875, "loss": 0.0961, "rewards/accuracies": 1.0, "rewards/chosen": -2.402531385421753, "rewards/margins": 5.436099052429199, "rewards/rejected": -7.838630676269531, "step": 410 }, { "epoch": 2.2133333333333334, "grad_norm": 4.8239874839782715, "learning_rate": 1e-06, "logits/chosen": -0.2481319159269333, "logits/rejected": -0.25086286664009094, "logps/chosen": -83.86447143554688, "logps/rejected": -130.08944702148438, "loss": 0.0767, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.552152395248413, "rewards/margins": 5.058899879455566, "rewards/rejected": -7.611052513122559, "step": 415 }, { "epoch": 2.24, "grad_norm": 10.188889503479004, "learning_rate": 1e-06, "logits/chosen": -0.2911505699157715, "logits/rejected": -0.2887571156024933, "logps/chosen": -89.61388397216797, "logps/rejected": -124.80122375488281, "loss": 0.0954, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.090373992919922, "rewards/margins": 4.806578636169434, "rewards/rejected": -6.8969526290893555, "step": 420 }, { "epoch": 2.2666666666666666, "grad_norm": 5.293187618255615, "learning_rate": 1e-06, "logits/chosen": -0.2788965106010437, "logits/rejected": -0.2694849371910095, "logps/chosen": -90.92457580566406, "logps/rejected": -142.2576446533203, "loss": 0.0733, "rewards/accuracies": 1.0, "rewards/chosen": -1.9070699214935303, "rewards/margins": 4.678387641906738, "rewards/rejected": -6.585457801818848, "step": 425 }, { "epoch": 2.2933333333333334, "grad_norm": 6.162081718444824, "learning_rate": 1e-06, "logits/chosen": -0.3094247877597809, "logits/rejected": -0.3067726492881775, "logps/chosen": -101.09193420410156, "logps/rejected": -136.2590789794922, "loss": 0.1087, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3400262594223022, "rewards/margins": 4.651651859283447, "rewards/rejected": -5.991677761077881, "step": 430 }, { "epoch": 2.32, "grad_norm": 5.37161111831665, "learning_rate": 1e-06, "logits/chosen": -0.3167645335197449, "logits/rejected": -0.30754679441452026, "logps/chosen": -90.11158752441406, "logps/rejected": -128.84521484375, "loss": 0.07, "rewards/accuracies": 1.0, "rewards/chosen": -1.3145698308944702, "rewards/margins": 5.076085567474365, "rewards/rejected": -6.390655517578125, "step": 435 }, { "epoch": 2.3466666666666667, "grad_norm": 8.718563079833984, "learning_rate": 1e-06, "logits/chosen": -0.3047015964984894, "logits/rejected": -0.3067510724067688, "logps/chosen": -76.70709991455078, "logps/rejected": -124.02851867675781, "loss": 0.1167, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4980465173721313, "rewards/margins": 4.969003677368164, "rewards/rejected": -6.467051029205322, "step": 440 }, { "epoch": 2.3733333333333335, "grad_norm": 3.066976547241211, "learning_rate": 1e-06, "logits/chosen": -0.32817643880844116, "logits/rejected": -0.3236837387084961, "logps/chosen": -119.95536804199219, "logps/rejected": -168.2990264892578, "loss": 0.0509, "rewards/accuracies": 1.0, "rewards/chosen": -2.097543239593506, "rewards/margins": 6.6215500831604, "rewards/rejected": -8.719093322753906, "step": 445 }, { "epoch": 2.4, "grad_norm": 7.526630878448486, "learning_rate": 1e-06, "logits/chosen": -0.33517691493034363, "logits/rejected": -0.3308143615722656, "logps/chosen": -65.50419616699219, "logps/rejected": -114.26656341552734, "loss": 0.0958, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.0172278881072998, "rewards/margins": 5.359133720397949, "rewards/rejected": -6.376360893249512, "step": 450 }, { "epoch": 2.4266666666666667, "grad_norm": 4.8166937828063965, "learning_rate": 1e-06, "logits/chosen": -0.3010093867778778, "logits/rejected": -0.29728105664253235, "logps/chosen": -95.86582946777344, "logps/rejected": -154.4734649658203, "loss": 0.0702, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.552085280418396, "rewards/margins": 5.720345973968506, "rewards/rejected": -7.27243185043335, "step": 455 }, { "epoch": 2.453333333333333, "grad_norm": 2.3862478733062744, "learning_rate": 1e-06, "logits/chosen": -0.31689682602882385, "logits/rejected": -0.31278690695762634, "logps/chosen": -104.4582290649414, "logps/rejected": -167.82778930664062, "loss": 0.0756, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.952106237411499, "rewards/margins": 6.5650482177734375, "rewards/rejected": -8.517154693603516, "step": 460 }, { "epoch": 2.48, "grad_norm": 1.4673610925674438, "learning_rate": 1e-06, "logits/chosen": -0.33551472425460815, "logits/rejected": -0.33083659410476685, "logps/chosen": -87.67115783691406, "logps/rejected": -160.4077911376953, "loss": 0.0675, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0451600551605225, "rewards/margins": 6.929755210876465, "rewards/rejected": -8.974915504455566, "step": 465 }, { "epoch": 2.506666666666667, "grad_norm": 5.761256217956543, "learning_rate": 1e-06, "logits/chosen": -0.30712994933128357, "logits/rejected": -0.2985348701477051, "logps/chosen": -75.22799682617188, "logps/rejected": -119.52055358886719, "loss": 0.0806, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5171074867248535, "rewards/margins": 4.610350608825684, "rewards/rejected": -6.127457618713379, "step": 470 }, { "epoch": 2.533333333333333, "grad_norm": 2.6889760494232178, "learning_rate": 1e-06, "logits/chosen": -0.3337317407131195, "logits/rejected": -0.33648866415023804, "logps/chosen": -84.6175765991211, "logps/rejected": -114.95267486572266, "loss": 0.0912, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.932594656944275, "rewards/margins": 4.491971969604492, "rewards/rejected": -6.424566745758057, "step": 475 }, { "epoch": 2.56, "grad_norm": 18.071474075317383, "learning_rate": 1e-06, "logits/chosen": -0.3041028082370758, "logits/rejected": -0.2970563471317291, "logps/chosen": -99.09244537353516, "logps/rejected": -154.91946411132812, "loss": 0.1146, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.7287986278533936, "rewards/margins": 6.216522693634033, "rewards/rejected": -7.945321559906006, "step": 480 }, { "epoch": 2.586666666666667, "grad_norm": 1.98813796043396, "learning_rate": 1e-06, "logits/chosen": -0.3482723832130432, "logits/rejected": -0.344726026058197, "logps/chosen": -78.27327728271484, "logps/rejected": -170.27120971679688, "loss": 0.0872, "rewards/accuracies": 1.0, "rewards/chosen": -2.34009051322937, "rewards/margins": 6.879552364349365, "rewards/rejected": -9.219643592834473, "step": 485 }, { "epoch": 2.6133333333333333, "grad_norm": 14.037318229675293, "learning_rate": 1e-06, "logits/chosen": -0.32685285806655884, "logits/rejected": -0.3174216151237488, "logps/chosen": -90.9804916381836, "logps/rejected": -159.81094360351562, "loss": 0.1495, "rewards/accuracies": 1.0, "rewards/chosen": -2.121194362640381, "rewards/margins": 5.947864532470703, "rewards/rejected": -8.069058418273926, "step": 490 }, { "epoch": 2.64, "grad_norm": 10.764850616455078, "learning_rate": 1e-06, "logits/chosen": -0.3625045418739319, "logits/rejected": -0.3534373342990875, "logps/chosen": -66.63526153564453, "logps/rejected": -135.00489807128906, "loss": 0.0934, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0588821172714233, "rewards/margins": 6.410633087158203, "rewards/rejected": -7.469515323638916, "step": 495 }, { "epoch": 2.6666666666666665, "grad_norm": 7.240427494049072, "learning_rate": 1e-06, "logits/chosen": -0.3743410110473633, "logits/rejected": -0.3734128773212433, "logps/chosen": -71.13905334472656, "logps/rejected": -139.22691345214844, "loss": 0.0534, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7334591746330261, "rewards/margins": 6.263424873352051, "rewards/rejected": -6.996884346008301, "step": 500 }, { "epoch": 2.6933333333333334, "grad_norm": 4.419618606567383, "learning_rate": 1e-06, "logits/chosen": -0.353333055973053, "logits/rejected": -0.34663665294647217, "logps/chosen": -88.02120971679688, "logps/rejected": -138.94662475585938, "loss": 0.0621, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0562851428985596, "rewards/margins": 5.7193145751953125, "rewards/rejected": -7.775599479675293, "step": 505 }, { "epoch": 2.7199999999999998, "grad_norm": 4.452930450439453, "learning_rate": 1e-06, "logits/chosen": -0.39688724279403687, "logits/rejected": -0.3942062556743622, "logps/chosen": -73.48619079589844, "logps/rejected": -114.01251220703125, "loss": 0.0936, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.882142424583435, "rewards/margins": 4.740755081176758, "rewards/rejected": -6.622898101806641, "step": 510 }, { "epoch": 2.7466666666666666, "grad_norm": 6.147470474243164, "learning_rate": 1e-06, "logits/chosen": -0.36246275901794434, "logits/rejected": -0.3594650328159332, "logps/chosen": -75.21955871582031, "logps/rejected": -138.40557861328125, "loss": 0.0508, "rewards/accuracies": 1.0, "rewards/chosen": -1.0342657566070557, "rewards/margins": 6.567025184631348, "rewards/rejected": -7.601290702819824, "step": 515 }, { "epoch": 2.7733333333333334, "grad_norm": 5.928589820861816, "learning_rate": 1e-06, "logits/chosen": -0.3479396104812622, "logits/rejected": -0.33934539556503296, "logps/chosen": -87.25235748291016, "logps/rejected": -147.29940795898438, "loss": 0.0882, "rewards/accuracies": 1.0, "rewards/chosen": -1.92647385597229, "rewards/margins": 6.275478839874268, "rewards/rejected": -8.20195198059082, "step": 520 }, { "epoch": 2.8, "grad_norm": 8.153724670410156, "learning_rate": 1e-06, "logits/chosen": -0.3809276819229126, "logits/rejected": -0.37726661562919617, "logps/chosen": -88.18365478515625, "logps/rejected": -133.28257751464844, "loss": 0.0715, "rewards/accuracies": 1.0, "rewards/chosen": -1.543839693069458, "rewards/margins": 6.156442165374756, "rewards/rejected": -7.700281620025635, "step": 525 }, { "epoch": 2.8266666666666667, "grad_norm": 7.274991512298584, "learning_rate": 1e-06, "logits/chosen": -0.39806967973709106, "logits/rejected": -0.39120930433273315, "logps/chosen": -80.41667938232422, "logps/rejected": -145.32870483398438, "loss": 0.0849, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.581139326095581, "rewards/margins": 6.18198299407959, "rewards/rejected": -7.76312255859375, "step": 530 }, { "epoch": 2.8533333333333335, "grad_norm": 6.882216453552246, "learning_rate": 1e-06, "logits/chosen": -0.3769987225532532, "logits/rejected": -0.37087202072143555, "logps/chosen": -78.6170425415039, "logps/rejected": -114.8215103149414, "loss": 0.1168, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4115530252456665, "rewards/margins": 4.665685176849365, "rewards/rejected": -6.077239036560059, "step": 535 }, { "epoch": 2.88, "grad_norm": 6.004029750823975, "learning_rate": 1e-06, "logits/chosen": -0.35111719369888306, "logits/rejected": -0.3472958207130432, "logps/chosen": -79.73258972167969, "logps/rejected": -135.81594848632812, "loss": 0.1653, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9881089925765991, "rewards/margins": 5.74216890335083, "rewards/rejected": -7.730278015136719, "step": 540 }, { "epoch": 2.9066666666666667, "grad_norm": 2.1099255084991455, "learning_rate": 1e-06, "logits/chosen": -0.3623197674751282, "logits/rejected": -0.35646793246269226, "logps/chosen": -94.4450912475586, "logps/rejected": -168.900146484375, "loss": 0.0705, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.358279228210449, "rewards/margins": 5.906458854675293, "rewards/rejected": -8.264737129211426, "step": 545 }, { "epoch": 2.9333333333333336, "grad_norm": 8.101237297058105, "learning_rate": 1e-06, "logits/chosen": -0.3904687166213989, "logits/rejected": -0.3798937499523163, "logps/chosen": -81.44142150878906, "logps/rejected": -130.85696411132812, "loss": 0.0788, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1261515617370605, "rewards/margins": 5.361577987670898, "rewards/rejected": -7.487729072570801, "step": 550 }, { "epoch": 2.96, "grad_norm": 10.77298641204834, "learning_rate": 1e-06, "logits/chosen": -0.3544600307941437, "logits/rejected": -0.3497422933578491, "logps/chosen": -75.84606170654297, "logps/rejected": -137.5305938720703, "loss": 0.093, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.848976492881775, "rewards/margins": 6.227209568023682, "rewards/rejected": -8.07618522644043, "step": 555 }, { "epoch": 2.986666666666667, "grad_norm": 1.2543996572494507, "learning_rate": 1e-06, "logits/chosen": -0.36508709192276, "logits/rejected": -0.3570384979248047, "logps/chosen": -79.93736267089844, "logps/rejected": -145.72976684570312, "loss": 0.0764, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.660933494567871, "rewards/margins": 6.339797019958496, "rewards/rejected": -8.000730514526367, "step": 560 }, { "epoch": 2.992, "step": 561, "total_flos": 0.0, "train_loss": 0.2571469163073894, "train_runtime": 1548.982, "train_samples_per_second": 11.619, "train_steps_per_second": 0.362 } ], "logging_steps": 5, "max_steps": 561, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }