{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2594458438287153, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 722.9521026611328, "epoch": 0.012594458438287154, "grad_norm": 0.07664977191949898, "kl": 0.00014810562133789064, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.46451360136270525, "reward_std": 0.10854418443050236, "rewards/wrapped_prediction_reward_func": 0.46451360136270525, "step": 5 }, { "completion_length": 730.1583602905273, "epoch": 0.02518891687657431, "grad_norm": 0.08074394124833935, "kl": 0.00022151470184326172, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.46406558752059934, "reward_std": 0.09366982954088598, "rewards/wrapped_prediction_reward_func": 0.46406558752059934, "step": 10 }, { "completion_length": 732.6021026611328, "epoch": 0.037783375314861464, "grad_norm": 0.07971439169776136, "kl": 0.00023424625396728516, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.4597253814339638, "reward_std": 0.09797948987688869, "rewards/wrapped_prediction_reward_func": 0.4597253814339638, "step": 15 }, { "completion_length": 706.5271057128906, "epoch": 0.05037783375314862, "grad_norm": 0.07675378344787744, "kl": 0.00025036334991455076, "learning_rate": 9.997377845227574e-07, "loss": 0.0, "reward": 0.47573257163167, "reward_std": 0.09649912532186136, "rewards/wrapped_prediction_reward_func": 0.47573257163167, "step": 20 }, { "completion_length": 709.5146011352539, "epoch": 0.06297229219143577, "grad_norm": 0.07792838886271755, "kl": 0.00035452842712402344, "learning_rate": 9.989514131188558e-07, "loss": 0.0, "reward": 0.4748247370123863, "reward_std": 0.08561795827117749, "rewards/wrapped_prediction_reward_func": 0.4748247370123863, "step": 25 }, { "completion_length": 746.320849609375, "epoch": 0.07556675062972293, "grad_norm": 0.0755043003664804, "kl": 0.0006094932556152344, "learning_rate": 9.97641710583307e-07, "loss": 0.0, "reward": 0.459003009647131, "reward_std": 0.09646977222291753, "rewards/wrapped_prediction_reward_func": 0.459003009647131, "step": 30 }, { "completion_length": 739.545849609375, "epoch": 0.08816120906801007, "grad_norm": 0.06831186280995111, "kl": 0.00098114013671875, "learning_rate": 9.958100506132126e-07, "loss": 0.0, "reward": 0.491700629144907, "reward_std": 0.08609421639703214, "rewards/wrapped_prediction_reward_func": 0.491700629144907, "step": 35 }, { "completion_length": 735.9604385375976, "epoch": 0.10075566750629723, "grad_norm": 0.06820840594292561, "kl": 0.00146484375, "learning_rate": 9.934583543669453e-07, "loss": 0.0, "reward": 0.4918279483914375, "reward_std": 0.10381489984574728, "rewards/wrapped_prediction_reward_func": 0.4918279483914375, "step": 40 }, { "completion_length": 761.6375183105469, "epoch": 0.11335012594458438, "grad_norm": 0.07306576306189429, "kl": 0.0023061752319335936, "learning_rate": 9.905890884491194e-07, "loss": 0.0, "reward": 0.4687023714184761, "reward_std": 0.10100088955368847, "rewards/wrapped_prediction_reward_func": 0.4687023714184761, "step": 45 }, { "completion_length": 762.1875213623047, "epoch": 0.12594458438287154, "grad_norm": 0.0803081141044505, "kl": 0.0034528732299804687, "learning_rate": 9.872052623234631e-07, "loss": 0.0, "reward": 0.47996846288442613, "reward_std": 0.09802622515708208, "rewards/wrapped_prediction_reward_func": 0.47996846288442613, "step": 50 }, { "completion_length": 790.0812713623047, "epoch": 0.1385390428211587, "grad_norm": 0.0704359174357588, "kl": 0.003958320617675782, "learning_rate": 9.833104251563055e-07, "loss": 0.0, "reward": 0.48190433755517004, "reward_std": 0.08568599077407271, "rewards/wrapped_prediction_reward_func": 0.48190433755517004, "step": 55 }, { "completion_length": 768.5521057128906, "epoch": 0.15113350125944586, "grad_norm": 0.06457778900067923, "kl": 0.005392837524414063, "learning_rate": 9.789086620939935e-07, "loss": 0.0, "reward": 0.48327849954366686, "reward_std": 0.09457279161288171, "rewards/wrapped_prediction_reward_func": 0.48327849954366686, "step": 60 }, { "completion_length": 770.9166870117188, "epoch": 0.163727959697733, "grad_norm": 0.06812633405378012, "kl": 0.006523513793945312, "learning_rate": 9.740045899781352e-07, "loss": 0.0, "reward": 0.5019405260682106, "reward_std": 0.07568035275908187, "rewards/wrapped_prediction_reward_func": 0.5019405260682106, "step": 65 }, { "completion_length": 749.1146057128906, "epoch": 0.17632241813602015, "grad_norm": 0.06685526422767288, "kl": 0.007244110107421875, "learning_rate": 9.686033525031719e-07, "loss": 0.0, "reward": 0.501540158689022, "reward_std": 0.07553914005402476, "rewards/wrapped_prediction_reward_func": 0.501540158689022, "step": 70 }, { "completion_length": 761.0937683105469, "epoch": 0.1889168765743073, "grad_norm": 0.0722156044539115, "kl": 0.007537460327148438, "learning_rate": 9.62710614821352e-07, "loss": 0.0, "reward": 0.4996995717287064, "reward_std": 0.0968896305654198, "rewards/wrapped_prediction_reward_func": 0.4996995717287064, "step": 75 }, { "completion_length": 767.5000213623047, "epoch": 0.20151133501259447, "grad_norm": 0.078058173076431, "kl": 0.0085418701171875, "learning_rate": 9.5633255760077e-07, "loss": 0.0, "reward": 0.4846237309277058, "reward_std": 0.10277664107270539, "rewards/wrapped_prediction_reward_func": 0.4846237309277058, "step": 80 }, { "completion_length": 737.7312728881836, "epoch": 0.2141057934508816, "grad_norm": 0.07298119554807302, "kl": 0.010308837890625, "learning_rate": 9.494758705426976e-07, "loss": 0.0, "reward": 0.5145397499203682, "reward_std": 0.09095026951981708, "rewards/wrapped_prediction_reward_func": 0.5145397499203682, "step": 85 }, { "completion_length": 780.7896041870117, "epoch": 0.22670025188916876, "grad_norm": 0.07249771083699665, "kl": 0.0119293212890625, "learning_rate": 9.421477453650117e-07, "loss": 0.0, "reward": 0.5147036135196685, "reward_std": 0.09051896830787882, "rewards/wrapped_prediction_reward_func": 0.5147036135196685, "step": 90 }, { "completion_length": 761.1833541870117, "epoch": 0.23929471032745592, "grad_norm": 0.07426543152283624, "kl": 0.01405029296875, "learning_rate": 9.343558682590755e-07, "loss": 0.0, "reward": 0.5229489721357823, "reward_std": 0.1045097156893462, "rewards/wrapped_prediction_reward_func": 0.5229489721357823, "step": 95 }, { "completion_length": 747.8270980834961, "epoch": 0.2518891687657431, "grad_norm": 0.08009584068360569, "kl": 0.016656494140625, "learning_rate": 9.261084118279846e-07, "loss": 0.0, "reward": 0.5308140270411968, "reward_std": 0.10999625454423949, "rewards/wrapped_prediction_reward_func": 0.5308140270411968, "step": 100 }, { "completion_length": 755.6771011352539, "epoch": 0.26448362720403024, "grad_norm": 0.0717796741561669, "kl": 0.01963958740234375, "learning_rate": 9.174140265146355e-07, "loss": 0.0, "reward": 0.5279295884072781, "reward_std": 0.12174326665699482, "rewards/wrapped_prediction_reward_func": 0.5279295884072781, "step": 105 }, { "completion_length": 741.0396072387696, "epoch": 0.2770780856423174, "grad_norm": 0.06788337142864205, "kl": 0.0238006591796875, "learning_rate": 9.082818315286054e-07, "loss": 0.0, "reward": 0.5241287641227246, "reward_std": 0.08864832047838718, "rewards/wrapped_prediction_reward_func": 0.5241287641227246, "step": 110 }, { "completion_length": 785.7208526611328, "epoch": 0.28967254408060455, "grad_norm": 0.0701122609050844, "kl": 0.025714111328125, "learning_rate": 8.987214052813603e-07, "loss": 0.0, "reward": 0.5118412531912326, "reward_std": 0.11687530118506402, "rewards/wrapped_prediction_reward_func": 0.5118412531912326, "step": 115 }, { "completion_length": 785.070849609375, "epoch": 0.3022670025188917, "grad_norm": 0.07472239714054472, "kl": 0.0284881591796875, "learning_rate": 8.887427753398247e-07, "loss": 0.0, "reward": 0.5511950686573982, "reward_std": 0.07816179413348437, "rewards/wrapped_prediction_reward_func": 0.5511950686573982, "step": 120 }, { "completion_length": 833.4291870117188, "epoch": 0.3148614609571788, "grad_norm": 0.07422495442339502, "kl": 0.027734375, "learning_rate": 8.783564079088476e-07, "loss": 0.0, "reward": 0.5402487128973007, "reward_std": 0.13062875589821488, "rewards/wrapped_prediction_reward_func": 0.5402487128973007, "step": 125 }, { "completion_length": 814.3541854858398, "epoch": 0.327455919395466, "grad_norm": 0.06823337736452399, "kl": 0.02915191650390625, "learning_rate": 8.675731968536002e-07, "loss": 0.0, "reward": 0.530899728089571, "reward_std": 0.11218150332570076, "rewards/wrapped_prediction_reward_func": 0.530899728089571, "step": 130 }, { "completion_length": 812.7791900634766, "epoch": 0.34005037783375314, "grad_norm": 0.07003585561947805, "kl": 0.03168487548828125, "learning_rate": 8.564044522734146e-07, "loss": 0.0, "reward": 0.5358106784522534, "reward_std": 0.10542880366556347, "rewards/wrapped_prediction_reward_func": 0.5358106784522534, "step": 135 }, { "completion_length": 817.2354385375977, "epoch": 0.3526448362720403, "grad_norm": 0.07278698573023756, "kl": 0.03175811767578125, "learning_rate": 8.448618886390521e-07, "loss": 0.0, "reward": 0.549003791064024, "reward_std": 0.0949756694608368, "rewards/wrapped_prediction_reward_func": 0.549003791064024, "step": 140 }, { "completion_length": 761.7271057128906, "epoch": 0.36523929471032746, "grad_norm": 0.0726454943691525, "kl": 0.0360321044921875, "learning_rate": 8.329576125058405e-07, "loss": 0.0, "reward": 0.5476619251072407, "reward_std": 0.08561794870765879, "rewards/wrapped_prediction_reward_func": 0.5476619251072407, "step": 145 }, { "completion_length": 784.9791854858398, "epoch": 0.3778337531486146, "grad_norm": 0.0734552135965303, "kl": 0.03544921875, "learning_rate": 8.207041098155699e-07, "loss": 0.0, "reward": 0.5536449149250984, "reward_std": 0.09187358255730942, "rewards/wrapped_prediction_reward_func": 0.5536449149250984, "step": 150 }, { "completion_length": 750.6729354858398, "epoch": 0.3904282115869018, "grad_norm": 0.08269464276856019, "kl": 0.0421600341796875, "learning_rate": 8.081142328004636e-07, "loss": 0.0, "reward": 0.560660557448864, "reward_std": 0.08419233104214072, "rewards/wrapped_prediction_reward_func": 0.560660557448864, "step": 155 }, { "completion_length": 736.0833541870118, "epoch": 0.40302267002518893, "grad_norm": 0.07934493494179942, "kl": 0.2113433837890625, "learning_rate": 7.952011865029613e-07, "loss": 0.0001, "reward": 0.5728602230548858, "reward_std": 0.08108794330037199, "rewards/wrapped_prediction_reward_func": 0.5728602230548858, "step": 160 }, { "completion_length": 751.6396011352539, "epoch": 0.4156171284634761, "grad_norm": 0.077727529432095, "kl": 0.0558685302734375, "learning_rate": 7.819785149254532e-07, "loss": 0.0, "reward": 0.5558124162256718, "reward_std": 0.07910064500756561, "rewards/wrapped_prediction_reward_func": 0.5558124162256718, "step": 165 }, { "completion_length": 794.9833511352539, "epoch": 0.4282115869017632, "grad_norm": 0.08198281407723804, "kl": 0.063677978515625, "learning_rate": 7.684600868244919e-07, "loss": 0.0001, "reward": 0.5688013978302479, "reward_std": 0.08710555551806465, "rewards/wrapped_prediction_reward_func": 0.5688013978302479, "step": 170 }, { "completion_length": 765.7625213623047, "epoch": 0.44080604534005036, "grad_norm": 0.07454384571252022, "kl": 0.0582550048828125, "learning_rate": 7.546600811643816e-07, "loss": 0.0001, "reward": 0.5501547582447529, "reward_std": 0.06000117462826893, "rewards/wrapped_prediction_reward_func": 0.5501547582447529, "step": 175 }, { "completion_length": 804.5687713623047, "epoch": 0.4534005037783375, "grad_norm": 0.0759962714852466, "kl": 0.051287841796875, "learning_rate": 7.405929722454025e-07, "loss": 0.0, "reward": 0.5598295979201794, "reward_std": 0.08419673591852188, "rewards/wrapped_prediction_reward_func": 0.5598295979201794, "step": 180 }, { "completion_length": 762.9896026611328, "epoch": 0.4659949622166247, "grad_norm": 0.05921819318149447, "kl": 0.059259033203125, "learning_rate": 7.262735145222695e-07, "loss": 0.0001, "reward": 0.5768641114234925, "reward_std": 0.0767961086006835, "rewards/wrapped_prediction_reward_func": 0.5768641114234925, "step": 185 }, { "completion_length": 756.3416885375976, "epoch": 0.47858942065491183, "grad_norm": 0.07564514416368207, "kl": 0.0628173828125, "learning_rate": 7.117167271287452e-07, "loss": 0.0, "reward": 0.5682952009141445, "reward_std": 0.07079082436393946, "rewards/wrapped_prediction_reward_func": 0.5682952009141445, "step": 190 }, { "completion_length": 792.5271087646485, "epoch": 0.491183879093199, "grad_norm": 0.05939427024749797, "kl": 0.0619049072265625, "learning_rate": 6.969378781246436e-07, "loss": 0.0, "reward": 0.555230350792408, "reward_std": 0.05828617985825986, "rewards/wrapped_prediction_reward_func": 0.555230350792408, "step": 195 }, { "completion_length": 805.2312698364258, "epoch": 0.5037783375314862, "grad_norm": 0.053418514095113, "kl": 0.0571746826171875, "learning_rate": 6.819524684817438e-07, "loss": 0.0001, "reward": 0.5742501869797707, "reward_std": 0.04643813910661265, "rewards/wrapped_prediction_reward_func": 0.5742501869797707, "step": 200 }, { "completion_length": 817.6541931152344, "epoch": 0.5163727959697733, "grad_norm": 0.059367612614114815, "kl": 0.0566925048828125, "learning_rate": 6.667762158254103e-07, "loss": 0.0, "reward": 0.5665707983076572, "reward_std": 0.05952281908830628, "rewards/wrapped_prediction_reward_func": 0.5665707983076572, "step": 205 }, { "completion_length": 785.7229415893555, "epoch": 0.5289672544080605, "grad_norm": 0.06832063353240915, "kl": 0.063702392578125, "learning_rate": 6.514250379489753e-07, "loss": 0.0001, "reward": 0.5705230697989464, "reward_std": 0.055411939066834746, "rewards/wrapped_prediction_reward_func": 0.5705230697989464, "step": 210 }, { "completion_length": 845.8625198364258, "epoch": 0.5415617128463476, "grad_norm": 0.059630081848652705, "kl": 0.055462646484375, "learning_rate": 6.359150361181714e-07, "loss": 0.0001, "reward": 0.5717524968087673, "reward_std": 0.058589562051929535, "rewards/wrapped_prediction_reward_func": 0.5717524968087673, "step": 215 }, { "completion_length": 842.5708572387696, "epoch": 0.5541561712846348, "grad_norm": 0.06370639502021583, "kl": 0.0563568115234375, "learning_rate": 6.202624781831268e-07, "loss": 0.0001, "reward": 0.5911780953407287, "reward_std": 0.05995338945649564, "rewards/wrapped_prediction_reward_func": 0.5911780953407287, "step": 220 }, { "completion_length": 819.0875198364258, "epoch": 0.5667506297229219, "grad_norm": 0.05972945064618038, "kl": 0.1108428955078125, "learning_rate": 6.044837815156376e-07, "loss": 0.0001, "reward": 0.5578226193785667, "reward_std": 0.04124137028120458, "rewards/wrapped_prediction_reward_func": 0.5578226193785667, "step": 225 }, { "completion_length": 815.4666931152344, "epoch": 0.5793450881612091, "grad_norm": 0.05137674910063656, "kl": 0.0625457763671875, "learning_rate": 5.885954957896115e-07, "loss": 0.0001, "reward": 0.5861903376877308, "reward_std": 0.04839982387493365, "rewards/wrapped_prediction_reward_func": 0.5861903376877308, "step": 230 }, { "completion_length": 802.9166900634766, "epoch": 0.5919395465994962, "grad_norm": 0.052644946129698264, "kl": 0.065142822265625, "learning_rate": 5.726142856227452e-07, "loss": 0.0001, "reward": 0.5952158778905868, "reward_std": 0.05475405912147835, "rewards/wrapped_prediction_reward_func": 0.5952158778905868, "step": 235 }, { "completion_length": 839.5187728881835, "epoch": 0.6045340050377834, "grad_norm": 0.0590405309579095, "kl": 0.0682373046875, "learning_rate": 5.565569130976422e-07, "loss": 0.0, "reward": 0.5894757807254791, "reward_std": 0.06357808914035559, "rewards/wrapped_prediction_reward_func": 0.5894757807254791, "step": 240 }, { "completion_length": 835.6958541870117, "epoch": 0.6171284634760705, "grad_norm": 0.047712245488407155, "kl": 0.064862060546875, "learning_rate": 5.404402201807021e-07, "loss": 0.0001, "reward": 0.5752220213413238, "reward_std": 0.04643567528109997, "rewards/wrapped_prediction_reward_func": 0.5752220213413238, "step": 245 }, { "completion_length": 821.4750274658203, "epoch": 0.6297229219143576, "grad_norm": 0.05442474889712706, "kl": 0.0702850341796875, "learning_rate": 5.242811110572242e-07, "loss": 0.0001, "reward": 0.5673072084784507, "reward_std": 0.03941638254909776, "rewards/wrapped_prediction_reward_func": 0.5673072084784507, "step": 250 }, { "completion_length": 781.9187728881836, "epoch": 0.6423173803526449, "grad_norm": 0.05809885115112598, "kl": 0.070941162109375, "learning_rate": 5.080965344012508e-07, "loss": 0.0001, "reward": 0.5991133064031601, "reward_std": 0.04335737858200446, "rewards/wrapped_prediction_reward_func": 0.5991133064031601, "step": 255 }, { "completion_length": 817.3083541870117, "epoch": 0.654911838790932, "grad_norm": 0.03682776713225834, "kl": 0.0657196044921875, "learning_rate": 4.919034655987492e-07, "loss": 0.0001, "reward": 0.5929536901414394, "reward_std": 0.06003379854373634, "rewards/wrapped_prediction_reward_func": 0.5929536901414394, "step": 260 }, { "completion_length": 808.1771072387695, "epoch": 0.6675062972292192, "grad_norm": 0.05594643798861689, "kl": 0.071002197265625, "learning_rate": 4.75718888942776e-07, "loss": 0.0001, "reward": 0.5693795874714851, "reward_std": 0.04352557165548206, "rewards/wrapped_prediction_reward_func": 0.5693795874714851, "step": 265 }, { "completion_length": 850.0666854858398, "epoch": 0.6801007556675063, "grad_norm": 0.05396780654610799, "kl": 0.066058349609375, "learning_rate": 4.595597798192979e-07, "loss": 0.0001, "reward": 0.5841031737625599, "reward_std": 0.054474068916169925, "rewards/wrapped_prediction_reward_func": 0.5841031737625599, "step": 270 }, { "completion_length": 807.5771041870117, "epoch": 0.6926952141057935, "grad_norm": 0.05692079461397134, "kl": 0.072412109375, "learning_rate": 4.434430869023579e-07, "loss": 0.0001, "reward": 0.6022952854633331, "reward_std": 0.05330510977655649, "rewards/wrapped_prediction_reward_func": 0.6022952854633331, "step": 275 }, { "completion_length": 846.0312774658203, "epoch": 0.7052896725440806, "grad_norm": 0.04955339683095041, "kl": 0.065411376953125, "learning_rate": 4.2738571437725496e-07, "loss": 0.0001, "reward": 0.5892672084271908, "reward_std": 0.05133460226934403, "rewards/wrapped_prediction_reward_func": 0.5892672084271908, "step": 280 }, { "completion_length": 781.9396057128906, "epoch": 0.7178841309823678, "grad_norm": 0.056799969581110395, "kl": 0.071551513671875, "learning_rate": 4.1140450421038866e-07, "loss": 0.0001, "reward": 0.5798937246203423, "reward_std": 0.051263501844368876, "rewards/wrapped_prediction_reward_func": 0.5798937246203423, "step": 285 }, { "completion_length": 797.3041900634765, "epoch": 0.7304785894206549, "grad_norm": 0.050561172649658176, "kl": 0.077020263671875, "learning_rate": 3.955162184843624e-07, "loss": 0.0001, "reward": 0.5860190108418465, "reward_std": 0.04162131273187697, "rewards/wrapped_prediction_reward_func": 0.5860190108418465, "step": 290 }, { "completion_length": 822.4833526611328, "epoch": 0.743073047858942, "grad_norm": 0.0434111466318861, "kl": 0.0822509765625, "learning_rate": 3.7973752181687327e-07, "loss": 0.0001, "reward": 0.5851313889026641, "reward_std": 0.05500690509798005, "rewards/wrapped_prediction_reward_func": 0.5851313889026641, "step": 295 }, { "completion_length": 855.5312805175781, "epoch": 0.7556675062972292, "grad_norm": 0.051422594050945035, "kl": 0.0636383056640625, "learning_rate": 3.640849638818285e-07, "loss": 0.0001, "reward": 0.5980591103434563, "reward_std": 0.036254264996387064, "rewards/wrapped_prediction_reward_func": 0.5980591103434563, "step": 300 }, { "completion_length": 792.4396087646485, "epoch": 0.7682619647355163, "grad_norm": 0.03997352927393124, "kl": 0.0724853515625, "learning_rate": 3.485749620510247e-07, "loss": 0.0001, "reward": 0.6093361288309097, "reward_std": 0.03319716795813292, "rewards/wrapped_prediction_reward_func": 0.6093361288309097, "step": 305 }, { "completion_length": 812.2625213623047, "epoch": 0.7808564231738035, "grad_norm": 0.058958040086389545, "kl": 0.072906494140625, "learning_rate": 3.3322378417458977e-07, "loss": 0.0001, "reward": 0.613701456785202, "reward_std": 0.041399752866709605, "rewards/wrapped_prediction_reward_func": 0.613701456785202, "step": 310 }, { "completion_length": 830.6354431152344, "epoch": 0.7934508816120907, "grad_norm": 0.03534980235662457, "kl": 0.067303466796875, "learning_rate": 3.1804753151825627e-07, "loss": 0.0001, "reward": 0.5734647884964943, "reward_std": 0.051193205296294765, "rewards/wrapped_prediction_reward_func": 0.5734647884964943, "step": 315 }, { "completion_length": 813.2833557128906, "epoch": 0.8060453400503779, "grad_norm": 0.06257699195249379, "kl": 0.068646240234375, "learning_rate": 3.030621218753565e-07, "loss": 0.0001, "reward": 0.6155773043632508, "reward_std": 0.039483394497074185, "rewards/wrapped_prediction_reward_func": 0.6155773043632508, "step": 320 }, { "completion_length": 798.3354370117188, "epoch": 0.818639798488665, "grad_norm": 0.047751764206928074, "kl": 0.0704833984375, "learning_rate": 2.8828327287125507e-07, "loss": 0.0001, "reward": 0.608567351102829, "reward_std": 0.03854468032368459, "rewards/wrapped_prediction_reward_func": 0.608567351102829, "step": 325 }, { "completion_length": 828.9812713623047, "epoch": 0.8312342569269522, "grad_norm": 0.04585758374215142, "kl": 0.0710540771484375, "learning_rate": 2.7372648547773056e-07, "loss": 0.0001, "reward": 0.5888618856668473, "reward_std": 0.03441393570974469, "rewards/wrapped_prediction_reward_func": 0.5888618856668473, "step": 330 }, { "completion_length": 821.6791931152344, "epoch": 0.8438287153652393, "grad_norm": 0.04800095851199432, "kl": 0.070928955078125, "learning_rate": 2.5940702775459744e-07, "loss": 0.0001, "reward": 0.5711221612989903, "reward_std": 0.037968299351632595, "rewards/wrapped_prediction_reward_func": 0.5711221612989903, "step": 335 }, { "completion_length": 825.0750167846679, "epoch": 0.8564231738035264, "grad_norm": 0.032868154882953755, "kl": 0.074005126953125, "learning_rate": 2.4533991883561867e-07, "loss": 0.0002, "reward": 0.5961314618587494, "reward_std": 0.02036965051665902, "rewards/wrapped_prediction_reward_func": 0.5961314618587494, "step": 340 }, { "completion_length": 789.3958572387695, "epoch": 0.8690176322418136, "grad_norm": 0.04730084864351471, "kl": 0.0718597412109375, "learning_rate": 2.3153991317550808e-07, "loss": 0.0001, "reward": 0.5693817652761937, "reward_std": 0.027137306291842835, "rewards/wrapped_prediction_reward_func": 0.5693817652761937, "step": 345 }, { "completion_length": 825.4916915893555, "epoch": 0.8816120906801007, "grad_norm": 0.05929565464749555, "kl": 0.0830902099609375, "learning_rate": 2.180214850745467e-07, "loss": 0.0001, "reward": 0.5720083937048912, "reward_std": 0.04108033460797742, "rewards/wrapped_prediction_reward_func": 0.5720083937048912, "step": 350 }, { "completion_length": 794.3854446411133, "epoch": 0.8942065491183879, "grad_norm": 0.042508843892672864, "kl": 0.0805755615234375, "learning_rate": 2.0479881349703882e-07, "loss": 0.0001, "reward": 0.6149648398160934, "reward_std": 0.028531424299580978, "rewards/wrapped_prediction_reward_func": 0.6149648398160934, "step": 355 }, { "completion_length": 828.1271102905273, "epoch": 0.906801007556675, "grad_norm": 0.13929773259081887, "kl": 0.1125732421875, "learning_rate": 1.918857671995363e-07, "loss": 0.0001, "reward": 0.5847930148243904, "reward_std": 0.044799246825277805, "rewards/wrapped_prediction_reward_func": 0.5847930148243904, "step": 360 }, { "completion_length": 843.9896087646484, "epoch": 0.9193954659949622, "grad_norm": 0.044992966442867634, "kl": 0.0656494140625, "learning_rate": 1.7929589018443014e-07, "loss": 0.0001, "reward": 0.5992885082960129, "reward_std": 0.029869268811307848, "rewards/wrapped_prediction_reward_func": 0.5992885082960129, "step": 365 }, { "completion_length": 791.6791900634765, "epoch": 0.9319899244332494, "grad_norm": 0.03456117446845568, "kl": 0.07545166015625, "learning_rate": 1.6704238749415955e-07, "loss": 0.0002, "reward": 0.5825795724987983, "reward_std": 0.021053510857746004, "rewards/wrapped_prediction_reward_func": 0.5825795724987983, "step": 370 }, { "completion_length": 795.2250274658203, "epoch": 0.9445843828715366, "grad_norm": 0.057925315822869305, "kl": 0.083660888671875, "learning_rate": 1.5513811136094785e-07, "loss": 0.0001, "reward": 0.5829600065946579, "reward_std": 0.034487396385520695, "rewards/wrapped_prediction_reward_func": 0.5829600065946579, "step": 375 }, { "completion_length": 846.0875274658204, "epoch": 0.9571788413098237, "grad_norm": 0.038214011850815244, "kl": 0.072222900390625, "learning_rate": 1.435955477265855e-07, "loss": 0.0001, "reward": 0.6045917004346848, "reward_std": 0.05368394823744893, "rewards/wrapped_prediction_reward_func": 0.6045917004346848, "step": 380 }, { "completion_length": 815.0458526611328, "epoch": 0.9697732997481109, "grad_norm": 0.030377786493206754, "kl": 0.078546142578125, "learning_rate": 1.3242680314639993e-07, "loss": 0.0001, "reward": 0.6047316044569016, "reward_std": 0.024706879258155824, "rewards/wrapped_prediction_reward_func": 0.6047316044569016, "step": 385 }, { "completion_length": 823.7146057128906, "epoch": 0.982367758186398, "grad_norm": 0.04345292422364211, "kl": 0.07333984375, "learning_rate": 1.2164359209115232e-07, "loss": 0.0001, "reward": 0.5892785102128982, "reward_std": 0.03435427148942836, "rewards/wrapped_prediction_reward_func": 0.5892785102128982, "step": 390 }, { "completion_length": 854.6458587646484, "epoch": 0.9949622166246851, "grad_norm": 0.05065816791479697, "kl": 0.0732177734375, "learning_rate": 1.1125722466017545e-07, "loss": 0.0001, "reward": 0.5809489727020264, "reward_std": 0.052034072624519465, "rewards/wrapped_prediction_reward_func": 0.5809489727020264, "step": 395 }, { "completion_length": 810.3968933105468, "epoch": 1.0075566750629723, "grad_norm": 0.03509442740105049, "kl": 0.075762939453125, "learning_rate": 1.0127859471863969e-07, "loss": 0.0001, "reward": 0.6079857856035232, "reward_std": 0.02203769111074507, "rewards/wrapped_prediction_reward_func": 0.6079857856035232, "step": 400 }, { "completion_length": 790.1479385375976, "epoch": 1.0201511335012594, "grad_norm": 0.03530726181544221, "kl": 0.0824920654296875, "learning_rate": 9.171816847139447e-08, "loss": 0.0001, "reward": 0.6192175649106503, "reward_std": 0.0369229513395112, "rewards/wrapped_prediction_reward_func": 0.6192175649106503, "step": 405 }, { "completion_length": 793.627099609375, "epoch": 1.0327455919395465, "grad_norm": 0.032582398864671, "kl": 0.077081298828125, "learning_rate": 8.258597348536451e-08, "loss": 0.0001, "reward": 0.6033682949841023, "reward_std": 0.034924795664846894, "rewards/wrapped_prediction_reward_func": 0.6033682949841023, "step": 410 }, { "completion_length": 820.5625213623047, "epoch": 1.0453400503778338, "grad_norm": 0.03594193252721311, "kl": 0.0728424072265625, "learning_rate": 7.389158817201541e-08, "loss": 0.0001, "reward": 0.5938129395246505, "reward_std": 0.025039249239489435, "rewards/wrapped_prediction_reward_func": 0.5938129395246505, "step": 415 }, { "completion_length": 791.6812713623046, "epoch": 1.057934508816121, "grad_norm": 0.2660556227962935, "kl": 0.178399658203125, "learning_rate": 6.564413174092443e-08, "loss": 0.0001, "reward": 0.5894127897918224, "reward_std": 0.026753197464859112, "rewards/wrapped_prediction_reward_func": 0.5894127897918224, "step": 420 }, { "completion_length": 816.7833587646485, "epoch": 1.070528967254408, "grad_norm": 0.04448961501333409, "kl": 0.074652099609375, "learning_rate": 5.785225463498828e-08, "loss": 0.0002, "reward": 0.6132952854037285, "reward_std": 0.04002904643421061, "rewards/wrapped_prediction_reward_func": 0.6132952854037285, "step": 425 }, { "completion_length": 848.4500198364258, "epoch": 1.0831234256926952, "grad_norm": 0.048148418977785004, "kl": 0.073956298828125, "learning_rate": 5.052412945730239e-08, "loss": 0.0001, "reward": 0.5875438123941421, "reward_std": 0.043150549242272976, "rewards/wrapped_prediction_reward_func": 0.5875438123941421, "step": 430 }, { "completion_length": 830.6750244140625, "epoch": 1.0957178841309823, "grad_norm": 0.05194640005960778, "kl": 0.082000732421875, "learning_rate": 4.366744239922998e-08, "loss": 0.0002, "reward": 0.6154551565647125, "reward_std": 0.040564915537834166, "rewards/wrapped_prediction_reward_func": 0.6154551565647125, "step": 435 }, { "completion_length": 836.8979400634765, "epoch": 1.1083123425692696, "grad_norm": 0.01997423624486904, "kl": 0.080609130859375, "learning_rate": 3.7289385178647935e-08, "loss": 0.0001, "reward": 0.60478435754776, "reward_std": 0.026485644932836293, "rewards/wrapped_prediction_reward_func": 0.60478435754776, "step": 440 }, { "completion_length": 832.4625228881836, "epoch": 1.1209068010075567, "grad_norm": 0.017870734412291134, "kl": 0.0710479736328125, "learning_rate": 3.1396647496828244e-08, "loss": 0.0001, "reward": 0.6108729064464569, "reward_std": 0.03767378572374582, "rewards/wrapped_prediction_reward_func": 0.6108729064464569, "step": 445 }, { "completion_length": 882.7062759399414, "epoch": 1.1335012594458438, "grad_norm": 0.05015364292113255, "kl": 0.0659332275390625, "learning_rate": 2.5995410021864783e-08, "loss": 0.0001, "reward": 0.5940894782543182, "reward_std": 0.04104239339358173, "rewards/wrapped_prediction_reward_func": 0.5940894782543182, "step": 450 }, { "completion_length": 820.8708557128906, "epoch": 1.146095717884131, "grad_norm": 0.03890549939275878, "kl": 0.079840087890625, "learning_rate": 2.109133790600648e-08, "loss": 0.0002, "reward": 0.6024712681770324, "reward_std": 0.03324083940824494, "rewards/wrapped_prediction_reward_func": 0.6024712681770324, "step": 455 }, { "completion_length": 816.364598083496, "epoch": 1.1586901763224182, "grad_norm": 0.03882316928789839, "kl": 0.07708740234375, "learning_rate": 1.6689574843694432e-08, "loss": 0.0001, "reward": 0.575566939264536, "reward_std": 0.049419730342924593, "rewards/wrapped_prediction_reward_func": 0.575566939264536, "step": 460 }, { "completion_length": 854.1916915893555, "epoch": 1.1712846347607053, "grad_norm": 0.018046225222633556, "kl": 0.06839599609375, "learning_rate": 1.2794737676536993e-08, "loss": 0.0001, "reward": 0.5933746129274369, "reward_std": 0.02758802007883787, "rewards/wrapped_prediction_reward_func": 0.5933746129274369, "step": 465 }, { "completion_length": 877.6416931152344, "epoch": 1.1838790931989924, "grad_norm": 0.04393883375775292, "kl": 0.064520263671875, "learning_rate": 9.410911550880474e-09, "loss": 0.0001, "reward": 0.5791490338742733, "reward_std": 0.02540514359716326, "rewards/wrapped_prediction_reward_func": 0.5791490338742733, "step": 470 }, { "completion_length": 823.2521118164062, "epoch": 1.1964735516372795, "grad_norm": 0.019200187518650894, "kl": 0.072186279296875, "learning_rate": 6.541645633054649e-09, "loss": 0.0002, "reward": 0.577340692281723, "reward_std": 0.024969300418160856, "rewards/wrapped_prediction_reward_func": 0.577340692281723, "step": 475 }, { "completion_length": 835.3208541870117, "epoch": 1.2090680100755669, "grad_norm": 0.03179886042365874, "kl": 0.07593994140625, "learning_rate": 4.189949386787462e-09, "loss": 0.0, "reward": 0.6053503692150116, "reward_std": 0.05023272330872715, "rewards/wrapped_prediction_reward_func": 0.6053503692150116, "step": 480 }, { "completion_length": 800.9750183105468, "epoch": 1.221662468513854, "grad_norm": 0.031246606223002214, "kl": 0.090142822265625, "learning_rate": 2.3582894166930267e-09, "loss": 0.0001, "reward": 0.6070939235389232, "reward_std": 0.03144086834508926, "rewards/wrapped_prediction_reward_func": 0.6070939235389232, "step": 485 }, { "completion_length": 848.7771072387695, "epoch": 1.234256926952141, "grad_norm": 0.05601067005700773, "kl": 0.07252197265625, "learning_rate": 1.0485868811441756e-09, "loss": 0.0001, "reward": 0.5712633952498436, "reward_std": 0.04291044136043638, "rewards/wrapped_prediction_reward_func": 0.5712633952498436, "step": 490 }, { "completion_length": 771.6208572387695, "epoch": 1.2468513853904282, "grad_norm": 0.03724004112267365, "kl": 0.085308837890625, "learning_rate": 2.6221547724253333e-10, "loss": 0.0001, "reward": 0.6077594101428986, "reward_std": 0.026009649853222072, "rewards/wrapped_prediction_reward_func": 0.6077594101428986, "step": 495 }, { "completion_length": 796.8458557128906, "epoch": 1.2594458438287153, "grad_norm": 0.053721210517970415, "kl": 0.0768798828125, "learning_rate": 0.0, "loss": 0.0002, "reward": 0.601429545879364, "reward_std": 0.024498459114693106, "rewards/wrapped_prediction_reward_func": 0.601429545879364, "step": 500 }, { "epoch": 1.2594458438287153, "step": 500, "total_flos": 0.0, "train_loss": 7.065190569642255e-05, "train_runtime": 24090.8459, "train_samples_per_second": 1.992, "train_steps_per_second": 0.021 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }