{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4948198546466677, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 706.4500244140625, "epoch": 0.0012370496366166692, "grad_norm": 0.07395388084773334, "kl": 0.00013015270233154296, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.2687237920239568, "reward_std": 0.27002699561417104, "rewards/wrapped_prediction_reward_func": 0.2687237920239568, "step": 5 }, { "completion_length": 688.8208526611328, "epoch": 0.0024740992732333383, "grad_norm": 0.07617417033897633, "kl": 0.00021941661834716796, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.2440833143889904, "reward_std": 0.2801619090139866, "rewards/wrapped_prediction_reward_func": 0.2440833143889904, "step": 10 }, { "completion_length": 744.8208572387696, "epoch": 0.003711148909850008, "grad_norm": 0.0819078272459872, "kl": 0.00023250579833984376, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.26395664354786275, "reward_std": 0.28396731354296206, "rewards/wrapped_prediction_reward_func": 0.26395664354786275, "step": 15 }, { "completion_length": 708.2479370117187, "epoch": 0.004948198546466677, "grad_norm": 0.076939745426713, "kl": 0.0003298521041870117, "learning_rate": 9.997377845227574e-07, "loss": 0.0, "reward": 0.26672716801986096, "reward_std": 0.2962087593972683, "rewards/wrapped_prediction_reward_func": 0.26672716801986096, "step": 20 }, { "completion_length": 708.7979354858398, "epoch": 0.006185248183083346, "grad_norm": 0.07979211449944758, "kl": 0.000590658187866211, "learning_rate": 9.989514131188558e-07, "loss": 0.0, "reward": 0.26328447796404364, "reward_std": 0.268215361982584, "rewards/wrapped_prediction_reward_func": 0.26328447796404364, "step": 25 }, { "completion_length": 699.4666870117187, "epoch": 0.007422297819700016, "grad_norm": 0.07645997382536411, "kl": 0.00087738037109375, "learning_rate": 9.97641710583307e-07, "loss": 0.0, "reward": 0.2993859386071563, "reward_std": 0.2841612804681063, "rewards/wrapped_prediction_reward_func": 0.2993859386071563, "step": 30 }, { "completion_length": 698.4104400634766, "epoch": 0.008659347456316685, "grad_norm": 0.08229005033469632, "kl": 0.0011083602905273438, "learning_rate": 9.958100506132126e-07, "loss": 0.0, "reward": 0.24814135134220122, "reward_std": 0.27837129943072797, "rewards/wrapped_prediction_reward_func": 0.24814135134220122, "step": 35 }, { "completion_length": 730.5583557128906, "epoch": 0.009896397092933353, "grad_norm": 0.08032008739221451, "kl": 0.0014670372009277343, "learning_rate": 9.934583543669453e-07, "loss": 0.0, "reward": 0.28453848622739314, "reward_std": 0.30600441098213194, "rewards/wrapped_prediction_reward_func": 0.28453848622739314, "step": 40 }, { "completion_length": 717.2500183105469, "epoch": 0.011133446729550023, "grad_norm": 0.07636479099600918, "kl": 0.0020627975463867188, "learning_rate": 9.905890884491194e-07, "loss": 0.0, "reward": 0.3236827469430864, "reward_std": 0.28328912034630777, "rewards/wrapped_prediction_reward_func": 0.3236827469430864, "step": 45 }, { "completion_length": 703.4291854858399, "epoch": 0.012370496366166692, "grad_norm": 0.08479182777264302, "kl": 0.0030239105224609377, "learning_rate": 9.872052623234631e-07, "loss": 0.0, "reward": 0.33847837764769795, "reward_std": 0.27859835997223853, "rewards/wrapped_prediction_reward_func": 0.33847837764769795, "step": 50 }, { "completion_length": 713.7958557128907, "epoch": 0.013607546002783362, "grad_norm": 0.07397783131579905, "kl": 0.004088401794433594, "learning_rate": 9.833104251563055e-07, "loss": 0.0, "reward": 0.25986622236669066, "reward_std": 0.27613487355411054, "rewards/wrapped_prediction_reward_func": 0.25986622236669066, "step": 55 }, { "completion_length": 726.5437713623047, "epoch": 0.014844595639400032, "grad_norm": 0.07426145325116877, "kl": 0.0040454864501953125, "learning_rate": 9.789086620939935e-07, "loss": 0.0, "reward": 0.2972335124388337, "reward_std": 0.2872866731137037, "rewards/wrapped_prediction_reward_func": 0.2972335124388337, "step": 60 }, { "completion_length": 731.0708602905273, "epoch": 0.0160816452760167, "grad_norm": 0.07737706819507911, "kl": 0.0043487548828125, "learning_rate": 9.740045899781352e-07, "loss": 0.0, "reward": 0.32037065653130414, "reward_std": 0.302394513040781, "rewards/wrapped_prediction_reward_func": 0.32037065653130414, "step": 65 }, { "completion_length": 719.6500183105469, "epoch": 0.01731869491263337, "grad_norm": 0.0795805434296881, "kl": 0.0043670654296875, "learning_rate": 9.686033525031719e-07, "loss": 0.0, "reward": 0.33162283562123773, "reward_std": 0.2712084986269474, "rewards/wrapped_prediction_reward_func": 0.33162283562123773, "step": 70 }, { "completion_length": 712.7771057128906, "epoch": 0.01855574454925004, "grad_norm": 0.0880575611800613, "kl": 0.005062484741210937, "learning_rate": 9.62710614821352e-07, "loss": 0.0, "reward": 0.2511910041794181, "reward_std": 0.28312002196908, "rewards/wrapped_prediction_reward_func": 0.2511910041794181, "step": 75 }, { "completion_length": 724.2229370117187, "epoch": 0.019792794185866706, "grad_norm": 0.07036361173864525, "kl": 0.0055816650390625, "learning_rate": 9.5633255760077e-07, "loss": 0.0, "reward": 0.3067868994548917, "reward_std": 0.28412686809897425, "rewards/wrapped_prediction_reward_func": 0.3067868994548917, "step": 80 }, { "completion_length": 752.7083541870118, "epoch": 0.021029843822483376, "grad_norm": 0.07199687991636121, "kl": 0.006278228759765625, "learning_rate": 9.494758705426976e-07, "loss": 0.0, "reward": 0.32415693569928405, "reward_std": 0.27934016212821006, "rewards/wrapped_prediction_reward_func": 0.32415693569928405, "step": 85 }, { "completion_length": 722.7916900634766, "epoch": 0.022266893459100046, "grad_norm": 0.08949060752280585, "kl": 0.0082305908203125, "learning_rate": 9.421477453650117e-07, "loss": 0.0, "reward": 0.24697376862168313, "reward_std": 0.27428305745124815, "rewards/wrapped_prediction_reward_func": 0.24697376862168313, "step": 90 }, { "completion_length": 722.5000183105469, "epoch": 0.023503943095716715, "grad_norm": 0.0848084970118332, "kl": 0.00894317626953125, "learning_rate": 9.343558682590755e-07, "loss": 0.0, "reward": 0.28274349831044676, "reward_std": 0.2850473988801241, "rewards/wrapped_prediction_reward_func": 0.28274349831044676, "step": 95 }, { "completion_length": 742.6771026611328, "epoch": 0.024740992732333385, "grad_norm": 0.07208469639827622, "kl": 0.009989166259765625, "learning_rate": 9.261084118279846e-07, "loss": 0.0, "reward": 0.3533043290488422, "reward_std": 0.2748906321823597, "rewards/wrapped_prediction_reward_func": 0.3533043290488422, "step": 100 }, { "completion_length": 763.0521026611328, "epoch": 0.025978042368950054, "grad_norm": 0.08257655118000982, "kl": 0.0105560302734375, "learning_rate": 9.174140265146355e-07, "loss": 0.0, "reward": 0.3249462183564901, "reward_std": 0.28961299136281016, "rewards/wrapped_prediction_reward_func": 0.3249462183564901, "step": 105 }, { "completion_length": 740.0521057128906, "epoch": 0.027215092005566724, "grad_norm": 0.0762279359554735, "kl": 0.01315155029296875, "learning_rate": 9.082818315286054e-07, "loss": 0.0, "reward": 0.3339743584394455, "reward_std": 0.2952582038938999, "rewards/wrapped_prediction_reward_func": 0.3339743584394455, "step": 110 }, { "completion_length": 757.6833557128906, "epoch": 0.028452141642183394, "grad_norm": 0.08396678957726936, "kl": 0.0153076171875, "learning_rate": 8.987214052813603e-07, "loss": 0.0, "reward": 0.33160307751968504, "reward_std": 0.29202965721488, "rewards/wrapped_prediction_reward_func": 0.33160307751968504, "step": 115 }, { "completion_length": 749.6187683105469, "epoch": 0.029689191278800063, "grad_norm": 0.07596631346489087, "kl": 0.01697845458984375, "learning_rate": 8.887427753398247e-07, "loss": 0.0, "reward": 0.3451491242274642, "reward_std": 0.29567699022591115, "rewards/wrapped_prediction_reward_func": 0.3451491242274642, "step": 120 }, { "completion_length": 754.2937698364258, "epoch": 0.030926240915416733, "grad_norm": 0.07804459830763263, "kl": 0.0177276611328125, "learning_rate": 8.783564079088476e-07, "loss": 0.0, "reward": 0.28280929680913686, "reward_std": 0.27272967286407945, "rewards/wrapped_prediction_reward_func": 0.28280929680913686, "step": 125 }, { "completion_length": 738.1771072387695, "epoch": 0.0321632905520334, "grad_norm": 0.08343258118775494, "kl": 0.0186279296875, "learning_rate": 8.675731968536002e-07, "loss": 0.0, "reward": 0.36268146373331545, "reward_std": 0.30270115882158277, "rewards/wrapped_prediction_reward_func": 0.36268146373331545, "step": 130 }, { "completion_length": 744.5104385375977, "epoch": 0.03340034018865007, "grad_norm": 0.08347592360346824, "kl": 0.01798858642578125, "learning_rate": 8.564044522734146e-07, "loss": 0.0, "reward": 0.30483949054032566, "reward_std": 0.2955338716506958, "rewards/wrapped_prediction_reward_func": 0.30483949054032566, "step": 135 }, { "completion_length": 774.2041900634765, "epoch": 0.03463738982526674, "grad_norm": 0.08210188698222519, "kl": 0.01839752197265625, "learning_rate": 8.448618886390521e-07, "loss": 0.0, "reward": 0.33811669340357187, "reward_std": 0.28610839396715165, "rewards/wrapped_prediction_reward_func": 0.33811669340357187, "step": 140 }, { "completion_length": 771.485432434082, "epoch": 0.03587443946188341, "grad_norm": 0.08231309017722833, "kl": 0.0191131591796875, "learning_rate": 8.329576125058405e-07, "loss": 0.0, "reward": 0.32734864968806504, "reward_std": 0.3108094479888678, "rewards/wrapped_prediction_reward_func": 0.32734864968806504, "step": 145 }, { "completion_length": 762.2562698364258, "epoch": 0.03711148909850008, "grad_norm": 0.08035168896650213, "kl": 0.0212799072265625, "learning_rate": 8.207041098155699e-07, "loss": 0.0, "reward": 0.326921210065484, "reward_std": 0.291541776061058, "rewards/wrapped_prediction_reward_func": 0.326921210065484, "step": 150 }, { "completion_length": 725.4208541870117, "epoch": 0.03834853873511675, "grad_norm": 0.08547192597275226, "kl": 0.0233428955078125, "learning_rate": 8.081142328004636e-07, "loss": 0.0, "reward": 0.3439413908869028, "reward_std": 0.29944013953208926, "rewards/wrapped_prediction_reward_func": 0.3439413908869028, "step": 155 }, { "completion_length": 759.2291854858398, "epoch": 0.03958558837173341, "grad_norm": 0.08506426381536784, "kl": 0.02423095703125, "learning_rate": 7.952011865029613e-07, "loss": 0.0, "reward": 0.3297223553061485, "reward_std": 0.3085209414362907, "rewards/wrapped_prediction_reward_func": 0.3297223553061485, "step": 160 }, { "completion_length": 758.3500198364258, "epoch": 0.040822638008350086, "grad_norm": 0.08034425112065993, "kl": 0.0279388427734375, "learning_rate": 7.819785149254532e-07, "loss": 0.0, "reward": 0.35085770562291146, "reward_std": 0.2952751278877258, "rewards/wrapped_prediction_reward_func": 0.35085770562291146, "step": 165 }, { "completion_length": 743.0666854858398, "epoch": 0.04205968764496675, "grad_norm": 0.08511673303593013, "kl": 0.02681884765625, "learning_rate": 7.684600868244919e-07, "loss": 0.0, "reward": 0.3263757932931185, "reward_std": 0.2954482719302177, "rewards/wrapped_prediction_reward_func": 0.3263757932931185, "step": 170 }, { "completion_length": 738.7437698364258, "epoch": 0.043296737281583425, "grad_norm": 0.08796509264783618, "kl": 0.027349853515625, "learning_rate": 7.546600811643816e-07, "loss": 0.0, "reward": 0.3590401239693165, "reward_std": 0.308769679069519, "rewards/wrapped_prediction_reward_func": 0.3590401239693165, "step": 175 }, { "completion_length": 747.3666885375976, "epoch": 0.04453378691820009, "grad_norm": 0.08179670112302775, "kl": 0.0264739990234375, "learning_rate": 7.405929722454025e-07, "loss": 0.0, "reward": 0.38826956897974013, "reward_std": 0.3231815077364445, "rewards/wrapped_prediction_reward_func": 0.38826956897974013, "step": 180 }, { "completion_length": 742.806265258789, "epoch": 0.045770836554816764, "grad_norm": 0.08578134015011815, "kl": 0.02674560546875, "learning_rate": 7.262735145222695e-07, "loss": 0.0, "reward": 0.3505730252712965, "reward_std": 0.2775091715157032, "rewards/wrapped_prediction_reward_func": 0.3505730252712965, "step": 185 }, { "completion_length": 718.962516784668, "epoch": 0.04700788619143343, "grad_norm": 0.07994897184415051, "kl": 0.0296417236328125, "learning_rate": 7.117167271287452e-07, "loss": 0.0, "reward": 0.35921268798410894, "reward_std": 0.30513893701136113, "rewards/wrapped_prediction_reward_func": 0.35921268798410894, "step": 190 }, { "completion_length": 748.2687713623047, "epoch": 0.0482449358280501, "grad_norm": 0.0865770688943119, "kl": 0.028778076171875, "learning_rate": 6.969378781246436e-07, "loss": 0.0, "reward": 0.32933947034180167, "reward_std": 0.30865853317081926, "rewards/wrapped_prediction_reward_func": 0.32933947034180167, "step": 195 }, { "completion_length": 759.0875183105469, "epoch": 0.04948198546466677, "grad_norm": 0.08067297784748526, "kl": 0.0297637939453125, "learning_rate": 6.819524684817438e-07, "loss": 0.0, "reward": 0.36759350784122946, "reward_std": 0.3031803611665964, "rewards/wrapped_prediction_reward_func": 0.36759350784122946, "step": 200 }, { "completion_length": 743.3833541870117, "epoch": 0.050719035101283436, "grad_norm": 0.08733780092177663, "kl": 0.0316497802734375, "learning_rate": 6.667762158254103e-07, "loss": 0.0, "reward": 0.3384020393714309, "reward_std": 0.2984684105962515, "rewards/wrapped_prediction_reward_func": 0.3384020393714309, "step": 205 }, { "completion_length": 773.5250213623046, "epoch": 0.05195608473790011, "grad_norm": 0.0806218167874605, "kl": 0.030438232421875, "learning_rate": 6.514250379489753e-07, "loss": 0.0, "reward": 0.3939262267202139, "reward_std": 0.30142598897218703, "rewards/wrapped_prediction_reward_func": 0.3939262267202139, "step": 210 }, { "completion_length": 747.8646072387695, "epoch": 0.053193134374516775, "grad_norm": 0.09049898430027699, "kl": 0.0313079833984375, "learning_rate": 6.359150361181714e-07, "loss": 0.0, "reward": 0.32692537792027, "reward_std": 0.3000472154468298, "rewards/wrapped_prediction_reward_func": 0.32692537792027, "step": 215 }, { "completion_length": 742.7000167846679, "epoch": 0.05443018401113345, "grad_norm": 0.08423570258982128, "kl": 0.035028076171875, "learning_rate": 6.202624781831268e-07, "loss": 0.0, "reward": 0.3765925022773445, "reward_std": 0.2830514904111624, "rewards/wrapped_prediction_reward_func": 0.3765925022773445, "step": 220 }, { "completion_length": 797.6562728881836, "epoch": 0.055667233647750114, "grad_norm": 0.07736975706873805, "kl": 0.0339813232421875, "learning_rate": 6.044837815156376e-07, "loss": 0.0, "reward": 0.40539272837340834, "reward_std": 0.3082651127129793, "rewards/wrapped_prediction_reward_func": 0.40539272837340834, "step": 225 }, { "completion_length": 780.8875167846679, "epoch": 0.05690428328436679, "grad_norm": 0.07827098940866907, "kl": 0.0318695068359375, "learning_rate": 5.885954957896115e-07, "loss": 0.0, "reward": 0.38181981910020113, "reward_std": 0.2932945691049099, "rewards/wrapped_prediction_reward_func": 0.38181981910020113, "step": 230 }, { "completion_length": 754.2896072387696, "epoch": 0.05814133292098345, "grad_norm": 0.0826828904689384, "kl": 0.034869384765625, "learning_rate": 5.726142856227452e-07, "loss": 0.0, "reward": 0.3463834885507822, "reward_std": 0.28481542877852917, "rewards/wrapped_prediction_reward_func": 0.3463834885507822, "step": 235 }, { "completion_length": 771.1562713623047, "epoch": 0.059378382557600126, "grad_norm": 0.07713887324957552, "kl": 0.0342132568359375, "learning_rate": 5.565569130976422e-07, "loss": 0.0, "reward": 0.3932892482727766, "reward_std": 0.28364565074443815, "rewards/wrapped_prediction_reward_func": 0.3932892482727766, "step": 240 }, { "completion_length": 774.7646041870117, "epoch": 0.06061543219421679, "grad_norm": 0.08270494108267515, "kl": 0.0363555908203125, "learning_rate": 5.404402201807021e-07, "loss": 0.0, "reward": 0.32618654407560826, "reward_std": 0.2739331152290106, "rewards/wrapped_prediction_reward_func": 0.32618654407560826, "step": 245 }, { "completion_length": 768.7041900634765, "epoch": 0.061852481830833465, "grad_norm": 0.08274418456934633, "kl": 0.0361663818359375, "learning_rate": 5.242811110572242e-07, "loss": 0.0, "reward": 0.34980231150984764, "reward_std": 0.28961083069443705, "rewards/wrapped_prediction_reward_func": 0.34980231150984764, "step": 250 }, { "completion_length": 772.8771057128906, "epoch": 0.06308953146745012, "grad_norm": 0.08218602446897527, "kl": 0.037335205078125, "learning_rate": 5.080965344012508e-07, "loss": 0.0, "reward": 0.3701079810038209, "reward_std": 0.2713898040354252, "rewards/wrapped_prediction_reward_func": 0.3701079810038209, "step": 255 }, { "completion_length": 778.5416854858398, "epoch": 0.0643265811040668, "grad_norm": 0.07617862186579623, "kl": 0.0373199462890625, "learning_rate": 4.919034655987492e-07, "loss": 0.0, "reward": 0.3581130625680089, "reward_std": 0.2770910169929266, "rewards/wrapped_prediction_reward_func": 0.3581130625680089, "step": 260 }, { "completion_length": 780.9250213623047, "epoch": 0.06556363074068347, "grad_norm": 0.07656477927043985, "kl": 0.03756103515625, "learning_rate": 4.75718888942776e-07, "loss": 0.0, "reward": 0.36717594107612966, "reward_std": 0.28662597499787806, "rewards/wrapped_prediction_reward_func": 0.36717594107612966, "step": 265 }, { "completion_length": 781.2896041870117, "epoch": 0.06680068037730014, "grad_norm": 0.08014122515360159, "kl": 0.038311767578125, "learning_rate": 4.595597798192979e-07, "loss": 0.0, "reward": 0.3816223323345184, "reward_std": 0.271556431427598, "rewards/wrapped_prediction_reward_func": 0.3816223323345184, "step": 270 }, { "completion_length": 798.4146026611328, "epoch": 0.0680377300139168, "grad_norm": 0.07982046766678585, "kl": 0.0353759765625, "learning_rate": 4.434430869023579e-07, "loss": 0.0, "reward": 0.4386338355951011, "reward_std": 0.2730592340230942, "rewards/wrapped_prediction_reward_func": 0.4386338355951011, "step": 275 }, { "completion_length": 785.2729354858399, "epoch": 0.06927477965053348, "grad_norm": 0.08494904461725931, "kl": 0.0390533447265625, "learning_rate": 4.2738571437725496e-07, "loss": 0.0, "reward": 0.35094334864988924, "reward_std": 0.28738664388656615, "rewards/wrapped_prediction_reward_func": 0.35094334864988924, "step": 280 }, { "completion_length": 782.0437713623047, "epoch": 0.07051182928715015, "grad_norm": 0.0803877655033883, "kl": 0.0365753173828125, "learning_rate": 4.1140450421038866e-07, "loss": 0.0, "reward": 0.3688418876379728, "reward_std": 0.2656211916357279, "rewards/wrapped_prediction_reward_func": 0.3688418876379728, "step": 285 }, { "completion_length": 768.6500213623046, "epoch": 0.07174887892376682, "grad_norm": 0.08304158927054404, "kl": 0.0390380859375, "learning_rate": 3.955162184843624e-07, "loss": 0.0, "reward": 0.39955376647412777, "reward_std": 0.28139798939228056, "rewards/wrapped_prediction_reward_func": 0.39955376647412777, "step": 290 }, { "completion_length": 789.9041854858399, "epoch": 0.07298592856038348, "grad_norm": 0.07747208474050314, "kl": 0.037847900390625, "learning_rate": 3.7973752181687327e-07, "loss": 0.0, "reward": 0.38146291133016347, "reward_std": 0.26271351538598536, "rewards/wrapped_prediction_reward_func": 0.38146291133016347, "step": 295 }, { "completion_length": 783.5833541870118, "epoch": 0.07422297819700016, "grad_norm": 0.08812425557340961, "kl": 0.039569091796875, "learning_rate": 3.640849638818285e-07, "loss": 0.0, "reward": 0.392831420712173, "reward_std": 0.27882115095853804, "rewards/wrapped_prediction_reward_func": 0.392831420712173, "step": 300 }, { "completion_length": 766.4041885375976, "epoch": 0.07546002783361683, "grad_norm": 0.08528324272383152, "kl": 0.04200439453125, "learning_rate": 3.485749620510247e-07, "loss": 0.0, "reward": 0.32817457215860485, "reward_std": 0.28904502242803576, "rewards/wrapped_prediction_reward_func": 0.32817457215860485, "step": 305 }, { "completion_length": 795.2208572387696, "epoch": 0.0766970774702335, "grad_norm": 0.0768071024120368, "kl": 0.03858642578125, "learning_rate": 3.3322378417458977e-07, "loss": 0.0, "reward": 0.4111718636006117, "reward_std": 0.28420086577534676, "rewards/wrapped_prediction_reward_func": 0.4111718636006117, "step": 310 }, { "completion_length": 782.3541900634766, "epoch": 0.07793412710685016, "grad_norm": 0.0844336302569656, "kl": 0.037799072265625, "learning_rate": 3.1804753151825627e-07, "loss": 0.0, "reward": 0.4124441655352712, "reward_std": 0.2818894196301699, "rewards/wrapped_prediction_reward_func": 0.4124441655352712, "step": 315 }, { "completion_length": 793.0646057128906, "epoch": 0.07917117674346683, "grad_norm": 0.08066624577311658, "kl": 0.0383544921875, "learning_rate": 3.030621218753565e-07, "loss": 0.0, "reward": 0.3580753861926496, "reward_std": 0.2660487644374371, "rewards/wrapped_prediction_reward_func": 0.3580753861926496, "step": 320 }, { "completion_length": 772.2937713623047, "epoch": 0.0804082263800835, "grad_norm": 0.08605776021258232, "kl": 0.03887939453125, "learning_rate": 2.8828327287125507e-07, "loss": 0.0, "reward": 0.4216658443212509, "reward_std": 0.24999102614820004, "rewards/wrapped_prediction_reward_func": 0.4216658443212509, "step": 325 }, { "completion_length": 756.6062744140625, "epoch": 0.08164527601670017, "grad_norm": 0.08228331152607465, "kl": 0.0389862060546875, "learning_rate": 2.7372648547773056e-07, "loss": 0.0, "reward": 0.3480538224801421, "reward_std": 0.290723355486989, "rewards/wrapped_prediction_reward_func": 0.3480538224801421, "step": 330 }, { "completion_length": 756.4979415893555, "epoch": 0.08288232565331684, "grad_norm": 0.07897908185759743, "kl": 0.0384063720703125, "learning_rate": 2.5940702775459744e-07, "loss": 0.0, "reward": 0.43084534499794247, "reward_std": 0.24301714226603507, "rewards/wrapped_prediction_reward_func": 0.43084534499794247, "step": 335 }, { "completion_length": 770.8208541870117, "epoch": 0.0841193752899335, "grad_norm": 0.08022436158476051, "kl": 0.037506103515625, "learning_rate": 2.4533991883561867e-07, "loss": 0.0, "reward": 0.36631291452795267, "reward_std": 0.2661047846078873, "rewards/wrapped_prediction_reward_func": 0.36631291452795267, "step": 340 }, { "completion_length": 802.768765258789, "epoch": 0.08535642492655018, "grad_norm": 0.07716945019631045, "kl": 0.0375732421875, "learning_rate": 2.3153991317550808e-07, "loss": 0.0, "reward": 0.3613167183473706, "reward_std": 0.2558280661702156, "rewards/wrapped_prediction_reward_func": 0.3613167183473706, "step": 345 }, { "completion_length": 783.9896057128906, "epoch": 0.08659347456316685, "grad_norm": 0.08090571273012268, "kl": 0.0387115478515625, "learning_rate": 2.180214850745467e-07, "loss": 0.0, "reward": 0.3723128205165267, "reward_std": 0.2817312143743038, "rewards/wrapped_prediction_reward_func": 0.3723128205165267, "step": 350 }, { "completion_length": 796.6708541870117, "epoch": 0.08783052419978352, "grad_norm": 0.08318196973274349, "kl": 0.0369293212890625, "learning_rate": 2.0479881349703882e-07, "loss": 0.0, "reward": 0.39644258581101893, "reward_std": 0.2782046254724264, "rewards/wrapped_prediction_reward_func": 0.39644258581101893, "step": 355 }, { "completion_length": 791.4229400634765, "epoch": 0.08906757383640018, "grad_norm": 0.08028917182552821, "kl": 0.0388916015625, "learning_rate": 1.918857671995363e-07, "loss": 0.0, "reward": 0.3609810034744442, "reward_std": 0.2516393680125475, "rewards/wrapped_prediction_reward_func": 0.3609810034744442, "step": 360 }, { "completion_length": 809.1104385375977, "epoch": 0.09030462347301685, "grad_norm": 0.07242816103482426, "kl": 0.040625, "learning_rate": 1.7929589018443014e-07, "loss": 0.0, "reward": 0.42961078975349665, "reward_std": 0.22062474824488162, "rewards/wrapped_prediction_reward_func": 0.42961078975349665, "step": 365 }, { "completion_length": 790.6604354858398, "epoch": 0.09154167310963353, "grad_norm": 0.06518629761186134, "kl": 0.0382080078125, "learning_rate": 1.6704238749415955e-07, "loss": 0.0, "reward": 0.4093520544469357, "reward_std": 0.2597980920225382, "rewards/wrapped_prediction_reward_func": 0.4093520544469357, "step": 370 }, { "completion_length": 814.1271087646485, "epoch": 0.0927787227462502, "grad_norm": 0.07628172339508517, "kl": 0.0365966796875, "learning_rate": 1.5513811136094785e-07, "loss": 0.0, "reward": 0.3505824860185385, "reward_std": 0.27299620136618613, "rewards/wrapped_prediction_reward_func": 0.3505824860185385, "step": 375 }, { "completion_length": 783.1896026611328, "epoch": 0.09401577238286686, "grad_norm": 0.08117291333238552, "kl": 0.0421905517578125, "learning_rate": 1.435955477265855e-07, "loss": 0.0, "reward": 0.37833018638193605, "reward_std": 0.25184023678302764, "rewards/wrapped_prediction_reward_func": 0.37833018638193605, "step": 380 }, { "completion_length": 770.8104370117187, "epoch": 0.09525282201948353, "grad_norm": 0.07871228753758207, "kl": 0.03914794921875, "learning_rate": 1.3242680314639993e-07, "loss": 0.0, "reward": 0.40263458620756865, "reward_std": 0.267032390832901, "rewards/wrapped_prediction_reward_func": 0.40263458620756865, "step": 385 }, { "completion_length": 773.722932434082, "epoch": 0.0964898716561002, "grad_norm": 0.07816804053249117, "kl": 0.0387786865234375, "learning_rate": 1.2164359209115232e-07, "loss": 0.0, "reward": 0.37163355564698575, "reward_std": 0.27504593059420585, "rewards/wrapped_prediction_reward_func": 0.37163355564698575, "step": 390 }, { "completion_length": 805.1875228881836, "epoch": 0.09772692129271687, "grad_norm": 0.08176318130659695, "kl": 0.03763427734375, "learning_rate": 1.1125722466017545e-07, "loss": 0.0, "reward": 0.33388274917379024, "reward_std": 0.24225291460752488, "rewards/wrapped_prediction_reward_func": 0.33388274917379024, "step": 395 }, { "completion_length": 805.0125213623047, "epoch": 0.09896397092933354, "grad_norm": 0.07854195016666575, "kl": 0.03751220703125, "learning_rate": 1.0127859471863969e-07, "loss": 0.0, "reward": 0.3642613720148802, "reward_std": 0.2789838884025812, "rewards/wrapped_prediction_reward_func": 0.3642613720148802, "step": 400 }, { "completion_length": 798.8812683105468, "epoch": 0.1002010205659502, "grad_norm": 0.07989341104679692, "kl": 0.03834228515625, "learning_rate": 9.171816847139447e-08, "loss": 0.0, "reward": 0.4235969323664904, "reward_std": 0.25955073721706867, "rewards/wrapped_prediction_reward_func": 0.4235969323664904, "step": 405 }, { "completion_length": 791.5479385375977, "epoch": 0.10143807020256687, "grad_norm": 0.08048023695623538, "kl": 0.0393402099609375, "learning_rate": 8.258597348536451e-08, "loss": 0.0, "reward": 0.3954638376832008, "reward_std": 0.24767913483083248, "rewards/wrapped_prediction_reward_func": 0.3954638376832008, "step": 410 }, { "completion_length": 817.8625198364258, "epoch": 0.10267511983918355, "grad_norm": 0.08216061039091842, "kl": 0.0398895263671875, "learning_rate": 7.389158817201541e-08, "loss": 0.0, "reward": 0.3182598289102316, "reward_std": 0.2491139207035303, "rewards/wrapped_prediction_reward_func": 0.3182598289102316, "step": 415 }, { "completion_length": 826.4541885375977, "epoch": 0.10391216947580022, "grad_norm": 0.08142431509048977, "kl": 0.039398193359375, "learning_rate": 6.564413174092443e-08, "loss": 0.0, "reward": 0.39901285879313947, "reward_std": 0.24879398196935654, "rewards/wrapped_prediction_reward_func": 0.39901285879313947, "step": 420 }, { "completion_length": 809.1500213623046, "epoch": 0.10514921911241688, "grad_norm": 0.08029175453124296, "kl": 0.0395721435546875, "learning_rate": 5.785225463498828e-08, "loss": 0.0, "reward": 0.3680599992163479, "reward_std": 0.20950759314000605, "rewards/wrapped_prediction_reward_func": 0.3680599992163479, "step": 425 }, { "completion_length": 793.9958541870117, "epoch": 0.10638626874903355, "grad_norm": 0.07686355454055262, "kl": 0.03958740234375, "learning_rate": 5.052412945730239e-08, "loss": 0.0, "reward": 0.34870776841416956, "reward_std": 0.23834120780229567, "rewards/wrapped_prediction_reward_func": 0.34870776841416956, "step": 430 }, { "completion_length": 784.295849609375, "epoch": 0.10762331838565023, "grad_norm": 0.07235699039223786, "kl": 0.0407958984375, "learning_rate": 4.366744239922998e-08, "loss": 0.0, "reward": 0.396230211853981, "reward_std": 0.26439183205366135, "rewards/wrapped_prediction_reward_func": 0.396230211853981, "step": 435 }, { "completion_length": 790.9791824340821, "epoch": 0.1088603680222669, "grad_norm": 0.0769781526573448, "kl": 0.0420654296875, "learning_rate": 3.7289385178647935e-08, "loss": 0.0, "reward": 0.4468363219872117, "reward_std": 0.25682389736175537, "rewards/wrapped_prediction_reward_func": 0.4468363219872117, "step": 440 }, { "completion_length": 786.5437728881836, "epoch": 0.11009741765888356, "grad_norm": 0.08235585919602932, "kl": 0.040509033203125, "learning_rate": 3.1396647496828244e-08, "loss": 0.0, "reward": 0.35082917027175425, "reward_std": 0.2520928043872118, "rewards/wrapped_prediction_reward_func": 0.35082917027175425, "step": 445 }, { "completion_length": 769.989599609375, "epoch": 0.11133446729550023, "grad_norm": 0.07935213941797192, "kl": 0.040728759765625, "learning_rate": 2.5995410021864783e-08, "loss": 0.0, "reward": 0.3685959761030972, "reward_std": 0.2553810145705938, "rewards/wrapped_prediction_reward_func": 0.3685959761030972, "step": 450 }, { "completion_length": 774.2687698364258, "epoch": 0.11257151693211691, "grad_norm": 0.08104915119350102, "kl": 0.043157958984375, "learning_rate": 2.109133790600648e-08, "loss": 0.0, "reward": 0.4045447456650436, "reward_std": 0.22608437277376653, "rewards/wrapped_prediction_reward_func": 0.4045447456650436, "step": 455 }, { "completion_length": 778.7187713623047, "epoch": 0.11380856656873357, "grad_norm": 0.08508676755947016, "kl": 0.0403411865234375, "learning_rate": 1.6689574843694432e-08, "loss": 0.0, "reward": 0.41044446006417273, "reward_std": 0.2603937637060881, "rewards/wrapped_prediction_reward_func": 0.41044446006417273, "step": 460 }, { "completion_length": 773.2146057128906, "epoch": 0.11504561620535024, "grad_norm": 0.08389470478912907, "kl": 0.04073486328125, "learning_rate": 1.2794737676536993e-08, "loss": 0.0, "reward": 0.41877091517671944, "reward_std": 0.24430075138807297, "rewards/wrapped_prediction_reward_func": 0.41877091517671944, "step": 465 }, { "completion_length": 815.6666854858398, "epoch": 0.1162826658419669, "grad_norm": 0.08008276331100919, "kl": 0.0399566650390625, "learning_rate": 9.410911550880474e-09, "loss": 0.0, "reward": 0.41310662887990474, "reward_std": 0.24822358265519143, "rewards/wrapped_prediction_reward_func": 0.41310662887990474, "step": 470 }, { "completion_length": 823.8187683105468, "epoch": 0.11751971547858357, "grad_norm": 0.07768470157971422, "kl": 0.038525390625, "learning_rate": 6.541645633054649e-09, "loss": 0.0, "reward": 0.417940921895206, "reward_std": 0.2483165491372347, "rewards/wrapped_prediction_reward_func": 0.417940921895206, "step": 475 }, { "completion_length": 769.9375198364257, "epoch": 0.11875676511520025, "grad_norm": 0.061168943284836415, "kl": 0.0412109375, "learning_rate": 4.189949386787462e-09, "loss": 0.0, "reward": 0.3644705488346517, "reward_std": 0.24229183606803417, "rewards/wrapped_prediction_reward_func": 0.3644705488346517, "step": 480 }, { "completion_length": 769.1396057128907, "epoch": 0.11999381475181692, "grad_norm": 0.07733858326687615, "kl": 0.04036865234375, "learning_rate": 2.3582894166930267e-09, "loss": 0.0, "reward": 0.4005460660904646, "reward_std": 0.25129300765693185, "rewards/wrapped_prediction_reward_func": 0.4005460660904646, "step": 485 }, { "completion_length": 787.2479385375976, "epoch": 0.12123086438843358, "grad_norm": 0.06128073169280309, "kl": 0.040618896484375, "learning_rate": 1.0485868811441756e-09, "loss": 0.0, "reward": 0.4271965937688947, "reward_std": 0.23377392888069154, "rewards/wrapped_prediction_reward_func": 0.4271965937688947, "step": 490 }, { "completion_length": 781.3208557128906, "epoch": 0.12246791402505025, "grad_norm": 0.07846556728701895, "kl": 0.040643310546875, "learning_rate": 2.6221547724253333e-10, "loss": 0.0, "reward": 0.34105976140126587, "reward_std": 0.24784295596182346, "rewards/wrapped_prediction_reward_func": 0.34105976140126587, "step": 495 }, { "completion_length": 794.2500198364257, "epoch": 0.12370496366166693, "grad_norm": 0.08430129800965754, "kl": 0.040234375, "learning_rate": 0.0, "loss": 0.0, "reward": 0.4289707732386887, "reward_std": 0.2418113723397255, "rewards/wrapped_prediction_reward_func": 0.4289707732386887, "step": 500 }, { "completion_length": 770.7791885375976, "epoch": 0.1249420132982836, "grad_norm": 0.0766438290633719, "kl": 0.037908935546875, "learning_rate": 8.75697374457722e-07, "loss": 0.0, "reward": 0.3715986987575889, "reward_std": 0.23276742324233055, "rewards/wrapped_prediction_reward_func": 0.3715986987575889, "step": 505 }, { "completion_length": 785.1041854858398, "epoch": 0.12617906293490025, "grad_norm": 0.0825692500772398, "kl": 0.04063720703125, "learning_rate": 8.73013710571623e-07, "loss": 0.0, "reward": 0.39223422314971684, "reward_std": 0.2475352793931961, "rewards/wrapped_prediction_reward_func": 0.39223422314971684, "step": 510 }, { "completion_length": 797.4375198364257, "epoch": 0.12741611257151694, "grad_norm": 0.08008720883542288, "kl": 0.0422119140625, "learning_rate": 8.703055921895199e-07, "loss": 0.0, "reward": 0.39032291090115906, "reward_std": 0.24012117199599742, "rewards/wrapped_prediction_reward_func": 0.39032291090115906, "step": 515 }, { "completion_length": 803.2333526611328, "epoch": 0.1286531622081336, "grad_norm": 0.07992687380521488, "kl": 0.044830322265625, "learning_rate": 8.675731968536002e-07, "loss": 0.0, "reward": 0.40386023046448827, "reward_std": 0.26609053984284403, "rewards/wrapped_prediction_reward_func": 0.40386023046448827, "step": 520 }, { "completion_length": 807.8625198364258, "epoch": 0.12989021184475028, "grad_norm": 0.0651664900707625, "kl": 0.04769287109375, "learning_rate": 8.648167036976302e-07, "loss": 0.0, "reward": 0.4148947631008923, "reward_std": 0.24836577251553535, "rewards/wrapped_prediction_reward_func": 0.4148947631008923, "step": 525 }, { "completion_length": 787.3375198364258, "epoch": 0.13112726148136694, "grad_norm": 0.0804633427575449, "kl": 0.047113037109375, "learning_rate": 8.620362934352108e-07, "loss": 0.0, "reward": 0.38152020014822485, "reward_std": 0.19838638603687286, "rewards/wrapped_prediction_reward_func": 0.38152020014822485, "step": 530 }, { "completion_length": 819.6854354858399, "epoch": 0.1323643111179836, "grad_norm": 0.06798793868581916, "kl": 0.051385498046875, "learning_rate": 8.592321483479303e-07, "loss": 0.0, "reward": 0.433369527477771, "reward_std": 0.20336254984140395, "rewards/wrapped_prediction_reward_func": 0.433369527477771, "step": 535 }, { "completion_length": 830.5083572387696, "epoch": 0.13360136075460027, "grad_norm": 0.07899410448354012, "kl": 0.053228759765625, "learning_rate": 8.564044522734146e-07, "loss": 0.0, "reward": 0.38872367832809684, "reward_std": 0.24438437521457673, "rewards/wrapped_prediction_reward_func": 0.38872367832809684, "step": 540 }, { "completion_length": 801.6979354858398, "epoch": 0.13483841039121694, "grad_norm": 0.05817921232742871, "kl": 0.0536865234375, "learning_rate": 8.535533905932737e-07, "loss": 0.0001, "reward": 0.469266626611352, "reward_std": 0.1918729618191719, "rewards/wrapped_prediction_reward_func": 0.469266626611352, "step": 545 }, { "completion_length": 792.9979385375976, "epoch": 0.1360754600278336, "grad_norm": 0.07588868644465757, "kl": 0.055865478515625, "learning_rate": 8.506791502209496e-07, "loss": 0.0, "reward": 0.3656269868835807, "reward_std": 0.21258986741304398, "rewards/wrapped_prediction_reward_func": 0.3656269868835807, "step": 550 }, { "completion_length": 781.2562683105468, "epoch": 0.1373125096644503, "grad_norm": 0.07289023388435747, "kl": 0.057379150390625, "learning_rate": 8.477819195894614e-07, "loss": 0.0, "reward": 0.38633032850921156, "reward_std": 0.24328391440212727, "rewards/wrapped_prediction_reward_func": 0.38633032850921156, "step": 555 }, { "completion_length": 811.3333511352539, "epoch": 0.13854955930106697, "grad_norm": 0.06172576914012836, "kl": 0.056732177734375, "learning_rate": 8.448618886390521e-07, "loss": 0.0001, "reward": 0.48336418326944114, "reward_std": 0.1790323607623577, "rewards/wrapped_prediction_reward_func": 0.48336418326944114, "step": 560 }, { "completion_length": 801.7708557128906, "epoch": 0.13978660893768363, "grad_norm": 0.0635314600858956, "kl": 0.055340576171875, "learning_rate": 8.419192488047369e-07, "loss": 0.0, "reward": 0.41982030756771566, "reward_std": 0.20342389829456806, "rewards/wrapped_prediction_reward_func": 0.41982030756771566, "step": 565 }, { "completion_length": 781.3166885375977, "epoch": 0.1410236585743003, "grad_norm": 0.08225311947234257, "kl": 0.058062744140625, "learning_rate": 8.389541930037516e-07, "loss": 0.0, "reward": 0.4451749456115067, "reward_std": 0.22370504327118396, "rewards/wrapped_prediction_reward_func": 0.4451749456115067, "step": 570 }, { "completion_length": 768.1000213623047, "epoch": 0.14226070821091696, "grad_norm": 0.07512188509392514, "kl": 0.0589599609375, "learning_rate": 8.359669156229061e-07, "loss": 0.0, "reward": 0.3749851009808481, "reward_std": 0.24626117795705796, "rewards/wrapped_prediction_reward_func": 0.3749851009808481, "step": 575 }, { "completion_length": 761.2083511352539, "epoch": 0.14349775784753363, "grad_norm": 0.07823221240071995, "kl": 0.065716552734375, "learning_rate": 8.329576125058405e-07, "loss": 0.0001, "reward": 0.4845332940109074, "reward_std": 0.1959636751562357, "rewards/wrapped_prediction_reward_func": 0.4845332940109074, "step": 580 }, { "completion_length": 765.8646041870118, "epoch": 0.1447348074841503, "grad_norm": 0.08670755026371187, "kl": 0.073675537109375, "learning_rate": 8.299264809401849e-07, "loss": 0.0, "reward": 0.40676400661468504, "reward_std": 0.21064249239861965, "rewards/wrapped_prediction_reward_func": 0.40676400661468504, "step": 585 }, { "completion_length": 744.7854354858398, "epoch": 0.14597185712076696, "grad_norm": 0.08330397852319552, "kl": 0.06956787109375, "learning_rate": 8.268737196446263e-07, "loss": 0.0, "reward": 0.44193693343549967, "reward_std": 0.22661157809197902, "rewards/wrapped_prediction_reward_func": 0.44193693343549967, "step": 590 }, { "completion_length": 759.2687713623047, "epoch": 0.14720890675738363, "grad_norm": 0.06723958978245864, "kl": 0.0719482421875, "learning_rate": 8.237995287558801e-07, "loss": 0.0, "reward": 0.42541108606383204, "reward_std": 0.22029799968004227, "rewards/wrapped_prediction_reward_func": 0.42541108606383204, "step": 595 }, { "completion_length": 792.7062683105469, "epoch": 0.14844595639400032, "grad_norm": 0.060005195130206926, "kl": 0.069671630859375, "learning_rate": 8.207041098155699e-07, "loss": 0.0001, "reward": 0.45301763620227575, "reward_std": 0.17232261933386325, "rewards/wrapped_prediction_reward_func": 0.45301763620227575, "step": 600 }, { "completion_length": 753.3437728881836, "epoch": 0.149683006030617, "grad_norm": 0.08112767534890353, "kl": 0.067376708984375, "learning_rate": 8.175876657570143e-07, "loss": 0.0001, "reward": 0.40281443875283, "reward_std": 0.22278440222144127, "rewards/wrapped_prediction_reward_func": 0.40281443875283, "step": 605 }, { "completion_length": 761.7666931152344, "epoch": 0.15092005566723365, "grad_norm": 0.07447474868170324, "kl": 0.065679931640625, "learning_rate": 8.144504008919222e-07, "loss": 0.0001, "reward": 0.39046034114435313, "reward_std": 0.18068717531859874, "rewards/wrapped_prediction_reward_func": 0.39046034114435313, "step": 610 }, { "completion_length": 762.4146011352539, "epoch": 0.15215710530385032, "grad_norm": 0.07699589897624498, "kl": 0.06287841796875, "learning_rate": 8.112925208969994e-07, "loss": 0.0, "reward": 0.42069863677024844, "reward_std": 0.21883668787777424, "rewards/wrapped_prediction_reward_func": 0.42069863677024844, "step": 615 }, { "completion_length": 768.6000183105468, "epoch": 0.153394154940467, "grad_norm": 0.08285866838394296, "kl": 0.066473388671875, "learning_rate": 8.081142328004636e-07, "loss": 0.0001, "reward": 0.4248110662214458, "reward_std": 0.1913703478872776, "rewards/wrapped_prediction_reward_func": 0.4248110662214458, "step": 620 }, { "completion_length": 748.643765258789, "epoch": 0.15463120457708365, "grad_norm": 0.07742750766969832, "kl": 0.064129638671875, "learning_rate": 8.049157449684722e-07, "loss": 0.0001, "reward": 0.47440802659839393, "reward_std": 0.19457920640707016, "rewards/wrapped_prediction_reward_func": 0.47440802659839393, "step": 625 }, { "completion_length": 752.6083557128907, "epoch": 0.15586825421370032, "grad_norm": 0.0667704718715496, "kl": 0.062310791015625, "learning_rate": 8.016972670914623e-07, "loss": 0.0, "reward": 0.47874220851808785, "reward_std": 0.21144339703023435, "rewards/wrapped_prediction_reward_func": 0.47874220851808785, "step": 630 }, { "completion_length": 769.3291839599609, "epoch": 0.15710530385031699, "grad_norm": 0.070797648531755, "kl": 0.06376953125, "learning_rate": 7.984590101704025e-07, "loss": 0.0001, "reward": 0.4929979900829494, "reward_std": 0.19738633297383784, "rewards/wrapped_prediction_reward_func": 0.4929979900829494, "step": 635 }, { "completion_length": 775.6666839599609, "epoch": 0.15834235348693365, "grad_norm": 0.08401653786497307, "kl": 0.062115478515625, "learning_rate": 7.952011865029613e-07, "loss": 0.0001, "reward": 0.4604580769315362, "reward_std": 0.1704392459243536, "rewards/wrapped_prediction_reward_func": 0.4604580769315362, "step": 640 }, { "completion_length": 797.9416915893555, "epoch": 0.15957940312355035, "grad_norm": 0.08134595383271465, "kl": 0.059991455078125, "learning_rate": 7.919240096695876e-07, "loss": 0.0001, "reward": 0.40527423843741417, "reward_std": 0.20551539584994316, "rewards/wrapped_prediction_reward_func": 0.40527423843741417, "step": 645 }, { "completion_length": 830.0625198364257, "epoch": 0.160816452760167, "grad_norm": 0.06191912556343784, "kl": 0.06328125, "learning_rate": 7.886276945195097e-07, "loss": 0.0001, "reward": 0.4366656182333827, "reward_std": 0.20696669742465018, "rewards/wrapped_prediction_reward_func": 0.4366656182333827, "step": 650 }, { "completion_length": 819.0000213623047, "epoch": 0.16205350239678368, "grad_norm": 0.05742925894238354, "kl": 0.06392822265625, "learning_rate": 7.853124571566491e-07, "loss": 0.0001, "reward": 0.452233337983489, "reward_std": 0.15266201496124268, "rewards/wrapped_prediction_reward_func": 0.452233337983489, "step": 655 }, { "completion_length": 817.3541870117188, "epoch": 0.16329055203340034, "grad_norm": 0.07022289820550312, "kl": 0.065838623046875, "learning_rate": 7.819785149254532e-07, "loss": 0.0, "reward": 0.4598813742399216, "reward_std": 0.20557740591466428, "rewards/wrapped_prediction_reward_func": 0.4598813742399216, "step": 660 }, { "completion_length": 800.5146072387695, "epoch": 0.164527601670017, "grad_norm": 0.06987985689631089, "kl": 0.066314697265625, "learning_rate": 7.786260863966467e-07, "loss": 0.0, "reward": 0.4142461620271206, "reward_std": 0.19170173518359662, "rewards/wrapped_prediction_reward_func": 0.4142461620271206, "step": 665 }, { "completion_length": 776.4146011352539, "epoch": 0.16576465130663368, "grad_norm": 0.06820738140125537, "kl": 0.0666259765625, "learning_rate": 7.752553913529018e-07, "loss": 0.0001, "reward": 0.39475581003353, "reward_std": 0.16375044211745263, "rewards/wrapped_prediction_reward_func": 0.39475581003353, "step": 670 }, { "completion_length": 806.5333541870117, "epoch": 0.16700170094325034, "grad_norm": 0.06195915019317933, "kl": 0.070855712890625, "learning_rate": 7.718666507744292e-07, "loss": 0.0, "reward": 0.431813295930624, "reward_std": 0.19085994884371757, "rewards/wrapped_prediction_reward_func": 0.431813295930624, "step": 675 }, { "completion_length": 781.3583557128907, "epoch": 0.168238750579867, "grad_norm": 0.0763785110981607, "kl": 0.07294921875, "learning_rate": 7.684600868244919e-07, "loss": 0.0001, "reward": 0.44598067868500946, "reward_std": 0.18708554245531558, "rewards/wrapped_prediction_reward_func": 0.44598067868500946, "step": 680 }, { "completion_length": 832.3021041870118, "epoch": 0.16947580021648367, "grad_norm": 0.06885728431184913, "kl": 0.071917724609375, "learning_rate": 7.650359228348389e-07, "loss": 0.0, "reward": 0.4754990879446268, "reward_std": 0.20640644915401934, "rewards/wrapped_prediction_reward_func": 0.4754990879446268, "step": 685 }, { "completion_length": 809.1479339599609, "epoch": 0.17071284985310037, "grad_norm": 0.059678793704242616, "kl": 0.076617431640625, "learning_rate": 7.61594383291065e-07, "loss": 0.0001, "reward": 0.4832779194228351, "reward_std": 0.15159060172736644, "rewards/wrapped_prediction_reward_func": 0.4832779194228351, "step": 690 }, { "completion_length": 833.0708587646484, "epoch": 0.17194989948971703, "grad_norm": 0.07661467366441205, "kl": 0.07445068359375, "learning_rate": 7.581356938178929e-07, "loss": 0.0, "reward": 0.41111797727644445, "reward_std": 0.17684606350958348, "rewards/wrapped_prediction_reward_func": 0.41111797727644445, "step": 695 }, { "completion_length": 774.0208557128906, "epoch": 0.1731869491263337, "grad_norm": 0.05955255967215497, "kl": 0.081622314453125, "learning_rate": 7.546600811643816e-07, "loss": 0.0001, "reward": 0.44893514458090067, "reward_std": 0.15385543517768382, "rewards/wrapped_prediction_reward_func": 0.44893514458090067, "step": 700 }, { "completion_length": 787.1604354858398, "epoch": 0.17442399876295037, "grad_norm": 0.06726030689642411, "kl": 0.08187255859375, "learning_rate": 7.51167773189061e-07, "loss": 0.0001, "reward": 0.45595725160092115, "reward_std": 0.150199169293046, "rewards/wrapped_prediction_reward_func": 0.45595725160092115, "step": 705 }, { "completion_length": 762.1125198364258, "epoch": 0.17566104839956703, "grad_norm": 0.08248976320903421, "kl": 0.081787109375, "learning_rate": 7.476589988449938e-07, "loss": 0.0, "reward": 0.43107919618487356, "reward_std": 0.14732478000223637, "rewards/wrapped_prediction_reward_func": 0.43107919618487356, "step": 710 }, { "completion_length": 782.9416885375977, "epoch": 0.1768980980361837, "grad_norm": 0.06796394392080257, "kl": 0.079656982421875, "learning_rate": 7.441339881647652e-07, "loss": 0.0001, "reward": 0.3995546201243997, "reward_std": 0.15563040487468244, "rewards/wrapped_prediction_reward_func": 0.3995546201243997, "step": 715 }, { "completion_length": 817.3625228881835, "epoch": 0.17813514767280036, "grad_norm": 0.06605880748783581, "kl": 0.083050537109375, "learning_rate": 7.405929722454025e-07, "loss": 0.0001, "reward": 0.5041571153327823, "reward_std": 0.14504916854202748, "rewards/wrapped_prediction_reward_func": 0.5041571153327823, "step": 720 }, { "completion_length": 777.9646102905274, "epoch": 0.17937219730941703, "grad_norm": 0.07241250724831831, "kl": 0.08251953125, "learning_rate": 7.37036183233224e-07, "loss": 0.0001, "reward": 0.4676838730461895, "reward_std": 0.17891859896481038, "rewards/wrapped_prediction_reward_func": 0.4676838730461895, "step": 725 }, { "completion_length": 790.3812728881836, "epoch": 0.1806092469460337, "grad_norm": 0.08047871155066995, "kl": 0.083935546875, "learning_rate": 7.334638543086203e-07, "loss": 0.0, "reward": 0.3849124165251851, "reward_std": 0.16877745948731898, "rewards/wrapped_prediction_reward_func": 0.3849124165251851, "step": 730 }, { "completion_length": 808.8854354858398, "epoch": 0.1818462965826504, "grad_norm": 0.08123146370485908, "kl": 0.084710693359375, "learning_rate": 7.298762196707668e-07, "loss": 0.0, "reward": 0.4050226330757141, "reward_std": 0.17611695490777493, "rewards/wrapped_prediction_reward_func": 0.4050226330757141, "step": 735 }, { "completion_length": 813.5125228881836, "epoch": 0.18308334621926706, "grad_norm": 0.06622297839524613, "kl": 0.08431396484375, "learning_rate": 7.262735145222695e-07, "loss": 0.0001, "reward": 0.4576717883348465, "reward_std": 0.1358294315636158, "rewards/wrapped_prediction_reward_func": 0.4576717883348465, "step": 740 }, { "completion_length": 774.8291870117188, "epoch": 0.18432039585588372, "grad_norm": 0.07819610347947448, "kl": 0.091357421875, "learning_rate": 7.226559750537461e-07, "loss": 0.0, "reward": 0.4723569771274924, "reward_std": 0.1277159068733454, "rewards/wrapped_prediction_reward_func": 0.4723569771274924, "step": 745 }, { "completion_length": 755.6312698364258, "epoch": 0.1855574454925004, "grad_norm": 0.05391480343805562, "kl": 0.092724609375, "learning_rate": 7.190238384283412e-07, "loss": 0.0001, "reward": 0.5140476296655834, "reward_std": 0.12782078310847284, "rewards/wrapped_prediction_reward_func": 0.5140476296655834, "step": 750 }, { "completion_length": 775.8166870117187, "epoch": 0.18679449512911706, "grad_norm": 0.0801122990010347, "kl": 0.09473876953125, "learning_rate": 7.153773427661773e-07, "loss": 0.0001, "reward": 0.4313332408666611, "reward_std": 0.15192715004086493, "rewards/wrapped_prediction_reward_func": 0.4313332408666611, "step": 755 }, { "completion_length": 768.3479385375977, "epoch": 0.18803154476573372, "grad_norm": 0.0805031146961764, "kl": 0.091558837890625, "learning_rate": 7.117167271287452e-07, "loss": 0.0001, "reward": 0.4566848175600171, "reward_std": 0.16660304851830005, "rewards/wrapped_prediction_reward_func": 0.4566848175600171, "step": 760 }, { "completion_length": 758.7333557128907, "epoch": 0.1892685944023504, "grad_norm": 0.04816693674184975, "kl": 0.09146728515625, "learning_rate": 7.080422315032297e-07, "loss": 0.0001, "reward": 0.5302044345065952, "reward_std": 0.11999310851097107, "rewards/wrapped_prediction_reward_func": 0.5302044345065952, "step": 765 }, { "completion_length": 785.3500183105468, "epoch": 0.19050564403896705, "grad_norm": 0.06508240549734828, "kl": 0.09281005859375, "learning_rate": 7.043540967867781e-07, "loss": 0.0001, "reward": 0.42681183107197285, "reward_std": 0.16044160537421703, "rewards/wrapped_prediction_reward_func": 0.42681183107197285, "step": 770 }, { "completion_length": 781.1312713623047, "epoch": 0.19174269367558372, "grad_norm": 0.07419752458939916, "kl": 0.095703125, "learning_rate": 7.006525647707053e-07, "loss": 0.0001, "reward": 0.5246783096343279, "reward_std": 0.13629969395697117, "rewards/wrapped_prediction_reward_func": 0.5246783096343279, "step": 775 }, { "completion_length": 794.4812698364258, "epoch": 0.1929797433122004, "grad_norm": 0.05569209701956418, "kl": 0.0977783203125, "learning_rate": 6.969378781246436e-07, "loss": 0.0001, "reward": 0.44251202829182146, "reward_std": 0.11432850509881973, "rewards/wrapped_prediction_reward_func": 0.44251202829182146, "step": 780 }, { "completion_length": 789.975016784668, "epoch": 0.19421679294881708, "grad_norm": 0.06689970905283685, "kl": 0.100927734375, "learning_rate": 6.932102803806324e-07, "loss": 0.0001, "reward": 0.47695821728557347, "reward_std": 0.14632086157798768, "rewards/wrapped_prediction_reward_func": 0.47695821728557347, "step": 785 }, { "completion_length": 769.1250198364257, "epoch": 0.19545384258543375, "grad_norm": 0.053449898788407335, "kl": 0.1048583984375, "learning_rate": 6.894700159171534e-07, "loss": 0.0001, "reward": 0.3842352098785341, "reward_std": 0.15045471973717212, "rewards/wrapped_prediction_reward_func": 0.3842352098785341, "step": 790 }, { "completion_length": 782.5437728881836, "epoch": 0.1966908922220504, "grad_norm": 0.06568315109126881, "kl": 0.1030517578125, "learning_rate": 6.857173299431083e-07, "loss": 0.0001, "reward": 0.4196577539667487, "reward_std": 0.1359492100775242, "rewards/wrapped_prediction_reward_func": 0.4196577539667487, "step": 795 }, { "completion_length": 783.5750152587891, "epoch": 0.19792794185866708, "grad_norm": 0.06177387185970035, "kl": 0.09818115234375, "learning_rate": 6.819524684817438e-07, "loss": 0.0001, "reward": 0.45632972903549673, "reward_std": 0.10240259878337384, "rewards/wrapped_prediction_reward_func": 0.45632972903549673, "step": 800 }, { "completion_length": 773.2750213623046, "epoch": 0.19916499149528374, "grad_norm": 0.07623099795002276, "kl": 0.10533447265625, "learning_rate": 6.781756783545224e-07, "loss": 0.0001, "reward": 0.45273180212825537, "reward_std": 0.13811454214155675, "rewards/wrapped_prediction_reward_func": 0.45273180212825537, "step": 805 }, { "completion_length": 756.9291854858399, "epoch": 0.2004020411319004, "grad_norm": 0.05779949116049259, "kl": 0.103173828125, "learning_rate": 6.743872071649411e-07, "loss": 0.0001, "reward": 0.47565967934206127, "reward_std": 0.1099685937166214, "rewards/wrapped_prediction_reward_func": 0.47565967934206127, "step": 810 }, { "completion_length": 790.0687759399414, "epoch": 0.20163909076851708, "grad_norm": 0.0702083971084798, "kl": 0.10361328125, "learning_rate": 6.70587303282298e-07, "loss": 0.0, "reward": 0.3754935884848237, "reward_std": 0.12223252654075623, "rewards/wrapped_prediction_reward_func": 0.3754935884848237, "step": 815 }, { "completion_length": 767.4937728881836, "epoch": 0.20287614040513374, "grad_norm": 0.07700172814841658, "kl": 0.10714111328125, "learning_rate": 6.667762158254103e-07, "loss": 0.0001, "reward": 0.44241261184215547, "reward_std": 0.14867481328547, "rewards/wrapped_prediction_reward_func": 0.44241261184215547, "step": 820 }, { "completion_length": 753.5541885375976, "epoch": 0.20411319004175044, "grad_norm": 0.08645027582478916, "kl": 0.10916748046875, "learning_rate": 6.629541946462816e-07, "loss": 0.0001, "reward": 0.406618938036263, "reward_std": 0.17157106809318065, "rewards/wrapped_prediction_reward_func": 0.406618938036263, "step": 825 }, { "completion_length": 762.4250213623047, "epoch": 0.2053502396783671, "grad_norm": 0.06817866132947824, "kl": 0.11602783203125, "learning_rate": 6.59121490313722e-07, "loss": 0.0, "reward": 0.41990571906790136, "reward_std": 0.11060777269303798, "rewards/wrapped_prediction_reward_func": 0.41990571906790136, "step": 830 }, { "completion_length": 766.9729370117187, "epoch": 0.20658728931498377, "grad_norm": 0.057052348192183175, "kl": 0.106201171875, "learning_rate": 6.552783540969211e-07, "loss": 0.0001, "reward": 0.45523099079728124, "reward_std": 0.1364333875477314, "rewards/wrapped_prediction_reward_func": 0.45523099079728124, "step": 835 }, { "completion_length": 758.5146041870117, "epoch": 0.20782433895160043, "grad_norm": 0.059268222968820714, "kl": 0.113818359375, "learning_rate": 6.514250379489753e-07, "loss": 0.0001, "reward": 0.5336175758391619, "reward_std": 0.15124866738915443, "rewards/wrapped_prediction_reward_func": 0.5336175758391619, "step": 840 }, { "completion_length": 749.1750198364258, "epoch": 0.2090613885882171, "grad_norm": 0.06152242378532451, "kl": 0.11639404296875, "learning_rate": 6.475617944903691e-07, "loss": 0.0001, "reward": 0.486631984077394, "reward_std": 0.09064425453543663, "rewards/wrapped_prediction_reward_func": 0.486631984077394, "step": 845 }, { "completion_length": 746.2396041870118, "epoch": 0.21029843822483377, "grad_norm": 0.08638392820808985, "kl": 0.11378173828125, "learning_rate": 6.436888769924141e-07, "loss": 0.0, "reward": 0.46668662438169123, "reward_std": 0.13883897066116332, "rewards/wrapped_prediction_reward_func": 0.46668662438169123, "step": 850 }, { "completion_length": 754.8187683105468, "epoch": 0.21153548786145043, "grad_norm": 0.043200843394990134, "kl": 0.114599609375, "learning_rate": 6.398065393606444e-07, "loss": 0.0001, "reward": 0.4405731033533812, "reward_std": 0.12028250098228455, "rewards/wrapped_prediction_reward_func": 0.4405731033533812, "step": 855 }, { "completion_length": 732.3000244140625, "epoch": 0.2127725374980671, "grad_norm": 0.04246092848515359, "kl": 0.11920166015625, "learning_rate": 6.359150361181714e-07, "loss": 0.0001, "reward": 0.5387031998485327, "reward_std": 0.09682522714138031, "rewards/wrapped_prediction_reward_func": 0.5387031998485327, "step": 860 }, { "completion_length": 730.4625244140625, "epoch": 0.2140095871346838, "grad_norm": 0.06090310952402415, "kl": 0.1197509765625, "learning_rate": 6.320146223889965e-07, "loss": 0.0001, "reward": 0.4410721279680729, "reward_std": 0.13679569475352765, "rewards/wrapped_prediction_reward_func": 0.4410721279680729, "step": 865 }, { "completion_length": 756.3166854858398, "epoch": 0.21524663677130046, "grad_norm": 0.055746368220646346, "kl": 0.1222900390625, "learning_rate": 6.281055538812861e-07, "loss": 0.0, "reward": 0.39723333092406393, "reward_std": 0.14023556001484394, "rewards/wrapped_prediction_reward_func": 0.39723333092406393, "step": 870 }, { "completion_length": 736.4666900634766, "epoch": 0.21648368640791713, "grad_norm": 0.05323607730248726, "kl": 0.11514892578125, "learning_rate": 6.241880868706074e-07, "loss": 0.0001, "reward": 0.44148183073848485, "reward_std": 0.06728704944252968, "rewards/wrapped_prediction_reward_func": 0.44148183073848485, "step": 875 }, { "completion_length": 749.914599609375, "epoch": 0.2177207360445338, "grad_norm": 0.02328522139339984, "kl": 0.1231201171875, "learning_rate": 6.202624781831268e-07, "loss": 0.0001, "reward": 0.5017391091212631, "reward_std": 0.0775448054075241, "rewards/wrapped_prediction_reward_func": 0.5017391091212631, "step": 880 }, { "completion_length": 753.7229370117187, "epoch": 0.21895778568115046, "grad_norm": 0.07323592909007214, "kl": 0.1176025390625, "learning_rate": 6.163289851787731e-07, "loss": 0.0001, "reward": 0.4388370859436691, "reward_std": 0.1239024505019188, "rewards/wrapped_prediction_reward_func": 0.4388370859436691, "step": 885 }, { "completion_length": 780.2312728881836, "epoch": 0.22019483531776712, "grad_norm": 0.07109242102927597, "kl": 0.1156494140625, "learning_rate": 6.123878657343647e-07, "loss": 0.0001, "reward": 0.5519936315715313, "reward_std": 0.12339010536670685, "rewards/wrapped_prediction_reward_func": 0.5519936315715313, "step": 890 }, { "completion_length": 748.4187667846679, "epoch": 0.2214318849543838, "grad_norm": 0.06440441205278268, "kl": 0.12310791015625, "learning_rate": 6.084393782267039e-07, "loss": 0.0001, "reward": 0.41257810574024917, "reward_std": 0.10009315423667431, "rewards/wrapped_prediction_reward_func": 0.41257810574024917, "step": 895 }, { "completion_length": 743.8541900634766, "epoch": 0.22266893459100046, "grad_norm": 0.05032560825850314, "kl": 0.128369140625, "learning_rate": 6.044837815156376e-07, "loss": 0.0001, "reward": 0.5202655298635364, "reward_std": 0.05850386098027229, "rewards/wrapped_prediction_reward_func": 0.5202655298635364, "step": 900 }, { "completion_length": 764.8750213623047, "epoch": 0.22390598422761712, "grad_norm": 0.07019113736755674, "kl": 0.12353515625, "learning_rate": 6.005213349270864e-07, "loss": 0.0001, "reward": 0.4067071693018079, "reward_std": 0.14382442086935043, "rewards/wrapped_prediction_reward_func": 0.4067071693018079, "step": 905 }, { "completion_length": 774.0062744140625, "epoch": 0.22514303386423382, "grad_norm": 0.05277402438496134, "kl": 0.1413330078125, "learning_rate": 5.96552298236044e-07, "loss": 0.0001, "reward": 0.44951862171292306, "reward_std": 0.10927974320948124, "rewards/wrapped_prediction_reward_func": 0.44951862171292306, "step": 910 }, { "completion_length": 749.3937683105469, "epoch": 0.22638008350085048, "grad_norm": 0.03501839896533366, "kl": 0.13756103515625, "learning_rate": 5.925769316495461e-07, "loss": 0.0001, "reward": 0.43234148118644955, "reward_std": 0.08405314721167087, "rewards/wrapped_prediction_reward_func": 0.43234148118644955, "step": 915 }, { "completion_length": 742.0333587646485, "epoch": 0.22761713313746715, "grad_norm": 0.0560900771911583, "kl": 0.13082275390625, "learning_rate": 5.885954957896115e-07, "loss": 0.0, "reward": 0.4151491061784327, "reward_std": 0.1518498443067074, "rewards/wrapped_prediction_reward_func": 0.4151491061784327, "step": 920 }, { "completion_length": 721.391682434082, "epoch": 0.22885418277408381, "grad_norm": 0.06301445089634264, "kl": 0.128955078125, "learning_rate": 5.846082516761557e-07, "loss": 0.0001, "reward": 0.641039339452982, "reward_std": 0.07052504420280456, "rewards/wrapped_prediction_reward_func": 0.641039339452982, "step": 925 }, { "completion_length": 735.0521041870118, "epoch": 0.23009123241070048, "grad_norm": 0.041940671749711285, "kl": 0.151806640625, "learning_rate": 5.806154607098799e-07, "loss": 0.0001, "reward": 0.4932118478231132, "reward_std": 0.04339260496199131, "rewards/wrapped_prediction_reward_func": 0.4932118478231132, "step": 930 }, { "completion_length": 723.4562698364258, "epoch": 0.23132828204731715, "grad_norm": 0.06049013497522889, "kl": 0.13043212890625, "learning_rate": 5.766173846551316e-07, "loss": 0.0001, "reward": 0.5325720165856183, "reward_std": 0.0835997462272644, "rewards/wrapped_prediction_reward_func": 0.5325720165856183, "step": 935 }, { "completion_length": 734.1229370117187, "epoch": 0.2325653316839338, "grad_norm": 0.04572102446891012, "kl": 0.13204345703125, "learning_rate": 5.726142856227452e-07, "loss": 0.0001, "reward": 0.4795490915887058, "reward_std": 0.0886494155973196, "rewards/wrapped_prediction_reward_func": 0.4795490915887058, "step": 940 }, { "completion_length": 727.2208526611328, "epoch": 0.23380238132055048, "grad_norm": 0.06456918116108919, "kl": 0.13282470703125, "learning_rate": 5.686064260528577e-07, "loss": 0.0001, "reward": 0.4241417799144983, "reward_std": 0.1043927937746048, "rewards/wrapped_prediction_reward_func": 0.4241417799144983, "step": 945 }, { "completion_length": 711.4729370117187, "epoch": 0.23503943095716714, "grad_norm": 0.05954120185098794, "kl": 0.129296875, "learning_rate": 5.645940686977032e-07, "loss": 0.0001, "reward": 0.5206036314368248, "reward_std": 0.08329091891646385, "rewards/wrapped_prediction_reward_func": 0.5206036314368248, "step": 950 }, { "completion_length": 728.4146041870117, "epoch": 0.23627648059378384, "grad_norm": 0.052822064777129474, "kl": 0.1328125, "learning_rate": 5.605774766043873e-07, "loss": 0.0001, "reward": 0.4754035800695419, "reward_std": 0.040189435705542564, "rewards/wrapped_prediction_reward_func": 0.4754035800695419, "step": 955 }, { "completion_length": 715.4312683105469, "epoch": 0.2375135302304005, "grad_norm": 0.04749608821560791, "kl": 0.14237060546875, "learning_rate": 5.565569130976422e-07, "loss": 0.0001, "reward": 0.4664379990659654, "reward_std": 0.05981869548559189, "rewards/wrapped_prediction_reward_func": 0.4664379990659654, "step": 960 }, { "completion_length": 732.2646026611328, "epoch": 0.23875057986701717, "grad_norm": 0.07892021899444322, "kl": 0.13857421875, "learning_rate": 5.52532641762562e-07, "loss": 0.0001, "reward": 0.3501896098256111, "reward_std": 0.1018542367964983, "rewards/wrapped_prediction_reward_func": 0.3501896098256111, "step": 965 }, { "completion_length": 723.845851135254, "epoch": 0.23998762950363384, "grad_norm": 0.06488985238375704, "kl": 0.1370361328125, "learning_rate": 5.485049264273241e-07, "loss": 0.0001, "reward": 0.4604840692132711, "reward_std": 0.10753555037081242, "rewards/wrapped_prediction_reward_func": 0.4604840692132711, "step": 970 }, { "completion_length": 696.810432434082, "epoch": 0.2412246791402505, "grad_norm": 0.03545636549292491, "kl": 0.14051513671875, "learning_rate": 5.444740311458914e-07, "loss": 0.0001, "reward": 0.4175055561587214, "reward_std": 0.07677363753318786, "rewards/wrapped_prediction_reward_func": 0.4175055561587214, "step": 975 }, { "completion_length": 734.493766784668, "epoch": 0.24246172877686717, "grad_norm": 0.04452324065170496, "kl": 0.1341552734375, "learning_rate": 5.404402201807021e-07, "loss": 0.0001, "reward": 0.5556499321013689, "reward_std": 0.058116089552640915, "rewards/wrapped_prediction_reward_func": 0.5556499321013689, "step": 980 }, { "completion_length": 758.0166885375977, "epoch": 0.24369877841348384, "grad_norm": 0.03583796247769736, "kl": 0.1330810546875, "learning_rate": 5.364037579853439e-07, "loss": 0.0001, "reward": 0.5044961895793676, "reward_std": 0.05573735199868679, "rewards/wrapped_prediction_reward_func": 0.5044961895793676, "step": 985 }, { "completion_length": 724.5583465576171, "epoch": 0.2449358280501005, "grad_norm": 0.0495697032985658, "kl": 0.13350830078125, "learning_rate": 5.323649091872178e-07, "loss": 0.0001, "reward": 0.456222964450717, "reward_std": 0.07662210948765277, "rewards/wrapped_prediction_reward_func": 0.456222964450717, "step": 990 }, { "completion_length": 734.0896041870117, "epoch": 0.24617287768671717, "grad_norm": 0.0777113973289712, "kl": 0.13773193359375, "learning_rate": 5.283239385701881e-07, "loss": 0.0001, "reward": 0.4875594067387283, "reward_std": 0.10679161138832569, "rewards/wrapped_prediction_reward_func": 0.4875594067387283, "step": 995 }, { "completion_length": 723.2583557128906, "epoch": 0.24740992732333386, "grad_norm": 0.04018229701367503, "kl": 0.1403076171875, "learning_rate": 5.242811110572242e-07, "loss": 0.0001, "reward": 0.47034588232636454, "reward_std": 0.10031413286924362, "rewards/wrapped_prediction_reward_func": 0.47034588232636454, "step": 1000 }, { "completion_length": 722.1000213623047, "epoch": 0.24864697695995053, "grad_norm": 0.06319998888243976, "kl": 0.1427001953125, "learning_rate": 5.202366916930319e-07, "loss": 0.0001, "reward": 0.4664687434211373, "reward_std": 0.06945351138710976, "rewards/wrapped_prediction_reward_func": 0.4664687434211373, "step": 1005 }, { "completion_length": 714.7625213623047, "epoch": 0.2498840265965672, "grad_norm": 0.04780952914464219, "kl": 0.1403564453125, "learning_rate": 5.16190945626678e-07, "loss": 0.0001, "reward": 0.5023269861936569, "reward_std": 0.0749455738812685, "rewards/wrapped_prediction_reward_func": 0.5023269861936569, "step": 1010 }, { "completion_length": 715.3166900634766, "epoch": 0.25112107623318386, "grad_norm": 0.046438519510363555, "kl": 0.1429931640625, "learning_rate": 5.121441380942065e-07, "loss": 0.0001, "reward": 0.5037658797577024, "reward_std": 0.09602665789425373, "rewards/wrapped_prediction_reward_func": 0.5037658797577024, "step": 1015 }, { "completion_length": 741.4854339599609, "epoch": 0.2523581258698005, "grad_norm": 0.06698438474579395, "kl": 0.137060546875, "learning_rate": 5.080965344012508e-07, "loss": 0.0002, "reward": 0.47632112530991433, "reward_std": 0.06293314322829247, "rewards/wrapped_prediction_reward_func": 0.47632112530991433, "step": 1020 }, { "completion_length": 739.1562728881836, "epoch": 0.2535951755064172, "grad_norm": 0.058013481138543904, "kl": 0.13350830078125, "learning_rate": 5.040483999056393e-07, "loss": 0.0001, "reward": 0.515098605491221, "reward_std": 0.07238813154399396, "rewards/wrapped_prediction_reward_func": 0.515098605491221, "step": 1025 }, { "completion_length": 743.8000213623047, "epoch": 0.2548322251430339, "grad_norm": 0.060273991716443535, "kl": 0.136572265625, "learning_rate": 5e-07, "loss": 0.0001, "reward": 0.5013489624485373, "reward_std": 0.0855951450765133, "rewards/wrapped_prediction_reward_func": 0.5013489624485373, "step": 1030 }, { "completion_length": 739.9583526611328, "epoch": 0.2560692747796505, "grad_norm": 0.04267638133407205, "kl": 0.13192138671875, "learning_rate": 4.959516000943607e-07, "loss": 0.0001, "reward": 0.48855996150523423, "reward_std": 0.08283850885927677, "rewards/wrapped_prediction_reward_func": 0.48855996150523423, "step": 1035 }, { "completion_length": 736.2333526611328, "epoch": 0.2573063244162672, "grad_norm": 0.056194858634804855, "kl": 0.136962890625, "learning_rate": 4.919034655987492e-07, "loss": 0.0001, "reward": 0.5468234160915018, "reward_std": 0.04913493171334267, "rewards/wrapped_prediction_reward_func": 0.5468234160915018, "step": 1040 }, { "completion_length": 741.8041839599609, "epoch": 0.25854337405288386, "grad_norm": 0.042737342793500686, "kl": 0.13411865234375, "learning_rate": 4.878558619057935e-07, "loss": 0.0001, "reward": 0.49116963930428026, "reward_std": 0.07535258755087852, "rewards/wrapped_prediction_reward_func": 0.49116963930428026, "step": 1045 }, { "completion_length": 757.3708511352539, "epoch": 0.25978042368950055, "grad_norm": 0.056567908131289514, "kl": 0.13594970703125, "learning_rate": 4.838090543733221e-07, "loss": 0.0, "reward": 0.45989798260852693, "reward_std": 0.12630628235638142, "rewards/wrapped_prediction_reward_func": 0.45989798260852693, "step": 1050 }, { "completion_length": 744.529183959961, "epoch": 0.2610174733261172, "grad_norm": 0.07296659596834233, "kl": 0.136279296875, "learning_rate": 4.797633083069683e-07, "loss": 0.0001, "reward": 0.4250240566208959, "reward_std": 0.1043800551444292, "rewards/wrapped_prediction_reward_func": 0.4250240566208959, "step": 1055 }, { "completion_length": 765.4250213623047, "epoch": 0.2622545229627339, "grad_norm": 0.03300528661951444, "kl": 0.133837890625, "learning_rate": 4.75718888942776e-07, "loss": 0.0001, "reward": 0.4467176416888833, "reward_std": 0.059732082113623616, "rewards/wrapped_prediction_reward_func": 0.4467176416888833, "step": 1060 }, { "completion_length": 761.6333526611328, "epoch": 0.2634915725993506, "grad_norm": 0.054926075448281146, "kl": 0.13094482421875, "learning_rate": 4.7167606142981173e-07, "loss": 0.0001, "reward": 0.4962974702939391, "reward_std": 0.05834553986787796, "rewards/wrapped_prediction_reward_func": 0.4962974702939391, "step": 1065 }, { "completion_length": 751.285432434082, "epoch": 0.2647286222359672, "grad_norm": 0.054779102268880255, "kl": 0.136328125, "learning_rate": 4.676350908127821e-07, "loss": 0.0, "reward": 0.5350621856749058, "reward_std": 0.06961509063839913, "rewards/wrapped_prediction_reward_func": 0.5350621856749058, "step": 1070 }, { "completion_length": 742.206266784668, "epoch": 0.2659656718725839, "grad_norm": 0.04670914742181485, "kl": 0.1391357421875, "learning_rate": 4.6359624201465597e-07, "loss": 0.0002, "reward": 0.49164473423734306, "reward_std": 0.04531940743327141, "rewards/wrapped_prediction_reward_func": 0.49164473423734306, "step": 1075 }, { "completion_length": 761.3604415893554, "epoch": 0.26720272150920055, "grad_norm": 0.0824981427300696, "kl": 0.1406494140625, "learning_rate": 4.595597798192979e-07, "loss": 0.0001, "reward": 0.5422801088541747, "reward_std": 0.07643798477947712, "rewards/wrapped_prediction_reward_func": 0.5422801088541747, "step": 1080 }, { "completion_length": 745.1208557128906, "epoch": 0.26843977114581724, "grad_norm": 8.853549205613618e-05, "kl": 0.1326171875, "learning_rate": 4.555259688541086e-07, "loss": 0.0001, "reward": 0.4162906426936388, "reward_std": 0.06242701783776283, "rewards/wrapped_prediction_reward_func": 0.4162906426936388, "step": 1085 }, { "completion_length": 758.2562683105468, "epoch": 0.2696768207824339, "grad_norm": 0.06036301574518875, "kl": 0.131787109375, "learning_rate": 4.5149507357267597e-07, "loss": 0.0001, "reward": 0.4528342138044536, "reward_std": 0.08761736638844013, "rewards/wrapped_prediction_reward_func": 0.4528342138044536, "step": 1090 }, { "completion_length": 744.564599609375, "epoch": 0.2709138704190506, "grad_norm": 0.05784457396424586, "kl": 0.13895263671875, "learning_rate": 4.47467358237438e-07, "loss": 0.0001, "reward": 0.5124209210276603, "reward_std": 0.06776915676891804, "rewards/wrapped_prediction_reward_func": 0.5124209210276603, "step": 1095 }, { "completion_length": 758.1916870117187, "epoch": 0.2721509200556672, "grad_norm": 0.03833588842141912, "kl": 0.13505859375, "learning_rate": 4.434430869023579e-07, "loss": 0.0002, "reward": 0.520728993974626, "reward_std": 0.07866840586066245, "rewards/wrapped_prediction_reward_func": 0.520728993974626, "step": 1100 }, { "completion_length": 741.7937667846679, "epoch": 0.2733879696922839, "grad_norm": 0.028550247516094713, "kl": 0.13800048828125, "learning_rate": 4.394225233956127e-07, "loss": 0.0001, "reward": 0.4832553478889167, "reward_std": 0.06090699248015881, "rewards/wrapped_prediction_reward_func": 0.4832553478889167, "step": 1105 }, { "completion_length": 746.0771057128907, "epoch": 0.2746250193289006, "grad_norm": 0.029999791710605777, "kl": 0.14295654296875, "learning_rate": 4.354059313022969e-07, "loss": 0.0001, "reward": 0.4988814054057002, "reward_std": 0.04718155525624752, "rewards/wrapped_prediction_reward_func": 0.4988814054057002, "step": 1110 }, { "completion_length": 780.0666900634766, "epoch": 0.27586206896551724, "grad_norm": 0.06360316714915734, "kl": 0.13826904296875, "learning_rate": 4.313935739471425e-07, "loss": 0.0001, "reward": 0.5211097911931575, "reward_std": 0.07867550514638424, "rewards/wrapped_prediction_reward_func": 0.5211097911931575, "step": 1115 }, { "completion_length": 776.1833526611329, "epoch": 0.27709911860213393, "grad_norm": 0.031282867146274325, "kl": 0.141357421875, "learning_rate": 4.2738571437725496e-07, "loss": 0.0, "reward": 0.46273208521306514, "reward_std": 0.08763996586203575, "rewards/wrapped_prediction_reward_func": 0.46273208521306514, "step": 1120 }, { "completion_length": 755.2521057128906, "epoch": 0.27833616823875057, "grad_norm": 0.05765718864555108, "kl": 0.14185791015625, "learning_rate": 4.233826153448684e-07, "loss": 0.0001, "reward": 0.5034419315867126, "reward_std": 0.05688044168055058, "rewards/wrapped_prediction_reward_func": 0.5034419315867126, "step": 1125 }, { "completion_length": 773.8250244140625, "epoch": 0.27957321787536726, "grad_norm": 0.03212472653438419, "kl": 0.1411376953125, "learning_rate": 4.193845392901201e-07, "loss": 0.0001, "reward": 0.4969831809401512, "reward_std": 0.06161353215575218, "rewards/wrapped_prediction_reward_func": 0.4969831809401512, "step": 1130 }, { "completion_length": 768.9000198364258, "epoch": 0.2808102675119839, "grad_norm": 0.044294315839148474, "kl": 0.1407958984375, "learning_rate": 4.1539174832384415e-07, "loss": 0.0001, "reward": 0.4423209439963102, "reward_std": 0.10879503712058067, "rewards/wrapped_prediction_reward_func": 0.4423209439963102, "step": 1135 }, { "completion_length": 759.7916885375977, "epoch": 0.2820473171486006, "grad_norm": 0.05154810914734898, "kl": 0.14052734375, "learning_rate": 4.1140450421038866e-07, "loss": 0.0002, "reward": 0.5333056742325425, "reward_std": 0.059134163334965704, "rewards/wrapped_prediction_reward_func": 0.5333056742325425, "step": 1140 }, { "completion_length": 745.5791870117188, "epoch": 0.28328436678521723, "grad_norm": 0.04366707071138725, "kl": 0.14813232421875, "learning_rate": 4.07423068350454e-07, "loss": 0.0001, "reward": 0.49964107451960443, "reward_std": 0.0779968474060297, "rewards/wrapped_prediction_reward_func": 0.49964107451960443, "step": 1145 }, { "completion_length": 756.002101135254, "epoch": 0.28452141642183393, "grad_norm": 0.06613287561842682, "kl": 0.1453857421875, "learning_rate": 4.0344770176395606e-07, "loss": 0.0001, "reward": 0.43359270226210356, "reward_std": 0.08028181083500385, "rewards/wrapped_prediction_reward_func": 0.43359270226210356, "step": 1150 }, { "completion_length": 766.206266784668, "epoch": 0.2857584660584506, "grad_norm": 0.05897935371348746, "kl": 0.15054931640625, "learning_rate": 3.994786650729136e-07, "loss": 0.0001, "reward": 0.48795836959034206, "reward_std": 0.07301567979156971, "rewards/wrapped_prediction_reward_func": 0.48795836959034206, "step": 1155 }, { "completion_length": 782.1146011352539, "epoch": 0.28699551569506726, "grad_norm": 0.03155362165509309, "kl": 0.1465087890625, "learning_rate": 3.955162184843624e-07, "loss": 0.0001, "reward": 0.5090317256748677, "reward_std": 0.05801623947918415, "rewards/wrapped_prediction_reward_func": 0.5090317256748677, "step": 1160 }, { "completion_length": 777.0479385375977, "epoch": 0.28823256533168395, "grad_norm": 0.048948750670197015, "kl": 0.14256591796875, "learning_rate": 3.915606217732962e-07, "loss": 0.0001, "reward": 0.46853276770561936, "reward_std": 0.09102714322507381, "rewards/wrapped_prediction_reward_func": 0.46853276770561936, "step": 1165 }, { "completion_length": 752.1979339599609, "epoch": 0.2894696149683006, "grad_norm": 0.05745386141094398, "kl": 0.149951171875, "learning_rate": 3.8761213426563543e-07, "loss": 0.0001, "reward": 0.4479881014674902, "reward_std": 0.07678309492766858, "rewards/wrapped_prediction_reward_func": 0.4479881014674902, "step": 1170 }, { "completion_length": 767.7125274658204, "epoch": 0.2907066646049173, "grad_norm": 0.05871330468562011, "kl": 0.14627685546875, "learning_rate": 3.8367101482122705e-07, "loss": 0.0, "reward": 0.5194343756884336, "reward_std": 0.06846481077373028, "rewards/wrapped_prediction_reward_func": 0.5194343756884336, "step": 1175 }, { "completion_length": 757.5354415893555, "epoch": 0.2919437142415339, "grad_norm": 0.06468685343835685, "kl": 0.14593505859375, "learning_rate": 3.7973752181687327e-07, "loss": 0.0001, "reward": 0.4868099356070161, "reward_std": 0.0667126391083002, "rewards/wrapped_prediction_reward_func": 0.4868099356070161, "step": 1180 }, { "completion_length": 762.2833526611328, "epoch": 0.2931807638781506, "grad_norm": 0.03781585514266224, "kl": 0.14259033203125, "learning_rate": 3.758119131293925e-07, "loss": 0.0001, "reward": 0.5278406769037247, "reward_std": 0.04370214119553566, "rewards/wrapped_prediction_reward_func": 0.5278406769037247, "step": 1185 }, { "completion_length": 751.9229415893554, "epoch": 0.29441781351476726, "grad_norm": 0.05463535205865628, "kl": 0.14635009765625, "learning_rate": 3.718944461187138e-07, "loss": 0.0001, "reward": 0.5110627200454474, "reward_std": 0.06480493322014809, "rewards/wrapped_prediction_reward_func": 0.5110627200454474, "step": 1190 }, { "completion_length": 767.4083541870117, "epoch": 0.29565486315138395, "grad_norm": 0.023093640853687706, "kl": 0.14705810546875, "learning_rate": 3.6798537761100347e-07, "loss": 0.0001, "reward": 0.5570095289498568, "reward_std": 0.045046417787671086, "rewards/wrapped_prediction_reward_func": 0.5570095289498568, "step": 1195 }, { "completion_length": 752.3916900634765, "epoch": 0.29689191278800064, "grad_norm": 0.035037312345744864, "kl": 0.14373779296875, "learning_rate": 3.640849638818285e-07, "loss": 0.0001, "reward": 0.48504778649657965, "reward_std": 0.05712633766233921, "rewards/wrapped_prediction_reward_func": 0.48504778649657965, "step": 1200 }, { "completion_length": 744.1250228881836, "epoch": 0.2981289624246173, "grad_norm": 0.0452081864505814, "kl": 0.145458984375, "learning_rate": 3.601934606393555e-07, "loss": 0.0001, "reward": 0.5049970051273703, "reward_std": 0.03301082365214825, "rewards/wrapped_prediction_reward_func": 0.5049970051273703, "step": 1205 }, { "completion_length": 737.6062728881836, "epoch": 0.299366012061234, "grad_norm": 0.03461453333102068, "kl": 0.14241943359375, "learning_rate": 3.563111230075859e-07, "loss": 0.0001, "reward": 0.45103547666221855, "reward_std": 0.08837438561022282, "rewards/wrapped_prediction_reward_func": 0.45103547666221855, "step": 1210 }, { "completion_length": 744.9187713623047, "epoch": 0.3006030616978506, "grad_norm": 0.02925206148235934, "kl": 0.1472900390625, "learning_rate": 3.524382055096308e-07, "loss": 0.0001, "reward": 0.4963268724270165, "reward_std": 0.03887149877846241, "rewards/wrapped_prediction_reward_func": 0.4963268724270165, "step": 1215 }, { "completion_length": 735.9521057128907, "epoch": 0.3018401113344673, "grad_norm": 0.052363859621846864, "kl": 0.1492919921875, "learning_rate": 3.485749620510247e-07, "loss": 0.0001, "reward": 0.5228683982975781, "reward_std": 0.05148884318768978, "rewards/wrapped_prediction_reward_func": 0.5228683982975781, "step": 1220 }, { "completion_length": 749.2208557128906, "epoch": 0.30307716097108395, "grad_norm": 0.02841696148809038, "kl": 0.13983154296875, "learning_rate": 3.447216459030789e-07, "loss": 0.0001, "reward": 0.49922923389822244, "reward_std": 0.057556836307048796, "rewards/wrapped_prediction_reward_func": 0.49922923389822244, "step": 1225 }, { "completion_length": 740.9250244140625, "epoch": 0.30431421060770064, "grad_norm": 0.03420368771409472, "kl": 0.14375, "learning_rate": 3.408785096862782e-07, "loss": 0.0001, "reward": 0.4923368003219366, "reward_std": 0.05154770351946354, "rewards/wrapped_prediction_reward_func": 0.4923368003219366, "step": 1230 }, { "completion_length": 742.1500183105469, "epoch": 0.3055512602443173, "grad_norm": 0.06760217222096482, "kl": 0.14427490234375, "learning_rate": 3.3704580535371857e-07, "loss": 0.0001, "reward": 0.5322297625243664, "reward_std": 0.05761535912752151, "rewards/wrapped_prediction_reward_func": 0.5322297625243664, "step": 1235 }, { "completion_length": 737.283351135254, "epoch": 0.306788309880934, "grad_norm": 0.03372498932323085, "kl": 0.14359130859375, "learning_rate": 3.3322378417458977e-07, "loss": 0.0001, "reward": 0.5538479458540678, "reward_std": 0.046281592547893526, "rewards/wrapped_prediction_reward_func": 0.5538479458540678, "step": 1240 }, { "completion_length": 750.5229354858399, "epoch": 0.30802535951755067, "grad_norm": 0.05934890447360817, "kl": 0.14742431640625, "learning_rate": 3.294126967177019e-07, "loss": 0.0002, "reward": 0.47018489595502616, "reward_std": 0.05935581475496292, "rewards/wrapped_prediction_reward_func": 0.47018489595502616, "step": 1245 }, { "completion_length": 760.1312698364258, "epoch": 0.3092624091541673, "grad_norm": 0.05568026298885921, "kl": 0.15076904296875, "learning_rate": 3.2561279283505884e-07, "loss": 0.0001, "reward": 0.5032477361150086, "reward_std": 0.08390399180352688, "rewards/wrapped_prediction_reward_func": 0.5032477361150086, "step": 1250 }, { "completion_length": 737.9583541870118, "epoch": 0.310499458790784, "grad_norm": 0.06324668098161558, "kl": 0.13980712890625, "learning_rate": 3.2182432164547744e-07, "loss": 0.0001, "reward": 0.5101997158490121, "reward_std": 0.07026028409600257, "rewards/wrapped_prediction_reward_func": 0.5101997158490121, "step": 1255 }, { "completion_length": 734.4146026611328, "epoch": 0.31173650842740064, "grad_norm": 0.061901146568645454, "kl": 0.15511474609375, "learning_rate": 3.1804753151825627e-07, "loss": 0.0001, "reward": 0.4728803929872811, "reward_std": 0.0600818395614624, "rewards/wrapped_prediction_reward_func": 0.4728803929872811, "step": 1260 }, { "completion_length": 742.5354354858398, "epoch": 0.31297355806401733, "grad_norm": 0.05284941413468379, "kl": 0.1459716796875, "learning_rate": 3.142826700568918e-07, "loss": 0.0001, "reward": 0.4964865125715733, "reward_std": 0.08052329532802105, "rewards/wrapped_prediction_reward_func": 0.4964865125715733, "step": 1265 }, { "completion_length": 736.889599609375, "epoch": 0.31421060770063397, "grad_norm": 0.05911227822994137, "kl": 0.147021484375, "learning_rate": 3.105299840828466e-07, "loss": 0.0001, "reward": 0.41497225435450674, "reward_std": 0.06442424207925797, "rewards/wrapped_prediction_reward_func": 0.41497225435450674, "step": 1270 }, { "completion_length": 732.7375183105469, "epoch": 0.31544765733725066, "grad_norm": 0.034683442072314795, "kl": 0.14261474609375, "learning_rate": 3.0678971961936764e-07, "loss": 0.0001, "reward": 0.4467615975998342, "reward_std": 0.07464952245354653, "rewards/wrapped_prediction_reward_func": 0.4467615975998342, "step": 1275 }, { "completion_length": 735.139599609375, "epoch": 0.3166847069738673, "grad_norm": 0.05703397206412208, "kl": 0.14874267578125, "learning_rate": 3.030621218753565e-07, "loss": 0.0001, "reward": 0.46127833873033525, "reward_std": 0.07033722139894963, "rewards/wrapped_prediction_reward_func": 0.46127833873033525, "step": 1280 }, { "completion_length": 758.6771057128906, "epoch": 0.317921756610484, "grad_norm": 0.07289664444149485, "kl": 0.15394287109375, "learning_rate": 2.9934743522929473e-07, "loss": 0.0001, "reward": 0.4588036834262311, "reward_std": 0.07971772141754627, "rewards/wrapped_prediction_reward_func": 0.4588036834262311, "step": 1285 }, { "completion_length": 740.6937713623047, "epoch": 0.3191588062471007, "grad_norm": 0.024827132296538046, "kl": 0.1473388671875, "learning_rate": 2.95645903213222e-07, "loss": 0.0001, "reward": 0.4552743576467037, "reward_std": 0.060056638717651364, "rewards/wrapped_prediction_reward_func": 0.4552743576467037, "step": 1290 }, { "completion_length": 783.358349609375, "epoch": 0.32039585588371733, "grad_norm": 0.05011389514184883, "kl": 0.1437255859375, "learning_rate": 2.9195776849677035e-07, "loss": 0.0002, "reward": 0.5194063542410732, "reward_std": 0.06745818518102169, "rewards/wrapped_prediction_reward_func": 0.5194063542410732, "step": 1295 }, { "completion_length": 745.1604415893555, "epoch": 0.321632905520334, "grad_norm": 0.04791673295103965, "kl": 0.1472900390625, "learning_rate": 2.8828327287125507e-07, "loss": 0.0001, "reward": 0.4598150375299156, "reward_std": 0.07030069567263127, "rewards/wrapped_prediction_reward_func": 0.4598150375299156, "step": 1300 }, { "completion_length": 761.9896072387695, "epoch": 0.32286995515695066, "grad_norm": 0.06202794999519762, "kl": 0.14462890625, "learning_rate": 2.846226572338225e-07, "loss": 0.0001, "reward": 0.45550857987254856, "reward_std": 0.09050950668752193, "rewards/wrapped_prediction_reward_func": 0.45550857987254856, "step": 1305 }, { "completion_length": 761.0021057128906, "epoch": 0.32410700479356735, "grad_norm": 9.337979594423026e-05, "kl": 0.151513671875, "learning_rate": 2.8097616157165885e-07, "loss": 0.0001, "reward": 0.42143569206818937, "reward_std": 0.05340727642178535, "rewards/wrapped_prediction_reward_func": 0.42143569206818937, "step": 1310 }, { "completion_length": 760.0625152587891, "epoch": 0.325344054430184, "grad_norm": 0.05117196128396242, "kl": 0.15203857421875, "learning_rate": 2.773440249462539e-07, "loss": 0.0001, "reward": 0.5622663186863065, "reward_std": 0.05105588287115097, "rewards/wrapped_prediction_reward_func": 0.5622663186863065, "step": 1315 }, { "completion_length": 729.2021011352539, "epoch": 0.3265811040668007, "grad_norm": 0.04457801418432316, "kl": 0.15322265625, "learning_rate": 2.7372648547773056e-07, "loss": 0.0001, "reward": 0.48678106535226107, "reward_std": 0.06915879845619202, "rewards/wrapped_prediction_reward_func": 0.48678106535226107, "step": 1320 }, { "completion_length": 754.2333557128907, "epoch": 0.3278181537034173, "grad_norm": 0.06104577718468036, "kl": 0.1523681640625, "learning_rate": 2.7012378032923343e-07, "loss": 0.0001, "reward": 0.39820886654779314, "reward_std": 0.0868304219096899, "rewards/wrapped_prediction_reward_func": 0.39820886654779314, "step": 1325 }, { "completion_length": 749.2229385375977, "epoch": 0.329055203340034, "grad_norm": 0.03760453413417284, "kl": 0.15206298828125, "learning_rate": 2.665361456913797e-07, "loss": 0.0001, "reward": 0.5090361749753356, "reward_std": 0.04264990463852882, "rewards/wrapped_prediction_reward_func": 0.5090361749753356, "step": 1330 }, { "completion_length": 756.4562698364258, "epoch": 0.3302922529766507, "grad_norm": 0.045285407813967476, "kl": 0.15771484375, "learning_rate": 2.6296381676677604e-07, "loss": 0.0, "reward": 0.471368836145848, "reward_std": 0.03997128494083881, "rewards/wrapped_prediction_reward_func": 0.471368836145848, "step": 1335 }, { "completion_length": 785.2771011352539, "epoch": 0.33152930261326735, "grad_norm": 0.05666666430889106, "kl": 0.1537353515625, "learning_rate": 2.5940702775459744e-07, "loss": 0.0001, "reward": 0.4646269915625453, "reward_std": 0.08307505026459694, "rewards/wrapped_prediction_reward_func": 0.4646269915625453, "step": 1340 }, { "completion_length": 745.4750228881836, "epoch": 0.33276635224988405, "grad_norm": 0.05587900927281758, "kl": 0.156396484375, "learning_rate": 2.558660118352348e-07, "loss": 0.0001, "reward": 0.5266566187143326, "reward_std": 0.07642874605953694, "rewards/wrapped_prediction_reward_func": 0.5266566187143326, "step": 1345 }, { "completion_length": 778.0146011352539, "epoch": 0.3340034018865007, "grad_norm": 0.07736406417997756, "kl": 0.14967041015625, "learning_rate": 2.523410011550064e-07, "loss": 0.0001, "reward": 0.4839884364977479, "reward_std": 0.0813891638070345, "rewards/wrapped_prediction_reward_func": 0.4839884364977479, "step": 1350 }, { "completion_length": 768.2854370117187, "epoch": 0.3352404515231174, "grad_norm": 0.05551049997786796, "kl": 0.1556396484375, "learning_rate": 2.4883222681093914e-07, "loss": 0.0001, "reward": 0.510473963804543, "reward_std": 0.04325590431690216, "rewards/wrapped_prediction_reward_func": 0.510473963804543, "step": 1355 }, { "completion_length": 748.0187698364258, "epoch": 0.336477501159734, "grad_norm": 0.03272503416709584, "kl": 0.161865234375, "learning_rate": 2.4533991883561867e-07, "loss": 0.0001, "reward": 0.4323475265875459, "reward_std": 0.05246281549334526, "rewards/wrapped_prediction_reward_func": 0.4323475265875459, "step": 1360 }, { "completion_length": 746.9583511352539, "epoch": 0.3377145507963507, "grad_norm": 0.06989282207830576, "kl": 0.15294189453125, "learning_rate": 2.4186430618210703e-07, "loss": 0.0002, "reward": 0.4897276235744357, "reward_std": 0.09300289861857891, "rewards/wrapped_prediction_reward_func": 0.4897276235744357, "step": 1365 }, { "completion_length": 759.379183959961, "epoch": 0.33895160043296735, "grad_norm": 0.00012444919081304812, "kl": 0.1503662109375, "learning_rate": 2.3840561670893495e-07, "loss": 0.0001, "reward": 0.4754153328947723, "reward_std": 0.05299051590263844, "rewards/wrapped_prediction_reward_func": 0.4754153328947723, "step": 1370 }, { "completion_length": 755.9479370117188, "epoch": 0.34018865006958404, "grad_norm": 0.045710266587869884, "kl": 0.1547607421875, "learning_rate": 2.349640771651611e-07, "loss": 0.0002, "reward": 0.5152722026221455, "reward_std": 0.04641593098640442, "rewards/wrapped_prediction_reward_func": 0.5152722026221455, "step": 1375 }, { "completion_length": 753.8812683105468, "epoch": 0.34142569970620074, "grad_norm": 0.038542543400419356, "kl": 0.1513427734375, "learning_rate": 2.3153991317550808e-07, "loss": 0.0001, "reward": 0.5372180460952223, "reward_std": 0.026408378407359122, "rewards/wrapped_prediction_reward_func": 0.5372180460952223, "step": 1380 }, { "completion_length": 742.6000213623047, "epoch": 0.3426627493428174, "grad_norm": 0.04486346970334233, "kl": 0.15272216796875, "learning_rate": 2.2813334922557077e-07, "loss": 0.0001, "reward": 0.4491668539121747, "reward_std": 0.05710873305797577, "rewards/wrapped_prediction_reward_func": 0.4491668539121747, "step": 1385 }, { "completion_length": 767.8291839599609, "epoch": 0.34389979897943407, "grad_norm": 0.04731115608172714, "kl": 0.145654296875, "learning_rate": 2.247446086470982e-07, "loss": 0.0001, "reward": 0.49003294911235573, "reward_std": 0.05564797632396221, "rewards/wrapped_prediction_reward_func": 0.49003294911235573, "step": 1390 }, { "completion_length": 739.6479385375976, "epoch": 0.3451368486160507, "grad_norm": 0.0324983006353205, "kl": 0.14913330078125, "learning_rate": 2.2137391360335328e-07, "loss": 0.0001, "reward": 0.5153373683802783, "reward_std": 0.07056629844009876, "rewards/wrapped_prediction_reward_func": 0.5153373683802783, "step": 1395 }, { "completion_length": 762.4291839599609, "epoch": 0.3463738982526674, "grad_norm": 0.05337055219373856, "kl": 0.14796142578125, "learning_rate": 2.180214850745467e-07, "loss": 0.0001, "reward": 0.49461562614887955, "reward_std": 0.049253585562109944, "rewards/wrapped_prediction_reward_func": 0.49461562614887955, "step": 1400 }, { "completion_length": 770.5396057128906, "epoch": 0.34761094788928404, "grad_norm": 0.043774273760773766, "kl": 0.15047607421875, "learning_rate": 2.1468754284335095e-07, "loss": 0.0001, "reward": 0.45189595930278303, "reward_std": 0.03938252069056034, "rewards/wrapped_prediction_reward_func": 0.45189595930278303, "step": 1405 }, { "completion_length": 744.8791870117187, "epoch": 0.34884799752590073, "grad_norm": 0.0001267998857188239, "kl": 0.15303955078125, "learning_rate": 2.113723054804904e-07, "loss": 0.0001, "reward": 0.5570487318560481, "reward_std": 0.04401119910180569, "rewards/wrapped_prediction_reward_func": 0.5570487318560481, "step": 1410 }, { "completion_length": 757.8062698364258, "epoch": 0.35008504716251737, "grad_norm": 0.0498086351813676, "kl": 0.15150146484375, "learning_rate": 2.0807599033041234e-07, "loss": 0.0001, "reward": 0.5071487789973617, "reward_std": 0.04892464466392994, "rewards/wrapped_prediction_reward_func": 0.5071487789973617, "step": 1415 }, { "completion_length": 752.029183959961, "epoch": 0.35132209679913406, "grad_norm": 0.06009230719026399, "kl": 0.15191650390625, "learning_rate": 2.0479881349703882e-07, "loss": 0.0001, "reward": 0.4836918482556939, "reward_std": 0.0635766088962555, "rewards/wrapped_prediction_reward_func": 0.4836918482556939, "step": 1420 }, { "completion_length": 779.5896057128906, "epoch": 0.35255914643575076, "grad_norm": 0.042367462070118926, "kl": 0.14569091796875, "learning_rate": 2.0154098982959744e-07, "loss": 0.0001, "reward": 0.4936805974692106, "reward_std": 0.0917021807283163, "rewards/wrapped_prediction_reward_func": 0.4936805974692106, "step": 1425 }, { "completion_length": 765.8104385375976, "epoch": 0.3537961960723674, "grad_norm": 0.060104262818723914, "kl": 0.15184326171875, "learning_rate": 1.9830273290853766e-07, "loss": 0.0002, "reward": 0.4905229499563575, "reward_std": 0.05690435692667961, "rewards/wrapped_prediction_reward_func": 0.4905229499563575, "step": 1430 }, { "completion_length": 774.7583511352539, "epoch": 0.3550332457089841, "grad_norm": 0.0679189158115039, "kl": 0.15130615234375, "learning_rate": 1.950842550315277e-07, "loss": 0.0, "reward": 0.4336330755613744, "reward_std": 0.07599727883934974, "rewards/wrapped_prediction_reward_func": 0.4336330755613744, "step": 1435 }, { "completion_length": 750.2333541870117, "epoch": 0.35627029534560073, "grad_norm": 0.06114114975927169, "kl": 0.15255126953125, "learning_rate": 1.918857671995363e-07, "loss": 0.0001, "reward": 0.44727214947342875, "reward_std": 0.0692370492964983, "rewards/wrapped_prediction_reward_func": 0.44727214947342875, "step": 1440 }, { "completion_length": 778.3250152587891, "epoch": 0.3575073449822174, "grad_norm": 0.05114708737824657, "kl": 0.1462646484375, "learning_rate": 1.8870747910300062e-07, "loss": 0.0001, "reward": 0.5533999393694102, "reward_std": 0.03972948119044304, "rewards/wrapped_prediction_reward_func": 0.5533999393694102, "step": 1445 }, { "completion_length": 774.8916809082032, "epoch": 0.35874439461883406, "grad_norm": 0.03191733866566709, "kl": 0.1515869140625, "learning_rate": 1.8554959910807772e-07, "loss": 0.0001, "reward": 0.4622044451534748, "reward_std": 0.04412779547274113, "rewards/wrapped_prediction_reward_func": 0.4622044451534748, "step": 1450 }, { "completion_length": 767.9625198364258, "epoch": 0.35998144425545076, "grad_norm": 0.04601119455590098, "kl": 0.1469970703125, "learning_rate": 1.824123342429858e-07, "loss": 0.0001, "reward": 0.43539866311475633, "reward_std": 0.06656052842736244, "rewards/wrapped_prediction_reward_func": 0.43539866311475633, "step": 1455 }, { "completion_length": 766.8562713623047, "epoch": 0.3612184938920674, "grad_norm": 0.030777046652940513, "kl": 0.15399169921875, "learning_rate": 1.7929589018443014e-07, "loss": 0.0001, "reward": 0.4539327438920736, "reward_std": 0.05378706008195877, "rewards/wrapped_prediction_reward_func": 0.4539327438920736, "step": 1460 }, { "completion_length": 745.0771026611328, "epoch": 0.3624555435286841, "grad_norm": 0.0365067112121767, "kl": 0.1519775390625, "learning_rate": 1.7620047124411997e-07, "loss": 0.0001, "reward": 0.528686286881566, "reward_std": 0.07875644378364086, "rewards/wrapped_prediction_reward_func": 0.528686286881566, "step": 1465 }, { "completion_length": 769.3083541870117, "epoch": 0.3636925931653008, "grad_norm": 0.052151633906379134, "kl": 0.15118408203125, "learning_rate": 1.7312628035537386e-07, "loss": 0.0002, "reward": 0.48016759315505625, "reward_std": 0.06370407566428185, "rewards/wrapped_prediction_reward_func": 0.48016759315505625, "step": 1470 }, { "completion_length": 762.533349609375, "epoch": 0.3649296428019174, "grad_norm": 0.057484167906522984, "kl": 0.15108642578125, "learning_rate": 1.700735190598151e-07, "loss": 0.0002, "reward": 0.5555347263813019, "reward_std": 0.058147551491856575, "rewards/wrapped_prediction_reward_func": 0.5555347263813019, "step": 1475 }, { "completion_length": 772.9312698364258, "epoch": 0.3661666924385341, "grad_norm": 0.031036044630949514, "kl": 0.1529296875, "learning_rate": 1.6704238749415955e-07, "loss": 0.0001, "reward": 0.5027113445103168, "reward_std": 0.024290652200579643, "rewards/wrapped_prediction_reward_func": 0.5027113445103168, "step": 1480 }, { "completion_length": 741.9271087646484, "epoch": 0.36740374207515075, "grad_norm": 0.03630489782016798, "kl": 0.15869140625, "learning_rate": 1.6403308437709378e-07, "loss": 0.0001, "reward": 0.47990547139197587, "reward_std": 0.049695948511362074, "rewards/wrapped_prediction_reward_func": 0.47990547139197587, "step": 1485 }, { "completion_length": 786.0208511352539, "epoch": 0.36864079171176745, "grad_norm": 9.429495306569628e-05, "kl": 0.1482177734375, "learning_rate": 1.6104580699624837e-07, "loss": 0.0001, "reward": 0.44344895239919424, "reward_std": 0.047899579256772996, "rewards/wrapped_prediction_reward_func": 0.44344895239919424, "step": 1490 }, { "completion_length": 758.7166915893555, "epoch": 0.3698778413483841, "grad_norm": 0.04897781522960424, "kl": 0.14615478515625, "learning_rate": 1.5808075119526322e-07, "loss": 0.0001, "reward": 0.4485797148197889, "reward_std": 0.04900593645870686, "rewards/wrapped_prediction_reward_func": 0.4485797148197889, "step": 1495 }, { "completion_length": 772.6021026611328, "epoch": 0.3711148909850008, "grad_norm": 0.05872401120327781, "kl": 0.1535400390625, "learning_rate": 1.5513811136094785e-07, "loss": 0.0001, "reward": 0.506781590450555, "reward_std": 0.06982601843774319, "rewards/wrapped_prediction_reward_func": 0.506781590450555, "step": 1500 }, { "completion_length": 762.8125198364257, "epoch": 0.3723519406216174, "grad_norm": 0.04839130205815754, "kl": 0.1456787109375, "learning_rate": 1.5221808041053873e-07, "loss": 0.0002, "reward": 0.4759699860587716, "reward_std": 0.06842627525329589, "rewards/wrapped_prediction_reward_func": 0.4759699860587716, "step": 1505 }, { "completion_length": 782.0437683105469, "epoch": 0.3735889902582341, "grad_norm": 0.04399460427448582, "kl": 0.1383056640625, "learning_rate": 1.493208497790504e-07, "loss": 0.0001, "reward": 0.5117153042927385, "reward_std": 0.0500517588108778, "rewards/wrapped_prediction_reward_func": 0.5117153042927385, "step": 1510 }, { "completion_length": 767.9604385375976, "epoch": 0.3748260398948508, "grad_norm": 0.050052699414806945, "kl": 0.1404296875, "learning_rate": 1.4644660940672627e-07, "loss": 0.0001, "reward": 0.5012076039798558, "reward_std": 0.04680187664926052, "rewards/wrapped_prediction_reward_func": 0.5012076039798558, "step": 1515 }, { "completion_length": 757.0708557128906, "epoch": 0.37606308953146744, "grad_norm": 0.05455627418786826, "kl": 0.1415771484375, "learning_rate": 1.435955477265855e-07, "loss": 0.0001, "reward": 0.5445947676897049, "reward_std": 0.04895281940698624, "rewards/wrapped_prediction_reward_func": 0.5445947676897049, "step": 1520 }, { "completion_length": 777.6125198364258, "epoch": 0.37730013916808414, "grad_norm": 0.04494172824394549, "kl": 0.13907470703125, "learning_rate": 1.4076785165206962e-07, "loss": 0.0001, "reward": 0.46489023678004743, "reward_std": 0.06057768724858761, "rewards/wrapped_prediction_reward_func": 0.46489023678004743, "step": 1525 }, { "completion_length": 775.8208511352539, "epoch": 0.3785371888047008, "grad_norm": 0.06722507992640166, "kl": 0.13720703125, "learning_rate": 1.3796370656478934e-07, "loss": 0.0001, "reward": 0.5138565480709076, "reward_std": 0.062168893218040464, "rewards/wrapped_prediction_reward_func": 0.5138565480709076, "step": 1530 }, { "completion_length": 783.4396026611328, "epoch": 0.37977423844131747, "grad_norm": 0.05021749997186356, "kl": 0.146875, "learning_rate": 1.3518329630236987e-07, "loss": 0.0001, "reward": 0.5363890752196312, "reward_std": 0.06623037457466126, "rewards/wrapped_prediction_reward_func": 0.5363890752196312, "step": 1535 }, { "completion_length": 765.2208557128906, "epoch": 0.3810112880779341, "grad_norm": 0.05126568829531852, "kl": 0.1437255859375, "learning_rate": 1.3242680314639993e-07, "loss": 0.0001, "reward": 0.48529754765331745, "reward_std": 0.06138504259288311, "rewards/wrapped_prediction_reward_func": 0.48529754765331745, "step": 1540 }, { "completion_length": 782.0541900634765, "epoch": 0.3822483377145508, "grad_norm": 0.051312150532255456, "kl": 0.13614501953125, "learning_rate": 1.2969440781048013e-07, "loss": 0.0001, "reward": 0.4957831010222435, "reward_std": 0.0628000695258379, "rewards/wrapped_prediction_reward_func": 0.4957831010222435, "step": 1545 }, { "completion_length": 759.1812713623046, "epoch": 0.38348538735116744, "grad_norm": 0.047994898560529846, "kl": 0.1449951171875, "learning_rate": 1.2698628942837697e-07, "loss": 0.0001, "reward": 0.4296943170018494, "reward_std": 0.06352304741740226, "rewards/wrapped_prediction_reward_func": 0.4296943170018494, "step": 1550 }, { "completion_length": 739.9604339599609, "epoch": 0.38472243698778413, "grad_norm": 0.034036253633998176, "kl": 0.14443359375, "learning_rate": 1.24302625542278e-07, "loss": 0.0001, "reward": 0.46531251408159735, "reward_std": 0.013734906911849976, "rewards/wrapped_prediction_reward_func": 0.46531251408159735, "step": 1555 }, { "completion_length": 762.8291885375977, "epoch": 0.3859594866244008, "grad_norm": 0.03131530897111836, "kl": 0.14432373046875, "learning_rate": 1.2164359209115232e-07, "loss": 0.0001, "reward": 0.5800471115857363, "reward_std": 0.0439479622989893, "rewards/wrapped_prediction_reward_func": 0.5800471115857363, "step": 1560 }, { "completion_length": 752.1229370117187, "epoch": 0.38719653626101747, "grad_norm": 0.04879627545155898, "kl": 0.1422607421875, "learning_rate": 1.1900936339921691e-07, "loss": 0.0002, "reward": 0.4967854070477188, "reward_std": 0.06513516306877136, "rewards/wrapped_prediction_reward_func": 0.4967854070477188, "step": 1565 }, { "completion_length": 776.4437683105468, "epoch": 0.38843358589763416, "grad_norm": 0.00011729412392632692, "kl": 0.13814697265625, "learning_rate": 1.1640011216450691e-07, "loss": 0.0002, "reward": 0.49604256078600883, "reward_std": 0.019370370730757713, "rewards/wrapped_prediction_reward_func": 0.49604256078600883, "step": 1570 }, { "completion_length": 763.1708511352539, "epoch": 0.3896706355342508, "grad_norm": 0.05642687610576811, "kl": 0.14141845703125, "learning_rate": 1.1381600944755492e-07, "loss": 0.0001, "reward": 0.543320269882679, "reward_std": 0.05773443952202797, "rewards/wrapped_prediction_reward_func": 0.543320269882679, "step": 1575 }, { "completion_length": 755.8083541870117, "epoch": 0.3909076851708675, "grad_norm": 0.05682755316076186, "kl": 0.1427001953125, "learning_rate": 1.1125722466017545e-07, "loss": 0.0001, "reward": 0.4996555058285594, "reward_std": 0.058728890493512154, "rewards/wrapped_prediction_reward_func": 0.4996555058285594, "step": 1580 }, { "completion_length": 752.2166900634766, "epoch": 0.39214473480748413, "grad_norm": 0.03241688944199977, "kl": 0.14398193359375, "learning_rate": 1.0872392555435855e-07, "loss": 0.0002, "reward": 0.5142341868951916, "reward_std": 0.02207615002989769, "rewards/wrapped_prediction_reward_func": 0.5142341868951916, "step": 1585 }, { "completion_length": 752.3291870117188, "epoch": 0.3933817844441008, "grad_norm": 0.04309650671521825, "kl": 0.14443359375, "learning_rate": 1.0621627821127288e-07, "loss": 0.0001, "reward": 0.5233104303479195, "reward_std": 0.03760427758097649, "rewards/wrapped_prediction_reward_func": 0.5233104303479195, "step": 1590 }, { "completion_length": 760.1229370117187, "epoch": 0.39461883408071746, "grad_norm": 0.0381429773416315, "kl": 0.14403076171875, "learning_rate": 1.0373444703037643e-07, "loss": 0.0001, "reward": 0.5427602794952691, "reward_std": 0.03513999842107296, "rewards/wrapped_prediction_reward_func": 0.5427602794952691, "step": 1595 }, { "completion_length": 735.8187713623047, "epoch": 0.39585588371733416, "grad_norm": 0.051544249973767636, "kl": 0.147705078125, "learning_rate": 1.0127859471863969e-07, "loss": 0.0001, "reward": 0.43322402676567434, "reward_std": 0.04156874977052212, "rewards/wrapped_prediction_reward_func": 0.43322402676567434, "step": 1600 }, { "completion_length": 774.018765258789, "epoch": 0.39709293335395085, "grad_norm": 0.043317686062817676, "kl": 0.1367431640625, "learning_rate": 9.884888227987759e-08, "loss": 0.0001, "reward": 0.5254854131489992, "reward_std": 0.06487410739064217, "rewards/wrapped_prediction_reward_func": 0.5254854131489992, "step": 1605 }, { "completion_length": 763.2312713623047, "epoch": 0.3983299829905675, "grad_norm": 0.03455817567084182, "kl": 0.1447265625, "learning_rate": 9.644546900419531e-08, "loss": 0.0001, "reward": 0.5098982187919319, "reward_std": 0.04394041560590267, "rewards/wrapped_prediction_reward_func": 0.5098982187919319, "step": 1610 }, { "completion_length": 755.7333526611328, "epoch": 0.3995670326271842, "grad_norm": 0.03359307975500474, "kl": 0.1416748046875, "learning_rate": 9.406851245754477e-08, "loss": 0.0001, "reward": 0.46502544172108173, "reward_std": 0.05649275071918965, "rewards/wrapped_prediction_reward_func": 0.46502544172108173, "step": 1615 }, { "completion_length": 750.8291900634765, "epoch": 0.4008040822638008, "grad_norm": 0.025871550461342725, "kl": 0.14180908203125, "learning_rate": 9.171816847139447e-08, "loss": 0.0001, "reward": 0.5463628959842026, "reward_std": 0.040695762261748314, "rewards/wrapped_prediction_reward_func": 0.5463628959842026, "step": 1620 }, { "completion_length": 760.0562698364258, "epoch": 0.4020411319004175, "grad_norm": 0.03771895135672017, "kl": 0.14371337890625, "learning_rate": 8.939459113251407e-08, "loss": 0.0001, "reward": 0.4566898914054036, "reward_std": 0.04758390225470066, "rewards/wrapped_prediction_reward_func": 0.4566898914054036, "step": 1625 }, { "completion_length": 778.7625198364258, "epoch": 0.40327818153703415, "grad_norm": 8.600004130211611e-05, "kl": 0.1397705078125, "learning_rate": 8.70979327728718e-08, "loss": 0.0001, "reward": 0.5543803449720144, "reward_std": 0.0506971575319767, "rewards/wrapped_prediction_reward_func": 0.5543803449720144, "step": 1630 }, { "completion_length": 743.8729354858399, "epoch": 0.40451523117365085, "grad_norm": 0.032680011980232924, "kl": 0.13988037109375, "learning_rate": 8.4828343959648e-08, "loss": 0.0002, "reward": 0.512663864903152, "reward_std": 0.04345934502780437, "rewards/wrapped_prediction_reward_func": 0.512663864903152, "step": 1635 }, { "completion_length": 772.1750244140625, "epoch": 0.4057522808102675, "grad_norm": 0.055208206665661175, "kl": 0.140478515625, "learning_rate": 8.258597348536451e-08, "loss": 0.0001, "reward": 0.47507090829312804, "reward_std": 0.060999739542603494, "rewards/wrapped_prediction_reward_func": 0.47507090829312804, "step": 1640 }, { "completion_length": 763.8396026611329, "epoch": 0.4069893304468842, "grad_norm": 0.03287960799147501, "kl": 0.1375, "learning_rate": 8.037096835812884e-08, "loss": 0.0002, "reward": 0.5474552355706692, "reward_std": 0.04405520483851433, "rewards/wrapped_prediction_reward_func": 0.5474552355706692, "step": 1645 }, { "completion_length": 762.4437728881836, "epoch": 0.4082263800835009, "grad_norm": 0.032737530215416505, "kl": 0.143310546875, "learning_rate": 7.81834737919978e-08, "loss": 0.0001, "reward": 0.5030878148972988, "reward_std": 0.05089644566178322, "rewards/wrapped_prediction_reward_func": 0.5030878148972988, "step": 1650 }, { "completion_length": 741.6333511352539, "epoch": 0.4094634297201175, "grad_norm": 0.04583515648311365, "kl": 0.144970703125, "learning_rate": 7.602363319745608e-08, "loss": 0.0002, "reward": 0.5225781170651317, "reward_std": 0.049219540879130365, "rewards/wrapped_prediction_reward_func": 0.5225781170651317, "step": 1655 }, { "completion_length": 754.0771072387695, "epoch": 0.4107004793567342, "grad_norm": 0.04196940407781161, "kl": 0.14228515625, "learning_rate": 7.389158817201541e-08, "loss": 0.0001, "reward": 0.49301483752205966, "reward_std": 0.07339239381253719, "rewards/wrapped_prediction_reward_func": 0.49301483752205966, "step": 1660 }, { "completion_length": 785.6875152587891, "epoch": 0.41193752899335084, "grad_norm": 0.054749189941343014, "kl": 0.138916015625, "learning_rate": 7.178747849093092e-08, "loss": 0.0001, "reward": 0.4913643542677164, "reward_std": 0.06684700101613998, "rewards/wrapped_prediction_reward_func": 0.4913643542677164, "step": 1665 }, { "completion_length": 772.337515258789, "epoch": 0.41317457862996754, "grad_norm": 0.03262129755927388, "kl": 0.14556884765625, "learning_rate": 6.971144209803736e-08, "loss": 0.0001, "reward": 0.5181374161504209, "reward_std": 0.046925947442650794, "rewards/wrapped_prediction_reward_func": 0.5181374161504209, "step": 1670 }, { "completion_length": 740.4083557128906, "epoch": 0.4144116282665842, "grad_norm": 0.036425258581473476, "kl": 0.14171142578125, "learning_rate": 6.766361509670687e-08, "loss": 0.0001, "reward": 0.4255114896222949, "reward_std": 0.020622413605451584, "rewards/wrapped_prediction_reward_func": 0.4255114896222949, "step": 1675 }, { "completion_length": 755.9937683105469, "epoch": 0.41564867790320087, "grad_norm": 0.029847885513060255, "kl": 0.14302978515625, "learning_rate": 6.564413174092443e-08, "loss": 0.0001, "reward": 0.5372514877468347, "reward_std": 0.05991799384355545, "rewards/wrapped_prediction_reward_func": 0.5372514877468347, "step": 1680 }, { "completion_length": 752.8479400634766, "epoch": 0.4168857275398175, "grad_norm": 0.03601716697901792, "kl": 0.14769287109375, "learning_rate": 6.365312442648769e-08, "loss": 0.0001, "reward": 0.48223327063024046, "reward_std": 0.07882548943161964, "rewards/wrapped_prediction_reward_func": 0.48223327063024046, "step": 1685 }, { "completion_length": 727.6000198364258, "epoch": 0.4181227771764342, "grad_norm": 0.050344523465414515, "kl": 0.14466552734375, "learning_rate": 6.16907236823262e-08, "loss": 0.0002, "reward": 0.4485375763848424, "reward_std": 0.05133052319288254, "rewards/wrapped_prediction_reward_func": 0.4485375763848424, "step": 1690 }, { "completion_length": 746.8271102905273, "epoch": 0.4193598268130509, "grad_norm": 0.034461655798979626, "kl": 0.1434326171875, "learning_rate": 5.97570581619446e-08, "loss": 0.0001, "reward": 0.56209014672786, "reward_std": 0.05782781988382339, "rewards/wrapped_prediction_reward_func": 0.56209014672786, "step": 1695 }, { "completion_length": 747.6312698364258, "epoch": 0.42059687644966753, "grad_norm": 0.0496150633082679, "kl": 0.1508056640625, "learning_rate": 5.785225463498828e-08, "loss": 0.0001, "reward": 0.43970810091122986, "reward_std": 0.046640587598085405, "rewards/wrapped_prediction_reward_func": 0.43970810091122986, "step": 1700 }, { "completion_length": 750.3708480834961, "epoch": 0.4218339260862842, "grad_norm": 0.04626199920119799, "kl": 0.143994140625, "learning_rate": 5.5976437978931755e-08, "loss": 0.0002, "reward": 0.5715684699825943, "reward_std": 0.03970764800906181, "rewards/wrapped_prediction_reward_func": 0.5715684699825943, "step": 1705 }, { "completion_length": 755.2041885375977, "epoch": 0.42307097572290087, "grad_norm": 0.06379999429256483, "kl": 0.1360107421875, "learning_rate": 5.412973117089287e-08, "loss": 0.0001, "reward": 0.5217526070773602, "reward_std": 0.09348910339176655, "rewards/wrapped_prediction_reward_func": 0.5217526070773602, "step": 1710 }, { "completion_length": 757.2125198364258, "epoch": 0.42430802535951756, "grad_norm": 0.03175776738632087, "kl": 0.14915771484375, "learning_rate": 5.231225527956923e-08, "loss": 0.0001, "reward": 0.4222654209472239, "reward_std": 0.03946601375937462, "rewards/wrapped_prediction_reward_func": 0.4222654209472239, "step": 1715 }, { "completion_length": 751.1312759399414, "epoch": 0.4255450749961342, "grad_norm": 0.028540883052275715, "kl": 0.14818115234375, "learning_rate": 5.052412945730239e-08, "loss": 0.0001, "reward": 0.5118896417319775, "reward_std": 0.041083069518208504, "rewards/wrapped_prediction_reward_func": 0.5118896417319775, "step": 1720 }, { "completion_length": 774.6166854858399, "epoch": 0.4267821246327509, "grad_norm": 0.00015132025761285738, "kl": 0.14119873046875, "learning_rate": 4.876547093226513e-08, "loss": 0.0001, "reward": 0.4455501016229391, "reward_std": 0.04328007437288761, "rewards/wrapped_prediction_reward_func": 0.4455501016229391, "step": 1725 }, { "completion_length": 758.783351135254, "epoch": 0.4280191742693676, "grad_norm": 0.06960276931645297, "kl": 0.13900146484375, "learning_rate": 4.703639500077655e-08, "loss": 0.0001, "reward": 0.5217124177142978, "reward_std": 0.04956845268607139, "rewards/wrapped_prediction_reward_func": 0.5217124177142978, "step": 1730 }, { "completion_length": 741.8604400634765, "epoch": 0.4292562239059842, "grad_norm": 0.00014180257255233717, "kl": 0.14415283203125, "learning_rate": 4.533701501974391e-08, "loss": 0.0002, "reward": 0.5378104700706899, "reward_std": 0.03337589018046856, "rewards/wrapped_prediction_reward_func": 0.5378104700706899, "step": 1735 }, { "completion_length": 748.5937713623047, "epoch": 0.4304932735426009, "grad_norm": 0.07373584001322955, "kl": 0.144921875, "learning_rate": 4.366744239922998e-08, "loss": 0.0002, "reward": 0.5325272038578988, "reward_std": 0.055879438295960426, "rewards/wrapped_prediction_reward_func": 0.5325272038578988, "step": 1740 }, { "completion_length": 762.002101135254, "epoch": 0.43173032317921756, "grad_norm": 0.00010285504577377182, "kl": 0.14385986328125, "learning_rate": 4.202778659514955e-08, "loss": 0.0001, "reward": 0.5330773379653693, "reward_std": 0.036233051866292956, "rewards/wrapped_prediction_reward_func": 0.5330773379653693, "step": 1745 }, { "completion_length": 749.4771026611328, "epoch": 0.43296737281583425, "grad_norm": 0.05589910211482898, "kl": 0.1441162109375, "learning_rate": 4.041815510209395e-08, "loss": 0.0001, "reward": 0.5022383161820472, "reward_std": 0.06651435121893882, "rewards/wrapped_prediction_reward_func": 0.5022383161820472, "step": 1750 }, { "completion_length": 764.1750213623047, "epoch": 0.4342044224524509, "grad_norm": 0.05340531857854081, "kl": 0.14259033203125, "learning_rate": 3.8838653446283065e-08, "loss": 0.0001, "reward": 0.47590462071821094, "reward_std": 0.05835465528070927, "rewards/wrapped_prediction_reward_func": 0.47590462071821094, "step": 1755 }, { "completion_length": 757.6666839599609, "epoch": 0.4354414720890676, "grad_norm": 0.031853006873851716, "kl": 0.14306640625, "learning_rate": 3.7289385178647935e-08, "loss": 0.0001, "reward": 0.47347617177292706, "reward_std": 0.04657198339700699, "rewards/wrapped_prediction_reward_func": 0.47347617177292706, "step": 1760 }, { "completion_length": 752.1833557128906, "epoch": 0.4366785217256842, "grad_norm": 0.04525867948077769, "kl": 0.14658203125, "learning_rate": 3.5770451868041174e-08, "loss": 0.0001, "reward": 0.5058192176744342, "reward_std": 0.03406770564615726, "rewards/wrapped_prediction_reward_func": 0.5058192176744342, "step": 1765 }, { "completion_length": 756.6062637329102, "epoch": 0.4379155713623009, "grad_norm": 0.00011051878816484868, "kl": 0.1396484375, "learning_rate": 3.4281953094578875e-08, "loss": 0.0001, "reward": 0.5579410493373871, "reward_std": 0.04511531516909599, "rewards/wrapped_prediction_reward_func": 0.5579410493373871, "step": 1770 }, { "completion_length": 753.6041900634766, "epoch": 0.4391526209989176, "grad_norm": 0.029306512522029707, "kl": 0.14267578125, "learning_rate": 3.282398644311185e-08, "loss": 0.0001, "reward": 0.42345950920134784, "reward_std": 0.03382036946713925, "rewards/wrapped_prediction_reward_func": 0.42345950920134784, "step": 1775 }, { "completion_length": 747.4583511352539, "epoch": 0.44038967063553425, "grad_norm": 0.0002058250914418045, "kl": 0.1451171875, "learning_rate": 3.1396647496828244e-08, "loss": 0.0001, "reward": 0.5162877984344959, "reward_std": 0.04818576686084271, "rewards/wrapped_prediction_reward_func": 0.5162877984344959, "step": 1780 }, { "completion_length": 768.1354385375977, "epoch": 0.44162672027215094, "grad_norm": 0.03547501706218823, "kl": 0.1423828125, "learning_rate": 3.000002983098693e-08, "loss": 0.0001, "reward": 0.41099606081843376, "reward_std": 0.02396547980606556, "rewards/wrapped_prediction_reward_func": 0.41099606081843376, "step": 1785 }, { "completion_length": 752.0541839599609, "epoch": 0.4428637699087676, "grad_norm": 0.04960837632996653, "kl": 0.1407470703125, "learning_rate": 2.8634225006782864e-08, "loss": 0.0001, "reward": 0.46006874069571496, "reward_std": 0.04551262743771076, "rewards/wrapped_prediction_reward_func": 0.46006874069571496, "step": 1790 }, { "completion_length": 747.8979385375976, "epoch": 0.4441008195453843, "grad_norm": 0.05518866244394545, "kl": 0.1421875, "learning_rate": 2.7299322565344953e-08, "loss": 0.0001, "reward": 0.5255239962600171, "reward_std": 0.0466438353061676, "rewards/wrapped_prediction_reward_func": 0.5255239962600171, "step": 1795 }, { "completion_length": 732.4771057128906, "epoch": 0.4453378691820009, "grad_norm": 0.04429062939006552, "kl": 0.14476318359375, "learning_rate": 2.5995410021864783e-08, "loss": 0.0002, "reward": 0.5237077623605728, "reward_std": 0.03708002977073192, "rewards/wrapped_prediction_reward_func": 0.5237077623605728, "step": 1800 }, { "completion_length": 764.3146026611328, "epoch": 0.4465749188186176, "grad_norm": 0.0426046136995963, "kl": 0.1398193359375, "learning_rate": 2.4722572859859903e-08, "loss": 0.0001, "reward": 0.46183042526245116, "reward_std": 0.05240595079958439, "rewards/wrapped_prediction_reward_func": 0.46183042526245116, "step": 1805 }, { "completion_length": 741.4208572387695, "epoch": 0.44781196845523424, "grad_norm": 0.00010630448758537318, "kl": 0.1475341796875, "learning_rate": 2.348089452556956e-08, "loss": 0.0002, "reward": 0.5036526273936033, "reward_std": 0.043657009676098826, "rewards/wrapped_prediction_reward_func": 0.5036526273936033, "step": 1810 }, { "completion_length": 761.2416854858399, "epoch": 0.44904901809185094, "grad_norm": 0.05598308984077719, "kl": 0.14114990234375, "learning_rate": 2.2270456422483653e-08, "loss": 0.0001, "reward": 0.45990248378366233, "reward_std": 0.06032248325645924, "rewards/wrapped_prediction_reward_func": 0.45990248378366233, "step": 1815 }, { "completion_length": 783.6937774658203, "epoch": 0.45028606772846763, "grad_norm": 0.049329391197086056, "kl": 0.13870849609375, "learning_rate": 2.109133790600648e-08, "loss": 0.0001, "reward": 0.5239573128521442, "reward_std": 0.05790785625576973, "rewards/wrapped_prediction_reward_func": 0.5239573128521442, "step": 1820 }, { "completion_length": 777.2375137329102, "epoch": 0.45152311736508427, "grad_norm": 0.0001434458111005123, "kl": 0.1373779296875, "learning_rate": 1.9943616278253638e-08, "loss": 0.0001, "reward": 0.5233666568994522, "reward_std": 0.02228833958506584, "rewards/wrapped_prediction_reward_func": 0.5233666568994522, "step": 1825 }, { "completion_length": 760.3166870117187, "epoch": 0.45276016700170096, "grad_norm": 0.041186229726982146, "kl": 0.1435302734375, "learning_rate": 1.882736678298491e-08, "loss": 0.0002, "reward": 0.5085546795278788, "reward_std": 0.06498942896723747, "rewards/wrapped_prediction_reward_func": 0.5085546795278788, "step": 1830 }, { "completion_length": 749.2041900634765, "epoch": 0.4539972166383176, "grad_norm": 0.030215326205040582, "kl": 0.14215087890625, "learning_rate": 1.7742662600670642e-08, "loss": 0.0001, "reward": 0.4173439702950418, "reward_std": 0.06593191176652909, "rewards/wrapped_prediction_reward_func": 0.4173439702950418, "step": 1835 }, { "completion_length": 756.1958526611328, "epoch": 0.4552342662749343, "grad_norm": 0.03994673662533993, "kl": 0.14322509765625, "learning_rate": 1.6689574843694432e-08, "loss": 0.0001, "reward": 0.47786121647804974, "reward_std": 0.06695570573210716, "rewards/wrapped_prediction_reward_func": 0.47786121647804974, "step": 1840 }, { "completion_length": 744.8500244140625, "epoch": 0.45647131591155093, "grad_norm": 0.06569594236789683, "kl": 0.14256591796875, "learning_rate": 1.5668172551691174e-08, "loss": 0.0001, "reward": 0.430814937222749, "reward_std": 0.04550225920975208, "rewards/wrapped_prediction_reward_func": 0.430814937222749, "step": 1845 }, { "completion_length": 738.6708526611328, "epoch": 0.45770836554816763, "grad_norm": 0.06816211508503128, "kl": 0.1371826171875, "learning_rate": 1.4678522687020412e-08, "loss": 0.0001, "reward": 0.45761711671948435, "reward_std": 0.05669744499027729, "rewards/wrapped_prediction_reward_func": 0.45761711671948435, "step": 1850 }, { "completion_length": 721.3833511352539, "epoch": 0.45894541518478427, "grad_norm": 0.06618371616756627, "kl": 0.14259033203125, "learning_rate": 1.3720690130377022e-08, "loss": 0.0001, "reward": 0.45178795969113705, "reward_std": 0.07305734120309353, "rewards/wrapped_prediction_reward_func": 0.45178795969113705, "step": 1855 }, { "completion_length": 757.1062744140625, "epoch": 0.46018246482140096, "grad_norm": 0.060005008678144026, "kl": 0.14178466796875, "learning_rate": 1.2794737676536993e-08, "loss": 0.0002, "reward": 0.43401491129770875, "reward_std": 0.06557008102536202, "rewards/wrapped_prediction_reward_func": 0.43401491129770875, "step": 1860 }, { "completion_length": 753.7271072387696, "epoch": 0.46141951445801765, "grad_norm": 0.07608640613059564, "kl": 0.135595703125, "learning_rate": 1.1900726030241004e-08, "loss": 0.0001, "reward": 0.4443151566199958, "reward_std": 0.08900549970567226, "rewards/wrapped_prediction_reward_func": 0.4443151566199958, "step": 1865 }, { "completion_length": 753.8896072387695, "epoch": 0.4626565640946343, "grad_norm": 0.058578778804687946, "kl": 0.14295654296875, "learning_rate": 1.1038713802214717e-08, "loss": 0.0001, "reward": 0.4238445356488228, "reward_std": 0.0466154221445322, "rewards/wrapped_prediction_reward_func": 0.4238445356488228, "step": 1870 }, { "completion_length": 781.470851135254, "epoch": 0.463893613731251, "grad_norm": 0.0001426283671727442, "kl": 0.137255859375, "learning_rate": 1.0208757505326015e-08, "loss": 0.0002, "reward": 0.49530622363090515, "reward_std": 0.033482743054628374, "rewards/wrapped_prediction_reward_func": 0.49530622363090515, "step": 1875 }, { "completion_length": 737.5729354858398, "epoch": 0.4651306633678676, "grad_norm": 0.05725258748554909, "kl": 0.14696044921875, "learning_rate": 9.410911550880474e-09, "loss": 0.0001, "reward": 0.4649914444424212, "reward_std": 0.0744082860648632, "rewards/wrapped_prediction_reward_func": 0.4649914444424212, "step": 1880 }, { "completion_length": 751.4291839599609, "epoch": 0.4663677130044843, "grad_norm": 0.03475875975637069, "kl": 0.14403076171875, "learning_rate": 8.645228245053759e-09, "loss": 0.0001, "reward": 0.534701827634126, "reward_std": 0.04333846867084503, "rewards/wrapped_prediction_reward_func": 0.534701827634126, "step": 1885 }, { "completion_length": 744.2021041870117, "epoch": 0.46760476264110096, "grad_norm": 0.00013473826478947477, "kl": 0.14542236328125, "learning_rate": 7.91175778546288e-09, "loss": 0.0002, "reward": 0.5801377366296947, "reward_std": 0.0406968604773283, "rewards/wrapped_prediction_reward_func": 0.5801377366296947, "step": 1890 }, { "completion_length": 766.1812698364258, "epoch": 0.46884181227771765, "grad_norm": 0.03228682835223789, "kl": 0.14100341796875, "learning_rate": 7.2105482578749265e-09, "loss": 0.0001, "reward": 0.4830586672760546, "reward_std": 0.049609319865703584, "rewards/wrapped_prediction_reward_func": 0.4830586672760546, "step": 1895 }, { "completion_length": 764.9041854858399, "epoch": 0.4700788619143343, "grad_norm": 0.04102730180703454, "kl": 0.142333984375, "learning_rate": 6.541645633054649e-09, "loss": 0.0002, "reward": 0.45141522530466316, "reward_std": 0.04762217551469803, "rewards/wrapped_prediction_reward_func": 0.45141522530466316, "step": 1900 }, { "completion_length": 746.8646011352539, "epoch": 0.471315911550951, "grad_norm": 0.05042809846963281, "kl": 0.14332275390625, "learning_rate": 5.90509376375109e-09, "loss": 0.0001, "reward": 0.4712002633139491, "reward_std": 0.06454807668924331, "rewards/wrapped_prediction_reward_func": 0.4712002633139491, "step": 1905 }, { "completion_length": 767.4666900634766, "epoch": 0.4725529611875677, "grad_norm": 0.028621613174543297, "kl": 0.14637451171875, "learning_rate": 5.3009343818219975e-09, "loss": 0.0001, "reward": 0.5114058044739067, "reward_std": 0.025011104345321656, "rewards/wrapped_prediction_reward_func": 0.5114058044739067, "step": 1910 }, { "completion_length": 761.6020980834961, "epoch": 0.4737900108241843, "grad_norm": 0.0578058158113799, "kl": 0.1398681640625, "learning_rate": 4.7292070954983445e-09, "loss": 0.0001, "reward": 0.4528739302419126, "reward_std": 0.04959133602678776, "rewards/wrapped_prediction_reward_func": 0.4528739302419126, "step": 1915 }, { "completion_length": 733.0416885375977, "epoch": 0.475027060460801, "grad_norm": 0.08437093256128045, "kl": 0.1397705078125, "learning_rate": 4.189949386787462e-09, "loss": 0.0001, "reward": 0.45349957039579747, "reward_std": 0.09353262223303319, "rewards/wrapped_prediction_reward_func": 0.45349957039579747, "step": 1920 }, { "completion_length": 729.735432434082, "epoch": 0.47626411009741765, "grad_norm": 0.04806114261006065, "kl": 0.14573974609375, "learning_rate": 3.683196609015782e-09, "loss": 0.0001, "reward": 0.48266843175515534, "reward_std": 0.03269868195056915, "rewards/wrapped_prediction_reward_func": 0.48266843175515534, "step": 1925 }, { "completion_length": 759.877101135254, "epoch": 0.47750115973403434, "grad_norm": 0.052960726423666026, "kl": 0.1370849609375, "learning_rate": 3.2089819845111944e-09, "loss": 0.0001, "reward": 0.49991451846435664, "reward_std": 0.0764271680265665, "rewards/wrapped_prediction_reward_func": 0.49991451846435664, "step": 1930 }, { "completion_length": 768.1479339599609, "epoch": 0.478738209370651, "grad_norm": 0.00013734348321962073, "kl": 0.14188232421875, "learning_rate": 2.767336602424786e-09, "loss": 0.0002, "reward": 0.5727842427790165, "reward_std": 0.0439902514219284, "rewards/wrapped_prediction_reward_func": 0.5727842427790165, "step": 1935 }, { "completion_length": 735.6062759399414, "epoch": 0.4799752590072677, "grad_norm": 0.05340228660676688, "kl": 0.146630859375, "learning_rate": 2.3582894166930267e-09, "loss": 0.0002, "reward": 0.5651108752004802, "reward_std": 0.048683862388134005, "rewards/wrapped_prediction_reward_func": 0.5651108752004802, "step": 1940 }, { "completion_length": 764.1625213623047, "epoch": 0.4812123086438843, "grad_norm": 0.045130824646274305, "kl": 0.1431640625, "learning_rate": 1.9818672441391237e-09, "loss": 0.0001, "reward": 0.48173327138647437, "reward_std": 0.09359192438423633, "rewards/wrapped_prediction_reward_func": 0.48173327138647437, "step": 1945 }, { "completion_length": 754.6708526611328, "epoch": 0.482449358280501, "grad_norm": 0.04891115639221876, "kl": 0.1383056640625, "learning_rate": 1.638094762715314e-09, "loss": 0.0002, "reward": 0.5446788365021348, "reward_std": 0.04786500036716461, "rewards/wrapped_prediction_reward_func": 0.5446788365021348, "step": 1950 }, { "completion_length": 757.4166854858398, "epoch": 0.4836864079171177, "grad_norm": 0.054143438527486494, "kl": 0.14033203125, "learning_rate": 1.3269945098847713e-09, "loss": 0.0002, "reward": 0.45030141416937114, "reward_std": 0.05802864618599415, "rewards/wrapped_prediction_reward_func": 0.45030141416937114, "step": 1955 }, { "completion_length": 753.3354354858399, "epoch": 0.48492345755373434, "grad_norm": 0.032792774518752794, "kl": 0.14150390625, "learning_rate": 1.0485868811441756e-09, "loss": 0.0001, "reward": 0.4954256805591285, "reward_std": 0.0408743605017662, "rewards/wrapped_prediction_reward_func": 0.4954256805591285, "step": 1960 }, { "completion_length": 759.2583511352539, "epoch": 0.48616050719035103, "grad_norm": 0.06427057024543754, "kl": 0.14178466796875, "learning_rate": 8.02890128686562e-10, "loss": 0.0001, "reward": 0.45416686236858367, "reward_std": 0.05258323810994625, "rewards/wrapped_prediction_reward_func": 0.45416686236858367, "step": 1965 }, { "completion_length": 777.1646057128906, "epoch": 0.48739755682696767, "grad_norm": 0.06313926645380846, "kl": 0.137158203125, "learning_rate": 5.899203602046654e-10, "loss": 0.0001, "reward": 0.46284006256610155, "reward_std": 0.05382467992603779, "rewards/wrapped_prediction_reward_func": 0.46284006256610155, "step": 1970 }, { "completion_length": 737.4250198364258, "epoch": 0.48863460646358436, "grad_norm": 0.031500260168922385, "kl": 0.144970703125, "learning_rate": 4.0969153783498854e-10, "loss": 0.0001, "reward": 0.4604743555188179, "reward_std": 0.053301149234175685, "rewards/wrapped_prediction_reward_func": 0.4604743555188179, "step": 1975 }, { "completion_length": 762.087516784668, "epoch": 0.489871656100201, "grad_norm": 0.045774442452721636, "kl": 0.141162109375, "learning_rate": 2.6221547724253333e-10, "loss": 0.0001, "reward": 0.46325629111379385, "reward_std": 0.06786729171872138, "rewards/wrapped_prediction_reward_func": 0.46325629111379385, "step": 1980 }, { "completion_length": 753.0937683105469, "epoch": 0.4911087057368177, "grad_norm": 0.05242761440483202, "kl": 0.14195556640625, "learning_rate": 1.4750184684597656e-10, "loss": 0.0001, "reward": 0.4899668443016708, "reward_std": 0.05773559994995594, "rewards/wrapped_prediction_reward_func": 0.4899668443016708, "step": 1985 }, { "completion_length": 740.2375228881835, "epoch": 0.49234575537343434, "grad_norm": 0.031428286908261924, "kl": 0.14801025390625, "learning_rate": 6.555816718389895e-11, "loss": 0.0002, "reward": 0.5367826625704766, "reward_std": 0.03765491470694542, "rewards/wrapped_prediction_reward_func": 0.5367826625704766, "step": 1990 }, { "completion_length": 772.6854370117187, "epoch": 0.49358280501005103, "grad_norm": 0.06630679255640828, "kl": 0.1428955078125, "learning_rate": 1.6389810421846284e-11, "loss": 0.0001, "reward": 0.4952763170003891, "reward_std": 0.07443222850561142, "rewards/wrapped_prediction_reward_func": 0.4952763170003891, "step": 1995 }, { "completion_length": 755.5229385375976, "epoch": 0.4948198546466677, "grad_norm": 0.00011456532666674227, "kl": 0.1419189453125, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.43289072550833224, "reward_std": 0.05967330485582352, "rewards/wrapped_prediction_reward_func": 0.43289072550833224, "step": 2000 }, { "epoch": 0.4948198546466677, "step": 2000, "total_flos": 0.0, "train_loss": 3.165239551162813e-05, "train_runtime": 30019.0956, "train_samples_per_second": 6.396, "train_steps_per_second": 0.067 } ], "logging_steps": 5, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }