{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.21401819154628143, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 167.21875, "epoch": 0.002140181915462814, "grad_norm": 3.21875, "kl": 0.0, "learning_rate": 2.1276595744680852e-07, "loss": -0.0, "reward": 0.671875, "reward_std": 0.7394911348819733, "rewards/correctness_reward_func": 0.28125, "rewards/format_reward_func": 0.390625, "step": 1 }, { "completion_length": 151.84375, "epoch": 0.004280363830925628, "grad_norm": 3.109375, "kl": 0.0, "learning_rate": 4.2553191489361704e-07, "loss": -0.0, "reward": 0.84375, "reward_std": 0.6551200225949287, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.5, "step": 2 }, { "completion_length": 152.3125, "epoch": 0.006420545746388443, "grad_norm": 2.734375, "kl": 0.0007652118656551465, "learning_rate": 6.382978723404255e-07, "loss": 0.0, "reward": 0.734375, "reward_std": 0.6524592041969299, "rewards/correctness_reward_func": 0.21875, "rewards/format_reward_func": 0.515625, "step": 3 }, { "completion_length": 175.328125, "epoch": 0.008560727661851257, "grad_norm": 2.84375, "kl": 0.0007007253007031977, "learning_rate": 8.510638297872341e-07, "loss": 0.0, "reward": 0.84375, "reward_std": 0.7802355140447617, "rewards/correctness_reward_func": 0.40625, "rewards/format_reward_func": 0.4375, "step": 4 }, { "completion_length": 152.84375, "epoch": 0.010700909577314071, "grad_norm": 3.0625, "kl": 0.0006465155893238261, "learning_rate": 1.0638297872340427e-06, "loss": 0.0, "reward": 0.875, "reward_std": 0.6669624000787735, "rewards/correctness_reward_func": 0.3125, "rewards/format_reward_func": 0.5625, "step": 5 }, { "completion_length": 179.5, "epoch": 0.012841091492776886, "grad_norm": 3.15625, "kl": 0.0007112839666660875, "learning_rate": 1.276595744680851e-06, "loss": 0.0, "reward": 0.6875, "reward_std": 0.7003459334373474, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.4375, "step": 6 }, { "completion_length": 169.140625, "epoch": 0.0149812734082397, "grad_norm": 3.5, "kl": 0.000702101708156988, "learning_rate": 1.4893617021276596e-06, "loss": 0.0, "reward": 0.84375, "reward_std": 0.7666550725698471, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.5, "step": 7 }, { "completion_length": 146.390625, "epoch": 0.017121455323702513, "grad_norm": 3.25, "kl": 0.00070322556712199, "learning_rate": 1.7021276595744682e-06, "loss": 0.0, "reward": 0.765625, "reward_std": 0.514876589179039, "rewards/correctness_reward_func": 0.15625, "rewards/format_reward_func": 0.609375, "step": 8 }, { "completion_length": 141.40625, "epoch": 0.019261637239165328, "grad_norm": 2.875, "kl": 0.0008593319798819721, "learning_rate": 1.9148936170212767e-06, "loss": 0.0, "reward": 1.03125, "reward_std": 0.7832296937704086, "rewards/correctness_reward_func": 0.46875, "rewards/format_reward_func": 0.5625, "step": 9 }, { "completion_length": 152.21875, "epoch": 0.021401819154628143, "grad_norm": 3.71875, "kl": 0.0011522448767209426, "learning_rate": 2.1276595744680853e-06, "loss": 0.0, "reward": 0.75, "reward_std": 0.7914351224899292, "rewards/correctness_reward_func": 0.1875, "rewards/format_reward_func": 0.5625, "step": 10 }, { "completion_length": 118.953125, "epoch": 0.023542001070090957, "grad_norm": 2.75, "kl": 0.0009655868489062414, "learning_rate": 2.340425531914894e-06, "loss": 0.0, "reward": 1.3125, "reward_std": 0.6371002793312073, "rewards/correctness_reward_func": 0.53125, "rewards/format_reward_func": 0.78125, "step": 11 }, { "completion_length": 135.90625, "epoch": 0.025682182985553772, "grad_norm": 3.3125, "kl": 0.0013183261326048523, "learning_rate": 2.553191489361702e-06, "loss": 0.0, "reward": 0.890625, "reward_std": 0.6318541243672371, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.640625, "step": 12 }, { "completion_length": 141.265625, "epoch": 0.027822364901016586, "grad_norm": 2.390625, "kl": 0.0016906778037082404, "learning_rate": 2.765957446808511e-06, "loss": 0.0, "reward": 1.015625, "reward_std": 0.6712641417980194, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.765625, "step": 13 }, { "completion_length": 140.015625, "epoch": 0.0299625468164794, "grad_norm": 2.796875, "kl": 0.0024730846926104277, "learning_rate": 2.978723404255319e-06, "loss": 0.0, "reward": 1.0625, "reward_std": 0.657661646604538, "rewards/correctness_reward_func": 0.3125, "rewards/format_reward_func": 0.75, "step": 14 }, { "completion_length": 139.65625, "epoch": 0.03210272873194221, "grad_norm": 2.84375, "kl": 0.003585253667552024, "learning_rate": 3.191489361702128e-06, "loss": 0.0, "reward": 1.109375, "reward_std": 0.6227563470602036, "rewards/correctness_reward_func": 0.3125, "rewards/format_reward_func": 0.796875, "step": 15 }, { "completion_length": 140.671875, "epoch": 0.03424291064740503, "grad_norm": 2.734375, "kl": 0.004379586665891111, "learning_rate": 3.4042553191489363e-06, "loss": 0.0, "reward": 1.0, "reward_std": 0.603929728269577, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.75, "step": 16 }, { "completion_length": 118.703125, "epoch": 0.03638309256286784, "grad_norm": 3.25, "kl": 0.00417952478164807, "learning_rate": 3.6170212765957453e-06, "loss": 0.0, "reward": 1.1875, "reward_std": 0.6465111523866653, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.84375, "step": 17 }, { "completion_length": 120.28125, "epoch": 0.038523274478330656, "grad_norm": 2.484375, "kl": 0.005598044954240322, "learning_rate": 3.8297872340425535e-06, "loss": 0.0, "reward": 1.09375, "reward_std": 0.5800373703241348, "rewards/correctness_reward_func": 0.25, "rewards/format_reward_func": 0.84375, "step": 18 }, { "completion_length": 127.890625, "epoch": 0.04066345639379347, "grad_norm": 3.0, "kl": 0.005727470270358026, "learning_rate": 4.042553191489362e-06, "loss": 0.0, "reward": 1.15625, "reward_std": 0.6766257882118225, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.8125, "step": 19 }, { "completion_length": 121.703125, "epoch": 0.042803638309256285, "grad_norm": 2.109375, "kl": 0.005017031449824572, "learning_rate": 4.255319148936171e-06, "loss": 0.0, "reward": 1.203125, "reward_std": 0.38406607508659363, "rewards/correctness_reward_func": 0.28125, "rewards/format_reward_func": 0.921875, "step": 20 }, { "completion_length": 147.4375, "epoch": 0.0449438202247191, "grad_norm": 2.171875, "kl": 0.0043895336566492915, "learning_rate": 4.468085106382979e-06, "loss": 0.0, "reward": 1.203125, "reward_std": 0.568247452378273, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.859375, "step": 21 }, { "completion_length": 129.40625, "epoch": 0.047084002140181914, "grad_norm": 1.984375, "kl": 0.006280150264501572, "learning_rate": 4.680851063829788e-06, "loss": 0.0, "reward": 1.3125, "reward_std": 0.5328208804130554, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.96875, "step": 22 }, { "completion_length": 115.859375, "epoch": 0.04922418405564473, "grad_norm": 3.03125, "kl": 0.004840813227929175, "learning_rate": 4.893617021276596e-06, "loss": 0.0, "reward": 1.21875, "reward_std": 0.555421955883503, "rewards/correctness_reward_func": 0.28125, "rewards/format_reward_func": 0.9375, "step": 23 }, { "completion_length": 136.6875, "epoch": 0.051364365971107544, "grad_norm": 2.515625, "kl": 0.005280128796584904, "learning_rate": 5.106382978723404e-06, "loss": 0.0, "reward": 1.265625, "reward_std": 0.6034187823534012, "rewards/correctness_reward_func": 0.34375, "rewards/format_reward_func": 0.921875, "step": 24 }, { "completion_length": 128.9375, "epoch": 0.05350454788657036, "grad_norm": 2.453125, "kl": 0.008012514561414719, "learning_rate": 5.319148936170213e-06, "loss": 0.0, "reward": 1.71875, "reward_std": 0.9615881741046906, "rewards/correctness_reward_func": 0.8125, "rewards/format_reward_func": 0.90625, "step": 25 }, { "completion_length": 119.9375, "epoch": 0.05564472980203317, "grad_norm": 2.53125, "kl": 0.006216021254658699, "learning_rate": 5.531914893617022e-06, "loss": 0.0, "reward": 1.53125, "reward_std": 0.6540063470602036, "rewards/correctness_reward_func": 0.5625, "rewards/format_reward_func": 0.96875, "step": 26 }, { "completion_length": 143.375, "epoch": 0.05778491171749599, "grad_norm": 2.5625, "kl": 0.006388432695530355, "learning_rate": 5.744680851063831e-06, "loss": 0.0, "reward": 1.296875, "reward_std": 0.6433176919817924, "rewards/correctness_reward_func": 0.40625, "rewards/format_reward_func": 0.890625, "step": 27 }, { "completion_length": 131.15625, "epoch": 0.0599250936329588, "grad_norm": 2.453125, "kl": 0.008788998704403639, "learning_rate": 5.957446808510638e-06, "loss": 0.0, "reward": 1.546875, "reward_std": 0.6077659651637077, "rewards/correctness_reward_func": 0.59375, "rewards/format_reward_func": 0.953125, "step": 28 }, { "completion_length": 124.25, "epoch": 0.06206527554842162, "grad_norm": 1.890625, "kl": 0.0065922129433602095, "learning_rate": 6.170212765957447e-06, "loss": 0.0, "reward": 1.4375, "reward_std": 0.6141257882118225, "rewards/correctness_reward_func": 0.46875, "rewards/format_reward_func": 0.96875, "step": 29 }, { "completion_length": 128.421875, "epoch": 0.06420545746388442, "grad_norm": 2.203125, "kl": 0.008303154725581408, "learning_rate": 6.382978723404256e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.7354267686605453, "rewards/correctness_reward_func": 0.65625, "rewards/format_reward_func": 0.96875, "step": 30 }, { "completion_length": 130.25, "epoch": 0.06634563937934725, "grad_norm": 1.859375, "kl": 0.01200802018865943, "learning_rate": 6.595744680851064e-06, "loss": 0.0001, "reward": 1.703125, "reward_std": 0.5409187823534012, "rewards/correctness_reward_func": 0.71875, "rewards/format_reward_func": 0.984375, "step": 31 }, { "completion_length": 134.3125, "epoch": 0.06848582129481005, "grad_norm": 2.4375, "kl": 0.012481397716328502, "learning_rate": 6.808510638297873e-06, "loss": 0.0001, "reward": 1.5625, "reward_std": 0.5936799347400665, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.9375, "step": 32 }, { "completion_length": 140.578125, "epoch": 0.07062600321027288, "grad_norm": 2.203125, "kl": 0.011200629058293998, "learning_rate": 7.021276595744682e-06, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.5570628941059113, "rewards/correctness_reward_func": 0.5, "rewards/format_reward_func": 0.96875, "step": 33 }, { "completion_length": 126.640625, "epoch": 0.07276618512573568, "grad_norm": 2.328125, "kl": 0.011981034418568015, "learning_rate": 7.234042553191491e-06, "loss": 0.0001, "reward": 1.671875, "reward_std": 0.6420939117670059, "rewards/correctness_reward_func": 0.6875, "rewards/format_reward_func": 0.984375, "step": 34 }, { "completion_length": 141.046875, "epoch": 0.0749063670411985, "grad_norm": 2.0, "kl": 0.01834311173297465, "learning_rate": 7.446808510638298e-06, "loss": 0.0001, "reward": 1.75, "reward_std": 0.5, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 35 }, { "completion_length": 129.703125, "epoch": 0.07704654895666131, "grad_norm": 2.15625, "kl": 0.02186211384832859, "learning_rate": 7.659574468085107e-06, "loss": 0.0001, "reward": 1.859375, "reward_std": 0.6324251294136047, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.984375, "step": 36 }, { "completion_length": 147.5, "epoch": 0.07918673087212413, "grad_norm": 2.109375, "kl": 0.018696403596550226, "learning_rate": 7.872340425531916e-06, "loss": 0.0001, "reward": 1.78125, "reward_std": 0.6237945631146431, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.90625, "step": 37 }, { "completion_length": 138.984375, "epoch": 0.08132691278758694, "grad_norm": 2.375, "kl": 0.02475315798074007, "learning_rate": 8.085106382978723e-06, "loss": 0.0001, "reward": 1.78125, "reward_std": 0.5818375647068024, "rewards/correctness_reward_func": 0.78125, "rewards/format_reward_func": 1.0, "step": 38 }, { "completion_length": 138.875, "epoch": 0.08346709470304976, "grad_norm": 2.21875, "kl": 0.02588808024302125, "learning_rate": 8.297872340425532e-06, "loss": 0.0001, "reward": 1.53125, "reward_std": 0.46814728528261185, "rewards/correctness_reward_func": 0.59375, "rewards/format_reward_func": 0.9375, "step": 39 }, { "completion_length": 132.46875, "epoch": 0.08560727661851257, "grad_norm": 2.5, "kl": 0.03315168898552656, "learning_rate": 8.510638297872341e-06, "loss": 0.0002, "reward": 1.609375, "reward_std": 0.6949251294136047, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 0.984375, "step": 40 }, { "completion_length": 143.25, "epoch": 0.08774745853397539, "grad_norm": 2.34375, "kl": 0.027149769477546215, "learning_rate": 8.72340425531915e-06, "loss": 0.0001, "reward": 1.75, "reward_std": 0.6540063470602036, "rewards/correctness_reward_func": 0.78125, "rewards/format_reward_func": 0.96875, "step": 41 }, { "completion_length": 149.1875, "epoch": 0.0898876404494382, "grad_norm": 2.734375, "kl": 0.033755607437342405, "learning_rate": 8.936170212765958e-06, "loss": 0.0002, "reward": 1.890625, "reward_std": 0.9243821352720261, "rewards/correctness_reward_func": 0.9375, "rewards/format_reward_func": 0.953125, "step": 42 }, { "completion_length": 140.765625, "epoch": 0.09202782236490102, "grad_norm": 1.65625, "kl": 0.03690116386860609, "learning_rate": 9.148936170212767e-06, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.33183756470680237, "rewards/correctness_reward_func": 0.71875, "rewards/format_reward_func": 1.0, "step": 43 }, { "completion_length": 119.640625, "epoch": 0.09416800428036383, "grad_norm": 2.984375, "kl": 0.042608937714248896, "learning_rate": 9.361702127659576e-06, "loss": 0.0002, "reward": 2.0, "reward_std": 0.6443375647068024, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 1.0, "step": 44 }, { "completion_length": 133.3125, "epoch": 0.09630818619582665, "grad_norm": 2.5625, "kl": 0.05142616247758269, "learning_rate": 9.574468085106385e-06, "loss": 0.0003, "reward": 1.5625, "reward_std": 0.6292316764593124, "rewards/correctness_reward_func": 0.59375, "rewards/format_reward_func": 0.96875, "step": 45 }, { "completion_length": 137.921875, "epoch": 0.09844836811128946, "grad_norm": 4.65625, "kl": 0.05740292742848396, "learning_rate": 9.787234042553192e-06, "loss": 0.0003, "reward": 1.5625, "reward_std": 0.39597851037979126, "rewards/correctness_reward_func": 0.65625, "rewards/format_reward_func": 0.90625, "step": 46 }, { "completion_length": 139.5625, "epoch": 0.10058855002675228, "grad_norm": 2.96875, "kl": 0.05779360141605139, "learning_rate": 1e-05, "loss": 0.0003, "reward": 1.75, "reward_std": 0.6911139190196991, "rewards/correctness_reward_func": 0.96875, "rewards/format_reward_func": 0.78125, "step": 47 }, { "completion_length": 162.4375, "epoch": 0.10272873194221509, "grad_norm": 2.75, "kl": 0.057580760680139065, "learning_rate": 9.99986012530635e-06, "loss": 0.0003, "reward": 1.734375, "reward_std": 0.6755875647068024, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.984375, "step": 48 }, { "completion_length": 117.859375, "epoch": 0.10486891385767791, "grad_norm": 1.625, "kl": 0.05675748083740473, "learning_rate": 9.999440509051367e-06, "loss": 0.0003, "reward": 2.03125, "reward_std": 0.30542195588350296, "rewards/correctness_reward_func": 1.0625, "rewards/format_reward_func": 0.96875, "step": 49 }, { "completion_length": 112.28125, "epoch": 0.10700909577314072, "grad_norm": 3.203125, "kl": 0.05932612717151642, "learning_rate": 9.998741174712534e-06, "loss": 0.0003, "reward": 1.96875, "reward_std": 0.5953208804130554, "rewards/correctness_reward_func": 1.0625, "rewards/format_reward_func": 0.90625, "step": 50 }, { "completion_length": 111.375, "epoch": 0.10914927768860354, "grad_norm": 2.75, "kl": 0.08433620352298021, "learning_rate": 9.997762161417517e-06, "loss": 0.0004, "reward": 1.953125, "reward_std": 0.5074251294136047, "rewards/correctness_reward_func": 0.96875, "rewards/format_reward_func": 0.984375, "step": 51 }, { "completion_length": 123.78125, "epoch": 0.11128945960406635, "grad_norm": 3.015625, "kl": 0.06248955149203539, "learning_rate": 9.996503523941994e-06, "loss": 0.0003, "reward": 2.125, "reward_std": 0.4858439266681671, "rewards/correctness_reward_func": 1.125, "rewards/format_reward_func": 1.0, "step": 52 }, { "completion_length": 119.46875, "epoch": 0.11342964151952915, "grad_norm": 2.625, "kl": 0.061760940589010715, "learning_rate": 9.994965332706574e-06, "loss": 0.0003, "reward": 2.0625, "reward_std": 0.5, "rewards/correctness_reward_func": 1.0625, "rewards/format_reward_func": 1.0, "step": 53 }, { "completion_length": 127.265625, "epoch": 0.11556982343499198, "grad_norm": 1.96875, "kl": 0.05814269371330738, "learning_rate": 9.993147673772869e-06, "loss": 0.0003, "reward": 1.90625, "reward_std": 0.3125, "rewards/correctness_reward_func": 0.90625, "rewards/format_reward_func": 1.0, "step": 54 }, { "completion_length": 125.84375, "epoch": 0.11771000535045478, "grad_norm": 2.875, "kl": 0.08824395015835762, "learning_rate": 9.991050648838676e-06, "loss": 0.0004, "reward": 2.03125, "reward_std": 0.7261751294136047, "rewards/correctness_reward_func": 1.03125, "rewards/format_reward_func": 1.0, "step": 55 }, { "completion_length": 121.390625, "epoch": 0.1198501872659176, "grad_norm": 2.796875, "kl": 0.06532257050275803, "learning_rate": 9.98867437523228e-06, "loss": 0.0003, "reward": 1.625, "reward_std": 0.46650634706020355, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 56 }, { "completion_length": 129.5625, "epoch": 0.12199036918138041, "grad_norm": 3.328125, "kl": 0.06475972291082144, "learning_rate": 9.986018985905901e-06, "loss": 0.0003, "reward": 2.015625, "reward_std": 0.5409187823534012, "rewards/correctness_reward_func": 1.03125, "rewards/format_reward_func": 0.984375, "step": 57 }, { "completion_length": 133.15625, "epoch": 0.12413055109684323, "grad_norm": 2.0625, "kl": 0.05745814461261034, "learning_rate": 9.983084629428244e-06, "loss": 0.0003, "reward": 1.984375, "reward_std": 0.42558756470680237, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.984375, "step": 58 }, { "completion_length": 124.671875, "epoch": 0.12627073301230604, "grad_norm": 2.9375, "kl": 0.07612972147762775, "learning_rate": 9.979871469976197e-06, "loss": 0.0004, "reward": 1.59375, "reward_std": 0.6346687823534012, "rewards/correctness_reward_func": 0.59375, "rewards/format_reward_func": 1.0, "step": 59 }, { "completion_length": 109.921875, "epoch": 0.12841091492776885, "grad_norm": 3.0, "kl": 0.06931339204311371, "learning_rate": 9.976379687325633e-06, "loss": 0.0003, "reward": 1.96875, "reward_std": 0.6540063470602036, "rewards/correctness_reward_func": 0.96875, "rewards/format_reward_func": 1.0, "step": 60 }, { "completion_length": 125.53125, "epoch": 0.13055109684323168, "grad_norm": 2.546875, "kl": 0.05608335882425308, "learning_rate": 9.972609476841368e-06, "loss": 0.0003, "reward": 2.125, "reward_std": 0.5915063470602036, "rewards/correctness_reward_func": 1.125, "rewards/format_reward_func": 1.0, "step": 61 }, { "completion_length": 127.8125, "epoch": 0.1326912787586945, "grad_norm": 2.59375, "kl": 0.14377161115407944, "learning_rate": 9.968561049466214e-06, "loss": 0.0007, "reward": 1.84375, "reward_std": 0.5290063321590424, "rewards/correctness_reward_func": 0.84375, "rewards/format_reward_func": 1.0, "step": 62 }, { "completion_length": 124.75, "epoch": 0.1348314606741573, "grad_norm": 1.6328125, "kl": 0.06179473642259836, "learning_rate": 9.964234631709188e-06, "loss": 0.0003, "reward": 1.90625, "reward_std": 0.2596687823534012, "rewards/correctness_reward_func": 0.90625, "rewards/format_reward_func": 1.0, "step": 63 }, { "completion_length": 125.515625, "epoch": 0.1369716425896201, "grad_norm": 2.875, "kl": 0.08027161657810211, "learning_rate": 9.959630465632833e-06, "loss": 0.0004, "reward": 2.15625, "reward_std": 0.6011751294136047, "rewards/correctness_reward_func": 1.15625, "rewards/format_reward_func": 1.0, "step": 64 }, { "completion_length": 114.890625, "epoch": 0.13911182450508294, "grad_norm": 2.90625, "kl": 0.13545920699834824, "learning_rate": 9.954748808839675e-06, "loss": 0.0007, "reward": 1.859375, "reward_std": 0.4826504588127136, "rewards/correctness_reward_func": 0.875, "rewards/format_reward_func": 0.984375, "step": 65 }, { "completion_length": 121.125, "epoch": 0.14125200642054575, "grad_norm": 1.9921875, "kl": 0.07298032753169537, "learning_rate": 9.949589934457815e-06, "loss": 0.0004, "reward": 2.03125, "reward_std": 0.3846687823534012, "rewards/correctness_reward_func": 1.03125, "rewards/format_reward_func": 1.0, "step": 66 }, { "completion_length": 122.625, "epoch": 0.14339218833600856, "grad_norm": 3.359375, "kl": 0.10324916429817677, "learning_rate": 9.944154131125643e-06, "loss": 0.0005, "reward": 2.078125, "reward_std": 0.5409187823534012, "rewards/correctness_reward_func": 1.09375, "rewards/format_reward_func": 0.984375, "step": 67 }, { "completion_length": 133.40625, "epoch": 0.14553237025147137, "grad_norm": 2.28125, "kl": 0.10124836303293705, "learning_rate": 9.938441702975689e-06, "loss": 0.0005, "reward": 1.71875, "reward_std": 0.40400634706020355, "rewards/correctness_reward_func": 0.71875, "rewards/format_reward_func": 1.0, "step": 68 }, { "completion_length": 141.984375, "epoch": 0.1476725521669342, "grad_norm": 2.921875, "kl": 0.08171628974378109, "learning_rate": 9.932452969617607e-06, "loss": 0.0004, "reward": 1.96875, "reward_std": 0.41325797885656357, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.96875, "step": 69 }, { "completion_length": 125.609375, "epoch": 0.149812734082397, "grad_norm": 2.53125, "kl": 0.08117420598864555, "learning_rate": 9.926188266120297e-06, "loss": 0.0004, "reward": 1.8125, "reward_std": 0.41367512941360474, "rewards/correctness_reward_func": 0.8125, "rewards/format_reward_func": 1.0, "step": 70 }, { "completion_length": 122.296875, "epoch": 0.15195291599785982, "grad_norm": 2.875, "kl": 0.08432869054377079, "learning_rate": 9.91964794299315e-06, "loss": 0.0004, "reward": 2.03125, "reward_std": 0.5818375647068024, "rewards/correctness_reward_func": 1.03125, "rewards/format_reward_func": 1.0, "step": 71 }, { "completion_length": 136.359375, "epoch": 0.15409309791332262, "grad_norm": 2.296875, "kl": 0.10322786308825016, "learning_rate": 9.912832366166443e-06, "loss": 0.0005, "reward": 1.5625, "reward_std": 0.26933756470680237, "rewards/correctness_reward_func": 0.5625, "rewards/format_reward_func": 1.0, "step": 72 }, { "completion_length": 113.921875, "epoch": 0.15623327982878546, "grad_norm": 3.84375, "kl": 0.13962376862764359, "learning_rate": 9.905741916970863e-06, "loss": 0.0007, "reward": 2.09375, "reward_std": 0.5290063470602036, "rewards/correctness_reward_func": 1.09375, "rewards/format_reward_func": 1.0, "step": 73 }, { "completion_length": 120.5625, "epoch": 0.15837346174424827, "grad_norm": 3.125, "kl": 0.10202482901513577, "learning_rate": 9.898376992116179e-06, "loss": 0.0005, "reward": 1.796875, "reward_std": 0.5892626941204071, "rewards/correctness_reward_func": 0.8125, "rewards/format_reward_func": 0.984375, "step": 74 }, { "completion_length": 132.390625, "epoch": 0.16051364365971107, "grad_norm": 2.875, "kl": 0.09189064055681229, "learning_rate": 9.890738003669029e-06, "loss": 0.0005, "reward": 2.125, "reward_std": 0.6636751294136047, "rewards/correctness_reward_func": 1.125, "rewards/format_reward_func": 1.0, "step": 75 }, { "completion_length": 127.546875, "epoch": 0.16265382557517388, "grad_norm": 3.3125, "kl": 0.09288744628429413, "learning_rate": 9.882825379029883e-06, "loss": 0.0005, "reward": 1.84375, "reward_std": 0.7068375498056412, "rewards/correctness_reward_func": 0.84375, "rewards/format_reward_func": 1.0, "step": 76 }, { "completion_length": 116.90625, "epoch": 0.1647940074906367, "grad_norm": 2.59375, "kl": 0.10366734117269516, "learning_rate": 9.874639560909118e-06, "loss": 0.0005, "reward": 1.625, "reward_std": 0.375, "rewards/correctness_reward_func": 0.625, "rewards/format_reward_func": 1.0, "step": 77 }, { "completion_length": 145.9375, "epoch": 0.16693418940609953, "grad_norm": 3.125, "kl": 0.14030247181653976, "learning_rate": 9.866181007302258e-06, "loss": 0.0007, "reward": 1.640625, "reward_std": 0.43525634706020355, "rewards/correctness_reward_func": 0.65625, "rewards/format_reward_func": 0.984375, "step": 78 }, { "completion_length": 114.1875, "epoch": 0.16907437132156233, "grad_norm": 4.15625, "kl": 0.10448622144758701, "learning_rate": 9.857450191464337e-06, "loss": 0.0005, "reward": 1.96875, "reward_std": 0.7790063321590424, "rewards/correctness_reward_func": 0.96875, "rewards/format_reward_func": 1.0, "step": 79 }, { "completion_length": 135.78125, "epoch": 0.17121455323702514, "grad_norm": 2.421875, "kl": 0.08989986591041088, "learning_rate": 9.848447601883436e-06, "loss": 0.0004, "reward": 1.40625, "reward_std": 0.3582531735301018, "rewards/correctness_reward_func": 0.4375, "rewards/format_reward_func": 0.96875, "step": 80 }, { "completion_length": 141.953125, "epoch": 0.17335473515248795, "grad_norm": 2.953125, "kl": 0.07188372500240803, "learning_rate": 9.839173742253334e-06, "loss": 0.0004, "reward": 1.96875, "reward_std": 0.6733439117670059, "rewards/correctness_reward_func": 1.0, "rewards/format_reward_func": 0.96875, "step": 81 }, { "completion_length": 118.359375, "epoch": 0.17549491706795078, "grad_norm": 1.7109375, "kl": 0.07351219840347767, "learning_rate": 9.829629131445342e-06, "loss": 0.0004, "reward": 2.21875, "reward_std": 0.27900634706020355, "rewards/correctness_reward_func": 1.21875, "rewards/format_reward_func": 1.0, "step": 82 }, { "completion_length": 127.109375, "epoch": 0.1776350989834136, "grad_norm": 1.359375, "kl": 0.07365736179053783, "learning_rate": 9.819814303479268e-06, "loss": 0.0004, "reward": 2.1875, "reward_std": 0.26933756470680237, "rewards/correctness_reward_func": 1.1875, "rewards/format_reward_func": 1.0, "step": 83 }, { "completion_length": 139.078125, "epoch": 0.1797752808988764, "grad_norm": 2.484375, "kl": 0.07714031636714935, "learning_rate": 9.80972980749353e-06, "loss": 0.0004, "reward": 1.90625, "reward_std": 0.6540063470602036, "rewards/correctness_reward_func": 0.90625, "rewards/format_reward_func": 1.0, "step": 84 }, { "completion_length": 128.390625, "epoch": 0.1819154628143392, "grad_norm": 1.4453125, "kl": 0.056750121526420116, "learning_rate": 9.799376207714446e-06, "loss": 0.0003, "reward": 2.125, "reward_std": 0.25, "rewards/correctness_reward_func": 1.15625, "rewards/format_reward_func": 0.96875, "step": 85 }, { "completion_length": 160.234375, "epoch": 0.18405564472980204, "grad_norm": 1.9296875, "kl": 0.06542882043868303, "learning_rate": 9.788754083424654e-06, "loss": 0.0003, "reward": 1.734375, "reward_std": 0.42558756470680237, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 0.984375, "step": 86 }, { "completion_length": 120.546875, "epoch": 0.18619582664526485, "grad_norm": 2.703125, "kl": 0.14421744085848331, "learning_rate": 9.777864028930705e-06, "loss": 0.0007, "reward": 2.140625, "reward_std": 0.6324251294136047, "rewards/correctness_reward_func": 1.15625, "rewards/format_reward_func": 0.984375, "step": 87 }, { "completion_length": 137.296875, "epoch": 0.18833600856072766, "grad_norm": 2.59375, "kl": 0.0669497437775135, "learning_rate": 9.766706653529814e-06, "loss": 0.0003, "reward": 2.078125, "reward_std": 0.7763455435633659, "rewards/correctness_reward_func": 1.09375, "rewards/format_reward_func": 0.984375, "step": 88 }, { "completion_length": 145.4375, "epoch": 0.19047619047619047, "grad_norm": 2.609375, "kl": 0.07120031677186489, "learning_rate": 9.755282581475769e-06, "loss": 0.0004, "reward": 1.59375, "reward_std": 0.7261751294136047, "rewards/correctness_reward_func": 0.59375, "rewards/format_reward_func": 1.0, "step": 89 }, { "completion_length": 137.59375, "epoch": 0.1926163723916533, "grad_norm": 2.484375, "kl": 0.07096913084387779, "learning_rate": 9.743592451944e-06, "loss": 0.0004, "reward": 1.828125, "reward_std": 0.6324251145124435, "rewards/correctness_reward_func": 0.84375, "rewards/format_reward_func": 0.984375, "step": 90 }, { "completion_length": 117.359375, "epoch": 0.1947565543071161, "grad_norm": 2.046875, "kl": 0.06865348853170872, "learning_rate": 9.731636918995821e-06, "loss": 0.0003, "reward": 2.3125, "reward_std": 0.39433756470680237, "rewards/correctness_reward_func": 1.3125, "rewards/format_reward_func": 1.0, "step": 91 }, { "completion_length": 130.734375, "epoch": 0.19689673622257892, "grad_norm": 2.453125, "kl": 0.08213793113827705, "learning_rate": 9.719416651541839e-06, "loss": 0.0004, "reward": 2.1875, "reward_std": 0.5818375647068024, "rewards/correctness_reward_func": 1.21875, "rewards/format_reward_func": 0.96875, "step": 92 }, { "completion_length": 143.015625, "epoch": 0.19903691813804172, "grad_norm": 1.2578125, "kl": 0.07020723912864923, "learning_rate": 9.706932333304518e-06, "loss": 0.0004, "reward": 1.78125, "reward_std": 0.2596687823534012, "rewards/correctness_reward_func": 0.78125, "rewards/format_reward_func": 1.0, "step": 93 }, { "completion_length": 119.96875, "epoch": 0.20117710005350456, "grad_norm": 2.375, "kl": 0.07560589909553528, "learning_rate": 9.694184662779931e-06, "loss": 0.0004, "reward": 2.125, "reward_std": 0.46650633215904236, "rewards/correctness_reward_func": 1.125, "rewards/format_reward_func": 1.0, "step": 94 }, { "completion_length": 139.21875, "epoch": 0.20331728196896737, "grad_norm": 2.078125, "kl": 0.09026694297790527, "learning_rate": 9.681174353198687e-06, "loss": 0.0005, "reward": 2.1875, "reward_std": 0.5386751443147659, "rewards/correctness_reward_func": 1.1875, "rewards/format_reward_func": 1.0, "step": 95 }, { "completion_length": 151.328125, "epoch": 0.20545746388443017, "grad_norm": 2.25, "kl": 0.06551774125546217, "learning_rate": 9.667902132486009e-06, "loss": 0.0003, "reward": 1.75, "reward_std": 0.4471687823534012, "rewards/correctness_reward_func": 0.75, "rewards/format_reward_func": 1.0, "step": 96 }, { "completion_length": 138.640625, "epoch": 0.20759764579989298, "grad_norm": 2.40625, "kl": 0.07146297954022884, "learning_rate": 9.654368743221022e-06, "loss": 0.0004, "reward": 1.796875, "reward_std": 0.6604816764593124, "rewards/correctness_reward_func": 0.8125, "rewards/format_reward_func": 0.984375, "step": 97 }, { "completion_length": 140.9375, "epoch": 0.20973782771535582, "grad_norm": 2.1875, "kl": 0.09298614785075188, "learning_rate": 9.640574942595195e-06, "loss": 0.0005, "reward": 2.09375, "reward_std": 0.5818375647068024, "rewards/correctness_reward_func": 1.09375, "rewards/format_reward_func": 1.0, "step": 98 }, { "completion_length": 151.171875, "epoch": 0.21187800963081863, "grad_norm": 2.5625, "kl": 0.06539157032966614, "learning_rate": 9.626521502369984e-06, "loss": 0.0003, "reward": 2.3125, "reward_std": 0.6443375647068024, "rewards/correctness_reward_func": 1.34375, "rewards/format_reward_func": 0.96875, "step": 99 }, { "completion_length": 154.21875, "epoch": 0.21401819154628143, "grad_norm": 2.171875, "kl": 0.07382548321038485, "learning_rate": 9.612209208833648e-06, "loss": 0.0004, "reward": 1.9375, "reward_std": 0.6292316764593124, "rewards/correctness_reward_func": 0.96875, "rewards/format_reward_func": 0.96875, "step": 100 } ], "logging_steps": 1, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }