diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14345 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998757609640949, + "eval_steps": 100, + "global_step": 4527, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 407.66875, + "epoch": 0.0011043469858229456, + "grad_norm": 1.3796405034281456, + "kl": 9.243488311767578e-05, + "learning_rate": 2.2075055187637973e-07, + "loss": 0.0, + "reward": 0.7898407633416354, + "reward_std": 0.5020190233539324, + "rewards/accuracy_reward": 0.175, + "rewards/cosine_scaled_reward": -0.08515923985396512, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.2250000026077032, + "step": 5 + }, + { + "completion_length": 403.64375, + "epoch": 0.002208693971645891, + "grad_norm": 2.5191525617733994, + "kl": 0.00025566108524799347, + "learning_rate": 4.4150110375275946e-07, + "loss": 0.0, + "reward": 0.5968398815020919, + "reward_std": 0.5149940982650151, + "rewards/accuracy_reward": 0.125, + "rewards/cosine_scaled_reward": -0.1469101178692654, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.18750000279396772, + "step": 10 + }, + { + "completion_length": 385.18125, + "epoch": 0.0033130409574688366, + "grad_norm": 1.3862240766673408, + "kl": 0.0002525851130485535, + "learning_rate": 6.622516556291392e-07, + "loss": 0.0, + "reward": 0.5968430628068745, + "reward_std": 0.5985216822278744, + "rewards/accuracy_reward": 0.15625, + "rewards/cosine_scaled_reward": -0.12399027492501773, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.17708333674818277, + "step": 15 + }, + { + "completion_length": 400.81875, + "epoch": 0.004417387943291782, + "grad_norm": 1.7133179531751626, + "kl": 0.0004843875765800476, + "learning_rate": 8.830022075055189e-07, + "loss": 0.0, + "reward": 0.64714968377084, + "reward_std": 0.47575741023320006, + "rewards/accuracy_reward": 0.11875, + "rewards/cosine_scaled_reward": -0.1341003203036962, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.1687500037252903, + "step": 20 + }, + { + "completion_length": 440.61875, + "epoch": 0.005521734929114728, + "grad_norm": 1.3020436787261798, + "kl": 0.0021125122904777525, + "learning_rate": 1.1037527593818985e-06, + "loss": 0.0001, + "reward": 0.6775904539041221, + "reward_std": 0.521566571767471, + "rewards/accuracy_reward": 0.125, + "rewards/cosine_scaled_reward": -0.1244928854459431, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.21458333693444728, + "step": 25 + }, + { + "completion_length": 358.81875, + "epoch": 0.006626081914937673, + "grad_norm": 2.383431595729729, + "kl": 0.0027086704969406126, + "learning_rate": 1.3245033112582784e-06, + "loss": 0.0001, + "reward": 0.6904986225068569, + "reward_std": 0.5199786387616768, + "rewards/accuracy_reward": 0.1, + "rewards/cosine_scaled_reward": -0.11158471030648798, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.15833333544433117, + "step": 30 + }, + { + "completion_length": 312.475, + "epoch": 0.007730428900760619, + "grad_norm": 1.8980957623315589, + "kl": 0.00699460506439209, + "learning_rate": 1.545253863134658e-06, + "loss": 0.0003, + "reward": 0.8459697065874934, + "reward_std": 0.49007231930154377, + "rewards/accuracy_reward": 0.15625, + "rewards/cosine_scaled_reward": -0.08736362864729016, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.10833333507180214, + "step": 35 + }, + { + "completion_length": 317.73125, + "epoch": 0.008834775886583565, + "grad_norm": 1.9533540981598545, + "kl": 0.012215614318847656, + "learning_rate": 1.7660044150110378e-06, + "loss": 0.0005, + "reward": 0.7698885165620595, + "reward_std": 0.4332339205837343, + "rewards/accuracy_reward": 0.09375, + "rewards/cosine_scaled_reward": -0.13219481345731765, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.10833333525806665, + "step": 40 + }, + { + "completion_length": 223.4125, + "epoch": 0.00993912287240651, + "grad_norm": 2.2672189417696456, + "kl": 0.019639110565185545, + "learning_rate": 1.9867549668874175e-06, + "loss": 0.0008, + "reward": 0.797611591219902, + "reward_std": 0.3280314706848003, + "rewards/accuracy_reward": 0.06875, + "rewards/cosine_scaled_reward": -0.13572174331638961, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.03958333414047956, + "step": 45 + }, + { + "completion_length": 276.6375, + "epoch": 0.011043469858229456, + "grad_norm": 2.365582947188283, + "kl": 0.03506050109863281, + "learning_rate": 2.207505518763797e-06, + "loss": 0.0014, + "reward": 0.9014913145452738, + "reward_std": 0.4327640982926823, + "rewards/accuracy_reward": 0.10625, + "rewards/cosine_scaled_reward": -0.07350869019464881, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.08750000186264514, + "step": 50 + }, + { + "completion_length": 188.775, + "epoch": 0.012147816844052401, + "grad_norm": 3.410151636466289, + "kl": 0.045893096923828126, + "learning_rate": 2.4282560706401767e-06, + "loss": 0.0018, + "reward": 0.8858188761398196, + "reward_std": 0.3367013673152542, + "rewards/accuracy_reward": 0.05625, + "rewards/cosine_scaled_reward": -0.09334779935888946, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.08541666828095913, + "step": 55 + }, + { + "completion_length": 156.75, + "epoch": 0.013252163829875346, + "grad_norm": 3.64480765148819, + "kl": 0.07016792297363281, + "learning_rate": 2.6490066225165567e-06, + "loss": 0.0028, + "reward": 0.8576473254710436, + "reward_std": 0.26497103955189233, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.0631860019522719, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.058333334140479566, + "step": 60 + }, + { + "completion_length": 132.20625, + "epoch": 0.014356510815698293, + "grad_norm": 4.132077532204708, + "kl": 0.1404022216796875, + "learning_rate": 2.8697571743929364e-06, + "loss": 0.0056, + "reward": 0.9789868280291557, + "reward_std": 0.31287062716583025, + "rewards/accuracy_reward": 0.08125, + "rewards/cosine_scaled_reward": -0.0022631677449680863, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.056250000931322576, + "step": 65 + }, + { + "completion_length": 111.775, + "epoch": 0.015460857801521238, + "grad_norm": 3.774544126186504, + "kl": 0.45338897705078124, + "learning_rate": 3.090507726269316e-06, + "loss": 0.0182, + "reward": 0.915225807391107, + "reward_std": 0.27416729260148714, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.024357524031074718, + "rewards/format_reward": 0.85625, + "rewards/reasoning_steps_reward": 0.03958333432674408, + "step": 70 + }, + { + "completion_length": 90.04375, + "epoch": 0.016565204787344183, + "grad_norm": 4.161283635522782, + "kl": 0.2106475830078125, + "learning_rate": 3.311258278145696e-06, + "loss": 0.0084, + "reward": 0.9585830196738243, + "reward_std": 0.292374527291031, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": 0.0002496805100236088, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.020833333767950536, + "step": 75 + }, + { + "completion_length": 153.38125, + "epoch": 0.01766955177316713, + "grad_norm": 3.3838750614323034, + "kl": 0.21140708923339843, + "learning_rate": 3.5320088300220757e-06, + "loss": 0.0085, + "reward": 0.8901195518672467, + "reward_std": 0.3666377069861483, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": -0.016130454646190628, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.06250000093132257, + "step": 80 + }, + { + "completion_length": 108.075, + "epoch": 0.018773898758990076, + "grad_norm": 3.2335960830389885, + "kl": 0.11846466064453125, + "learning_rate": 3.752759381898455e-06, + "loss": 0.0047, + "reward": 1.0230381244793534, + "reward_std": 0.3194487606411712, + "rewards/accuracy_reward": 0.0875, + "rewards/cosine_scaled_reward": 0.014704790979158134, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.04583333432674408, + "step": 85 + }, + { + "completion_length": 112.8875, + "epoch": 0.01987824574481302, + "grad_norm": 2.981290303972626, + "kl": 0.14231414794921876, + "learning_rate": 3.973509933774835e-06, + "loss": 0.0057, + "reward": 1.0851823196280748, + "reward_std": 0.305687127640158, + "rewards/accuracy_reward": 0.11875, + "rewards/cosine_scaled_reward": 0.03934898309526034, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.05208333432674408, + "step": 90 + }, + { + "completion_length": 105.1125, + "epoch": 0.020982592730635966, + "grad_norm": 3.41324876981118, + "kl": 0.1306793212890625, + "learning_rate": 4.1942604856512145e-06, + "loss": 0.0052, + "reward": 1.0669921234250068, + "reward_std": 0.31615011730809783, + "rewards/accuracy_reward": 0.08125, + "rewards/cosine_scaled_reward": -0.008007883114623837, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.06875000167638064, + "step": 95 + }, + { + "completion_length": 96.00625, + "epoch": 0.022086939716458913, + "grad_norm": 3.3620993390968845, + "kl": 0.1841644287109375, + "learning_rate": 4.415011037527594e-06, + "loss": 0.0074, + "reward": 1.043765520118177, + "reward_std": 0.25984670983216346, + "rewards/accuracy_reward": 0.05625, + "rewards/cosine_scaled_reward": -0.01456781585002318, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.07083333544433117, + "step": 100 + }, + { + "epoch": 0.022086939716458913, + "eval_completion_length": 93.125, + "eval_kl": 0.1815234375, + "eval_loss": 0.007249877788126469, + "eval_reward": 1.1368126523494722, + "eval_reward_std": 0.30647377895191313, + "eval_rewards/accuracy_reward": 0.08, + "eval_rewards/cosine_scaled_reward": 0.008479312220588326, + "eval_rewards/format_reward": 0.945, + "eval_rewards/reasoning_steps_reward": 0.10333333551883697, + "eval_runtime": 45.45, + "eval_samples_per_second": 2.178, + "eval_steps_per_second": 0.55, + "step": 100 + }, + { + "completion_length": 119.2375, + "epoch": 0.023191286702281856, + "grad_norm": 2.8049680767356135, + "kl": 0.1911853790283203, + "learning_rate": 4.635761589403974e-06, + "loss": 0.0076, + "reward": 1.1845005745068193, + "reward_std": 0.40934712939670137, + "rewards/accuracy_reward": 0.0875, + "rewards/cosine_scaled_reward": 0.005333909482578747, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.1854166705161333, + "step": 105 + }, + { + "completion_length": 82.05625, + "epoch": 0.024295633688104803, + "grad_norm": 3.456560596923405, + "kl": 0.2972686767578125, + "learning_rate": 4.856512141280353e-06, + "loss": 0.0119, + "reward": 1.2734145127236842, + "reward_std": 0.2678998214521016, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.009918814865523018, + "rewards/format_reward": 0.95625, + "rewards/reasoning_steps_reward": 0.28333334121853115, + "step": 110 + }, + { + "completion_length": 54.4625, + "epoch": 0.02539998067392775, + "grad_norm": 3.5364458340049385, + "kl": 0.471929931640625, + "learning_rate": 5.077262693156734e-06, + "loss": 0.0189, + "reward": 1.3385284006595612, + "reward_std": 0.18102660190961614, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": 0.003111723146867007, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.3229166755452752, + "step": 115 + }, + { + "completion_length": 54.4125, + "epoch": 0.026504327659750693, + "grad_norm": 4.342426566312151, + "kl": 0.61019287109375, + "learning_rate": 5.2980132450331135e-06, + "loss": 0.0244, + "reward": 1.3336276397109033, + "reward_std": 0.20837920447120267, + "rewards/accuracy_reward": 0.05625, + "rewards/cosine_scaled_reward": 0.021127634699223564, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.3125000087544322, + "step": 120 + }, + { + "completion_length": 64.14375, + "epoch": 0.02760867464557364, + "grad_norm": 3.210503848172606, + "kl": 0.5955078125, + "learning_rate": 5.518763796909493e-06, + "loss": 0.0238, + "reward": 1.3687469862401485, + "reward_std": 0.26018318940004975, + "rewards/accuracy_reward": 0.0625, + "rewards/cosine_scaled_reward": 0.016663649416295813, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.3520833417773247, + "step": 125 + }, + { + "completion_length": 64.01875, + "epoch": 0.028713021631396586, + "grad_norm": 3.7132153189677544, + "kl": 0.568804931640625, + "learning_rate": 5.739514348785873e-06, + "loss": 0.0228, + "reward": 1.2947323210537434, + "reward_std": 0.24608583817171165, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.007351013895822689, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.3708333432674408, + "step": 130 + }, + { + "completion_length": 48.625, + "epoch": 0.02981736861721953, + "grad_norm": 2.6304505181783546, + "kl": 0.53724365234375, + "learning_rate": 5.960264900662252e-06, + "loss": 0.0215, + "reward": 1.3914526164531709, + "reward_std": 0.12061821918068745, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.006464047048939392, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.3791666755452752, + "step": 135 + }, + { + "completion_length": 67.4, + "epoch": 0.030921715603042476, + "grad_norm": 3.1538444400935526, + "kl": 0.4556884765625, + "learning_rate": 6.181015452538632e-06, + "loss": 0.0182, + "reward": 1.4109491214156151, + "reward_std": 0.16968962437640583, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.01821754773845896, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.4291666761040688, + "step": 140 + }, + { + "completion_length": 97.26875, + "epoch": 0.03202606258886542, + "grad_norm": 2.5494271727394686, + "kl": 0.479034423828125, + "learning_rate": 6.4017660044150125e-06, + "loss": 0.0192, + "reward": 1.5318490117788315, + "reward_std": 0.27640023065378044, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.04731766675249673, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.5916666815057396, + "step": 145 + }, + { + "completion_length": 211.43125, + "epoch": 0.033130409574688366, + "grad_norm": 1.5190327305356153, + "kl": 0.474810791015625, + "learning_rate": 6.622516556291392e-06, + "loss": 0.019, + "reward": 1.6260522678494453, + "reward_std": 0.23128953371724492, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.1364477440190967, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.7875000171363353, + "step": 150 + }, + { + "completion_length": 414.16875, + "epoch": 0.034234756560511316, + "grad_norm": 1.6713240655364676, + "kl": 0.221484375, + "learning_rate": 6.843267108167772e-06, + "loss": 0.0089, + "reward": 1.6096204966306686, + "reward_std": 0.5072136571725423, + "rewards/accuracy_reward": 0.0875, + "rewards/cosine_scaled_reward": -0.17579616815783083, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.9041666768491268, + "step": 155 + }, + { + "completion_length": 224.45, + "epoch": 0.03533910354633426, + "grad_norm": 1.6799839469624172, + "kl": 0.284002685546875, + "learning_rate": 7.064017660044151e-06, + "loss": 0.0114, + "reward": 1.6045206032693387, + "reward_std": 0.41629061991989147, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.16422939775511622, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.8812500163912773, + "step": 160 + }, + { + "completion_length": 143.5375, + "epoch": 0.0364434505321572, + "grad_norm": 2.749038664091265, + "kl": 0.40633544921875, + "learning_rate": 7.28476821192053e-06, + "loss": 0.0163, + "reward": 1.6239365682005882, + "reward_std": 0.30945953201444354, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14481342723593116, + "rewards/format_reward": 0.85625, + "rewards/reasoning_steps_reward": 0.9062500089406967, + "step": 165 + }, + { + "completion_length": 79.20625, + "epoch": 0.03754779751798015, + "grad_norm": 1.9950235960081237, + "kl": 0.58839111328125, + "learning_rate": 7.50551876379691e-06, + "loss": 0.0235, + "reward": 1.7936675041913985, + "reward_std": 0.24646314584424545, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.047999159130267796, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.8729166803881526, + "step": 170 + }, + { + "completion_length": 72.99375, + "epoch": 0.038652144503803096, + "grad_norm": 3.1334250495896403, + "kl": 0.7208740234375, + "learning_rate": 7.726269315673288e-06, + "loss": 0.0288, + "reward": 1.8457185290753841, + "reward_std": 0.1848987685256361, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.058448143280111255, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9541666734963655, + "step": 175 + }, + { + "completion_length": 68.33125, + "epoch": 0.03975649148962604, + "grad_norm": 1.8399927140885233, + "kl": 0.9168701171875, + "learning_rate": 7.94701986754967e-06, + "loss": 0.0367, + "reward": 1.8285610511898995, + "reward_std": 0.1804211751697949, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06310561551945285, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.9854166701436042, + "step": 180 + }, + { + "completion_length": 62.3625, + "epoch": 0.04086083847544899, + "grad_norm": 1.7243816581634572, + "kl": 0.999951171875, + "learning_rate": 8.16777041942605e-06, + "loss": 0.04, + "reward": 1.7962127968668937, + "reward_std": 0.2584745195626965, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.03503720639273524, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.9687500029802323, + "step": 185 + }, + { + "completion_length": 43.825, + "epoch": 0.04196518546127193, + "grad_norm": 1.9090704424835, + "kl": 1.1904296875, + "learning_rate": 8.388520971302429e-06, + "loss": 0.0476, + "reward": 1.9285470694303513, + "reward_std": 0.07938597783086151, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.019369596161413937, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 0.9729166701436043, + "step": 190 + }, + { + "completion_length": 43.8, + "epoch": 0.043069532447094876, + "grad_norm": 0.8384814834445764, + "kl": 1.274560546875, + "learning_rate": 8.609271523178809e-06, + "loss": 0.051, + "reward": 1.9545519351959229, + "reward_std": 0.07418545563377847, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.018364711850881576, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.9729166686534881, + "step": 195 + }, + { + "completion_length": 48.35625, + "epoch": 0.044173879432917826, + "grad_norm": 2.236940540903145, + "kl": 1.300146484375, + "learning_rate": 8.830022075055188e-06, + "loss": 0.052, + "reward": 1.9158101230859756, + "reward_std": 0.07812084475554001, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03210651960689574, + "rewards/format_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9479166716337204, + "step": 200 + }, + { + "epoch": 0.044173879432917826, + "eval_completion_length": 71.615, + "eval_kl": 1.23375, + "eval_loss": 0.04937754571437836, + "eval_reward": 1.8474071335792541, + "eval_reward_std": 0.18131415246985852, + "eval_rewards/accuracy_reward": 0.01, + "eval_rewards/cosine_scaled_reward": -0.040926196351647376, + "eval_rewards/format_reward": 0.97, + "eval_rewards/reasoning_steps_reward": 0.9083333444595337, + "eval_runtime": 36.3868, + "eval_samples_per_second": 2.721, + "eval_steps_per_second": 0.687, + "step": 200 + }, + { + "completion_length": 74.2, + "epoch": 0.04527822641874077, + "grad_norm": 1.1539830590686253, + "kl": 1.27861328125, + "learning_rate": 9.050772626931568e-06, + "loss": 0.0511, + "reward": 1.868411859869957, + "reward_std": 0.13951143043577757, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.04408814936177805, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.9250000035390258, + "step": 205 + }, + { + "completion_length": 38.5375, + "epoch": 0.04638257340456371, + "grad_norm": 2.3094120876632154, + "kl": 2.0333984375, + "learning_rate": 9.271523178807948e-06, + "loss": 0.0813, + "reward": 1.9079508751630783, + "reward_std": 0.14040698215430894, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.014965799322817474, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9666666693985462, + "step": 210 + }, + { + "completion_length": 36.73125, + "epoch": 0.04748692039038666, + "grad_norm": 1.9626719196766829, + "kl": 1.74697265625, + "learning_rate": 9.492273730684327e-06, + "loss": 0.0699, + "reward": 1.824728435277939, + "reward_std": 0.20264510232354951, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.019021555897779763, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9125000078231096, + "step": 215 + }, + { + "completion_length": 46.425, + "epoch": 0.048591267376209606, + "grad_norm": 2.344462570648517, + "kl": 1.8123046875, + "learning_rate": 9.713024282560707e-06, + "loss": 0.0725, + "reward": 1.7400818899273873, + "reward_std": 0.3166843029595839, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.02450146197807044, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9208333410322667, + "step": 220 + }, + { + "completion_length": 36.45625, + "epoch": 0.04969561436203255, + "grad_norm": 1.6108117585836246, + "kl": 1.7595703125, + "learning_rate": 9.933774834437086e-06, + "loss": 0.0704, + "reward": 1.7901365123689175, + "reward_std": 0.22408495279232737, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.016113495687022805, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.8312500113621354, + "step": 225 + }, + { + "completion_length": 36.65, + "epoch": 0.0507999613478555, + "grad_norm": 3.236305451533758, + "kl": 1.82919921875, + "learning_rate": 1.0154525386313468e-05, + "loss": 0.0732, + "reward": 1.8323896206915378, + "reward_std": 0.21802660732319054, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.017610373883508147, + "rewards/format_reward": 0.91875, + "rewards/reasoning_steps_reward": 0.9312500074505806, + "step": 230 + }, + { + "completion_length": 45.64375, + "epoch": 0.05190430833367844, + "grad_norm": 1.6334788003351493, + "kl": 1.7169921875, + "learning_rate": 1.0375275938189846e-05, + "loss": 0.0687, + "reward": 1.8946375951170922, + "reward_std": 0.11254046880385431, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.015779063804075122, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9854166671633721, + "step": 235 + }, + { + "completion_length": 63.425, + "epoch": 0.053008655319501385, + "grad_norm": 1.7113105745171684, + "kl": 1.695556640625, + "learning_rate": 1.0596026490066227e-05, + "loss": 0.0678, + "reward": 1.9150223009288312, + "reward_std": 0.10123766471187992, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01831102555152029, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.9895833343267441, + "step": 240 + }, + { + "completion_length": 93.68125, + "epoch": 0.054113002305324336, + "grad_norm": 0.4404451775440966, + "kl": 1.57080078125, + "learning_rate": 1.0816777041942605e-05, + "loss": 0.0628, + "reward": 1.8356807470321654, + "reward_std": 0.2136607704902417, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03931924531934783, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.9687500029802323, + "step": 245 + }, + { + "completion_length": 56.9125, + "epoch": 0.05521734929114728, + "grad_norm": 4.754826808249897, + "kl": 2.864794921875, + "learning_rate": 1.1037527593818986e-05, + "loss": 0.1146, + "reward": 1.9538603499531746, + "reward_std": 0.028102499651504333, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01488963805604726, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.993750000745058, + "step": 250 + }, + { + "completion_length": 85.1875, + "epoch": 0.05632169627697022, + "grad_norm": 1.9425038218319501, + "kl": 2.82373046875, + "learning_rate": 1.1258278145695364e-05, + "loss": 0.1127, + "reward": 1.6320011641830205, + "reward_std": 0.3420275276679604, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.040915494039654735, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.8229166699573398, + "step": 255 + }, + { + "completion_length": 51.23125, + "epoch": 0.05742604326279317, + "grad_norm": 1.6999510907770932, + "kl": 1.645703125, + "learning_rate": 1.1479028697571745e-05, + "loss": 0.0658, + "reward": 1.874811889231205, + "reward_std": 0.12289988842621824, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02310477099381387, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 0.9291666693985462, + "step": 260 + }, + { + "completion_length": 150.65, + "epoch": 0.058530390248616115, + "grad_norm": 1.442258073406476, + "kl": 1.566796875, + "learning_rate": 1.1699779249448125e-05, + "loss": 0.0627, + "reward": 1.7884906940162182, + "reward_std": 0.22153634292044444, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.019842630508355795, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9333333425223828, + "step": 265 + }, + { + "completion_length": 115.4, + "epoch": 0.05963473723443906, + "grad_norm": 0.6973653373093713, + "kl": 481691.4131835938, + "learning_rate": 1.1920529801324505e-05, + "loss": 19216.3781, + "reward": 1.8304102931171655, + "reward_std": 0.1441515963528218, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.021673032961552964, + "rewards/format_reward": 0.9, + "rewards/reasoning_steps_reward": 0.9520833369344472, + "step": 270 + }, + { + "completion_length": 78.175, + "epoch": 0.06073908422026201, + "grad_norm": 0.7996143804584955, + "kl": 1.68466796875, + "learning_rate": 1.2141280353200884e-05, + "loss": 0.0674, + "reward": 1.9026772230863571, + "reward_std": 0.1005777153201052, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.022322767600417136, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9937500014901162, + "step": 275 + }, + { + "completion_length": 94.39375, + "epoch": 0.06184343120608495, + "grad_norm": 0.46301803821702214, + "kl": 1.720703125, + "learning_rate": 1.2362030905077264e-05, + "loss": 0.0688, + "reward": 1.909028697013855, + "reward_std": 0.1098573448281968, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02013796616811305, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9916666686534882, + "step": 280 + }, + { + "completion_length": 35.9875, + "epoch": 0.0629477781919079, + "grad_norm": 25.392403138510254, + "kl": 6.09052734375, + "learning_rate": 1.2582781456953644e-05, + "loss": 0.2435, + "reward": 1.9458073616027831, + "reward_std": 0.05808448990646866, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01877596784615889, + "rewards/format_reward": 0.9875, + "rewards/reasoning_steps_reward": 0.9770833350718021, + "step": 285 + }, + { + "completion_length": 44.14375, + "epoch": 0.06405212517773085, + "grad_norm": 5.742884239841623, + "kl": 1.8162109375, + "learning_rate": 1.2803532008830025e-05, + "loss": 0.0727, + "reward": 1.942259357869625, + "reward_std": 0.051475054543698204, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02232399402419105, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.9895833343267441, + "step": 290 + }, + { + "completion_length": 70.83125, + "epoch": 0.06515647216355379, + "grad_norm": 1.1878621439792336, + "kl": 1.75224609375, + "learning_rate": 1.3024282560706403e-05, + "loss": 0.0701, + "reward": 1.8855205789208411, + "reward_std": 0.14350247889015008, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0311461063567549, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9854166686534882, + "step": 295 + }, + { + "completion_length": 59.16875, + "epoch": 0.06626081914937673, + "grad_norm": 1.0737008694745849, + "kl": 2.21201171875, + "learning_rate": 1.3245033112582784e-05, + "loss": 0.0886, + "reward": 1.8838034845888614, + "reward_std": 0.10267033483396518, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.032863204437308016, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.991666667163372, + "step": 300 + }, + { + "epoch": 0.06626081914937673, + "eval_completion_length": 37.235, + "eval_kl": 1.8078125, + "eval_loss": 0.07229267060756683, + "eval_reward": 1.9764094233512879, + "eval_reward_std": 0.015016918147157412, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.013590602073818445, + "eval_rewards/format_reward": 0.99, + "eval_rewards/reasoning_steps_reward": 1.0, + "eval_runtime": 19.7868, + "eval_samples_per_second": 5.003, + "eval_steps_per_second": 1.263, + "step": 300 + }, + { + "completion_length": 112.075, + "epoch": 0.06736516613519968, + "grad_norm": 38.86453958076354, + "kl": 116.337109375, + "learning_rate": 1.3465783664459162e-05, + "loss": 4.6466, + "reward": 1.7255545184016228, + "reward_std": 0.3258556753062294, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05569549936335534, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.9750000014901161, + "step": 305 + }, + { + "completion_length": 91.34375, + "epoch": 0.06846951312102263, + "grad_norm": 0.7032767211156884, + "kl": 2.12666015625, + "learning_rate": 1.3686534216335543e-05, + "loss": 0.0851, + "reward": 1.8290522865951062, + "reward_std": 0.19914081794595404, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03136437909561209, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9854166671633721, + "step": 310 + }, + { + "completion_length": 35.0125, + "epoch": 0.06957386010684558, + "grad_norm": 0.1219839633736872, + "kl": 3.073046875, + "learning_rate": 1.3907284768211921e-05, + "loss": 0.1228, + "reward": 1.9570835500955581, + "reward_std": 0.04304394810787926, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.017916415364015847, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 1.0, + "step": 315 + }, + { + "completion_length": 32.275, + "epoch": 0.07067820709266852, + "grad_norm": 1.104264851050662, + "kl": 3.7556640625, + "learning_rate": 1.4128035320088303e-05, + "loss": 0.1502, + "reward": 1.968562737107277, + "reward_std": 0.026956520293606446, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012687235441990196, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 1.0, + "step": 320 + }, + { + "completion_length": 132.08125, + "epoch": 0.07178255407849146, + "grad_norm": 68555.41271806235, + "kl": 1126.325, + "learning_rate": 1.434878587196468e-05, + "loss": 44.9841, + "reward": 1.8718097068369388, + "reward_std": 0.05580442189639143, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.015690282598370686, + "rewards/format_reward": 0.8875, + "rewards/reasoning_steps_reward": 1.0, + "step": 325 + }, + { + "completion_length": 213.45625, + "epoch": 0.0728869010643144, + "grad_norm": 3.503371973098148, + "kl": 44.011328125, + "learning_rate": 1.456953642384106e-05, + "loss": 1.757, + "reward": 1.7566166341304779, + "reward_std": 0.21126712449513435, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03296670269337483, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.995833334326744, + "step": 330 + }, + { + "completion_length": 68.475, + "epoch": 0.07399124805013735, + "grad_norm": 1.079221435802786, + "kl": 9.17109375, + "learning_rate": 1.479028697571744e-05, + "loss": 0.368, + "reward": 1.8921391651034356, + "reward_std": 0.1115680442419034, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.022444166260538623, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9895833350718022, + "step": 335 + }, + { + "completion_length": 35.54375, + "epoch": 0.0750955950359603, + "grad_norm": 0.3296732185602028, + "kl": 1.96572265625, + "learning_rate": 1.501103752759382e-05, + "loss": 0.0786, + "reward": 1.9763855755329132, + "reward_std": 0.015541732574638445, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.013197748665697872, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.995833334326744, + "step": 340 + }, + { + "completion_length": 38.39375, + "epoch": 0.07619994202178325, + "grad_norm": 0.804238046719128, + "kl": 2.0283203125, + "learning_rate": 1.52317880794702e-05, + "loss": 0.0812, + "reward": 1.9684513732790947, + "reward_std": 0.026967421242807176, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012798593926709146, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.9875000007450581, + "step": 345 + }, + { + "completion_length": 48.86875, + "epoch": 0.07730428900760619, + "grad_norm": 1.9342787529877643, + "kl": 2.15478515625, + "learning_rate": 1.5452538631346577e-05, + "loss": 0.0862, + "reward": 1.915003441274166, + "reward_std": 0.10254102655635507, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.024579859105870128, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.995833334326744, + "step": 350 + }, + { + "completion_length": 238.325, + "epoch": 0.07840863599342913, + "grad_norm": 7.756150900770026, + "kl": 4.5904296875, + "learning_rate": 1.567328918322296e-05, + "loss": 0.1833, + "reward": 1.5095329130068422, + "reward_std": 0.4169476901159214, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06338373832768411, + "rewards/format_reward": 0.69375, + "rewards/reasoning_steps_reward": 0.8791666779667139, + "step": 355 + }, + { + "completion_length": 147.71875, + "epoch": 0.07951298297925208, + "grad_norm": 0.1004225405142809, + "kl": 2.135546875, + "learning_rate": 1.589403973509934e-05, + "loss": 0.0854, + "reward": 1.7320448141545057, + "reward_std": 0.2472891275290749, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04920516336569562, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.9375000055879354, + "step": 360 + }, + { + "completion_length": 73.8125, + "epoch": 0.08061732996507502, + "grad_norm": 3.8314243673640505, + "kl": 2.2732421875, + "learning_rate": 1.6114790286975718e-05, + "loss": 0.0909, + "reward": 1.858565354347229, + "reward_std": 0.1464589938717836, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03518462204374373, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9687500059604645, + "step": 365 + }, + { + "completion_length": 46.4625, + "epoch": 0.08172167695089798, + "grad_norm": 1.148009265152921, + "kl": 2.7, + "learning_rate": 1.63355408388521e-05, + "loss": 0.108, + "reward": 1.935795644670725, + "reward_std": 0.07739904040663532, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.020454342185985297, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 0.9875000014901161, + "step": 370 + }, + { + "completion_length": 28.63125, + "epoch": 0.08282602393672092, + "grad_norm": 0.018613804390782206, + "kl": 2.887109375, + "learning_rate": 1.6556291390728477e-05, + "loss": 0.1155, + "reward": 1.9873796686530114, + "reward_std": 0.005896346227382309, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.008453596872277558, + "rewards/format_reward": 1.0, + "rewards/reasoning_steps_reward": 0.995833333581686, + "step": 375 + }, + { + "completion_length": 28.8375, + "epoch": 0.08393037092254386, + "grad_norm": 1.6371785454339474, + "kl": 2.7423828125, + "learning_rate": 1.6777041942604858e-05, + "loss": 0.1097, + "reward": 1.9645642668008805, + "reward_std": 0.020640570133218718, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.008352343272417784, + "rewards/format_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9729166701436043, + "step": 380 + }, + { + "completion_length": 27.5, + "epoch": 0.08503471790836681, + "grad_norm": 2.9035413740067773, + "kl": 2.6583984375, + "learning_rate": 1.699779249448124e-05, + "loss": 0.1063, + "reward": 1.9090838402509689, + "reward_std": 0.04726645071750681, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.007582775689661503, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.9229166693985462, + "step": 385 + }, + { + "completion_length": 25.5, + "epoch": 0.08613906489418975, + "grad_norm": 10.067463916774571, + "kl": 2.821484375, + "learning_rate": 1.7218543046357617e-05, + "loss": 0.1128, + "reward": 1.4473052226414438, + "reward_std": 0.2710021964079715, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.006861429521813988, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.6416666749864817, + "step": 390 + }, + { + "completion_length": 25.33125, + "epoch": 0.0872434118800127, + "grad_norm": 1.1120951826184378, + "kl": 2.9173828125, + "learning_rate": 1.7439293598234e-05, + "loss": 0.1167, + "reward": 1.6058874435722827, + "reward_std": 0.04730093894213496, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.006612581602530554, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.6312500178813935, + "step": 395 + }, + { + "completion_length": 260.55625, + "epoch": 0.08834775886583565, + "grad_norm": 18.757375828870764, + "kl": 2208.165234375, + "learning_rate": 1.7660044150110377e-05, + "loss": 88.441, + "reward": 1.2055384639650584, + "reward_std": 0.23063606465893827, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09237821163842455, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.78541667945683, + "step": 400 + }, + { + "epoch": 0.08834775886583565, + "eval_completion_length": 217.92, + "eval_kl": 87.934375, + "eval_loss": 3.5442821979522705, + "eval_reward": 1.2175334417819976, + "eval_reward_std": 0.365319043637719, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.07746655503287911, + "eval_rewards/format_reward": 0.455, + "eval_rewards/reasoning_steps_reward": 0.8400000214576722, + "eval_runtime": 99.5826, + "eval_samples_per_second": 0.994, + "eval_steps_per_second": 0.251, + "step": 400 + }, + { + "completion_length": 625.7875, + "epoch": 0.0894521058516586, + "grad_norm": 19.106502016916846, + "kl": 12.63349609375, + "learning_rate": 1.7880794701986758e-05, + "loss": 0.5055, + "reward": 0.651996704749763, + "reward_std": 0.4102951940265484, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21883663166663608, + "rewards/format_reward": 0.08125, + "rewards/reasoning_steps_reward": 0.7895833376795054, + "step": 405 + }, + { + "completion_length": 161.35, + "epoch": 0.09055645283748154, + "grad_norm": 0.8426631626860688, + "kl": 2.6265625, + "learning_rate": 1.8101545253863136e-05, + "loss": 0.1051, + "reward": 1.346696252003312, + "reward_std": 0.32623073370778, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08663710475666449, + "rewards/format_reward": 0.68125, + "rewards/reasoning_steps_reward": 0.7520833492279053, + "step": 410 + }, + { + "completion_length": 27.78125, + "epoch": 0.09166079982330448, + "grad_norm": 2.026191681702149, + "kl": 2.80498046875, + "learning_rate": 1.8322295805739517e-05, + "loss": 0.1122, + "reward": 1.942487709224224, + "reward_std": 0.02961592679930618, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0075122533366084095, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.9562500052154064, + "step": 415 + }, + { + "completion_length": 23.975, + "epoch": 0.09276514680912742, + "grad_norm": 0.31172284735643946, + "kl": 4.0859375, + "learning_rate": 1.8543046357615895e-05, + "loss": 0.1634, + "reward": 1.5779291547834873, + "reward_std": 0.22977396088535898, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.005404202520730905, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.8833333443850279, + "step": 420 + }, + { + "completion_length": 26.6, + "epoch": 0.09386949379495037, + "grad_norm": 10734.08122919461, + "kl": 104.6666015625, + "learning_rate": 1.8763796909492276e-05, + "loss": 4.1821, + "reward": 1.5809004239737987, + "reward_std": 0.1068729208822333, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0065996065095532686, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 1.0, + "step": 425 + }, + { + "completion_length": 83.06875, + "epoch": 0.09497384078077332, + "grad_norm": 123.88060508402012, + "kl": 6.474365234375, + "learning_rate": 1.8984547461368654e-05, + "loss": 0.2591, + "reward": 1.6652553364634515, + "reward_std": 0.282259418636113, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07224468101048842, + "rewards/format_reward": 0.7375, + "rewards/reasoning_steps_reward": 1.0, + "step": 430 + }, + { + "completion_length": 106.91875, + "epoch": 0.09607818776659627, + "grad_norm": 0.940290808549546, + "kl": 842.04267578125, + "learning_rate": 1.9205298013245036e-05, + "loss": 33.6922, + "reward": 1.4902156308293342, + "reward_std": 0.16858022491724114, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12228436931036413, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 1.0, + "step": 435 + }, + { + "completion_length": 31.5625, + "epoch": 0.09718253475241921, + "grad_norm": 15.254329720774644, + "kl": 28.6966796875, + "learning_rate": 1.9426048565121414e-05, + "loss": 1.1475, + "reward": 1.6264034517109394, + "reward_std": 0.31542411341342813, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.013179889318416826, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.8833333440124989, + "step": 440 + }, + { + "completion_length": 36.19375, + "epoch": 0.09828688173824215, + "grad_norm": 1.7786785678688244, + "kl": 3.790625, + "learning_rate": 1.9646799116997795e-05, + "loss": 0.1516, + "reward": 1.6109677419066428, + "reward_std": 0.3635523966368055, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01611559497541748, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.864583345502615, + "step": 445 + }, + { + "completion_length": 28.9125, + "epoch": 0.0993912287240651, + "grad_norm": 13.049933334291598, + "kl": 4.665234375, + "learning_rate": 1.9867549668874173e-05, + "loss": 0.1867, + "reward": 1.9556236922740937, + "reward_std": 0.05575240566577122, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0068762671202421185, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 1.0, + "step": 450 + }, + { + "completion_length": 48.9, + "epoch": 0.10049557570988804, + "grad_norm": 12.750419334393301, + "kl": 3.6001953125, + "learning_rate": 1.9999988107104428e-05, + "loss": 0.144, + "reward": 1.8631265118718148, + "reward_std": 0.1681165755485381, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.016040142456768082, + "rewards/format_reward": 0.8875, + "rewards/reasoning_steps_reward": 0.99166666790843, + "step": 455 + }, + { + "completion_length": 99.2625, + "epoch": 0.101599922695711, + "grad_norm": 3.6354989719273636, + "kl": 149.4564453125, + "learning_rate": 1.9999854312354064e-05, + "loss": 5.9617, + "reward": 1.8252798458561301, + "reward_std": 0.17444125837337196, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01847012363432441, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9687500018626451, + "step": 460 + }, + { + "completion_length": 135.56875, + "epoch": 0.10270426968153394, + "grad_norm": 5.2304357492626625, + "kl": 4.8642578125, + "learning_rate": 1.999957185872951e-05, + "loss": 0.1946, + "reward": 1.7171886287629605, + "reward_std": 0.3744886555035009, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02239466998144053, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.9395833402872086, + "step": 465 + }, + { + "completion_length": 78.5125, + "epoch": 0.10380861666735688, + "grad_norm": 91.64606863523989, + "kl": 70.5404296875, + "learning_rate": 1.999914075042975e-05, + "loss": 2.8153, + "reward": 1.6547522293403745, + "reward_std": 0.2920411152263114, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.055664407834410665, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9291666738688946, + "step": 470 + }, + { + "completion_length": 64.2375, + "epoch": 0.10491296365317983, + "grad_norm": 48.311980555436286, + "kl": 4.6033203125, + "learning_rate": 1.9998560993863682e-05, + "loss": 0.1843, + "reward": 1.6796498108655213, + "reward_std": 0.2776311224031815, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04535014413995668, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9437500044703484, + "step": 475 + }, + { + "completion_length": 36.9, + "epoch": 0.10601731063900277, + "grad_norm": 5.921542778461717, + "kl": 4.1017578125, + "learning_rate": 1.999783259765003e-05, + "loss": 0.164, + "reward": 1.7877669408917427, + "reward_std": 0.2045195282747045, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.030983004180598073, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.9375000033527613, + "step": 480 + }, + { + "completion_length": 152.48125, + "epoch": 0.10712165762482571, + "grad_norm": 20.142481995600914, + "kl": 5.8130859375, + "learning_rate": 1.9996955572617202e-05, + "loss": 0.2326, + "reward": 1.4146922817453742, + "reward_std": 0.396951551019356, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08530769811477512, + "rewards/format_reward": 0.6375, + "rewards/reasoning_steps_reward": 0.8625000085681677, + "step": 485 + }, + { + "completion_length": 239.51875, + "epoch": 0.10822600461064867, + "grad_norm": 139.2698059408305, + "kl": 15.3798828125, + "learning_rate": 1.999592993180315e-05, + "loss": 0.6159, + "reward": 1.1587502604350448, + "reward_std": 0.6047080957883736, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12041640711249783, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.760416679084301, + "step": 490 + }, + { + "completion_length": 110.30625, + "epoch": 0.10933035159647161, + "grad_norm": 3.7802819605372977, + "kl": 2.846484375, + "learning_rate": 1.9994755690455154e-05, + "loss": 0.1139, + "reward": 1.3858223337680102, + "reward_std": 0.4451230512106122, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08501100512221456, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.758333345130086, + "step": 495 + }, + { + "completion_length": 419.45, + "epoch": 0.11043469858229456, + "grad_norm": 1.4189717430862432, + "kl": 1.2078369140625, + "learning_rate": 1.9993432866029604e-05, + "loss": 0.0483, + "reward": 1.3895617920905352, + "reward_std": 0.3321596068039071, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22502154029498342, + "rewards/format_reward": 0.63125, + "rewards/reasoning_steps_reward": 0.9833333343267441, + "step": 500 + }, + { + "epoch": 0.11043469858229456, + "eval_completion_length": 66.62, + "eval_kl": 2.2115625, + "eval_loss": 0.08852547407150269, + "eval_reward": 1.8721801257133484, + "eval_reward_std": 0.14555715662660076, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.041153202150017026, + "eval_rewards/format_reward": 0.915, + "eval_rewards/reasoning_steps_reward": 0.9983333337306977, + "eval_runtime": 32.0208, + "eval_samples_per_second": 3.092, + "eval_steps_per_second": 0.781, + "step": 500 + }, + { + "completion_length": 33.025, + "epoch": 0.1115390455681175, + "grad_norm": 1.314290920645204, + "kl": 2.6916015625, + "learning_rate": 1.9991961478191753e-05, + "loss": 0.1077, + "reward": 1.9229176357388496, + "reward_std": 0.09401306777253922, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012499059329275041, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 0.9729166716337204, + "step": 505 + }, + { + "completion_length": 28.95625, + "epoch": 0.11264339255394044, + "grad_norm": 13.466340801725543, + "kl": 4.073046875, + "learning_rate": 1.99903415488154e-05, + "loss": 0.163, + "reward": 1.955730925500393, + "reward_std": 0.05362616253169108, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.006769094028277323, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 1.0, + "step": 510 + }, + { + "completion_length": 32.275, + "epoch": 0.11374773953976339, + "grad_norm": 4.387812745130587, + "kl": 3.749609375, + "learning_rate": 1.998857310198259e-05, + "loss": 0.15, + "reward": 1.9408951826393603, + "reward_std": 0.07298454180927365, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.00910481644968968, + "rewards/format_reward": 0.95625, + "rewards/reasoning_steps_reward": 0.99375, + "step": 515 + }, + { + "completion_length": 208.75625, + "epoch": 0.11485208652558634, + "grad_norm": 15.261102221631818, + "kl": 4.52294921875, + "learning_rate": 1.998665616398323e-05, + "loss": 0.1808, + "reward": 0.8959554025903345, + "reward_std": 0.28214400556153124, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08737794174230658, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.4645833414047956, + "step": 520 + }, + { + "completion_length": 946.95625, + "epoch": 0.11595643351140929, + "grad_norm": 4.064474873038914, + "kl": 7.4697265625, + "learning_rate": 1.9984590763314722e-05, + "loss": 0.299, + "reward": 0.4448224641382694, + "reward_std": 0.1930072069000744, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1822608746260812, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.6145833436399698, + "step": 525 + }, + { + "completion_length": 867.95, + "epoch": 0.11706078049723223, + "grad_norm": 0.8715946488428576, + "kl": 1.211572265625, + "learning_rate": 1.998237693068153e-05, + "loss": 0.0485, + "reward": 1.1276234179735183, + "reward_std": 0.33161048383626623, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1411265860995627, + "rewards/format_reward": 0.26875, + "rewards/reasoning_steps_reward": 1.0, + "step": 530 + }, + { + "completion_length": 1018.55625, + "epoch": 0.11816512748305517, + "grad_norm": 0.5739914958239132, + "kl": 1.079345703125, + "learning_rate": 1.9980014698994722e-05, + "loss": 0.0432, + "reward": 1.2353254936635494, + "reward_std": 0.24865906643667585, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.45217450708150864, + "rewards/format_reward": 0.6875, + "rewards/reasoning_steps_reward": 1.0, + "step": 535 + }, + { + "completion_length": 853.175, + "epoch": 0.11926947446887812, + "grad_norm": 1.0404924783317373, + "kl": 1.39716796875, + "learning_rate": 1.997750410337147e-05, + "loss": 0.0558, + "reward": 1.4958539374172688, + "reward_std": 0.1754556493193377, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.404146053083241, + "rewards/format_reward": 0.9, + "rewards/reasoning_steps_reward": 1.0, + "step": 540 + }, + { + "completion_length": 54.7, + "epoch": 0.12037382145470106, + "grad_norm": 1.3971817042757162, + "kl": 4.2869140625, + "learning_rate": 1.997484518113456e-05, + "loss": 0.1714, + "reward": 1.8733646899461747, + "reward_std": 0.15371908817323857, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.024551973055349664, + "rewards/format_reward": 0.91875, + "rewards/reasoning_steps_reward": 0.9791666693985462, + "step": 545 + }, + { + "completion_length": 32.7, + "epoch": 0.12147816844052402, + "grad_norm": 3.3861272960337225, + "kl": 3.658203125, + "learning_rate": 1.9972037971811802e-05, + "loss": 0.1464, + "reward": 1.7629884868860244, + "reward_std": 0.24611905133260734, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.009928178068366832, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.8979166731238365, + "step": 550 + }, + { + "completion_length": 32.2375, + "epoch": 0.12258251542634696, + "grad_norm": 0.9537515740920277, + "kl": 3.62529296875, + "learning_rate": 1.9969082517135463e-05, + "loss": 0.145, + "reward": 1.707100809639087, + "reward_std": 0.36074760046503795, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.009565857070265337, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.8791666701436043, + "step": 555 + }, + { + "completion_length": 29.175, + "epoch": 0.1236868624121699, + "grad_norm": 1.8079021090047303, + "kl": 3.3318359375, + "learning_rate": 1.9965978861041637e-05, + "loss": 0.1333, + "reward": 1.8566195629537106, + "reward_std": 0.16747138476275722, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.00796372244367376, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9333333380520343, + "step": 560 + }, + { + "completion_length": 28.85, + "epoch": 0.12479120939799285, + "grad_norm": 0.30956855309501174, + "kl": 3.42373046875, + "learning_rate": 1.99627270496696e-05, + "loss": 0.137, + "reward": 1.8421029239892959, + "reward_std": 0.13506208243761647, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.007897024205885828, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.8687500126659871, + "step": 565 + }, + { + "completion_length": 26.525, + "epoch": 0.1258955563838158, + "grad_norm": 0.47918362299607187, + "kl": 4.195703125, + "learning_rate": 1.995932713136112e-05, + "loss": 0.168, + "reward": 1.5204951745280142, + "reward_std": 0.31988839789064516, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.006588119767866374, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.7770833345130086, + "step": 570 + }, + { + "completion_length": 29.475, + "epoch": 0.12699990336963873, + "grad_norm": 1.7107660050085856, + "kl": 3.25537109375, + "learning_rate": 1.9955779156659735e-05, + "loss": 0.1302, + "reward": 1.9459566242992878, + "reward_std": 0.06541404174095078, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.008209979979437777, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.9791666671633721, + "step": 575 + }, + { + "completion_length": 28.4, + "epoch": 0.1281042503554617, + "grad_norm": 3.0620288960322344, + "kl": 3.51279296875, + "learning_rate": 1.9952083178310002e-05, + "loss": 0.1406, + "reward": 1.823703521117568, + "reward_std": 0.19747704482369954, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.007546431059017778, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9562500011175871, + "step": 580 + }, + { + "completion_length": 24.125, + "epoch": 0.12920859734128462, + "grad_norm": 1.5920134602595593, + "kl": 4.5791015625, + "learning_rate": 1.994823925125672e-05, + "loss": 0.1832, + "reward": 1.687876349347016, + "reward_std": 0.304932097128949, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.005873611301694837, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8812500063329936, + "step": 585 + }, + { + "completion_length": 38.7125, + "epoch": 0.13031294432710758, + "grad_norm": 1.7803721516558564, + "kl": 5.0361328125, + "learning_rate": 1.994424743264412e-05, + "loss": 0.2014, + "reward": 1.4074853049299691, + "reward_std": 0.4336934615795144, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.030014701202208015, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.8500000024214387, + "step": 590 + }, + { + "completion_length": 90.8625, + "epoch": 0.13141729131293053, + "grad_norm": 1.3605634584420858, + "kl": 4.99658203125, + "learning_rate": 1.9940107781814976e-05, + "loss": 0.1999, + "reward": 1.2846552881412208, + "reward_std": 0.5878636933018242, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08409471668419428, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 0.7562500022351741, + "step": 595 + }, + { + "completion_length": 430.75, + "epoch": 0.13252163829875346, + "grad_norm": 0.42608770036732224, + "kl": 4.250341796875, + "learning_rate": 1.993582036030978e-05, + "loss": 0.17, + "reward": 1.0411737323971466, + "reward_std": 0.5796436283727416, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08382627054525074, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.5812500020489096, + "step": 600 + }, + { + "epoch": 0.13252163829875346, + "eval_completion_length": 531.12, + "eval_kl": 5.138125, + "eval_loss": 0.20564807951450348, + "eval_reward": 1.5188868433237075, + "eval_reward_std": 0.39198793584240776, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.017779812179505826, + "eval_rewards/format_reward": 0.785, + "eval_rewards/reasoning_steps_reward": 0.75166668176651, + "eval_runtime": 102.3024, + "eval_samples_per_second": 0.968, + "eval_steps_per_second": 0.244, + "step": 600 + }, + { + "completion_length": 936.55, + "epoch": 0.13362598528457642, + "grad_norm": 0.5683802140587957, + "kl": 2.01455078125, + "learning_rate": 1.993138523186578e-05, + "loss": 0.0805, + "reward": 0.327013082918711, + "reward_std": 0.42701971883070655, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16673692046315408, + "rewards/format_reward": 0.1375, + "rewards/reasoning_steps_reward": 0.35625000596046447, + "step": 605 + }, + { + "completion_length": 1024.0, + "epoch": 0.13473033227039935, + "grad_norm": 0.4839523012096288, + "kl": 0.7296875, + "learning_rate": 1.9926802462416054e-05, + "loss": 0.0292, + "reward": 0.41157908397726717, + "reward_std": 0.28430348377587505, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.23425425429013558, + "rewards/format_reward": 0.00625, + "rewards/reasoning_steps_reward": 0.6395833477377891, + "step": 610 + }, + { + "completion_length": 879.0, + "epoch": 0.1358346792562223, + "grad_norm": 0.8739388757124811, + "kl": 3.1473388671875, + "learning_rate": 1.9922072120088537e-05, + "loss": 0.1259, + "reward": 0.8592522375285625, + "reward_std": 0.1283191536087543, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0803310930976295, + "rewards/format_reward": 0.00625, + "rewards/reasoning_steps_reward": 0.9333333414047956, + "step": 615 + }, + { + "completion_length": 956.5125, + "epoch": 0.13693902624204526, + "grad_norm": 0.3409618950357477, + "kl": 2.539794921875, + "learning_rate": 1.991719427520499e-05, + "loss": 0.1016, + "reward": 0.6374624267220497, + "reward_std": 0.23787620406874338, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.24378757532394957, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.8687500182539225, + "step": 620 + }, + { + "completion_length": 951.675, + "epoch": 0.1380433732278682, + "grad_norm": 0.5311158252207424, + "kl": 1.86796875, + "learning_rate": 1.9912169000279952e-05, + "loss": 0.0747, + "reward": -0.04920477559790015, + "reward_std": 0.21609443256020314, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.35337144320365044, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3041666731238365, + "step": 625 + }, + { + "completion_length": 1019.275, + "epoch": 0.13914772021369115, + "grad_norm": 0.4779187904211677, + "kl": 1.165673828125, + "learning_rate": 1.9906996370019692e-05, + "loss": 0.0466, + "reward": 0.08100474406965077, + "reward_std": 0.22535915609259974, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1669119239784777, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.24791667144745588, + "step": 630 + }, + { + "completion_length": 742.05, + "epoch": 0.14025206719951408, + "grad_norm": 2.860219052376377, + "kl": 3.076318359375, + "learning_rate": 1.990167646132107e-05, + "loss": 0.123, + "reward": 0.5310021251440048, + "reward_std": 0.3654556166498878, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.20649788190494292, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.7062500109896064, + "step": 635 + }, + { + "completion_length": 154.4625, + "epoch": 0.14135641418533704, + "grad_norm": 0.20831523609247937, + "kl": 3.63935546875, + "learning_rate": 1.9896209353270394e-05, + "loss": 0.1455, + "reward": 1.8390323543921112, + "reward_std": 0.1871601128950715, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.029717639734735714, + "rewards/format_reward": 0.91875, + "rewards/reasoning_steps_reward": 0.9500000027939677, + "step": 640 + }, + { + "completion_length": 759.0375, + "epoch": 0.14246076117115997, + "grad_norm": 0.4107178938373751, + "kl": 2.780615234375, + "learning_rate": 1.989059512714227e-05, + "loss": 0.1112, + "reward": 1.2166819516569376, + "reward_std": 0.3681499962562157, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.060401382704731076, + "rewards/format_reward": 0.4, + "rewards/reasoning_steps_reward": 0.8770833380520344, + "step": 645 + }, + { + "completion_length": 1020.01875, + "epoch": 0.14356510815698292, + "grad_norm": 0.5118018262308587, + "kl": 1.021142578125, + "learning_rate": 1.988483386639836e-05, + "loss": 0.0409, + "reward": 0.7364203490898944, + "reward_std": 0.2831057722181868, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04066298486868618, + "rewards/format_reward": 0.01875, + "rewards/reasoning_steps_reward": 0.7583333430811763, + "step": 650 + }, + { + "completion_length": 1023.7625, + "epoch": 0.14466945514280588, + "grad_norm": 0.5971199868423984, + "kl": 0.971728515625, + "learning_rate": 1.9878925656686167e-05, + "loss": 0.0389, + "reward": 0.5211033774306998, + "reward_std": 0.2769409292843193, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.27264662481029517, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.7812500074505806, + "step": 655 + }, + { + "completion_length": 1024.0, + "epoch": 0.1457738021286288, + "grad_norm": 0.7257973266644143, + "kl": 0.5168212890625, + "learning_rate": 1.9872870585837757e-05, + "loss": 0.0207, + "reward": 0.5945031743496656, + "reward_std": 0.24122693912358956, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22424682592973114, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.8062500102445483, + "step": 660 + }, + { + "completion_length": 1024.0, + "epoch": 0.14687814911445177, + "grad_norm": 0.6690686208159938, + "kl": 0.796337890625, + "learning_rate": 1.9866668743868437e-05, + "loss": 0.0318, + "reward": 0.9177674036473036, + "reward_std": 0.13023071596617228, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07806593386630993, + "rewards/format_reward": 0.04375, + "rewards/reasoning_steps_reward": 0.9520833373069764, + "step": 665 + }, + { + "completion_length": 987.1375, + "epoch": 0.1479824961002747, + "grad_norm": 0.6120508069402077, + "kl": 1.7907470703125, + "learning_rate": 1.9860320222975435e-05, + "loss": 0.0716, + "reward": 1.260640586912632, + "reward_std": 0.40482770588496353, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09769274116843008, + "rewards/format_reward": 0.39375, + "rewards/reasoning_steps_reward": 0.964583334326744, + "step": 670 + }, + { + "completion_length": 640.525, + "epoch": 0.14908684308609765, + "grad_norm": 0.710860699058398, + "kl": 3.9996826171875, + "learning_rate": 1.9853825117536522e-05, + "loss": 0.16, + "reward": 1.6751317463815212, + "reward_std": 0.3215458321869846, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14778491355245932, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.972916667163372, + "step": 675 + }, + { + "completion_length": 852.1375, + "epoch": 0.1501911900719206, + "grad_norm": 0.4777776180383065, + "kl": 3.6158203125, + "learning_rate": 1.9847183524108614e-05, + "loss": 0.1446, + "reward": 1.468020135909319, + "reward_std": 0.44469789950890115, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12572985703882295, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 0.9812500029802322, + "step": 680 + }, + { + "completion_length": 1007.66875, + "epoch": 0.15129553705774354, + "grad_norm": 0.5985977128324016, + "kl": 1.750341796875, + "learning_rate": 1.9840395541426333e-05, + "loss": 0.07, + "reward": 0.7763801473192871, + "reward_std": 0.4260402750223875, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12778652228880674, + "rewards/format_reward": 0.11875, + "rewards/reasoning_steps_reward": 0.78541667945683, + "step": 685 + }, + { + "completion_length": 1024.0, + "epoch": 0.1523998840435665, + "grad_norm": 0.9348256931543566, + "kl": 1.45712890625, + "learning_rate": 1.983346127040053e-05, + "loss": 0.0583, + "reward": 0.7624955659732222, + "reward_std": 0.3992441566209891, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1333377712897345, + "rewards/format_reward": 0.1125, + "rewards/reasoning_steps_reward": 0.7833333471789956, + "step": 690 + }, + { + "completion_length": 875.2375, + "epoch": 0.15350423102938943, + "grad_norm": 0.6791189480472896, + "kl": 2.89599609375, + "learning_rate": 1.9826380814116795e-05, + "loss": 0.1157, + "reward": 0.8852463798597455, + "reward_std": 0.6508712698566341, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.27933695799438285, + "rewards/format_reward": 0.35, + "rewards/reasoning_steps_reward": 0.8145833453163505, + "step": 695 + }, + { + "completion_length": 923.7, + "epoch": 0.15460857801521238, + "grad_norm": 0.3922171537250086, + "kl": 2.5765625, + "learning_rate": 1.9819154277833938e-05, + "loss": 0.1031, + "reward": 1.0286956165917218, + "reward_std": 0.5952219057944603, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1317210498600616, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.7916666803881526, + "step": 700 + }, + { + "epoch": 0.15460857801521238, + "eval_completion_length": 1024.0, + "eval_kl": 3.66328125, + "eval_loss": 0.14716078341007233, + "eval_reward": 1.1705710649490357, + "eval_reward_std": 0.47160317164845766, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.03109560369513929, + "eval_rewards/format_reward": 0.4, + "eval_rewards/reasoning_steps_reward": 0.8016666793823242, + "eval_runtime": 196.5133, + "eval_samples_per_second": 0.504, + "eval_steps_per_second": 0.127, + "step": 700 + }, + { + "completion_length": 923.7, + "epoch": 0.1557129250010353, + "grad_norm": 2.476456769332757, + "kl": 3.134033203125, + "learning_rate": 1.9811781768982392e-05, + "loss": 0.1254, + "reward": 1.064253362873569, + "reward_std": 0.5311597665102454, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03574663798935944, + "rewards/format_reward": 0.33125, + "rewards/reasoning_steps_reward": 0.7687500152736902, + "step": 705 + }, + { + "completion_length": 973.85, + "epoch": 0.15681727198685827, + "grad_norm": 0.9110518537976159, + "kl": 2.82705078125, + "learning_rate": 1.980426339716264e-05, + "loss": 0.1131, + "reward": 0.8556666751392186, + "reward_std": 0.45775549400859744, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12349999214638956, + "rewards/format_reward": 0.20625, + "rewards/reasoning_steps_reward": 0.7729166857898235, + "step": 710 + }, + { + "completion_length": 929.9875, + "epoch": 0.15792161897268123, + "grad_norm": 0.689638744596231, + "kl": 2.47158203125, + "learning_rate": 1.9796599274143586e-05, + "loss": 0.0988, + "reward": 1.0453462563455105, + "reward_std": 0.4044397716068488, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04632041496806778, + "rewards/format_reward": 0.23125, + "rewards/reasoning_steps_reward": 0.8604166802018881, + "step": 715 + }, + { + "completion_length": 757.775, + "epoch": 0.15902596595850416, + "grad_norm": 2.6619811581457093, + "kl": 13.9373046875, + "learning_rate": 1.9788789513860875e-05, + "loss": 0.5578, + "reward": 1.106494550034404, + "reward_std": 0.5601302272752037, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21433878369862214, + "rewards/format_reward": 0.39375, + "rewards/reasoning_steps_reward": 0.9270833387970925, + "step": 720 + }, + { + "completion_length": 834.575, + "epoch": 0.1601303129443271, + "grad_norm": 1.2429717954269097, + "kl": 4.67158203125, + "learning_rate": 1.9780834232415214e-05, + "loss": 0.1868, + "reward": 1.011640521325171, + "reward_std": 0.4418655791791025, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2654428183857817, + "rewards/format_reward": 0.325, + "rewards/reasoning_steps_reward": 0.9520833365619182, + "step": 725 + }, + { + "completion_length": 899.575, + "epoch": 0.16123465993015004, + "grad_norm": 0.7171746700695241, + "kl": 2.8833984375, + "learning_rate": 1.9772733548070647e-05, + "loss": 0.1154, + "reward": 0.9335492581129075, + "reward_std": 0.3853685567474713, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22061740965000354, + "rewards/format_reward": 0.21875, + "rewards/reasoning_steps_reward": 0.9354166708886623, + "step": 730 + }, + { + "completion_length": 938.2375, + "epoch": 0.162339006915973, + "grad_norm": 0.530387948893096, + "kl": 2.845263671875, + "learning_rate": 1.9764487581252787e-05, + "loss": 0.1138, + "reward": 0.9676836218684912, + "reward_std": 0.4040184532976127, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11981637928402052, + "rewards/format_reward": 0.19375, + "rewards/reasoning_steps_reward": 0.8937500083819032, + "step": 735 + }, + { + "completion_length": 923.7125, + "epoch": 0.16344335390179596, + "grad_norm": 0.7148455898281986, + "kl": 2.9392578125, + "learning_rate": 1.975609645454704e-05, + "loss": 0.1176, + "reward": 1.004165413416922, + "reward_std": 0.4944089779272872, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05000126257946249, + "rewards/format_reward": 0.2625, + "rewards/reasoning_steps_reward": 0.79166667945683, + "step": 740 + }, + { + "completion_length": 869.825, + "epoch": 0.1645477008876189, + "grad_norm": 0.5326753926199363, + "kl": 2.82470703125, + "learning_rate": 1.9747560292696763e-05, + "loss": 0.113, + "reward": 0.832844705414027, + "reward_std": 0.5525396721786819, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13590529205976054, + "rewards/format_reward": 0.29375, + "rewards/reasoning_steps_reward": 0.6750000108033418, + "step": 745 + }, + { + "completion_length": 951.2625, + "epoch": 0.16565204787344184, + "grad_norm": 0.9976724610545183, + "kl": 3.01611328125, + "learning_rate": 1.9738879222601425e-05, + "loss": 0.1207, + "reward": 0.6285201878286898, + "reward_std": 0.6893505120096052, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2298131546471268, + "rewards/format_reward": 0.26875, + "rewards/reasoning_steps_reward": 0.5895833455026149, + "step": 750 + }, + { + "completion_length": 935.625, + "epoch": 0.16675639485926477, + "grad_norm": 0.4851659837433616, + "kl": 2.586865234375, + "learning_rate": 1.9730053373314722e-05, + "loss": 0.1035, + "reward": 0.4737007636576891, + "reward_std": 0.6648764016776113, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2950492393341847, + "rewards/format_reward": 0.25, + "rewards/reasoning_steps_reward": 0.5187500091269612, + "step": 755 + }, + { + "completion_length": 898.625, + "epoch": 0.16786074184508773, + "grad_norm": 0.4416112762647842, + "kl": 2.737841796875, + "learning_rate": 1.9721082876042644e-05, + "loss": 0.1095, + "reward": 0.6276166431605816, + "reward_std": 0.6592218722492362, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2723833565658424, + "rewards/format_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.5875000094994902, + "step": 760 + }, + { + "completion_length": 986.3875, + "epoch": 0.16896508883091066, + "grad_norm": 0.5212659541587674, + "kl": 1.964501953125, + "learning_rate": 1.9711967864141542e-05, + "loss": 0.0786, + "reward": 0.5854208903387189, + "reward_std": 0.4578703532402869, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07707911343604792, + "rewards/format_reward": 0.14375, + "rewards/reasoning_steps_reward": 0.5187500098720192, + "step": 765 + }, + { + "completion_length": 1011.4625, + "epoch": 0.17006943581673362, + "grad_norm": 0.4202393392619023, + "kl": 1.484814453125, + "learning_rate": 1.970270847311612e-05, + "loss": 0.0594, + "reward": 0.5946683968533761, + "reward_std": 0.3890162902807788, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09074827631484368, + "rewards/format_reward": 0.09375, + "rewards/reasoning_steps_reward": 0.5916666772216559, + "step": 770 + }, + { + "completion_length": 995.0375, + "epoch": 0.17117378280255657, + "grad_norm": 0.5057925969173489, + "kl": 1.631982421875, + "learning_rate": 1.9693304840617456e-05, + "loss": 0.0652, + "reward": 0.5266901765018701, + "reward_std": 0.44898259460460394, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2420598310069181, + "rewards/format_reward": 0.1125, + "rewards/reasoning_steps_reward": 0.6562500141561032, + "step": 775 + }, + { + "completion_length": 973.85, + "epoch": 0.1722781297883795, + "grad_norm": 0.4342246003094615, + "kl": 2.336279296875, + "learning_rate": 1.968375710644093e-05, + "loss": 0.0935, + "reward": 0.8064871094655246, + "reward_std": 0.6480218000418972, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.19559622975066304, + "rewards/format_reward": 0.24375, + "rewards/reasoning_steps_reward": 0.7583333445712924, + "step": 780 + }, + { + "completion_length": 823.4, + "epoch": 0.17338247677420246, + "grad_norm": 0.2840592190798346, + "kl": 3.2484375, + "learning_rate": 1.9674065412524147e-05, + "loss": 0.13, + "reward": 1.2522860381752252, + "reward_std": 0.5435375596192898, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.020630629758670693, + "rewards/format_reward": 0.45, + "rewards/reasoning_steps_reward": 0.8229166774079204, + "step": 785 + }, + { + "completion_length": 958.7125, + "epoch": 0.1744868237600254, + "grad_norm": 0.5066655198721257, + "kl": 2.073095703125, + "learning_rate": 1.9664229902944833e-05, + "loss": 0.0829, + "reward": 0.7657026316504926, + "reward_std": 0.46858742368640377, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06346403961651959, + "rewards/format_reward": 0.18125, + "rewards/reasoning_steps_reward": 0.6479166788980365, + "step": 790 + }, + { + "completion_length": 1011.4625, + "epoch": 0.17559117074584835, + "grad_norm": 0.5090355806565158, + "kl": 1.276513671875, + "learning_rate": 1.9654250723918706e-05, + "loss": 0.0511, + "reward": 0.6036186209297739, + "reward_std": 0.30241919725158367, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03596471712298808, + "rewards/format_reward": 0.05625, + "rewards/reasoning_steps_reward": 0.5833333427086472, + "step": 795 + }, + { + "completion_length": 1024.0, + "epoch": 0.1766955177316713, + "grad_norm": 0.362393508654969, + "kl": 1.162890625, + "learning_rate": 1.9644128023797273e-05, + "loss": 0.0465, + "reward": 0.5545021136291325, + "reward_std": 0.3219915485853562, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.299664559494704, + "rewards/format_reward": 0.0625, + "rewards/reasoning_steps_reward": 0.79166667945683, + "step": 800 + }, + { + "epoch": 0.1766955177316713, + "eval_completion_length": 1024.0, + "eval_kl": 1.004609375, + "eval_loss": 0.040289442986249924, + "eval_reward": 0.5231576064229011, + "eval_reward_std": 0.1656639602733776, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.45517572939395906, + "eval_rewards/format_reward": 0.035, + "eval_rewards/reasoning_steps_reward": 0.943333340883255, + "eval_runtime": 202.7136, + "eval_samples_per_second": 0.488, + "eval_steps_per_second": 0.123, + "step": 800 + }, + { + "completion_length": 1018.0875, + "epoch": 0.17779986471749423, + "grad_norm": 0.44983968909591454, + "kl": 1.289599609375, + "learning_rate": 1.9633861953065648e-05, + "loss": 0.0516, + "reward": 0.5429016770794988, + "reward_std": 0.2576578710861213, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.4425149895250797, + "rewards/format_reward": 0.08125, + "rewards/reasoning_steps_reward": 0.9041666716337204, + "step": 805 + }, + { + "completion_length": 979.1375, + "epoch": 0.1789042117033172, + "grad_norm": 0.4467769812510504, + "kl": 1.9100830078125, + "learning_rate": 1.9623452664340305e-05, + "loss": 0.0763, + "reward": 0.7711514856666326, + "reward_std": 0.38121095038736713, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.3892651722300798, + "rewards/format_reward": 0.1875, + "rewards/reasoning_steps_reward": 0.9666666701436043, + "step": 810 + }, + { + "completion_length": 936.3375, + "epoch": 0.18000855868914012, + "grad_norm": 0.4199249856681596, + "kl": 2.6292236328125, + "learning_rate": 1.9612900312366815e-05, + "loss": 0.1052, + "reward": 0.7745890522375702, + "reward_std": 0.5395248577739948, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.34207761539146303, + "rewards/format_reward": 0.28125, + "rewards/reasoning_steps_reward": 0.8354166736826301, + "step": 815 + }, + { + "completion_length": 998.925, + "epoch": 0.18111290567496308, + "grad_norm": 0.38807419895689016, + "kl": 2.0396484375, + "learning_rate": 1.9602205054017534e-05, + "loss": 0.0815, + "reward": 0.38755306117236615, + "reward_std": 0.6135149325666134, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.30203027836978436, + "rewards/format_reward": 0.1625, + "rewards/reasoning_steps_reward": 0.5270833438262343, + "step": 820 + }, + { + "completion_length": 964.6375, + "epoch": 0.182217252660786, + "grad_norm": 0.34646502449060457, + "kl": 2.594287109375, + "learning_rate": 1.9591367048289297e-05, + "loss": 0.1038, + "reward": 0.5837174264714122, + "reward_std": 0.6725190826080507, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13503257725387813, + "rewards/format_reward": 0.23125, + "rewards/reasoning_steps_reward": 0.4875000072643161, + "step": 825 + }, + { + "completion_length": 920.1, + "epoch": 0.18332159964660896, + "grad_norm": 0.30414365620585365, + "kl": 2.875, + "learning_rate": 1.9580386456301014e-05, + "loss": 0.115, + "reward": 0.8815148666501045, + "reward_std": 0.6505038747184699, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04973513415825437, + "rewards/format_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.6187500094994902, + "step": 830 + }, + { + "completion_length": 834.025, + "epoch": 0.18442594663243192, + "grad_norm": 0.40396097917811025, + "kl": 2.9671875, + "learning_rate": 1.9569263441291312e-05, + "loss": 0.1188, + "reward": 1.1630100145936013, + "reward_std": 0.5103294208552143, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03282331913396774, + "rewards/format_reward": 0.3625, + "rewards/reasoning_steps_reward": 0.8333333428949118, + "step": 835 + }, + { + "completion_length": 976.95, + "epoch": 0.18553029361825485, + "grad_norm": 0.6142914802044704, + "kl": 2.892529296875, + "learning_rate": 1.9557998168616087e-05, + "loss": 0.1157, + "reward": 0.991350544989109, + "reward_std": 0.4822408523090417, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.058649450994562355, + "rewards/format_reward": 0.19375, + "rewards/reasoning_steps_reward": 0.8562500078231097, + "step": 840 + }, + { + "completion_length": 911.1625, + "epoch": 0.1866346406040778, + "grad_norm": 0.3716875526820613, + "kl": 3.598046875, + "learning_rate": 1.9546590805746054e-05, + "loss": 0.144, + "reward": 1.1998552225530148, + "reward_std": 0.4323180656544537, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03139477769100267, + "rewards/format_reward": 0.31875, + "rewards/reasoning_steps_reward": 0.9125000059604644, + "step": 845 + }, + { + "completion_length": 931.4, + "epoch": 0.18773898758990074, + "grad_norm": 0.32469474685255123, + "kl": 2.931396484375, + "learning_rate": 1.9535041522264256e-05, + "loss": 0.1173, + "reward": 1.1232962097972632, + "reward_std": 0.501405765369418, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03087046154323616, + "rewards/format_reward": 0.29375, + "rewards/reasoning_steps_reward": 0.8604166770353914, + "step": 850 + }, + { + "completion_length": 854.675, + "epoch": 0.1888433345757237, + "grad_norm": 0.28376486622584113, + "kl": 2.97490234375, + "learning_rate": 1.9523350489863545e-05, + "loss": 0.1189, + "reward": 1.1701628059148788, + "reward_std": 0.544858716159706, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03192052699450869, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.833333345130086, + "step": 855 + }, + { + "completion_length": 873.25, + "epoch": 0.18994768156154665, + "grad_norm": 0.305720363616131, + "kl": 3.108154296875, + "learning_rate": 1.951151788234402e-05, + "loss": 0.1243, + "reward": 1.1185833937488496, + "reward_std": 0.5606642366210508, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03349993971351069, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7708333427086472, + "step": 860 + }, + { + "completion_length": 1011.4625, + "epoch": 0.19105202854736958, + "grad_norm": 0.36378063097811875, + "kl": 1.690185546875, + "learning_rate": 1.949954387561046e-05, + "loss": 0.0676, + "reward": 0.7097708626213717, + "reward_std": 0.36822339960053796, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03189580502767057, + "rewards/format_reward": 0.1, + "rewards/reasoning_steps_reward": 0.6416666757315397, + "step": 865 + }, + { + "completion_length": 1015.04375, + "epoch": 0.19215637553319254, + "grad_norm": 0.3489339505523048, + "kl": 1.650439453125, + "learning_rate": 1.9487428647669688e-05, + "loss": 0.066, + "reward": 0.8181800896301865, + "reward_std": 0.36886287455181443, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07140324611464166, + "rewards/format_reward": 0.10625, + "rewards/reasoning_steps_reward": 0.7833333414047956, + "step": 870 + }, + { + "completion_length": 937.45, + "epoch": 0.19326072251901547, + "grad_norm": 0.48239314974054465, + "kl": 2.27412109375, + "learning_rate": 1.947517237862795e-05, + "loss": 0.091, + "reward": 0.9640216436237097, + "reward_std": 0.44529842740666936, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16931168984156103, + "rewards/format_reward": 0.225, + "rewards/reasoning_steps_reward": 0.908333340100944, + "step": 875 + }, + { + "completion_length": 810.875, + "epoch": 0.19436506950483842, + "grad_norm": 0.3776754403626219, + "kl": 3.48330078125, + "learning_rate": 1.9462775250688208e-05, + "loss": 0.1394, + "reward": 1.3459553118795156, + "reward_std": 0.5377076888135889, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04571135753940325, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.9041666727513075, + "step": 880 + }, + { + "completion_length": 1009.91875, + "epoch": 0.19546941649066138, + "grad_norm": 0.4583496847925809, + "kl": 2.01220703125, + "learning_rate": 1.9450237448147463e-05, + "loss": 0.0805, + "reward": 0.6594697997083131, + "reward_std": 0.4720977840166597, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02803020708306576, + "rewards/format_reward": 0.14375, + "rewards/reasoning_steps_reward": 0.5437500100582838, + "step": 885 + }, + { + "completion_length": 998.925, + "epoch": 0.1965737634764843, + "grad_norm": 0.37896192292415093, + "kl": 2.0322265625, + "learning_rate": 1.943755915739399e-05, + "loss": 0.0813, + "reward": 0.608962860464817, + "reward_std": 0.450826744859296, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03270381114562042, + "rewards/format_reward": 0.1375, + "rewards/reasoning_steps_reward": 0.5041666772216559, + "step": 890 + }, + { + "completion_length": 1008.71875, + "epoch": 0.19767811046230727, + "grad_norm": 0.35794014020876924, + "kl": 2.0822265625, + "learning_rate": 1.9424740566904572e-05, + "loss": 0.0832, + "reward": 0.6830778570845724, + "reward_std": 0.5404627276671817, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10233881335589104, + "rewards/format_reward": 0.15625, + "rewards/reasoning_steps_reward": 0.6291666775941849, + "step": 895 + }, + { + "completion_length": 948.8, + "epoch": 0.1987824574481302, + "grad_norm": 0.39271082198189455, + "kl": 2.01015625, + "learning_rate": 1.9411781867241718e-05, + "loss": 0.0804, + "reward": 0.844484331086278, + "reward_std": 0.39959267702070067, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22634900148259476, + "rewards/format_reward": 0.19375, + "rewards/reasoning_steps_reward": 0.8770833415910602, + "step": 900 + }, + { + "epoch": 0.1987824574481302, + "eval_completion_length": 1024.0, + "eval_kl": 0.763046875, + "eval_loss": 0.03052530251443386, + "eval_reward": 0.5850898969173431, + "eval_reward_std": 0.2078262207657099, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.31157676726579664, + "eval_rewards/format_reward": 0.015, + "eval_rewards/reasoning_steps_reward": 0.8816666746139527, + "eval_runtime": 202.2443, + "eval_samples_per_second": 0.49, + "eval_steps_per_second": 0.124, + "step": 900 + }, + { + "completion_length": 1022.2875, + "epoch": 0.19988680443395315, + "grad_norm": 0.48408716939512914, + "kl": 0.7494140625, + "learning_rate": 1.9398683251050796e-05, + "loss": 0.03, + "reward": 0.5665433191694319, + "reward_std": 0.19808940038928996, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.3292900139465928, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.8833333384245634, + "step": 905 + }, + { + "completion_length": 1024.0, + "epoch": 0.20099115141977608, + "grad_norm": 0.42990941163217694, + "kl": 0.6109130859375, + "learning_rate": 1.93854449130572e-05, + "loss": 0.0244, + "reward": 0.5857137320563197, + "reward_std": 0.10294231597799808, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.385119604319334, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.9583333371207118, + "step": 910 + }, + { + "completion_length": 1024.0, + "epoch": 0.20209549840559904, + "grad_norm": 0.48720013231599374, + "kl": 0.610107421875, + "learning_rate": 1.937206705006344e-05, + "loss": 0.0244, + "reward": 0.594690283946693, + "reward_std": 0.18519932519702706, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.369893048517406, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.9333333369344473, + "step": 915 + }, + { + "completion_length": 1024.0, + "epoch": 0.203199845391422, + "grad_norm": 0.5696400107146373, + "kl": 0.5218994140625, + "learning_rate": 1.9358549860946217e-05, + "loss": 0.0209, + "reward": 0.812476817984134, + "reward_std": 0.24139399882988072, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1666898537427187, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.9104166731238366, + "step": 920 + }, + { + "completion_length": 1024.0, + "epoch": 0.20430419237724493, + "grad_norm": 0.541602324631897, + "kl": 0.5464599609375, + "learning_rate": 1.934489354665347e-05, + "loss": 0.0219, + "reward": 1.0158553715795278, + "reward_std": 0.28890128862985875, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07789463058797992, + "rewards/format_reward": 0.1625, + "rewards/reasoning_steps_reward": 0.9312500067055225, + "step": 925 + }, + { + "completion_length": 1024.0, + "epoch": 0.20540853936306788, + "grad_norm": 0.5321993292267261, + "kl": 0.5557861328125, + "learning_rate": 1.9331098310201392e-05, + "loss": 0.0222, + "reward": 1.2741701494902373, + "reward_std": 0.44738651754014425, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05916318525414681, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.9208333365619182, + "step": 930 + }, + { + "completion_length": 1024.0, + "epoch": 0.2065128863488908, + "grad_norm": 0.5959692383930701, + "kl": 0.47041015625, + "learning_rate": 1.9317164356671395e-05, + "loss": 0.0188, + "reward": 1.6727484971284867, + "reward_std": 0.2997322161420016, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06266818092990434, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.9791666679084301, + "step": 935 + }, + { + "completion_length": 713.875, + "epoch": 0.20761723333471377, + "grad_norm": 1.1057415650471873, + "kl": 2.516064453125, + "learning_rate": 1.930309189320709e-05, + "loss": 0.1006, + "reward": 1.9278814405202866, + "reward_std": 0.07678677760886785, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.040868553338805215, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 1.0, + "step": 940 + }, + { + "completion_length": 986.3875, + "epoch": 0.20872158032053673, + "grad_norm": 0.5228184716949067, + "kl": 1.52724609375, + "learning_rate": 1.9288881129011177e-05, + "loss": 0.0611, + "reward": 1.8507322192192077, + "reward_std": 0.1548274521872372, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.053434445448510816, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.9291666721925139, + "step": 945 + }, + { + "completion_length": 1024.0, + "epoch": 0.20982592730635966, + "grad_norm": 0.5117019454936371, + "kl": 0.4581787109375, + "learning_rate": 1.9274532275342355e-05, + "loss": 0.0183, + "reward": 1.6492023468017578, + "reward_std": 0.26452546955042633, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1466309867319069, + "rewards/format_reward": 0.9, + "rewards/reasoning_steps_reward": 0.8958333406597376, + "step": 950 + }, + { + "completion_length": 1024.0, + "epoch": 0.2109302742921826, + "grad_norm": 0.48557359205206774, + "kl": 0.3712890625, + "learning_rate": 1.9260045545512174e-05, + "loss": 0.0149, + "reward": 1.5456451624631882, + "reward_std": 0.35966640640981495, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1064381811331259, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.8895833430811763, + "step": 955 + }, + { + "completion_length": 1024.0, + "epoch": 0.21203462127800554, + "grad_norm": 0.5356381874366991, + "kl": 0.329052734375, + "learning_rate": 1.9245421154881873e-05, + "loss": 0.0132, + "reward": 1.7083046436309814, + "reward_std": 0.2587324011943565, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06461202607351879, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9604166708886623, + "step": 960 + }, + { + "completion_length": 1024.0, + "epoch": 0.2131389682638285, + "grad_norm": 0.5603446707923614, + "kl": 0.3440185546875, + "learning_rate": 1.9230659320859157e-05, + "loss": 0.0138, + "reward": 1.7261480644345284, + "reward_std": 0.2795285307271115, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.07176859906758182, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.9416666723787784, + "step": 965 + }, + { + "completion_length": 1024.0, + "epoch": 0.21424331524965143, + "grad_norm": 0.6759689326469629, + "kl": 0.391845703125, + "learning_rate": 1.9215760262894982e-05, + "loss": 0.0157, + "reward": 1.8084782645106317, + "reward_std": 0.17105812441823218, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09152174164628377, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9687500022351742, + "step": 970 + }, + { + "completion_length": 1002.6375, + "epoch": 0.21534766223547439, + "grad_norm": 0.6017399141547436, + "kl": 0.9856201171875, + "learning_rate": 1.9200724202480305e-05, + "loss": 0.0394, + "reward": 1.5970855988562107, + "reward_std": 0.35718659692502114, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1570810692050145, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.9479166727513075, + "step": 975 + }, + { + "completion_length": 1008.3375, + "epoch": 0.21645200922129734, + "grad_norm": 0.4897056955691348, + "kl": 0.69998779296875, + "learning_rate": 1.9185551363142754e-05, + "loss": 0.028, + "reward": 1.4942862942814827, + "reward_std": 0.37678084987601324, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.10779703845037147, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9645833373069763, + "step": 980 + }, + { + "completion_length": 1024.0, + "epoch": 0.21755635620712027, + "grad_norm": 0.4944414791078527, + "kl": 0.30677490234375, + "learning_rate": 1.9170241970443344e-05, + "loss": 0.0123, + "reward": 1.6588552303612232, + "reward_std": 0.2802548832620232, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.08281143896747381, + "rewards/format_reward": 0.74375, + "rewards/reasoning_steps_reward": 0.9791666693985462, + "step": 985 + }, + { + "completion_length": 1024.0, + "epoch": 0.21866070319294323, + "grad_norm": 0.47055042970737043, + "kl": 0.31396484375, + "learning_rate": 1.9154796251973092e-05, + "loss": 0.0126, + "reward": 1.707213106751442, + "reward_std": 0.23285328571801073, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.11153689179336652, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9375000059604645, + "step": 990 + }, + { + "completion_length": 1024.0, + "epoch": 0.21976505017876616, + "grad_norm": 0.4722092126957342, + "kl": 0.27503662109375, + "learning_rate": 1.9139214437349663e-05, + "loss": 0.011, + "reward": 1.6616327054798603, + "reward_std": 0.2845032803234062, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13211730096081736, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.9000000070780516, + "step": 995 + }, + { + "completion_length": 1024.0, + "epoch": 0.22086939716458912, + "grad_norm": 0.4694447156979849, + "kl": 0.2855224609375, + "learning_rate": 1.9123496758213926e-05, + "loss": 0.0114, + "reward": 1.7685628682374954, + "reward_std": 0.23412906796729657, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10018712454620982, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9375000063329935, + "step": 1000 + }, + { + "epoch": 0.22086939716458912, + "eval_completion_length": 1024.0, + "eval_kl": 0.29111328125, + "eval_loss": 0.011653742752969265, + "eval_reward": 1.8491823887825012, + "eval_reward_std": 0.25383697646204384, + "eval_rewards/accuracy_reward": 0.02, + "eval_rewards/cosine_scaled_reward": -0.06248428151942789, + "eval_rewards/format_reward": 0.95, + "eval_rewards/reasoning_steps_reward": 0.9416666758060456, + "eval_runtime": 203.8275, + "eval_samples_per_second": 0.486, + "eval_steps_per_second": 0.123, + "step": 1000 + }, + { + "completion_length": 1024.0, + "epoch": 0.22197374415041207, + "grad_norm": 0.44007384237619535, + "kl": 0.28228759765625, + "learning_rate": 1.9107643448226536e-05, + "loss": 0.0113, + "reward": 1.8567358702421188, + "reward_std": 0.15496021461585768, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.07451412830560003, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 0.9562500029802322, + "step": 1005 + }, + { + "completion_length": 1024.0, + "epoch": 0.223078091136235, + "grad_norm": 0.4713086951763063, + "kl": 0.28775634765625, + "learning_rate": 1.909165474306445e-05, + "loss": 0.0115, + "reward": 1.851528912782669, + "reward_std": 0.14237245887197786, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10680442866578232, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 0.9833333350718021, + "step": 1010 + }, + { + "completion_length": 1024.0, + "epoch": 0.22418243812205796, + "grad_norm": 0.4478755129652751, + "kl": 0.30277099609375, + "learning_rate": 1.9075530880417422e-05, + "loss": 0.0121, + "reward": 1.717442861199379, + "reward_std": 0.2557602992928878, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1242238087579608, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.9479166723787784, + "step": 1015 + }, + { + "completion_length": 1024.0, + "epoch": 0.2252867851078809, + "grad_norm": 0.4532749764094587, + "kl": 0.3177490234375, + "learning_rate": 1.905927209998447e-05, + "loss": 0.0127, + "reward": 1.622461923956871, + "reward_std": 0.363123452578111, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1129547476026346, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.9041666753590107, + "step": 1020 + }, + { + "completion_length": 1024.0, + "epoch": 0.22639113209370385, + "grad_norm": 0.45238116128275235, + "kl": 0.34163818359375, + "learning_rate": 1.9042878643470313e-05, + "loss": 0.0137, + "reward": 1.7687511250376702, + "reward_std": 0.2220757791714277, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08958221631328342, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.8770833378657699, + "step": 1025 + }, + { + "completion_length": 1024.0, + "epoch": 0.22749547907952677, + "grad_norm": 0.4650153137737356, + "kl": 0.34112548828125, + "learning_rate": 1.9026350754581782e-05, + "loss": 0.0137, + "reward": 1.8675057023763657, + "reward_std": 0.150637792609632, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.08666095893859165, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.9666666693985462, + "step": 1030 + }, + { + "completion_length": 1024.0, + "epoch": 0.22859982606534973, + "grad_norm": 0.4321232746728849, + "kl": 0.312451171875, + "learning_rate": 1.900968867902419e-05, + "loss": 0.0125, + "reward": 1.8571926668286323, + "reward_std": 0.14679434172503533, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10114066474925494, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 0.9833333343267441, + "step": 1035 + }, + { + "completion_length": 1024.0, + "epoch": 0.2297041730511727, + "grad_norm": 0.4339868879357326, + "kl": 0.30401611328125, + "learning_rate": 1.8992892664497693e-05, + "loss": 0.0122, + "reward": 1.7687968090176582, + "reward_std": 0.22350005892512853, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07911985855171225, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9854166671633721, + "step": 1040 + }, + { + "completion_length": 1024.0, + "epoch": 0.23080852003699562, + "grad_norm": 0.4536285695606801, + "kl": 0.2967041015625, + "learning_rate": 1.897596296069358e-05, + "loss": 0.0119, + "reward": 1.8167687579989433, + "reward_std": 0.1868845313145357, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08323125535480358, + "rewards/format_reward": 0.9125, + "rewards/reasoning_steps_reward": 0.9875000014901161, + "step": 1045 + }, + { + "completion_length": 1024.0, + "epoch": 0.23191286702281858, + "grad_norm": 0.4735990601666979, + "kl": 0.316552734375, + "learning_rate": 1.8958899819290592e-05, + "loss": 0.0127, + "reward": 1.7116306245326995, + "reward_std": 0.2591278723456526, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10086938191234367, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9750000022351741, + "step": 1050 + }, + { + "completion_length": 1024.0, + "epoch": 0.2330172140086415, + "grad_norm": 0.4603152932618283, + "kl": 0.30628662109375, + "learning_rate": 1.8941703493951163e-05, + "loss": 0.0122, + "reward": 1.8179652035236358, + "reward_std": 0.21623797937058953, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.0778681310959655, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.9145833380520344, + "step": 1055 + }, + { + "completion_length": 1024.0, + "epoch": 0.23412156099446446, + "grad_norm": 0.4076470105342933, + "kl": 0.32257080078125, + "learning_rate": 1.892437424031766e-05, + "loss": 0.0129, + "reward": 1.8207226276397706, + "reward_std": 0.19692284195543835, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06886071387561969, + "rewards/format_reward": 0.99375, + "rewards/reasoning_steps_reward": 0.8958333365619182, + "step": 1060 + }, + { + "completion_length": 1024.0, + "epoch": 0.23522590798028742, + "grad_norm": 0.40747101665458285, + "kl": 0.2858154296875, + "learning_rate": 1.890691231600856e-05, + "loss": 0.0114, + "reward": 1.8809554889798163, + "reward_std": 0.14877327984722796, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.07529450277797878, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.968750000745058, + "step": 1065 + }, + { + "completion_length": 1024.0, + "epoch": 0.23633025496611035, + "grad_norm": 0.44323089226801765, + "kl": 0.28681640625, + "learning_rate": 1.8889317980614653e-05, + "loss": 0.0115, + "reward": 1.7944041058421134, + "reward_std": 0.22159042263956508, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13892922517989065, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9833333350718021, + "step": 1070 + }, + { + "completion_length": 1024.0, + "epoch": 0.2374346019519333, + "grad_norm": 0.4913757009884322, + "kl": 0.30711669921875, + "learning_rate": 1.8871591495695156e-05, + "loss": 0.0123, + "reward": 1.6569852642714977, + "reward_std": 0.30601046779338503, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1638480743567925, + "rewards/format_reward": 0.8875, + "rewards/reasoning_steps_reward": 0.9270833348855376, + "step": 1075 + }, + { + "completion_length": 1024.0, + "epoch": 0.23853894893775623, + "grad_norm": 0.4985806443874965, + "kl": 0.32496337890625, + "learning_rate": 1.8853733124773837e-05, + "loss": 0.013, + "reward": 1.7661071710288525, + "reward_std": 0.2830719207166112, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12139282901771367, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.9250000029802322, + "step": 1080 + }, + { + "completion_length": 1024.0, + "epoch": 0.2396432959235792, + "grad_norm": 0.4792956594110829, + "kl": 0.29970703125, + "learning_rate": 1.8835743133335096e-05, + "loss": 0.012, + "reward": 1.808322674036026, + "reward_std": 0.23723325279643176, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.07084399787709117, + "rewards/format_reward": 0.9125, + "rewards/reasoning_steps_reward": 0.9541666693985462, + "step": 1085 + }, + { + "completion_length": 1024.0, + "epoch": 0.24074764290940212, + "grad_norm": 0.43172903038425636, + "kl": 0.2791015625, + "learning_rate": 1.8817621788820017e-05, + "loss": 0.0112, + "reward": 1.8203510470688342, + "reward_std": 0.2552833634043054, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.07131561395945027, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.9291666701436043, + "step": 1090 + }, + { + "completion_length": 1024.0, + "epoch": 0.24185198989522508, + "grad_norm": 0.4484582517601397, + "kl": 0.31998291015625, + "learning_rate": 1.8799369360622394e-05, + "loss": 0.0128, + "reward": 1.7547560043632984, + "reward_std": 0.28648674785072215, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.0994106627011206, + "rewards/format_reward": 0.91875, + "rewards/reasoning_steps_reward": 0.9166666720062494, + "step": 1095 + }, + { + "completion_length": 1024.0, + "epoch": 0.24295633688104804, + "grad_norm": 0.41476974658385424, + "kl": 0.33250732421875, + "learning_rate": 1.8780986120084715e-05, + "loss": 0.0133, + "reward": 1.6285200668498874, + "reward_std": 0.3249174021591898, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16106326215958688, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9145833356305957, + "step": 1100 + }, + { + "epoch": 0.24295633688104804, + "eval_completion_length": 1024.0, + "eval_kl": 0.45419921875, + "eval_loss": 0.018182678148150444, + "eval_reward": 1.4785543692111969, + "eval_reward_std": 0.5918275532126427, + "eval_rewards/accuracy_reward": 0.025, + "eval_rewards/cosine_scaled_reward": -0.1331122925132513, + "eval_rewards/format_reward": 0.745, + "eval_rewards/reasoning_steps_reward": 0.8416666769981385, + "eval_runtime": 203.0342, + "eval_samples_per_second": 0.488, + "eval_steps_per_second": 0.123, + "step": 1100 + }, + { + "completion_length": 1024.0, + "epoch": 0.24406068386687096, + "grad_norm": 0.3429126222185335, + "kl": 0.373504638671875, + "learning_rate": 1.876247234049416e-05, + "loss": 0.0149, + "reward": 1.4890433787368238, + "reward_std": 0.5072989727195818, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.10262328688113484, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.891666672937572, + "step": 1105 + }, + { + "completion_length": 1024.0, + "epoch": 0.24516503085269392, + "grad_norm": 0.4247791062153765, + "kl": 0.275592041015625, + "learning_rate": 1.8743828297078485e-05, + "loss": 0.011, + "reward": 1.3021938862279057, + "reward_std": 0.49335026524204295, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14988945015938954, + "rewards/format_reward": 0.53125, + "rewards/reasoning_steps_reward": 0.9145833373069763, + "step": 1110 + }, + { + "completion_length": 1020.725, + "epoch": 0.24626937783851685, + "grad_norm": 0.8127107356104201, + "kl": 0.56708984375, + "learning_rate": 1.8725054267001992e-05, + "loss": 0.0227, + "reward": 1.5503739204257727, + "reward_std": 0.44013373394700467, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.14962608254988935, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.9625000026077032, + "step": 1115 + }, + { + "completion_length": 1024.0, + "epoch": 0.2473737248243398, + "grad_norm": 0.3549989094358324, + "kl": 0.28006591796875, + "learning_rate": 1.8706150529361355e-05, + "loss": 0.0112, + "reward": 1.8609877035021782, + "reward_std": 0.4224522696546046, + "rewards/accuracy_reward": 0.08125, + "rewards/cosine_scaled_reward": -0.07651229500770569, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.962500000745058, + "step": 1120 + }, + { + "completion_length": 1020.66875, + "epoch": 0.24847807181016277, + "grad_norm": 0.3992396902478467, + "kl": 0.3288330078125, + "learning_rate": 1.8687117365181514e-05, + "loss": 0.0132, + "reward": 1.8732830002903937, + "reward_std": 0.22004793924934346, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.07671700548453372, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.962500000745058, + "step": 1125 + }, + { + "completion_length": 1024.0, + "epoch": 0.2495824187959857, + "grad_norm": 0.3259551611857864, + "kl": 0.3834228515625, + "learning_rate": 1.8667955057411454e-05, + "loss": 0.0153, + "reward": 1.8463864415884017, + "reward_std": 0.2847652018404915, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.08486354988708626, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9687500014901161, + "step": 1130 + }, + { + "completion_length": 1024.0, + "epoch": 0.25068676578180865, + "grad_norm": 0.3444343510959895, + "kl": 0.302099609375, + "learning_rate": 1.864866389092005e-05, + "loss": 0.0121, + "reward": 1.8804628476500511, + "reward_std": 0.41146394111856355, + "rewards/accuracy_reward": 0.08125, + "rewards/cosine_scaled_reward": -0.05287047996171168, + "rewards/format_reward": 0.9, + "rewards/reasoning_steps_reward": 0.9520833358168602, + "step": 1135 + }, + { + "completion_length": 1024.0, + "epoch": 0.2517911127676316, + "grad_norm": 0.41868127372045477, + "kl": 0.39306640625, + "learning_rate": 1.8629244152491773e-05, + "loss": 0.0157, + "reward": 1.7143469981849193, + "reward_std": 0.39155905476345654, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.04190299968176987, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9250000044703484, + "step": 1140 + }, + { + "completion_length": 1024.0, + "epoch": 0.2528954597534545, + "grad_norm": 0.3927845753079056, + "kl": 0.43892822265625, + "learning_rate": 1.860969613082249e-05, + "loss": 0.0175, + "reward": 1.4373045616783202, + "reward_std": 0.5264668684656499, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.05227877427241765, + "rewards/format_reward": 0.65625, + "rewards/reasoning_steps_reward": 0.8083333402872086, + "step": 1145 + }, + { + "completion_length": 1024.0, + "epoch": 0.25399980673927747, + "grad_norm": 0.38745171247473803, + "kl": 0.44339599609375, + "learning_rate": 1.8590020116515116e-05, + "loss": 0.0177, + "reward": 1.3909062273800372, + "reward_std": 0.5424190352443474, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.08617710751132109, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.8395833402872086, + "step": 1150 + }, + { + "completion_length": 1024.0, + "epoch": 0.2551041537251004, + "grad_norm": 0.36149961111425516, + "kl": 0.393505859375, + "learning_rate": 1.8570216402075326e-05, + "loss": 0.0157, + "reward": 1.5060852129012345, + "reward_std": 0.5054815013281768, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.11474812921951524, + "rewards/format_reward": 0.69375, + "rewards/reasoning_steps_reward": 0.8958333402872085, + "step": 1155 + }, + { + "completion_length": 1024.0, + "epoch": 0.2562085007109234, + "grad_norm": 0.23940946232188295, + "kl": 0.29405517578125, + "learning_rate": 1.8550285281907198e-05, + "loss": 0.0118, + "reward": 1.5001007352024316, + "reward_std": 0.4472006390285969, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1623992755456129, + "rewards/format_reward": 0.74375, + "rewards/reasoning_steps_reward": 0.9125000035390258, + "step": 1160 + }, + { + "completion_length": 1024.0, + "epoch": 0.25731284769674634, + "grad_norm": 0.23736459900896167, + "kl": 0.245904541015625, + "learning_rate": 1.8530227052308843e-05, + "loss": 0.0098, + "reward": 1.6085085548460483, + "reward_std": 0.42973450673271146, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13315811949432826, + "rewards/format_reward": 0.76875, + "rewards/reasoning_steps_reward": 0.9604166686534882, + "step": 1165 + }, + { + "completion_length": 1024.0, + "epoch": 0.25841719468256924, + "grad_norm": 0.26537961711313995, + "kl": 0.2716796875, + "learning_rate": 1.8510042011467978e-05, + "loss": 0.0109, + "reward": 1.6423254296183587, + "reward_std": 0.3275948438240448, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14100789964140859, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.9583333365619182, + "step": 1170 + }, + { + "completion_length": 1024.0, + "epoch": 0.2595215416683922, + "grad_norm": 0.2407532396886285, + "kl": 0.304412841796875, + "learning_rate": 1.848973045945753e-05, + "loss": 0.0122, + "reward": 1.5989655748941005, + "reward_std": 0.35892669553431916, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13436777089991664, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.9270833365619182, + "step": 1175 + }, + { + "completion_length": 1024.0, + "epoch": 0.26062588865421515, + "grad_norm": 0.25225574352603947, + "kl": 0.382373046875, + "learning_rate": 1.8469292698231137e-05, + "loss": 0.0153, + "reward": 1.4464009982533752, + "reward_std": 0.5317513575919293, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13693233868325477, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.8770833374932409, + "step": 1180 + }, + { + "completion_length": 1024.0, + "epoch": 0.2617302356400381, + "grad_norm": 0.2156684661256207, + "kl": 0.29520263671875, + "learning_rate": 1.8448729031618687e-05, + "loss": 0.0118, + "reward": 1.4996001317165792, + "reward_std": 0.3996583673519126, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.133733198023765, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8833333352580667, + "step": 1185 + }, + { + "completion_length": 1024.0, + "epoch": 0.26283458262586107, + "grad_norm": 0.2338480894666337, + "kl": 0.25845947265625, + "learning_rate": 1.8428039765321783e-05, + "loss": 0.0103, + "reward": 1.557172004878521, + "reward_std": 0.4314892937021796, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13449467101017945, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9041666699573397, + "step": 1190 + }, + { + "completion_length": 1024.0, + "epoch": 0.26393892961168397, + "grad_norm": 0.23739045779401974, + "kl": 0.382818603515625, + "learning_rate": 1.840722520690921e-05, + "loss": 0.0153, + "reward": 1.3562876941636204, + "reward_std": 0.6140932853326376, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13746229895623402, + "rewards/format_reward": 0.64375, + "rewards/reasoning_steps_reward": 0.8500000033527613, + "step": 1195 + }, + { + "completion_length": 1024.0, + "epoch": 0.2650432765975069, + "grad_norm": 0.18812254182518015, + "kl": 0.41695556640625, + "learning_rate": 1.838628566581236e-05, + "loss": 0.0167, + "reward": 1.2772326513193548, + "reward_std": 0.6012849385821027, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14985067906891344, + "rewards/format_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8583333371207118, + "step": 1200 + }, + { + "epoch": 0.2650432765975069, + "eval_completion_length": 1024.0, + "eval_kl": 0.2277880859375, + "eval_loss": 0.009126170538365841, + "eval_reward": 1.237759041786194, + "eval_reward_std": 0.5342784489318728, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.14890761934220792, + "eval_rewards/format_reward": 0.475, + "eval_rewards/reasoning_steps_reward": 0.9116666686534881, + "eval_runtime": 202.9615, + "eval_samples_per_second": 0.488, + "eval_steps_per_second": 0.123, + "step": 1200 + }, + { + "completion_length": 1024.0, + "epoch": 0.2661476235833299, + "grad_norm": 0.1396938684266728, + "kl": 0.1735382080078125, + "learning_rate": 1.8365221453320625e-05, + "loss": 0.0069, + "reward": 1.159564550407231, + "reward_std": 0.5056648141493497, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13418545336462556, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9187500011175871, + "step": 1205 + }, + { + "completion_length": 1024.0, + "epoch": 0.26725197056915284, + "grad_norm": 0.17009112400745297, + "kl": 0.148822021484375, + "learning_rate": 1.8344032882576784e-05, + "loss": 0.006, + "reward": 1.7244970690459014, + "reward_std": 0.2821242625116156, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.056752927817615276, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.9500000007450581, + "step": 1210 + }, + { + "completion_length": 1020.8625, + "epoch": 0.2683563175549758, + "grad_norm": 0.08738103322845646, + "kl": 0.14678955078125, + "learning_rate": 1.8322720268572333e-05, + "loss": 0.0059, + "reward": 1.88584890589118, + "reward_std": 0.15781026161916997, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.020401104938173376, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.968750000745058, + "step": 1215 + }, + { + "completion_length": 1021.76875, + "epoch": 0.2694606645407987, + "grad_norm": 0.17622717031700735, + "kl": 0.150677490234375, + "learning_rate": 1.83012839281428e-05, + "loss": 0.006, + "reward": 1.6834479916840792, + "reward_std": 0.3111886321689781, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.026968683502354908, + "rewards/format_reward": 0.7375, + "rewards/reasoning_steps_reward": 0.972916667163372, + "step": 1220 + }, + { + "completion_length": 1022.925, + "epoch": 0.27056501152662166, + "grad_norm": 0.11918594376170559, + "kl": 0.285870361328125, + "learning_rate": 1.827972417996306e-05, + "loss": 0.0114, + "reward": 1.5191262364387512, + "reward_std": 0.3489377611604709, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04962377124284103, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 0.9562500014901161, + "step": 1225 + }, + { + "completion_length": 1024.0, + "epoch": 0.2716693585124446, + "grad_norm": 0.06285065134173069, + "kl": 0.13602294921875, + "learning_rate": 1.8258041344542567e-05, + "loss": 0.0054, + "reward": 0.9662943309172988, + "reward_std": 0.10380094405736599, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.021205670808194556, + "rewards/format_reward": 0.025, + "rewards/reasoning_steps_reward": 0.9625000018626452, + "step": 1230 + }, + { + "completion_length": 1024.0, + "epoch": 0.27277370549826757, + "grad_norm": 0.07686648446244543, + "kl": 0.116314697265625, + "learning_rate": 1.823623574422061e-05, + "loss": 0.0046, + "reward": 0.944898908957839, + "reward_std": 0.07661853031105466, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.019684416962354588, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.964583334326744, + "step": 1235 + }, + { + "completion_length": 1020.00625, + "epoch": 0.27387805248409053, + "grad_norm": 0.07754923602159525, + "kl": 0.1475494384765625, + "learning_rate": 1.821430770316151e-05, + "loss": 0.0059, + "reward": 0.9660956308245658, + "reward_std": 0.1536989317713818, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.021404369897666697, + "rewards/format_reward": 0.0375, + "rewards/reasoning_steps_reward": 0.9500000022351742, + "step": 1240 + }, + { + "completion_length": 1024.0, + "epoch": 0.27498239946991343, + "grad_norm": 0.11285092638491938, + "kl": 0.1615203857421875, + "learning_rate": 1.8192257547349805e-05, + "loss": 0.0065, + "reward": 0.9706789815798402, + "reward_std": 0.14492510877234963, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.037654361013386504, + "rewards/format_reward": 0.05, + "rewards/reasoning_steps_reward": 0.9583333354443312, + "step": 1245 + }, + { + "completion_length": 1022.925, + "epoch": 0.2760867464557364, + "grad_norm": 0.13307393716537338, + "kl": 0.3531829833984375, + "learning_rate": 1.817008560458541e-05, + "loss": 0.0141, + "reward": 0.7812707336619497, + "reward_std": 0.3402592138071441, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09789594054198006, + "rewards/format_reward": 0.05, + "rewards/reasoning_steps_reward": 0.829166672565043, + "step": 1250 + }, + { + "completion_length": 1024.0, + "epoch": 0.27719109344155934, + "grad_norm": 0.25658627816669477, + "kl": 0.487921142578125, + "learning_rate": 1.814779220447872e-05, + "loss": 0.0195, + "reward": 0.6302142185159028, + "reward_std": 0.5167911179111343, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14686911877834063, + "rewards/format_reward": 0.05625, + "rewards/reasoning_steps_reward": 0.7208333412185312, + "step": 1255 + }, + { + "completion_length": 1024.0, + "epoch": 0.2782954404273823, + "grad_norm": 0.5529738552955684, + "kl": 0.373065185546875, + "learning_rate": 1.8125377678445755e-05, + "loss": 0.0149, + "reward": 0.9013677610084414, + "reward_std": 0.4689164909107603, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10071557244591531, + "rewards/format_reward": 0.18125, + "rewards/reasoning_steps_reward": 0.8208333399146795, + "step": 1260 + }, + { + "completion_length": 1023.875, + "epoch": 0.2793997874132052, + "grad_norm": 3.3418210634211905, + "kl": 2.01922607421875, + "learning_rate": 1.8102842359703177e-05, + "loss": 0.0809, + "reward": 1.1432768110185862, + "reward_std": 0.44278795822719985, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09005652678351908, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.8583333399146795, + "step": 1265 + }, + { + "completion_length": 1024.0, + "epoch": 0.28050413439902816, + "grad_norm": 4.521061137528278, + "kl": 1.19075927734375, + "learning_rate": 1.8080186583263386e-05, + "loss": 0.0476, + "reward": 0.5847210302948952, + "reward_std": 0.41035552892026317, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2277789770560048, + "rewards/format_reward": 0.05625, + "rewards/reasoning_steps_reward": 0.7562500078231096, + "step": 1270 + }, + { + "completion_length": 1024.0, + "epoch": 0.2816084813848511, + "grad_norm": 30.812440821129783, + "kl": 250.4185028076172, + "learning_rate": 1.8057410685929505e-05, + "loss": 10.0457, + "reward": 0.5216961699537933, + "reward_std": 0.43772149360393087, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22413716676055628, + "rewards/format_reward": 0.04375, + "rewards/reasoning_steps_reward": 0.702083344757557, + "step": 1275 + }, + { + "completion_length": 1006.43125, + "epoch": 0.2827128283706741, + "grad_norm": 2.740052403563944, + "kl": 2.95443115234375, + "learning_rate": 1.8034515006290398e-05, + "loss": 0.1182, + "reward": 0.6823957259068265, + "reward_std": 0.3967391650963691, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13010428047346068, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.7437500119209289, + "step": 1280 + }, + { + "completion_length": 1022.3875, + "epoch": 0.28381717535649703, + "grad_norm": 39.81394935188374, + "kl": 2.14736328125, + "learning_rate": 1.8011499884715616e-05, + "loss": 0.086, + "reward": 0.8438511086627841, + "reward_std": 0.30578292898626386, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.07281556303551043, + "rewards/format_reward": 0.05625, + "rewards/reasoning_steps_reward": 0.8541666775941849, + "step": 1285 + }, + { + "completion_length": 1024.0, + "epoch": 0.28492152234231993, + "grad_norm": 3.137457380908233, + "kl": 2.644873046875, + "learning_rate": 1.7988365663350352e-05, + "loss": 0.1059, + "reward": 0.9859581716358662, + "reward_std": 0.22302457209725618, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06195849884697964, + "rewards/format_reward": 0.09375, + "rewards/reasoning_steps_reward": 0.9541666701436042, + "step": 1290 + }, + { + "completion_length": 1021.1, + "epoch": 0.2860258693281429, + "grad_norm": 0.29805685695358275, + "kl": 0.4004791259765625, + "learning_rate": 1.7965112686110346e-05, + "loss": 0.016, + "reward": 0.9928273539990187, + "reward_std": 0.24035521575530083, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04467265058710836, + "rewards/format_reward": 0.1125, + "rewards/reasoning_steps_reward": 0.9250000055879355, + "step": 1295 + }, + { + "completion_length": 1024.0, + "epoch": 0.28713021631396585, + "grad_norm": 1.5460480759475044, + "kl": 0.447198486328125, + "learning_rate": 1.7941741298676777e-05, + "loss": 0.0179, + "reward": 0.933686813339591, + "reward_std": 0.21179361512779452, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03922985791132305, + "rewards/format_reward": 0.05625, + "rewards/reasoning_steps_reward": 0.9166666753590107, + "step": 1300 + }, + { + "epoch": 0.28713021631396585, + "eval_completion_length": 1019.175, + "eval_kl": 0.6944921875, + "eval_loss": 0.027912691235542297, + "eval_reward": 0.8045022475719452, + "eval_reward_std": 0.29673283290376273, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.0638310891322908, + "eval_rewards/format_reward": 0.05, + "eval_rewards/reasoning_steps_reward": 0.8183333414793015, + "eval_runtime": 202.212, + "eval_samples_per_second": 0.49, + "eval_steps_per_second": 0.124, + "step": 1300 + }, + { + "completion_length": 1024.0, + "epoch": 0.2882345632997888, + "grad_norm": 131.321337688551, + "kl": 65.63963623046875, + "learning_rate": 1.7918251848491118e-05, + "loss": 2.6278, + "reward": 0.7914097828324884, + "reward_std": 0.35510434115416273, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0794235574917593, + "rewards/format_reward": 0.0875, + "rewards/reasoning_steps_reward": 0.7833333387970924, + "step": 1305 + }, + { + "completion_length": 999.3, + "epoch": 0.28933891028561176, + "grad_norm": 5.088107026581073, + "kl": 16.488323974609376, + "learning_rate": 1.7894644684749983e-05, + "loss": 0.6609, + "reward": 0.7671722872182727, + "reward_std": 0.4012905497775364, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0869943821081847, + "rewards/format_reward": 0.0875, + "rewards/reasoning_steps_reward": 0.7666666701436042, + "step": 1310 + }, + { + "completion_length": 984.25, + "epoch": 0.29044325727143466, + "grad_norm": 5.662270180234321, + "kl": 5.150537109375, + "learning_rate": 1.7870920158399918e-05, + "loss": 0.2062, + "reward": 0.7325123744085431, + "reward_std": 0.5204162761657812, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09040429510343415, + "rewards/format_reward": 0.0875, + "rewards/reasoning_steps_reward": 0.7354166703298688, + "step": 1315 + }, + { + "completion_length": 983.725, + "epoch": 0.2915476042572576, + "grad_norm": 3.8531905036661738, + "kl": 2.1231201171875, + "learning_rate": 1.7847078622132202e-05, + "loss": 0.085, + "reward": 0.6666571330279112, + "reward_std": 0.4020955947952075, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08959287195330035, + "rewards/format_reward": 0.025, + "rewards/reasoning_steps_reward": 0.7312500027939677, + "step": 1320 + }, + { + "completion_length": 1024.0, + "epoch": 0.2926519512430806, + "grad_norm": 1.4713601895366548, + "kl": 4.474505615234375, + "learning_rate": 1.7823120430377593e-05, + "loss": 0.1791, + "reward": 0.8719107124954462, + "reward_std": 0.20605486250725563, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04475596445258816, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.9041666679084301, + "step": 1325 + }, + { + "completion_length": 1024.0, + "epoch": 0.29375629822890353, + "grad_norm": 2.001254416658288, + "kl": 234.38853149414064, + "learning_rate": 1.7799045939301063e-05, + "loss": 9.3666, + "reward": 0.9464922484010458, + "reward_std": 0.36131374030945834, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0597577509677194, + "rewards/format_reward": 0.11875, + "rewards/reasoning_steps_reward": 0.8875000007450581, + "step": 1330 + }, + { + "completion_length": 1024.0, + "epoch": 0.2948606452147265, + "grad_norm": 0.01825257511989942, + "kl": 0.88497314453125, + "learning_rate": 1.7774855506796497e-05, + "loss": 0.0355, + "reward": 0.9531695555895567, + "reward_std": 0.0772957038158438, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.025997111127071548, + "rewards/format_reward": 0.025, + "rewards/reasoning_steps_reward": 0.9541666679084301, + "step": 1335 + }, + { + "completion_length": 1024.0, + "epoch": 0.2959649922005494, + "grad_norm": 0.5453957093609844, + "kl": 0.524688720703125, + "learning_rate": 1.775054949248138e-05, + "loss": 0.021, + "reward": 0.9354104410856963, + "reward_std": 0.13244989554941639, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.029172901513811668, + "rewards/format_reward": 0.025, + "rewards/reasoning_steps_reward": 0.9395833350718021, + "step": 1340 + }, + { + "completion_length": 1024.0, + "epoch": 0.29706933918637235, + "grad_norm": 2.4380567469335372, + "kl": 1.2219635009765626, + "learning_rate": 1.7726128257691447e-05, + "loss": 0.0489, + "reward": 0.9105375189334154, + "reward_std": 0.16619428564858935, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04987915392703144, + "rewards/format_reward": 0.0375, + "rewards/reasoning_steps_reward": 0.9229166686534882, + "step": 1345 + }, + { + "completion_length": 1024.0, + "epoch": 0.2981736861721953, + "grad_norm": 1.4015232004301508, + "kl": 0.3145111083984375, + "learning_rate": 1.770159216547532e-05, + "loss": 0.0126, + "reward": 0.9414736803621053, + "reward_std": 0.21337720166779944, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05227632584610546, + "rewards/format_reward": 0.0625, + "rewards/reasoning_steps_reward": 0.9312500022351742, + "step": 1350 + }, + { + "completion_length": 1024.0, + "epoch": 0.29927803315801826, + "grad_norm": 0.6098020893731024, + "kl": 0.9691864013671875, + "learning_rate": 1.7676941580589097e-05, + "loss": 0.0388, + "reward": 1.0848730199038983, + "reward_std": 0.43680297569849247, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05471031935303472, + "rewards/format_reward": 0.2375, + "rewards/reasoning_steps_reward": 0.9020833350718022, + "step": 1355 + }, + { + "completion_length": 1024.0, + "epoch": 0.3003823801438412, + "grad_norm": 1.3473870887142767, + "kl": 0.3821868896484375, + "learning_rate": 1.7652176869490933e-05, + "loss": 0.0153, + "reward": 1.5648357531055808, + "reward_std": 0.34688837417397733, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0351642573474237, + "rewards/format_reward": 0.6625, + "rewards/reasoning_steps_reward": 0.9375000013038516, + "step": 1360 + }, + { + "completion_length": 1024.0, + "epoch": 0.3014867271296641, + "grad_norm": 2.650927188181098, + "kl": 34.706756591796875, + "learning_rate": 1.76272984003356e-05, + "loss": 1.3914, + "reward": 1.7366739973425864, + "reward_std": 0.3717085005620106, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0674926791811572, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.89791666790843, + "step": 1365 + }, + { + "completion_length": 1024.0, + "epoch": 0.3025910741154871, + "grad_norm": 1.1784705594565221, + "kl": 1.138018798828125, + "learning_rate": 1.7602306542969006e-05, + "loss": 0.0455, + "reward": 1.6846371553838253, + "reward_std": 0.3902606235532744, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08202951550820217, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.8854166684672237, + "step": 1370 + }, + { + "completion_length": 1024.0, + "epoch": 0.30369542110131004, + "grad_norm": 1.069729400209864, + "kl": 0.304193115234375, + "learning_rate": 1.7577201668922702e-05, + "loss": 0.0122, + "reward": 1.725393744930625, + "reward_std": 0.2859493938201467, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07460626843894716, + "rewards/format_reward": 0.9125, + "rewards/reasoning_steps_reward": 0.8875000020489097, + "step": 1375 + }, + { + "completion_length": 1024.0, + "epoch": 0.304799768087133, + "grad_norm": 11.760360030790453, + "kl": 4.318035888671875, + "learning_rate": 1.7551984151408363e-05, + "loss": 0.173, + "reward": 1.8561431474983692, + "reward_std": 0.20155350471841302, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.035523528978228566, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9416666679084301, + "step": 1380 + }, + { + "completion_length": 1024.0, + "epoch": 0.3059041150729559, + "grad_norm": 0.7829414634076728, + "kl": 0.50015869140625, + "learning_rate": 1.7526654365312222e-05, + "loss": 0.0201, + "reward": 1.807436482422054, + "reward_std": 0.21474389426180096, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.038396865250979316, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9208333337679505, + "step": 1385 + }, + { + "completion_length": 1024.0, + "epoch": 0.30700846205877885, + "grad_norm": 0.3526252407355771, + "kl": 0.42960205078125, + "learning_rate": 1.750121268718951e-05, + "loss": 0.0172, + "reward": 1.7724484391510487, + "reward_std": 0.3056804820618481, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06088490542335308, + "rewards/format_reward": 0.9125, + "rewards/reasoning_steps_reward": 0.9208333365619182, + "step": 1390 + }, + { + "completion_length": 1024.0, + "epoch": 0.3081128090446018, + "grad_norm": 0.8538212226535026, + "kl": 0.447174072265625, + "learning_rate": 1.7475659495258864e-05, + "loss": 0.0179, + "reward": 1.1615405725315213, + "reward_std": 0.3998776563397314, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1238761033207993, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.8229166688397527, + "step": 1395 + }, + { + "completion_length": 1024.0, + "epoch": 0.30921715603042477, + "grad_norm": 0.22647138061338717, + "kl": 0.1839111328125, + "learning_rate": 1.7449995169396693e-05, + "loss": 0.0074, + "reward": 0.598058795183897, + "reward_std": 0.45160960222401625, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17277454470386147, + "rewards/format_reward": 0.13125, + "rewards/reasoning_steps_reward": 0.6395833432674408, + "step": 1400 + }, + { + "epoch": 0.30921715603042477, + "eval_completion_length": 1024.0, + "eval_kl": 0.114072265625, + "eval_loss": 0.004564680624753237, + "eval_reward": 0.152089421749115, + "eval_reward_std": 0.1883784442592878, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.23457725435495377, + "eval_rewards/format_reward": 0.03, + "eval_rewards/reasoning_steps_reward": 0.3566666767001152, + "eval_runtime": 206.7244, + "eval_samples_per_second": 0.479, + "eval_steps_per_second": 0.121, + "step": 1400 + }, + { + "completion_length": 1024.0, + "epoch": 0.3103215030162477, + "grad_norm": 0.1914184381458074, + "kl": 0.1725677490234375, + "learning_rate": 1.7424220091131536e-05, + "loss": 0.0069, + "reward": 0.0531884940341115, + "reward_std": 0.23593127114654636, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.20722817881032823, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.18541666995733977, + "step": 1405 + }, + { + "completion_length": 1019.25, + "epoch": 0.3114258500020706, + "grad_norm": 0.18859472672382957, + "kl": 0.270391845703125, + "learning_rate": 1.739833464363838e-05, + "loss": 0.0108, + "reward": 0.07014747215434909, + "reward_std": 0.2072645182282585, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.19651920032940212, + "rewards/format_reward": 0.09375, + "rewards/reasoning_steps_reward": 0.17291667088866233, + "step": 1410 + }, + { + "completion_length": 1024.0, + "epoch": 0.3125301969878936, + "grad_norm": 0.4524235162500008, + "kl": 0.26759033203125, + "learning_rate": 1.7372339211732988e-05, + "loss": 0.0107, + "reward": 0.767735379934311, + "reward_std": 0.3956183263140701, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09684796158981043, + "rewards/format_reward": 0.25, + "rewards/reasoning_steps_reward": 0.6145833486691117, + "step": 1415 + }, + { + "completion_length": 1024.0, + "epoch": 0.31363454397371654, + "grad_norm": 0.29966595716364663, + "kl": 0.2554931640625, + "learning_rate": 1.734623418186615e-05, + "loss": 0.0102, + "reward": 0.9944251235574484, + "reward_std": 0.3010030138277216, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13890820817177882, + "rewards/format_reward": 0.1875, + "rewards/reasoning_steps_reward": 0.9395833373069763, + "step": 1420 + }, + { + "completion_length": 1024.0, + "epoch": 0.3147388909595395, + "grad_norm": 0.34966167035721935, + "kl": 0.2528564453125, + "learning_rate": 1.7320019942117954e-05, + "loss": 0.0101, + "reward": 1.0877137396484613, + "reward_std": 0.30549203366972505, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13311959272250534, + "rewards/format_reward": 0.23125, + "rewards/reasoning_steps_reward": 0.977083333581686, + "step": 1425 + }, + { + "completion_length": 1024.0, + "epoch": 0.31584323794536245, + "grad_norm": 0.32149028086929365, + "kl": 0.30032958984375, + "learning_rate": 1.729369688219202e-05, + "loss": 0.012, + "reward": 1.3974668875336647, + "reward_std": 0.3835971452834201, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13378311347041744, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 1.0, + "step": 1430 + }, + { + "completion_length": 1024.0, + "epoch": 0.31694758493118536, + "grad_norm": 0.458686181420835, + "kl": 0.269873046875, + "learning_rate": 1.7267265393409684e-05, + "loss": 0.0108, + "reward": 1.7222345098853111, + "reward_std": 0.2661111972852268, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11526548723049927, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 1.0, + "step": 1435 + }, + { + "completion_length": 1024.0, + "epoch": 0.3180519319170083, + "grad_norm": 0.4300493124515638, + "kl": 0.28525390625, + "learning_rate": 1.7240725868704218e-05, + "loss": 0.0114, + "reward": 1.9266040623188019, + "reward_std": 0.16214110484579577, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.0754792769512278, + "rewards/format_reward": 0.9875, + "rewards/reasoning_steps_reward": 0.995833333581686, + "step": 1440 + }, + { + "completion_length": 1024.0, + "epoch": 0.31915627890283127, + "grad_norm": 0.42969892888824873, + "kl": 0.2861572265625, + "learning_rate": 1.7214078702614946e-05, + "loss": 0.0114, + "reward": 2.032954090833664, + "reward_std": 0.19149340623989702, + "rewards/accuracy_reward": 0.05625, + "rewards/cosine_scaled_reward": -0.012879242049530148, + "rewards/format_reward": 1.0, + "rewards/reasoning_steps_reward": 0.989583333581686, + "step": 1445 + }, + { + "completion_length": 1024.0, + "epoch": 0.3202606258886542, + "grad_norm": 0.49367663303009623, + "kl": 0.27318115234375, + "learning_rate": 1.7187324291281423e-05, + "loss": 0.0109, + "reward": 1.9512878715991975, + "reward_std": 0.1059367892348746, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.040378800382313784, + "rewards/format_reward": 0.9875, + "rewards/reasoning_steps_reward": 0.9916666686534882, + "step": 1450 + }, + { + "completion_length": 1024.0, + "epoch": 0.3213649728744772, + "grad_norm": 0.3778408957712114, + "kl": 0.32586669921875, + "learning_rate": 1.71604630324375e-05, + "loss": 0.013, + "reward": 1.9965640038251877, + "reward_std": 0.22204069324834563, + "rewards/accuracy_reward": 0.05, + "rewards/cosine_scaled_reward": -0.001352671076892875, + "rewards/format_reward": 0.9875, + "rewards/reasoning_steps_reward": 0.9604166693985462, + "step": 1455 + }, + { + "completion_length": 1024.0, + "epoch": 0.3224693198603001, + "grad_norm": 0.39152943132673057, + "kl": 0.3145263671875, + "learning_rate": 1.7133495325405448e-05, + "loss": 0.0126, + "reward": 1.9360227391123772, + "reward_std": 0.2746580336162879, + "rewards/accuracy_reward": 0.05, + "rewards/cosine_scaled_reward": -0.013977252056065481, + "rewards/format_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9000000044703483, + "step": 1460 + }, + { + "completion_length": 1024.0, + "epoch": 0.32357366684612304, + "grad_norm": 0.5058134430224523, + "kl": 0.33702392578125, + "learning_rate": 1.7106421571090003e-05, + "loss": 0.0135, + "reward": 1.85144801735878, + "reward_std": 0.2626168250788396, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.033968651014947684, + "rewards/format_reward": 1.0, + "rewards/reasoning_steps_reward": 0.8604166712611914, + "step": 1465 + }, + { + "completion_length": 1024.0, + "epoch": 0.324678013831946, + "grad_norm": 0.45638817493378453, + "kl": 0.34488525390625, + "learning_rate": 1.7079242171972417e-05, + "loss": 0.0138, + "reward": 1.705393605818972, + "reward_std": 0.25409652892012674, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.04668973356310744, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.8895833380520344, + "step": 1470 + }, + { + "completion_length": 1024.0, + "epoch": 0.32578236081776896, + "grad_norm": 0.31595014246293646, + "kl": 0.23759765625, + "learning_rate": 1.705195753210446e-05, + "loss": 0.0095, + "reward": 1.4653432246297597, + "reward_std": 0.33772912265012567, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.072156780314981, + "rewards/format_reward": 0.56875, + "rewards/reasoning_steps_reward": 0.962500001117587, + "step": 1475 + }, + { + "completion_length": 1024.0, + "epoch": 0.3268867078035919, + "grad_norm": 0.3387234465375613, + "kl": 0.21739501953125, + "learning_rate": 1.7024568057102423e-05, + "loss": 0.0087, + "reward": 1.2381744548678397, + "reward_std": 0.3804714563237212, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.07015887794332229, + "rewards/format_reward": 0.33125, + "rewards/reasoning_steps_reward": 0.964583334326744, + "step": 1480 + }, + { + "completion_length": 1024.0, + "epoch": 0.3279910547894148, + "grad_norm": 0.3635919256435814, + "kl": 0.290997314453125, + "learning_rate": 1.6997074154141097e-05, + "loss": 0.0116, + "reward": 1.2281988142058253, + "reward_std": 0.522791172178404, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.08638452111918013, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.8833333363756537, + "step": 1485 + }, + { + "completion_length": 1024.0, + "epoch": 0.3290954017752378, + "grad_norm": 0.31289390902077713, + "kl": 0.413104248046875, + "learning_rate": 1.69694762319477e-05, + "loss": 0.0165, + "reward": 1.2244404914788902, + "reward_std": 0.5740925252861416, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.05264285028388258, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.745833340473473, + "step": 1490 + }, + { + "completion_length": 1019.89375, + "epoch": 0.33019974876106073, + "grad_norm": 0.2543210545589851, + "kl": 0.35384521484375, + "learning_rate": 1.694177470079581e-05, + "loss": 0.0142, + "reward": 1.278636990953237, + "reward_std": 0.653353753479314, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.08802968083273299, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.7729166712611913, + "step": 1495 + }, + { + "completion_length": 1024.0, + "epoch": 0.3313040957468837, + "grad_norm": 0.2000090354174927, + "kl": 0.285101318359375, + "learning_rate": 1.6913969972499272e-05, + "loss": 0.0114, + "reward": 1.4027412496507168, + "reward_std": 0.6077661401930528, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12017541860113852, + "rewards/format_reward": 0.69375, + "rewards/reasoning_steps_reward": 0.8291666712611914, + "step": 1500 + }, + { + "epoch": 0.3313040957468837, + "eval_completion_length": 1024.0, + "eval_kl": 3.11359375, + "eval_loss": 0.1255832314491272, + "eval_reward": 1.5576036548614502, + "eval_reward_std": 0.4220656427741051, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.13572968773543834, + "eval_rewards/format_reward": 0.79, + "eval_rewards/reasoning_steps_reward": 0.9033333349227906, + "eval_runtime": 202.5651, + "eval_samples_per_second": 0.489, + "eval_steps_per_second": 0.123, + "step": 1500 + }, + { + "completion_length": 1024.0, + "epoch": 0.33240844273270664, + "grad_norm": 0.18666628132889443, + "kl": 0.2407806396484375, + "learning_rate": 1.688606246040607e-05, + "loss": 0.0096, + "reward": 1.5353901420719922, + "reward_std": 0.3581777959698229, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1312765258422587, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.8791666684672237, + "step": 1505 + }, + { + "completion_length": 1024.0, + "epoch": 0.33351278971852955, + "grad_norm": 0.15057205371474208, + "kl": 0.2106201171875, + "learning_rate": 1.6858052579392182e-05, + "loss": 0.0084, + "reward": 1.6789043765515088, + "reward_std": 0.3242158696106344, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09817895484156906, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.9083333365619183, + "step": 1510 + }, + { + "completion_length": 1024.0, + "epoch": 0.3346171367043525, + "grad_norm": 0.15908647576995252, + "kl": 0.26219482421875, + "learning_rate": 1.682994074585541e-05, + "loss": 0.0105, + "reward": 1.6573340767994522, + "reward_std": 0.33736471326956236, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.09058259856828954, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.8979166688397526, + "step": 1515 + }, + { + "completion_length": 1024.0, + "epoch": 0.33572148369017546, + "grad_norm": 0.15174865851665464, + "kl": 0.321112060546875, + "learning_rate": 1.6801727377709195e-05, + "loss": 0.0128, + "reward": 1.6820120507851244, + "reward_std": 0.3425358484266326, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10132128554250812, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9208333363756538, + "step": 1520 + }, + { + "completion_length": 1024.0, + "epoch": 0.3368258306759984, + "grad_norm": 1.60136854765388, + "kl": 0.7475982666015625, + "learning_rate": 1.6773412894376404e-05, + "loss": 0.0299, + "reward": 1.664339251909405, + "reward_std": 0.36049082253084636, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08149407562686975, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.9270833354443312, + "step": 1525 + }, + { + "completion_length": 1019.6375, + "epoch": 0.3379301776618213, + "grad_norm": 749.4573047515579, + "kl": 22.88638916015625, + "learning_rate": 1.674499771678309e-05, + "loss": 0.9147, + "reward": 1.6466619638726114, + "reward_std": 0.3481345805644196, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07000470066104754, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.9416666677221656, + "step": 1530 + }, + { + "completion_length": 1024.0, + "epoch": 0.3390345246476443, + "grad_norm": 0.21658147441883935, + "kl": 0.3058441162109375, + "learning_rate": 1.6716482267352234e-05, + "loss": 0.0123, + "reward": 1.7539634361863137, + "reward_std": 0.3016778468489065, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06686989046866074, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.9520833343267441, + "step": 1535 + }, + { + "completion_length": 1012.55, + "epoch": 0.34013887163346723, + "grad_norm": 461.5916103955411, + "kl": 40.589141845703125, + "learning_rate": 1.6687866969997483e-05, + "loss": 1.6231, + "reward": 1.6885167896049098, + "reward_std": 0.3414273451831832, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07606654057235573, + "rewards/format_reward": 0.85625, + "rewards/reasoning_steps_reward": 0.9083333358168602, + "step": 1540 + }, + { + "completion_length": 1023.0, + "epoch": 0.3412432186192902, + "grad_norm": 5.713861291488285, + "kl": 1.279107666015625, + "learning_rate": 1.665915225011681e-05, + "loss": 0.0512, + "reward": 1.5734828183427454, + "reward_std": 0.4893728211319285, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0848505121160997, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.8770833380520344, + "step": 1545 + }, + { + "completion_length": 1017.9, + "epoch": 0.34234756560511315, + "grad_norm": 44.99030930635304, + "kl": 6.491766357421875, + "learning_rate": 1.663033853458624e-05, + "loss": 0.2599, + "reward": 1.565668173879385, + "reward_std": 0.4733093095805089, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0822484963136958, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.8229166740551591, + "step": 1550 + }, + { + "completion_length": 995.39375, + "epoch": 0.34345191259093605, + "grad_norm": 30.752879788425222, + "kl": 1.7755828857421876, + "learning_rate": 1.660142625175346e-05, + "loss": 0.071, + "reward": 1.2021763553842901, + "reward_std": 0.407320237372187, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10199031746979018, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.4916666788980365, + "step": 1555 + }, + { + "completion_length": 1016.3625, + "epoch": 0.344556259576759, + "grad_norm": 4.863397224735704, + "kl": 3.19599609375, + "learning_rate": 1.6572415831431466e-05, + "loss": 0.1281, + "reward": 1.0388813458383084, + "reward_std": 0.4170527165522799, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10278532494285172, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.3229166742414236, + "step": 1560 + }, + { + "completion_length": 1016.2, + "epoch": 0.34566060656258196, + "grad_norm": 2.693390293032911, + "kl": 1.61478271484375, + "learning_rate": 1.6543307704892196e-05, + "loss": 0.0647, + "reward": 1.0753739975392818, + "reward_std": 0.3687583804799942, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.085042677965248, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.3291666736826301, + "step": 1565 + }, + { + "completion_length": 1024.0, + "epoch": 0.3467649535484049, + "grad_norm": 1.2274816870977052, + "kl": 0.848553466796875, + "learning_rate": 1.6514102304860077e-05, + "loss": 0.034, + "reward": 1.0539750020951033, + "reward_std": 0.43142651255548115, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0876916709530633, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.3166666740551591, + "step": 1570 + }, + { + "completion_length": 1024.0, + "epoch": 0.3478693005342279, + "grad_norm": 0.2539185949163488, + "kl": 0.2436279296875, + "learning_rate": 1.6484800065505627e-05, + "loss": 0.0097, + "reward": 1.1815047591924668, + "reward_std": 0.44734334169734213, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05182858537846187, + "rewards/format_reward": 0.725, + "rewards/reasoning_steps_reward": 0.5083333438262343, + "step": 1575 + }, + { + "completion_length": 1024.0, + "epoch": 0.3489736475200508, + "grad_norm": 0.2697024562923024, + "kl": 78.62169189453125, + "learning_rate": 1.6455401422438984e-05, + "loss": 3.1515, + "reward": 1.8654019482433797, + "reward_std": 0.13044934055097884, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06168138112694806, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9770833380520344, + "step": 1580 + }, + { + "completion_length": 1024.0, + "epoch": 0.35007799450587374, + "grad_norm": 0.6487746875720865, + "kl": 0.379437255859375, + "learning_rate": 1.6425906812703435e-05, + "loss": 0.0152, + "reward": 1.7737618699669837, + "reward_std": 0.2546435080017545, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.08457146059081425, + "rewards/format_reward": 0.85625, + "rewards/reasoning_steps_reward": 0.995833334326744, + "step": 1585 + }, + { + "completion_length": 1024.0, + "epoch": 0.3511823414916967, + "grad_norm": 0.16041873093251072, + "kl": 0.50980224609375, + "learning_rate": 1.6396316674768914e-05, + "loss": 0.0204, + "reward": 1.61807621717453, + "reward_std": 0.3785809577530017, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.11317378490348347, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.9312500022351742, + "step": 1590 + }, + { + "completion_length": 1024.0, + "epoch": 0.35228668847751965, + "grad_norm": 0.16080044982174893, + "kl": 0.220855712890625, + "learning_rate": 1.6366631448525486e-05, + "loss": 0.0088, + "reward": 1.6727639326825738, + "reward_std": 0.4138571529452747, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10848606025974732, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9125000037252903, + "step": 1595 + }, + { + "completion_length": 1024.0, + "epoch": 0.3533910354633426, + "grad_norm": 0.15122244016755843, + "kl": 0.25135498046875, + "learning_rate": 1.6336851575276814e-05, + "loss": 0.0101, + "reward": 1.5872814737260341, + "reward_std": 0.4882094876725205, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1023018532214337, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.8583333350718021, + "step": 1600 + }, + { + "epoch": 0.3533910354633426, + "eval_completion_length": 1024.0, + "eval_kl": 0.257392578125, + "eval_loss": 0.010239919647574425, + "eval_reward": 1.5994415652751923, + "eval_reward_std": 0.5397925276041496, + "eval_rewards/accuracy_reward": 0.005, + "eval_rewards/cosine_scaled_reward": -0.0838917788118124, + "eval_rewards/format_reward": 0.815, + "eval_rewards/reasoning_steps_reward": 0.8633333361148834, + "eval_runtime": 204.9068, + "eval_samples_per_second": 0.483, + "eval_steps_per_second": 0.122, + "step": 1600 + }, + { + "completion_length": 1024.0, + "epoch": 0.3544953824491655, + "grad_norm": 0.15278881164591143, + "kl": 0.215399169921875, + "learning_rate": 1.630697749773359e-05, + "loss": 0.0086, + "reward": 1.6932586930692195, + "reward_std": 0.4409494582350817, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.06715795926138526, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.9104166686534881, + "step": 1605 + }, + { + "completion_length": 1024.0, + "epoch": 0.35559972943498847, + "grad_norm": 0.12847749575672446, + "kl": 0.308056640625, + "learning_rate": 1.627700966000696e-05, + "loss": 0.0123, + "reward": 1.5046606879681348, + "reward_std": 0.5595257567225417, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0911726443493535, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.8333333367481828, + "step": 1610 + }, + { + "completion_length": 1024.0, + "epoch": 0.3567040764208114, + "grad_norm": 0.12461377449215352, + "kl": 0.287921142578125, + "learning_rate": 1.6246948507601915e-05, + "loss": 0.0115, + "reward": 1.4486139392480255, + "reward_std": 0.5760282400807227, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07846938786969986, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.8208333348855377, + "step": 1615 + }, + { + "completion_length": 1024.0, + "epoch": 0.3578084234066344, + "grad_norm": 0.03891215118141946, + "kl": 0.10467529296875, + "learning_rate": 1.621679448741067e-05, + "loss": 0.0042, + "reward": 1.7575967930257321, + "reward_std": 0.3055588886391433, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01323653027502587, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.977083333581686, + "step": 1620 + }, + { + "completion_length": 1024.0, + "epoch": 0.35891277039245734, + "grad_norm": 0.0617309421173978, + "kl": 0.173211669921875, + "learning_rate": 1.618654804770603e-05, + "loss": 0.007, + "reward": 1.7544544816017151, + "reward_std": 0.3281066141825704, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.033045521087115046, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.9187500014901161, + "step": 1625 + }, + { + "completion_length": 1024.0, + "epoch": 0.36001711737828024, + "grad_norm": 0.11475484692618709, + "kl": 0.14339599609375, + "learning_rate": 1.615620963813471e-05, + "loss": 0.0057, + "reward": 1.8531198611482977, + "reward_std": 0.16819539086868646, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.026046801151460387, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9479166677221655, + "step": 1630 + }, + { + "completion_length": 1024.0, + "epoch": 0.3611214643641032, + "grad_norm": 1.7098540572139886, + "kl": 0.311505126953125, + "learning_rate": 1.6125779709710668e-05, + "loss": 0.0125, + "reward": 1.846768843382597, + "reward_std": 0.21533012690188116, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02406449381742277, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9458333350718021, + "step": 1635 + }, + { + "completion_length": 1024.0, + "epoch": 0.36222581134992615, + "grad_norm": 0.10500065061318996, + "kl": 0.172625732421875, + "learning_rate": 1.6095258714808373e-05, + "loss": 0.0069, + "reward": 1.829521244764328, + "reward_std": 0.2395985798266338, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03297876436563456, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.937500000745058, + "step": 1640 + }, + { + "completion_length": 1024.0, + "epoch": 0.3633301583357491, + "grad_norm": 0.12252448809449357, + "kl": 0.18194580078125, + "learning_rate": 1.606464710715612e-05, + "loss": 0.0073, + "reward": 1.8227525861933827, + "reward_std": 0.21451298040622077, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03141408516330557, + "rewards/format_reward": 0.9125, + "rewards/reasoning_steps_reward": 0.9416666684672237, + "step": 1645 + }, + { + "completion_length": 1024.0, + "epoch": 0.364434505321572, + "grad_norm": 0.7445776320336748, + "kl": 0.30133056640625, + "learning_rate": 1.603394534182925e-05, + "loss": 0.012, + "reward": 1.6621078178286552, + "reward_std": 0.27496500448987715, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05872552102773625, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.895833333954215, + "step": 1650 + }, + { + "completion_length": 1022.99375, + "epoch": 0.36553885230739497, + "grad_norm": 0.1955073245601333, + "kl": 0.409490966796875, + "learning_rate": 1.600315387524339e-05, + "loss": 0.0164, + "reward": 1.3730967482551932, + "reward_std": 0.65049932035663, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0977365891343652, + "rewards/format_reward": 0.65, + "rewards/reasoning_steps_reward": 0.8208333378657698, + "step": 1655 + }, + { + "completion_length": 1024.0, + "epoch": 0.3666431992932179, + "grad_norm": 0.08757147668498837, + "kl": 0.2857421875, + "learning_rate": 1.5972273165147697e-05, + "loss": 0.0115, + "reward": 1.3178067412227392, + "reward_std": 0.6414556607540363, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0717766029327322, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.8458333369344473, + "step": 1660 + }, + { + "completion_length": 1024.0, + "epoch": 0.3677475462790409, + "grad_norm": 0.10077064026422362, + "kl": 0.194622802734375, + "learning_rate": 1.5941303670618018e-05, + "loss": 0.0078, + "reward": 1.501453479193151, + "reward_std": 0.47270524825935356, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.040213195209798866, + "rewards/format_reward": 0.61875, + "rewards/reasoning_steps_reward": 0.9166666686534881, + "step": 1665 + }, + { + "completion_length": 1024.0, + "epoch": 0.36885189326486384, + "grad_norm": 0.09456211756292313, + "kl": 0.209967041015625, + "learning_rate": 1.591024585205007e-05, + "loss": 0.0084, + "reward": 1.5002569787204265, + "reward_std": 0.47431263369280713, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.051826363683358065, + "rewards/format_reward": 0.6625, + "rewards/reasoning_steps_reward": 0.8895833346992731, + "step": 1670 + }, + { + "completion_length": 1024.0, + "epoch": 0.36995624025068674, + "grad_norm": 0.10501416195016813, + "kl": 0.20081787109375, + "learning_rate": 1.587910017115262e-05, + "loss": 0.008, + "reward": 1.4864585481584072, + "reward_std": 0.5479732289919411, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0489581265635934, + "rewards/format_reward": 0.64375, + "rewards/reasoning_steps_reward": 0.8916666679084301, + "step": 1675 + }, + { + "completion_length": 1024.0, + "epoch": 0.3710605872365097, + "grad_norm": 0.07587020227662776, + "kl": 0.210357666015625, + "learning_rate": 1.5847867090940602e-05, + "loss": 0.0084, + "reward": 1.6044495470821858, + "reward_std": 0.4506511878987567, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05596711239659271, + "rewards/format_reward": 0.74375, + "rewards/reasoning_steps_reward": 0.9166666686534881, + "step": 1680 + }, + { + "completion_length": 1024.0, + "epoch": 0.37216493422233266, + "grad_norm": 0.13633268674501928, + "kl": 0.245611572265625, + "learning_rate": 1.5816547075728227e-05, + "loss": 0.0098, + "reward": 1.5000795137137175, + "reward_std": 0.4664591760686562, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06450380514434073, + "rewards/format_reward": 0.6625, + "rewards/reasoning_steps_reward": 0.9020833361893892, + "step": 1685 + }, + { + "completion_length": 1024.0, + "epoch": 0.3732692812081556, + "grad_norm": 0.1289716587802781, + "kl": 0.2935333251953125, + "learning_rate": 1.5785140591122107e-05, + "loss": 0.0117, + "reward": 1.4516793651506306, + "reward_std": 0.4440517393491973, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08790395214164164, + "rewards/format_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.8520833365619183, + "step": 1690 + }, + { + "completion_length": 1024.0, + "epoch": 0.37437362819397857, + "grad_norm": 0.08366432741697126, + "kl": 0.2215484619140625, + "learning_rate": 1.57536481040143e-05, + "loss": 0.0089, + "reward": 1.4857848590239882, + "reward_std": 0.5149718122185731, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05796512944652932, + "rewards/format_reward": 0.6375, + "rewards/reasoning_steps_reward": 0.9062500024214387, + "step": 1695 + }, + { + "completion_length": 1024.0, + "epoch": 0.37547797517980147, + "grad_norm": 0.12095767601763074, + "kl": 0.2228729248046875, + "learning_rate": 1.57220700825754e-05, + "loss": 0.0089, + "reward": 1.5507157089188695, + "reward_std": 0.5246146993404182, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.051367602551181335, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.8895833346992731, + "step": 1700 + }, + { + "epoch": 0.37547797517980147, + "eval_completion_length": 1024.0, + "eval_kl": 0.376298828125, + "eval_loss": 0.015111659653484821, + "eval_reward": 1.4007956439256668, + "eval_reward_std": 0.5410330631426041, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.07253768887836486, + "eval_rewards/format_reward": 0.635, + "eval_rewards/reasoning_steps_reward": 0.8383333373069763, + "eval_runtime": 203.5513, + "eval_samples_per_second": 0.486, + "eval_steps_per_second": 0.123, + "step": 1700 + }, + { + "completion_length": 1024.0, + "epoch": 0.37658232216562443, + "grad_norm": 0.1043657866474859, + "kl": 0.22974853515625, + "learning_rate": 1.5690406996247557e-05, + "loss": 0.0092, + "reward": 1.4823718063533307, + "reward_std": 0.5257471238997823, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05096150914614554, + "rewards/format_reward": 0.64375, + "rewards/reasoning_steps_reward": 0.8895833350718021, + "step": 1705 + }, + { + "completion_length": 1021.3, + "epoch": 0.3776866691514474, + "grad_norm": 0.1450103124132454, + "kl": 222.7259979248047, + "learning_rate": 1.5658659315737505e-05, + "loss": 8.9174, + "reward": 1.3179884374141693, + "reward_std": 0.6270647759190069, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.08826154011912876, + "rewards/format_reward": 0.58125, + "rewards/reasoning_steps_reward": 0.8187500039115548, + "step": 1710 + }, + { + "completion_length": 1024.0, + "epoch": 0.37879101613727034, + "grad_norm": 0.4320971163586487, + "kl": 2.19267578125, + "learning_rate": 1.5626827513009565e-05, + "loss": 0.0878, + "reward": 1.3571282140910625, + "reward_std": 0.6224009027850116, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10328842537855962, + "rewards/format_reward": 0.6375, + "rewards/reasoning_steps_reward": 0.822916672565043, + "step": 1715 + }, + { + "completion_length": 1024.0, + "epoch": 0.3798953631230933, + "grad_norm": 0.10636800980618685, + "kl": 1.1803497314453124, + "learning_rate": 1.5594912061278627e-05, + "loss": 0.0472, + "reward": 1.4040614984929563, + "reward_std": 0.45519052888548683, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08552179490507114, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.820833338983357, + "step": 1720 + }, + { + "completion_length": 1019.55, + "epoch": 0.3809997101089162, + "grad_norm": 2.383636335628273, + "kl": 0.4132568359375, + "learning_rate": 1.5562913435003113e-05, + "loss": 0.0165, + "reward": 1.562475298345089, + "reward_std": 0.47447351760647505, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05210799162014155, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.8583333352580667, + "step": 1725 + }, + { + "completion_length": 1024.0, + "epoch": 0.38210405709473916, + "grad_norm": 14.647822546025663, + "kl": 0.928717041015625, + "learning_rate": 1.5530832109877932e-05, + "loss": 0.0372, + "reward": 1.7276832605712116, + "reward_std": 0.3270437012415641, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03065001876966562, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9208333371207118, + "step": 1730 + }, + { + "completion_length": 1024.0, + "epoch": 0.3832084040805621, + "grad_norm": 9.751639098465139, + "kl": 8.048651123046875, + "learning_rate": 1.5498668562827397e-05, + "loss": 0.3212, + "reward": 1.7738482117652894, + "reward_std": 0.29785865079848006, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.026151732992730103, + "rewards/format_reward": 0.85625, + "rewards/reasoning_steps_reward": 0.9437500014901161, + "step": 1735 + }, + { + "completion_length": 1024.0, + "epoch": 0.38431275106638507, + "grad_norm": 29.60838556403236, + "kl": 1.1584747314453125, + "learning_rate": 1.5466423271998144e-05, + "loss": 0.0463, + "reward": 1.8169302485883236, + "reward_std": 0.2547533855103211, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.022653027176784236, + "rewards/format_reward": 0.8875, + "rewards/reasoning_steps_reward": 0.9520833358168602, + "step": 1740 + }, + { + "completion_length": 1024.0, + "epoch": 0.38541709805220803, + "grad_norm": 1.493106786519115, + "kl": 1.753643798828125, + "learning_rate": 1.5434096716752023e-05, + "loss": 0.0703, + "reward": 1.736639281362295, + "reward_std": 0.2719473097446098, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0321106720060925, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.9375000029802323, + "step": 1745 + }, + { + "completion_length": 1012.91875, + "epoch": 0.38652144503803093, + "grad_norm": 0.2769839530384672, + "kl": 3.68170166015625, + "learning_rate": 1.5401689377658962e-05, + "loss": 0.1478, + "reward": 1.6126683823764325, + "reward_std": 0.44903155848760434, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.031081585782339972, + "rewards/format_reward": 0.71875, + "rewards/reasoning_steps_reward": 0.9250000022351742, + "step": 1750 + }, + { + "completion_length": 1024.0, + "epoch": 0.3876257920238539, + "grad_norm": 0.901459553106272, + "kl": 2.446429443359375, + "learning_rate": 1.536920173648984e-05, + "loss": 0.0978, + "reward": 1.5782335847616196, + "reward_std": 0.393158163732096, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03218308122377493, + "rewards/format_reward": 0.675, + "rewards/reasoning_steps_reward": 0.9354166679084301, + "step": 1755 + }, + { + "completion_length": 1019.58125, + "epoch": 0.38873013900967684, + "grad_norm": 11.114883143058288, + "kl": 2.0950164794921875, + "learning_rate": 1.53366342762093e-05, + "loss": 0.0839, + "reward": 1.3569506576284767, + "reward_std": 0.4249624714701895, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08054934682236307, + "rewards/format_reward": 0.525, + "rewards/reasoning_steps_reward": 0.9125000055879354, + "step": 1760 + }, + { + "completion_length": 1019.2375, + "epoch": 0.3898344859954998, + "grad_norm": 2.9423721339278655, + "kl": 1.0611572265625, + "learning_rate": 1.5303987480968607e-05, + "loss": 0.0425, + "reward": 1.4560162041336298, + "reward_std": 0.5089367198333548, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06690047315787524, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.9354166716337204, + "step": 1765 + }, + { + "completion_length": 1020.55625, + "epoch": 0.39093883298132276, + "grad_norm": 1.1833839877582364, + "kl": 6.795245361328125, + "learning_rate": 1.5271261836098403e-05, + "loss": 0.2715, + "reward": 1.4157525110989808, + "reward_std": 0.5201515946177097, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07174750183366996, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.9000000050291419, + "step": 1770 + }, + { + "completion_length": 1019.3375, + "epoch": 0.39204317996714566, + "grad_norm": 2.2476471036032795, + "kl": 1.54696044921875, + "learning_rate": 1.5238457828101531e-05, + "loss": 0.062, + "reward": 1.6591543201357126, + "reward_std": 0.3638195994876696, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.040845706693653484, + "rewards/format_reward": 0.76875, + "rewards/reasoning_steps_reward": 0.9312500029802322, + "step": 1775 + }, + { + "completion_length": 1005.8125, + "epoch": 0.3931475269529686, + "grad_norm": 0.5559651125273167, + "kl": 3.158740234375, + "learning_rate": 1.520557594464579e-05, + "loss": 0.1264, + "reward": 1.5916038572788238, + "reward_std": 0.39986591760807644, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04172948705709132, + "rewards/format_reward": 0.68125, + "rewards/reasoning_steps_reward": 0.9520833365619182, + "step": 1780 + }, + { + "completion_length": 1024.0, + "epoch": 0.3942518739387916, + "grad_norm": 0.7921314371486445, + "kl": 0.52529296875, + "learning_rate": 1.5172616674556673e-05, + "loss": 0.021, + "reward": 1.6518889758735895, + "reward_std": 0.40095153995195004, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03561103413817364, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.9250000020489096, + "step": 1785 + }, + { + "completion_length": 1024.0, + "epoch": 0.39535622092461453, + "grad_norm": 198.98423683140666, + "kl": 11.310357666015625, + "learning_rate": 1.5139580507810118e-05, + "loss": 0.4515, + "reward": 1.6622562702745198, + "reward_std": 0.3482717312890372, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03566040461216744, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9479166693985462, + "step": 1790 + }, + { + "completion_length": 1024.0, + "epoch": 0.39646056791043743, + "grad_norm": 0.24822742846638876, + "kl": 7.70560302734375, + "learning_rate": 1.510646793552522e-05, + "loss": 0.3082, + "reward": 1.687724581360817, + "reward_std": 0.36951222630941627, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.037275422150722194, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.9500000029802322, + "step": 1795 + }, + { + "completion_length": 1017.25, + "epoch": 0.3975649148962604, + "grad_norm": 4.99816542290855, + "kl": 3.644091796875, + "learning_rate": 1.5073279449956916e-05, + "loss": 0.1455, + "reward": 1.5804516039788723, + "reward_std": 0.4355189655091465, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05288173892857344, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.9270833369344473, + "step": 1800 + }, + { + "epoch": 0.3975649148962604, + "eval_completion_length": 1024.0, + "eval_kl": 1.33404296875, + "eval_loss": 0.05353707820177078, + "eval_reward": 1.6259092497825622, + "eval_reward_std": 0.44313232216529286, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.05075742355262264, + "eval_rewards/format_reward": 0.735, + "eval_rewards/reasoning_steps_reward": 0.9416666674613953, + "eval_runtime": 200.6778, + "eval_samples_per_second": 0.493, + "eval_steps_per_second": 0.125, + "step": 1800 + }, + { + "completion_length": 1016.4625, + "epoch": 0.39866926188208335, + "grad_norm": 0.8216823589718119, + "kl": 1.52921142578125, + "learning_rate": 1.5040015544488689e-05, + "loss": 0.0613, + "reward": 1.580186554789543, + "reward_std": 0.4473959618730078, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05106344955359532, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.9312500037252903, + "step": 1805 + }, + { + "completion_length": 1024.0, + "epoch": 0.3997736088679063, + "grad_norm": 0.27497954194074575, + "kl": 1.869049072265625, + "learning_rate": 1.5006676713625217e-05, + "loss": 0.0748, + "reward": 1.7112540045753122, + "reward_std": 0.3451619381986461, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04707933118999676, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.9520833350718021, + "step": 1810 + }, + { + "completion_length": 1022.7875, + "epoch": 0.40087795585372926, + "grad_norm": 0.1932843975023837, + "kl": 0.42769775390625, + "learning_rate": 1.4973263452985023e-05, + "loss": 0.0171, + "reward": 1.7407132534310221, + "reward_std": 0.2509838029066316, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.038453425541553087, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9416666677221656, + "step": 1815 + }, + { + "completion_length": 1024.0, + "epoch": 0.40198230283955216, + "grad_norm": 13.375482247028472, + "kl": 5.430291748046875, + "learning_rate": 1.493977625929312e-05, + "loss": 0.2177, + "reward": 1.8173677779734134, + "reward_std": 0.25813394066904605, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.024298889722177818, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.9479166686534881, + "step": 1820 + }, + { + "completion_length": 1024.0, + "epoch": 0.4030866498253751, + "grad_norm": 0.09709265350974022, + "kl": 0.277154541015625, + "learning_rate": 1.4906215630373606e-05, + "loss": 0.0111, + "reward": 1.8708434641361236, + "reward_std": 0.18241405415017767, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.010406546228927028, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.9750000014901161, + "step": 1825 + }, + { + "completion_length": 1024.0, + "epoch": 0.4041909968111981, + "grad_norm": 0.9087598758466838, + "kl": 0.90994873046875, + "learning_rate": 1.4872582065142285e-05, + "loss": 0.0363, + "reward": 1.7198083013296128, + "reward_std": 0.36085402057337035, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04060836032874136, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.9354166686534882, + "step": 1830 + }, + { + "completion_length": 1024.0, + "epoch": 0.40529534379702103, + "grad_norm": 19.71139802721317, + "kl": 1.30902099609375, + "learning_rate": 1.4838876063599234e-05, + "loss": 0.0524, + "reward": 1.633510524313897, + "reward_std": 0.41481399162443894, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.058156153634925545, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9104166697710753, + "step": 1835 + }, + { + "completion_length": 1024.0, + "epoch": 0.406399690782844, + "grad_norm": 0.20736892182413091, + "kl": 2.954833984375, + "learning_rate": 1.480509812682138e-05, + "loss": 0.118, + "reward": 1.6065174978226424, + "reward_std": 0.4605361075816063, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06014917756460818, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.8791666697710753, + "step": 1840 + }, + { + "completion_length": 1024.0, + "epoch": 0.4075040377686669, + "grad_norm": 0.12490512211432611, + "kl": 1.48941650390625, + "learning_rate": 1.4771248756955042e-05, + "loss": 0.0597, + "reward": 1.339280641824007, + "reward_std": 0.6381314028223641, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08363602570825605, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.7979166705161334, + "step": 1845 + }, + { + "completion_length": 1024.0, + "epoch": 0.40860838475448985, + "grad_norm": 0.2179059890414028, + "kl": 0.3721527099609375, + "learning_rate": 1.4737328457208471e-05, + "loss": 0.0149, + "reward": 1.6670774094760419, + "reward_std": 0.4513689050038465, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04958925420360174, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9041666708886623, + "step": 1850 + }, + { + "completion_length": 1018.66875, + "epoch": 0.4097127317403128, + "grad_norm": 0.79455251719469, + "kl": 0.32587890625, + "learning_rate": 1.4703337731844374e-05, + "loss": 0.0131, + "reward": 1.7391955329105258, + "reward_std": 0.3125075527260378, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03372114356534439, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.9229166684672236, + "step": 1855 + }, + { + "completion_length": 1018.4625, + "epoch": 0.41081707872613576, + "grad_norm": 1.1295492282165498, + "kl": 0.2130218505859375, + "learning_rate": 1.4669277086172406e-05, + "loss": 0.0086, + "reward": 1.7530269030481578, + "reward_std": 0.22123022362970915, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.036556454593664967, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.9208333356305957, + "step": 1860 + }, + { + "completion_length": 1021.48125, + "epoch": 0.4119214257119587, + "grad_norm": 0.10472402689364621, + "kl": 0.4535247802734375, + "learning_rate": 1.4635147026541674e-05, + "loss": 0.0182, + "reward": 1.853439299762249, + "reward_std": 0.2057221634101097, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.021560727976611814, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9500000007450581, + "step": 1865 + }, + { + "completion_length": 1011.59375, + "epoch": 0.4130257726977816, + "grad_norm": 0.060145090948972385, + "kl": 1.786566162109375, + "learning_rate": 1.4600948060333187e-05, + "loss": 0.0715, + "reward": 1.6970781801268457, + "reward_std": 0.37393540782250057, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05500518054032, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9145833365619183, + "step": 1870 + }, + { + "completion_length": 1018.5125, + "epoch": 0.4141301196836046, + "grad_norm": 0.07160628265153375, + "kl": 0.150506591796875, + "learning_rate": 1.4566680695952333e-05, + "loss": 0.0061, + "reward": 1.849437363818288, + "reward_std": 0.17379919528120807, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.021395997072977478, + "rewards/format_reward": 0.91875, + "rewards/reasoning_steps_reward": 0.9520833335816861, + "step": 1875 + }, + { + "completion_length": 1017.94375, + "epoch": 0.41523446666942754, + "grad_norm": 0.3061351568984248, + "kl": 0.1790069580078125, + "learning_rate": 1.4532345442821323e-05, + "loss": 0.0072, + "reward": 1.8633771307766438, + "reward_std": 0.1917227172017192, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01787287338374881, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9562500014901161, + "step": 1880 + }, + { + "completion_length": 1011.39375, + "epoch": 0.4163388136552505, + "grad_norm": 0.13980206314174293, + "kl": 0.840350341796875, + "learning_rate": 1.4497942811371592e-05, + "loss": 0.0336, + "reward": 1.811388997361064, + "reward_std": 0.18153300940950884, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02819433979511814, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.9583333352580666, + "step": 1885 + }, + { + "completion_length": 1014.46875, + "epoch": 0.41744316064107345, + "grad_norm": 37.89468511386642, + "kl": 3.224212646484375, + "learning_rate": 1.4463473313036241e-05, + "loss": 0.129, + "reward": 1.7037913450971245, + "reward_std": 0.3241650226414777, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03787532943442784, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.92916666790843, + "step": 1890 + }, + { + "completion_length": 999.16875, + "epoch": 0.41854750762689635, + "grad_norm": 0.43770064535885594, + "kl": 1.079486083984375, + "learning_rate": 1.4428937460242417e-05, + "loss": 0.0432, + "reward": 1.4503749491646887, + "reward_std": 0.5806084466651555, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08087505426301504, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.8625000054016709, + "step": 1895 + }, + { + "completion_length": 1012.8125, + "epoch": 0.4196518546127193, + "grad_norm": 0.031069888519680837, + "kl": 0.6485260009765625, + "learning_rate": 1.4394335766403703e-05, + "loss": 0.0259, + "reward": 1.5874255585018546, + "reward_std": 0.530018257148754, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0500744489670069, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8875000035390258, + "step": 1900 + }, + { + "epoch": 0.4196518546127193, + "eval_completion_length": 1015.89, + "eval_kl": 0.38810546875, + "eval_loss": 0.015597357414662838, + "eval_reward": 1.6550358521938324, + "eval_reward_std": 0.4101494722440839, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.049964163267577535, + "eval_rewards/format_reward": 0.805, + "eval_rewards/reasoning_steps_reward": 0.9000000047683716, + "eval_runtime": 200.6466, + "eval_samples_per_second": 0.493, + "eval_steps_per_second": 0.125, + "step": 1900 + }, + { + "completion_length": 1001.575, + "epoch": 0.42075620159854227, + "grad_norm": 2.0124246714328873, + "kl": 1.036248779296875, + "learning_rate": 1.4359668745912472e-05, + "loss": 0.0414, + "reward": 1.6629710331559182, + "reward_std": 0.39515268294885003, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03286230957592693, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9083333373069763, + "step": 1905 + }, + { + "completion_length": 957.78125, + "epoch": 0.4218605485843652, + "grad_norm": 7.0968415445601, + "kl": 161.5555908203125, + "learning_rate": 1.4324936914132255e-05, + "loss": 6.4648, + "reward": 0.8477631491608918, + "reward_std": 0.5642138687988535, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16473685780432845, + "rewards/format_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.5750000026077032, + "step": 1910 + }, + { + "completion_length": 840.0625, + "epoch": 0.4229648955701881, + "grad_norm": 1.6032662050525144, + "kl": 2.6, + "learning_rate": 1.4290140787390083e-05, + "loss": 0.104, + "reward": 0.001272639585658908, + "reward_std": 0.3248372384929098, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22581069722946268, + "rewards/format_reward": 0.05, + "rewards/reasoning_steps_reward": 0.1770833356305957, + "step": 1915 + }, + { + "completion_length": 914.06875, + "epoch": 0.4240692425560111, + "grad_norm": 4.201448980739016, + "kl": 1.5188720703125, + "learning_rate": 1.4255280882968787e-05, + "loss": 0.0607, + "reward": 0.18680339390411974, + "reward_std": 0.412913748028177, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15694661159250245, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.2750000048428774, + "step": 1920 + }, + { + "completion_length": 951.16875, + "epoch": 0.42517358954183404, + "grad_norm": 1.209578741785896, + "kl": 3.43743896484375, + "learning_rate": 1.4220357719099338e-05, + "loss": 0.1374, + "reward": 0.3251433074765373, + "reward_std": 0.4896274733968312, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11235669811055686, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.36875000689178705, + "step": 1925 + }, + { + "completion_length": 1000.65, + "epoch": 0.426277936527657, + "grad_norm": 1.7450178375722907, + "kl": 0.916650390625, + "learning_rate": 1.4185371814953116e-05, + "loss": 0.0367, + "reward": 0.3472979475278407, + "reward_std": 0.4390462203696188, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21311872491462508, + "rewards/format_reward": 0.09375, + "rewards/reasoning_steps_reward": 0.4666666740551591, + "step": 1930 + }, + { + "completion_length": 1012.9375, + "epoch": 0.42738228351347995, + "grad_norm": 0.6809658317149088, + "kl": 2.3167236328125, + "learning_rate": 1.415032369063422e-05, + "loss": 0.0926, + "reward": 0.45425672866404054, + "reward_std": 0.4167201625125017, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.31449327804148197, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.7000000111758709, + "step": 1935 + }, + { + "completion_length": 1019.725, + "epoch": 0.42848663049930286, + "grad_norm": 6.584644008036467, + "kl": 1.9494140625, + "learning_rate": 1.41152138671717e-05, + "loss": 0.0781, + "reward": 0.7717248608358205, + "reward_std": 0.47345116818096356, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.23452514785294626, + "rewards/format_reward": 0.1625, + "rewards/reasoning_steps_reward": 0.8437500104308129, + "step": 1940 + }, + { + "completion_length": 1010.575, + "epoch": 0.4295909774851258, + "grad_norm": 0.9124464101651549, + "kl": 0.98031005859375, + "learning_rate": 1.408004286651185e-05, + "loss": 0.0392, + "reward": 0.9319220932200551, + "reward_std": 0.3040256727119527, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10974457962370252, + "rewards/format_reward": 0.15, + "rewards/reasoning_steps_reward": 0.8916666749864817, + "step": 1945 + }, + { + "completion_length": 1007.06875, + "epoch": 0.43069532447094877, + "grad_norm": 0.8174851082205585, + "kl": 1.955194091796875, + "learning_rate": 1.4044811211510419e-05, + "loss": 0.0784, + "reward": 0.9499879771843552, + "reward_std": 0.41973948137037664, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07292869019135821, + "rewards/format_reward": 0.20625, + "rewards/reasoning_steps_reward": 0.8166666731238366, + "step": 1950 + }, + { + "completion_length": 1011.7875, + "epoch": 0.4317996714567717, + "grad_norm": 9.563287846750592, + "kl": 1.788214111328125, + "learning_rate": 1.4009519425924858e-05, + "loss": 0.0714, + "reward": 0.9242004107683897, + "reward_std": 0.49032472132910243, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06954959582892287, + "rewards/format_reward": 0.23125, + "rewards/reasoning_steps_reward": 0.7625000070780515, + "step": 1955 + }, + { + "completion_length": 1014.0375, + "epoch": 0.4329040184425947, + "grad_norm": 1.1172252423176203, + "kl": 1.506072998046875, + "learning_rate": 1.3974168034406524e-05, + "loss": 0.0602, + "reward": 1.0922672674059868, + "reward_std": 0.6774374388254956, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05981607586077189, + "rewards/format_reward": 0.3625, + "rewards/reasoning_steps_reward": 0.7895833428949117, + "step": 1960 + }, + { + "completion_length": 1015.45, + "epoch": 0.4340083654284176, + "grad_norm": 8.662740397902532, + "kl": 2.05098876953125, + "learning_rate": 1.3938757562492873e-05, + "loss": 0.082, + "reward": 1.074470814689994, + "reward_std": 0.6158715720244914, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.056779195064882514, + "rewards/format_reward": 0.35625, + "rewards/reasoning_steps_reward": 0.7750000122934579, + "step": 1965 + }, + { + "completion_length": 1014.45625, + "epoch": 0.43511271241424054, + "grad_norm": 0.3773294016373424, + "kl": 3.3776611328125, + "learning_rate": 1.3903288536599668e-05, + "loss": 0.1351, + "reward": 0.9905229835072532, + "reward_std": 0.5584212095652674, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0615603672684756, + "rewards/format_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.7395833427086472, + "step": 1970 + }, + { + "completion_length": 1020.29375, + "epoch": 0.4362170594000635, + "grad_norm": 0.4442185959316331, + "kl": 1.775543212890625, + "learning_rate": 1.3867761484013135e-05, + "loss": 0.071, + "reward": 1.1137715804390609, + "reward_std": 0.5982852473727007, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04664510243914037, + "rewards/format_reward": 0.39375, + "rewards/reasoning_steps_reward": 0.7666666774079204, + "step": 1975 + }, + { + "completion_length": 1024.0, + "epoch": 0.43732140638588646, + "grad_norm": 21.47234145361546, + "kl": 2.4731292724609375, + "learning_rate": 1.3832176932882136e-05, + "loss": 0.0989, + "reward": 1.2862569394987076, + "reward_std": 0.5667873994000729, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.051243075182765094, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.8187500070780516, + "step": 1980 + }, + { + "completion_length": 1019.7625, + "epoch": 0.4384257533717094, + "grad_norm": 1.527622097864932, + "kl": 1.788006591796875, + "learning_rate": 1.3796535412210301e-05, + "loss": 0.0715, + "reward": 1.2694264559075237, + "reward_std": 0.5048920283968528, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04515689611821472, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.8145833436399699, + "step": 1985 + }, + { + "completion_length": 1024.0, + "epoch": 0.4395301003575323, + "grad_norm": 4.3138407765683855, + "kl": 0.601263427734375, + "learning_rate": 1.3760837451848193e-05, + "loss": 0.0241, + "reward": 1.3374655573628842, + "reward_std": 0.6142874645546271, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.037534464124564695, + "rewards/format_reward": 0.575, + "rewards/reasoning_steps_reward": 0.8000000083819032, + "step": 1990 + }, + { + "completion_length": 1015.325, + "epoch": 0.4406344473433553, + "grad_norm": 0.22980488757849943, + "kl": 0.72313232421875, + "learning_rate": 1.3725083582485397e-05, + "loss": 0.0289, + "reward": 1.4313670295290648, + "reward_std": 0.5948537336438051, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.016549649066297432, + "rewards/format_reward": 0.6375, + "rewards/reasoning_steps_reward": 0.8104166738688946, + "step": 1995 + }, + { + "completion_length": 1024.0, + "epoch": 0.44173879432917823, + "grad_norm": 10.31697817811464, + "kl": 2.43072509765625, + "learning_rate": 1.3689274335642653e-05, + "loss": 0.0972, + "reward": 1.577266044355929, + "reward_std": 0.5029176145569266, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.022733943232353226, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8500000040978193, + "step": 2000 + }, + { + "epoch": 0.44173879432917823, + "eval_completion_length": 1019.28, + "eval_kl": 0.4328515625, + "eval_loss": 0.017405448481440544, + "eval_reward": 1.6899439215660095, + "eval_reward_std": 0.3815716141184316, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.003389413725956274, + "eval_rewards/format_reward": 0.8, + "eval_rewards/reasoning_steps_reward": 0.8933333384990693, + "eval_runtime": 202.9245, + "eval_samples_per_second": 0.488, + "eval_steps_per_second": 0.123, + "step": 2000 + }, + { + "completion_length": 1018.48125, + "epoch": 0.4428431413150012, + "grad_norm": 0.37629734086009897, + "kl": 0.5832275390625, + "learning_rate": 1.3653410243663953e-05, + "loss": 0.0233, + "reward": 1.7200924716889858, + "reward_std": 0.2776235666715309, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.015324184401330853, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9229166697710752, + "step": 2005 + }, + { + "completion_length": 1020.01875, + "epoch": 0.44394748830082414, + "grad_norm": 0.28952601144092716, + "kl": 1.030401611328125, + "learning_rate": 1.3617491839708614e-05, + "loss": 0.0413, + "reward": 1.639422894269228, + "reward_std": 0.3915938691733004, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012660458464461044, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.8895833376795054, + "step": 2010 + }, + { + "completion_length": 1024.0, + "epoch": 0.44505183528664705, + "grad_norm": 0.09617188273592023, + "kl": 0.60234375, + "learning_rate": 1.3581519657743365e-05, + "loss": 0.0242, + "reward": 1.7725027503445745, + "reward_std": 0.24438665603962306, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012913908507604788, + "rewards/format_reward": 0.85625, + "rewards/reasoning_steps_reward": 0.9291666682809592, + "step": 2015 + }, + { + "completion_length": 1024.0, + "epoch": 0.44615618227247, + "grad_norm": 0.18225728128628407, + "kl": 0.27862548828125, + "learning_rate": 1.3545494232534406e-05, + "loss": 0.0111, + "reward": 1.7568111419677734, + "reward_std": 0.29651416725586444, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.007772182382834103, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.9208333363756538, + "step": 2020 + }, + { + "completion_length": 1016.925, + "epoch": 0.44726052925829296, + "grad_norm": 0.2648706944585046, + "kl": 0.615509033203125, + "learning_rate": 1.3509416099639456e-05, + "loss": 0.0246, + "reward": 1.8220582745969296, + "reward_std": 0.17748434675852423, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.007108392053859802, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9666666693985462, + "step": 2025 + }, + { + "completion_length": 1018.525, + "epoch": 0.4483648762441159, + "grad_norm": 0.6277482412805114, + "kl": 0.555120849609375, + "learning_rate": 1.3473285795399792e-05, + "loss": 0.0222, + "reward": 1.7602945683524012, + "reward_std": 0.2339164051840612, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012622114044086174, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.9416666684672237, + "step": 2030 + }, + { + "completion_length": 1024.0, + "epoch": 0.4494692232299388, + "grad_norm": 0.3638572386307533, + "kl": 0.827734375, + "learning_rate": 1.3437103856932266e-05, + "loss": 0.0331, + "reward": 1.7036781013011932, + "reward_std": 0.3036784950886727, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02757192025987365, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.9250000013038516, + "step": 2035 + }, + { + "completion_length": 1018.1125, + "epoch": 0.4505735702157618, + "grad_norm": 0.28945935173584225, + "kl": 0.39534912109375, + "learning_rate": 1.3400870822121348e-05, + "loss": 0.0158, + "reward": 1.7693685671314596, + "reward_std": 0.29658813936011086, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.009798096649282684, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.9104166690260171, + "step": 2040 + }, + { + "completion_length": 1024.0, + "epoch": 0.45167791720158473, + "grad_norm": 0.30395883309793953, + "kl": 0.496514892578125, + "learning_rate": 1.3364587229611095e-05, + "loss": 0.0198, + "reward": 1.5488340200856328, + "reward_std": 0.34414923763941657, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.044915972299531856, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.9250000067055225, + "step": 2045 + }, + { + "completion_length": 1023.36875, + "epoch": 0.4527822641874077, + "grad_norm": 0.37563339825606884, + "kl": 1.257757568359375, + "learning_rate": 1.332825361879717e-05, + "loss": 0.0503, + "reward": 1.0923855936154723, + "reward_std": 0.5553531399730645, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10553107349589368, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.8229166757315397, + "step": 2050 + }, + { + "completion_length": 1020.39375, + "epoch": 0.45388661117323065, + "grad_norm": 0.17545325218642227, + "kl": 0.552960205078125, + "learning_rate": 1.3291870529818809e-05, + "loss": 0.0221, + "reward": 1.3102701783180237, + "reward_std": 0.5236696774121044, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08556316045970505, + "rewards/format_reward": 0.55, + "rewards/reasoning_steps_reward": 0.8458333417773247, + "step": 2055 + }, + { + "completion_length": 1024.0, + "epoch": 0.45499095815905355, + "grad_norm": 2.065649559488245, + "kl": 0.834759521484375, + "learning_rate": 1.3255438503550796e-05, + "loss": 0.0333, + "reward": 1.69286774341017, + "reward_std": 0.3293037013057784, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.027965584968657708, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.9208333367481828, + "step": 2060 + }, + { + "completion_length": 1024.0, + "epoch": 0.4560953051448765, + "grad_norm": 0.660593118562221, + "kl": 0.329058837890625, + "learning_rate": 1.3218958081595426e-05, + "loss": 0.0131, + "reward": 1.6265041932463646, + "reward_std": 0.41342474384632055, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.031829144536436614, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.9020833386108279, + "step": 2065 + }, + { + "completion_length": 1023.1875, + "epoch": 0.45719965213069946, + "grad_norm": 0.1196002841795942, + "kl": 0.259063720703125, + "learning_rate": 1.3182429806274442e-05, + "loss": 0.0103, + "reward": 1.7037229581736029, + "reward_std": 0.3075523629013503, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.025443704352724694, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9416666693985463, + "step": 2070 + }, + { + "completion_length": 1024.0, + "epoch": 0.4583039991165224, + "grad_norm": 0.14409380240615186, + "kl": 0.298175048828125, + "learning_rate": 1.3145854220620981e-05, + "loss": 0.0119, + "reward": 1.6938567931763828, + "reward_std": 0.3266446697782669, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03322654654869552, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9395833369344473, + "step": 2075 + }, + { + "completion_length": 1024.0, + "epoch": 0.4594083461023454, + "grad_norm": 0.12617090323900998, + "kl": 0.24708251953125, + "learning_rate": 1.3109231868371511e-05, + "loss": 0.0099, + "reward": 1.7247996438294648, + "reward_std": 0.3449321190650778, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.029367040466587467, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.9479166708886624, + "step": 2080 + }, + { + "completion_length": 1024.0, + "epoch": 0.4605126930881683, + "grad_norm": 4.119605221827958, + "kl": 0.795184326171875, + "learning_rate": 1.3072563293957725e-05, + "loss": 0.0318, + "reward": 1.557416939828545, + "reward_std": 0.37313184138067185, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.048833064705115704, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.9000000039115548, + "step": 2085 + }, + { + "completion_length": 1020.1125, + "epoch": 0.46161704007399124, + "grad_norm": 0.14784198259771675, + "kl": 0.735406494140625, + "learning_rate": 1.3035849042498462e-05, + "loss": 0.0294, + "reward": 1.5002798398956656, + "reward_std": 0.5547551644277462, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06638682496321166, + "rewards/format_reward": 0.6625, + "rewards/reasoning_steps_reward": 0.904166673310101, + "step": 2090 + }, + { + "completion_length": 1020.9, + "epoch": 0.4627213870598142, + "grad_norm": 0.32720942030685624, + "kl": 0.608282470703125, + "learning_rate": 1.299908965979161e-05, + "loss": 0.0244, + "reward": 1.4782994251698256, + "reward_std": 0.500833890357893, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05920058094643536, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9125000052154064, + "step": 2095 + }, + { + "completion_length": 1024.0, + "epoch": 0.46382573404563715, + "grad_norm": 0.1310566495840646, + "kl": 0.917279052734375, + "learning_rate": 1.2962285692305964e-05, + "loss": 0.0367, + "reward": 1.6021131692454218, + "reward_std": 0.4008885342831661, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.037470159548001904, + "rewards/format_reward": 0.74375, + "rewards/reasoning_steps_reward": 0.8958333371207118, + "step": 2100 + }, + { + "epoch": 0.46382573404563715, + "eval_completion_length": 1024.0, + "eval_kl": 0.28365234375, + "eval_loss": 0.011327223852276802, + "eval_reward": 1.6594515788555144, + "eval_reward_std": 0.38401263874957065, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.035548398063092464, + "eval_rewards/format_reward": 0.765, + "eval_rewards/reasoning_steps_reward": 0.9300000059604645, + "eval_runtime": 203.5966, + "eval_samples_per_second": 0.486, + "eval_steps_per_second": 0.123, + "step": 2100 + }, + { + "completion_length": 1024.0, + "epoch": 0.4649300810314601, + "grad_norm": 1.3191955643061455, + "kl": 0.275543212890625, + "learning_rate": 1.2925437687173144e-05, + "loss": 0.011, + "reward": 1.684731831587851, + "reward_std": 0.25950010808546153, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04651815479469405, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.9375000039115549, + "step": 2105 + }, + { + "completion_length": 1024.0, + "epoch": 0.466034428017283, + "grad_norm": 0.09016927229701084, + "kl": 0.257794189453125, + "learning_rate": 1.2888546192179417e-05, + "loss": 0.0103, + "reward": 1.745303137972951, + "reward_std": 0.2667829909493776, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02761352810288713, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9354166686534882, + "step": 2110 + }, + { + "completion_length": 1024.0, + "epoch": 0.46713877500310597, + "grad_norm": 0.20047174524395592, + "kl": 0.311578369140625, + "learning_rate": 1.2851611755757587e-05, + "loss": 0.0125, + "reward": 1.741257084161043, + "reward_std": 0.34543239968699024, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.029576247078188088, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9333333380520343, + "step": 2115 + }, + { + "completion_length": 1019.73125, + "epoch": 0.4682431219889289, + "grad_norm": 0.17615770549711435, + "kl": 0.356341552734375, + "learning_rate": 1.2814634926978831e-05, + "loss": 0.0142, + "reward": 1.6373960416764022, + "reward_std": 0.3339324926538552, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0480206228675911, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.9229166708886624, + "step": 2120 + }, + { + "completion_length": 1019.475, + "epoch": 0.4693474689747519, + "grad_norm": 13.564334122866743, + "kl": 1.46416015625, + "learning_rate": 1.2777616255544527e-05, + "loss": 0.0586, + "reward": 1.5654781736433505, + "reward_std": 0.45222095985685656, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0761884901760709, + "rewards/format_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9541666701436042, + "step": 2125 + }, + { + "completion_length": 1020.94375, + "epoch": 0.47045181596057484, + "grad_norm": 29.85489728983082, + "kl": 41.50612182617188, + "learning_rate": 1.2740556291778096e-05, + "loss": 1.6594, + "reward": 1.48430804759264, + "reward_std": 0.39297982692303035, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09485862025956919, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 0.9666666690260172, + "step": 2130 + }, + { + "completion_length": 1024.0, + "epoch": 0.47155616294639774, + "grad_norm": 0.9013678435103032, + "kl": 3.813409423828125, + "learning_rate": 1.2703455586616811e-05, + "loss": 0.1528, + "reward": 1.6619893133640289, + "reward_std": 0.3401134827206249, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05259404219436874, + "rewards/format_reward": 0.71875, + "rewards/reasoning_steps_reward": 0.995833333581686, + "step": 2135 + }, + { + "completion_length": 1024.0, + "epoch": 0.4726605099322207, + "grad_norm": 0.786336979289747, + "kl": 2.127301025390625, + "learning_rate": 1.2666314691603615e-05, + "loss": 0.085, + "reward": 1.7701308561488986, + "reward_std": 0.24104950832922895, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03611916397267123, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.9875, + "step": 2140 + }, + { + "completion_length": 1024.0, + "epoch": 0.47376485691804365, + "grad_norm": 1.0553226951559378, + "kl": 1.034063720703125, + "learning_rate": 1.2629134158878919e-05, + "loss": 0.0413, + "reward": 1.7154738694429397, + "reward_std": 0.3030771045820302, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04494282233531521, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9729166686534881, + "step": 2145 + }, + { + "completion_length": 1024.0, + "epoch": 0.4748692039038666, + "grad_norm": 0.3393903797287573, + "kl": 3.99031982421875, + "learning_rate": 1.259191454117239e-05, + "loss": 0.1597, + "reward": 1.7812194317579269, + "reward_std": 0.2781567809738135, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.041697263186324565, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.9791666686534881, + "step": 2150 + }, + { + "completion_length": 1024.0, + "epoch": 0.47597355088968957, + "grad_norm": 6.589061393274526, + "kl": 1.5100830078125, + "learning_rate": 1.255465639179473e-05, + "loss": 0.0604, + "reward": 1.7989778753370047, + "reward_std": 0.23155034703924002, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03852214975868264, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9750000026077033, + "step": 2155 + }, + { + "completion_length": 1024.0, + "epoch": 0.47707789787551247, + "grad_norm": 0.8010593946661915, + "kl": 3.04891357421875, + "learning_rate": 1.2517360264629463e-05, + "loss": 0.122, + "reward": 1.7094581590034068, + "reward_std": 0.34085092952595913, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.046791872511084874, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.9375000035390257, + "step": 2160 + }, + { + "completion_length": 1024.0, + "epoch": 0.4781822448613354, + "grad_norm": 2.217307019793859, + "kl": 1.355609130859375, + "learning_rate": 1.24800267141247e-05, + "loss": 0.0542, + "reward": 1.8146669074892998, + "reward_std": 0.26201679514249465, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.022833122030738194, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9625000022351742, + "step": 2165 + }, + { + "completion_length": 1024.0, + "epoch": 0.4792865918471584, + "grad_norm": 0.6114018111973071, + "kl": 1.562957763671875, + "learning_rate": 1.2442656295284879e-05, + "loss": 0.0625, + "reward": 1.6614520654082299, + "reward_std": 0.3890801830509247, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02813129380491546, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.9145833371207118, + "step": 2170 + }, + { + "completion_length": 1024.0, + "epoch": 0.48039093883298134, + "grad_norm": 5.905753794579747, + "kl": 2.620941162109375, + "learning_rate": 1.2405249563662539e-05, + "loss": 0.1048, + "reward": 1.6438979797065258, + "reward_std": 0.4486811731098953, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.031102037969344565, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.9187500044703484, + "step": 2175 + }, + { + "completion_length": 1024.0, + "epoch": 0.48149528581880424, + "grad_norm": 5.462849225057963, + "kl": 2.09735107421875, + "learning_rate": 1.2367807075350036e-05, + "loss": 0.0839, + "reward": 1.6195215459913015, + "reward_std": 0.40022268557599433, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04714512059364893, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.9041666707023979, + "step": 2180 + }, + { + "completion_length": 1024.0, + "epoch": 0.4825996328046272, + "grad_norm": 2.9029705830904895, + "kl": 2.64107666015625, + "learning_rate": 1.23303293869713e-05, + "loss": 0.1057, + "reward": 1.4394451253116132, + "reward_std": 0.5855226124516661, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09180486189869157, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.8312500044703484, + "step": 2185 + }, + { + "completion_length": 1024.0, + "epoch": 0.48370397979045016, + "grad_norm": 0.5313710417788244, + "kl": 2.38533935546875, + "learning_rate": 1.2292817055673543e-05, + "loss": 0.0955, + "reward": 1.5141951335594057, + "reward_std": 0.611786913189178, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0962215375395317, + "rewards/format_reward": 0.7375, + "rewards/reasoning_steps_reward": 0.8729166721925139, + "step": 2190 + }, + { + "completion_length": 1024.0, + "epoch": 0.4848083267762731, + "grad_norm": 1.5676422562430876, + "kl": 2.0924560546875, + "learning_rate": 1.2255270639118984e-05, + "loss": 0.0837, + "reward": 1.4455404764972628, + "reward_std": 0.475542880991236, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11070954198064556, + "rewards/format_reward": 0.73125, + "rewards/reasoning_steps_reward": 0.8250000048428774, + "step": 2195 + }, + { + "completion_length": 1024.0, + "epoch": 0.48591267376209607, + "grad_norm": 1.1085617605053268, + "kl": 1.0922119140625, + "learning_rate": 1.2217690695476551e-05, + "loss": 0.0436, + "reward": 1.4205092269927264, + "reward_std": 0.5992176422125908, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1315741269034561, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.8520833374932408, + "step": 2200 + }, + { + "epoch": 0.48591267376209607, + "eval_completion_length": 1024.0, + "eval_kl": 0.831904296875, + "eval_loss": 0.032701995223760605, + "eval_reward": 1.5383737568557263, + "eval_reward_std": 0.39792034816321575, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.03329294507166196, + "eval_rewards/format_reward": 0.765, + "eval_rewards/reasoning_steps_reward": 0.8066666719317436, + "eval_runtime": 206.2216, + "eval_samples_per_second": 0.48, + "eval_steps_per_second": 0.121, + "step": 2200 + }, + { + "completion_length": 1024.0, + "epoch": 0.48701702074791897, + "grad_norm": 0.9801168345465376, + "kl": 0.500738525390625, + "learning_rate": 1.2180077783413601e-05, + "loss": 0.02, + "reward": 1.6020004270598291, + "reward_std": 0.5047344055276994, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.025082948844874407, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.8520833380520344, + "step": 2205 + }, + { + "completion_length": 1024.0, + "epoch": 0.48812136773374193, + "grad_norm": 1.7123348809738306, + "kl": 0.643328857421875, + "learning_rate": 1.21424324620876e-05, + "loss": 0.0257, + "reward": 1.5895938023924827, + "reward_std": 0.43823493779657385, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.033322910089100335, + "rewards/format_reward": 0.76875, + "rewards/reasoning_steps_reward": 0.8541666703298688, + "step": 2210 + }, + { + "completion_length": 1024.0, + "epoch": 0.4892257147195649, + "grad_norm": 0.42456634301355384, + "kl": 0.2675048828125, + "learning_rate": 1.2104755291137797e-05, + "loss": 0.0107, + "reward": 1.7852548621594906, + "reward_std": 0.3036327707303144, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.008495195002069522, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.9125000014901161, + "step": 2215 + }, + { + "completion_length": 1024.0, + "epoch": 0.49033006170538784, + "grad_norm": 44.884356387747495, + "kl": 1.102850341796875, + "learning_rate": 1.2067046830676947e-05, + "loss": 0.0441, + "reward": 1.6169834925793112, + "reward_std": 0.4143550125787215, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.030933218055542964, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.8729166686534882, + "step": 2220 + }, + { + "completion_length": 1024.0, + "epoch": 0.4914344086912108, + "grad_norm": 1.6254530181590954, + "kl": 0.957958984375, + "learning_rate": 1.2029307641282935e-05, + "loss": 0.0383, + "reward": 1.5575453802943229, + "reward_std": 0.4768017131381782, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07787129828080311, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.8604166692122817, + "step": 2225 + }, + { + "completion_length": 1024.0, + "epoch": 0.4925387556770337, + "grad_norm": 3.8378763559078193, + "kl": 1.298052978515625, + "learning_rate": 1.1991538283990483e-05, + "loss": 0.0519, + "reward": 1.6807304440066217, + "reward_std": 0.29701345783856825, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0380195384103331, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9312500037252903, + "step": 2230 + }, + { + "completion_length": 1024.0, + "epoch": 0.49364310266285666, + "grad_norm": 9.71139970165291, + "kl": 4.29759521484375, + "learning_rate": 1.1953739320282778e-05, + "loss": 0.1719, + "reward": 1.7868030063807965, + "reward_std": 0.27771961602904865, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0048636200283453945, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9291666693985462, + "step": 2235 + }, + { + "completion_length": 1024.0, + "epoch": 0.4947474496486796, + "grad_norm": 1.4786352397643705, + "kl": 1.40592041015625, + "learning_rate": 1.191591131208315e-05, + "loss": 0.0563, + "reward": 1.6962168462574483, + "reward_std": 0.3573447995016267, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.005866468011367943, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.9395833402872086, + "step": 2240 + }, + { + "completion_length": 1024.0, + "epoch": 0.4958517966345026, + "grad_norm": 12.15263252691778, + "kl": 5.4029541015625, + "learning_rate": 1.1878054821746703e-05, + "loss": 0.2161, + "reward": 1.6230487048625946, + "reward_std": 0.38453564145783614, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.014451281244680559, + "rewards/format_reward": 0.71875, + "rewards/reasoning_steps_reward": 0.9187500052154064, + "step": 2245 + }, + { + "completion_length": 1024.0, + "epoch": 0.49695614362032553, + "grad_norm": 0.7557319167485542, + "kl": 1.6591064453125, + "learning_rate": 1.1840170412051957e-05, + "loss": 0.0663, + "reward": 1.702598787844181, + "reward_std": 0.4023397401074874, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01406788526670084, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.9104166738688946, + "step": 2250 + }, + { + "completion_length": 1024.0, + "epoch": 0.49806049060614843, + "grad_norm": 1.569834132336579, + "kl": 3.63101806640625, + "learning_rate": 1.1802258646192486e-05, + "loss": 0.1451, + "reward": 1.7320811052806675, + "reward_std": 0.29090855971943486, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01583560020637833, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9354166699573397, + "step": 2255 + }, + { + "completion_length": 1024.0, + "epoch": 0.4991648375919714, + "grad_norm": 13.15568121526605, + "kl": 1.659033203125, + "learning_rate": 1.1764320087768546e-05, + "loss": 0.0664, + "reward": 1.6248051080852748, + "reward_std": 0.45652774178926164, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.033528259821559915, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.8770833371207118, + "step": 2260 + }, + { + "completion_length": 1024.0, + "epoch": 0.5002691845777943, + "grad_norm": 0.7170030879216972, + "kl": 2.472637939453125, + "learning_rate": 1.1726355300778693e-05, + "loss": 0.099, + "reward": 1.6203329667448998, + "reward_std": 0.5256475808244773, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.027583728811373477, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.8541666708886624, + "step": 2265 + }, + { + "completion_length": 1024.0, + "epoch": 0.5013735315636173, + "grad_norm": 3.390023769493416, + "kl": 2.3892578125, + "learning_rate": 1.1688364849611395e-05, + "loss": 0.0957, + "reward": 1.5784560879692435, + "reward_std": 0.41030325355160924, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.036127270559586575, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.8520833376795054, + "step": 2270 + }, + { + "completion_length": 1024.0, + "epoch": 0.5024778785494403, + "grad_norm": 3.831774678363149, + "kl": 1.948974609375, + "learning_rate": 1.1650349299036656e-05, + "loss": 0.078, + "reward": 1.6070521710440517, + "reward_std": 0.3782993628408235, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.015864501821948807, + "rewards/format_reward": 0.775, + "rewards/reasoning_steps_reward": 0.847916672565043, + "step": 2275 + }, + { + "completion_length": 1024.0, + "epoch": 0.5035822255352632, + "grad_norm": 1.3332468495023275, + "kl": 3.39307861328125, + "learning_rate": 1.1612309214197599e-05, + "loss": 0.1357, + "reward": 1.618955060839653, + "reward_std": 0.44067184482020993, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.014378260358787998, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.8458333371207118, + "step": 2280 + }, + { + "completion_length": 1024.0, + "epoch": 0.5046865725210862, + "grad_norm": 5.409150362417238, + "kl": 1.684649658203125, + "learning_rate": 1.1574245160602085e-05, + "loss": 0.0673, + "reward": 1.4961256569251418, + "reward_std": 0.43631368919288605, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01429101258823664, + "rewards/format_reward": 0.64375, + "rewards/reasoning_steps_reward": 0.8666666716337204, + "step": 2285 + }, + { + "completion_length": 1024.0, + "epoch": 0.505790919506909, + "grad_norm": 10.3090725611453, + "kl": 2.255633544921875, + "learning_rate": 1.153615770411429e-05, + "loss": 0.0902, + "reward": 1.5355156451463698, + "reward_std": 0.4401967245209562, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01656769531377904, + "rewards/format_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.8645833389833569, + "step": 2290 + }, + { + "completion_length": 1024.0, + "epoch": 0.506895266492732, + "grad_norm": 4.742828920657206, + "kl": 3.08935546875, + "learning_rate": 1.1498047410946307e-05, + "loss": 0.1236, + "reward": 1.5172470673918723, + "reward_std": 0.402690345170231, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.014002943886953289, + "rewards/format_reward": 0.64375, + "rewards/reasoning_steps_reward": 0.8875000052154064, + "step": 2295 + }, + { + "completion_length": 1024.0, + "epoch": 0.5079996134785549, + "grad_norm": 3.7991396033177476, + "kl": 2.6480224609375, + "learning_rate": 1.1459914847649716e-05, + "loss": 0.1059, + "reward": 1.6665501791983843, + "reward_std": 0.35762373491728566, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.010533173413750773, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.8833333371207118, + "step": 2300 + }, + { + "epoch": 0.5079996134785549, + "eval_completion_length": 1024.0, + "eval_kl": 3.185244140625, + "eval_loss": 0.12666305899620056, + "eval_reward": 1.7388547444343567, + "eval_reward_std": 0.3053203289665726, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.02781194020019029, + "eval_rewards/format_reward": 0.85, + "eval_rewards/reasoning_steps_reward": 0.9166666674613952, + "eval_runtime": 207.715, + "eval_samples_per_second": 0.477, + "eval_steps_per_second": 0.12, + "step": 2300 + }, + { + "completion_length": 1024.0, + "epoch": 0.5091039604643779, + "grad_norm": 1.49588954242709, + "kl": 2.38382568359375, + "learning_rate": 1.1421760581107164e-05, + "loss": 0.0953, + "reward": 1.6487581813707948, + "reward_std": 0.2917481830695891, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04082516024231779, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.8583333348855376, + "step": 2305 + }, + { + "completion_length": 1024.0, + "epoch": 0.5102083074502008, + "grad_norm": 1.8780330623608932, + "kl": 1.966387939453125, + "learning_rate": 1.1383585178523955e-05, + "loss": 0.0787, + "reward": 1.60753201469779, + "reward_std": 0.42931269481578627, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.046634629848381334, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.8541666697710752, + "step": 2310 + }, + { + "completion_length": 1024.0, + "epoch": 0.5113126544360238, + "grad_norm": 2.96432937885032, + "kl": 2.16142578125, + "learning_rate": 1.1345389207419588e-05, + "loss": 0.0865, + "reward": 1.5378418434411287, + "reward_std": 0.5287294987143696, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.051741475899098076, + "rewards/format_reward": 0.74375, + "rewards/reasoning_steps_reward": 0.8395833408460021, + "step": 2315 + }, + { + "completion_length": 1024.0, + "epoch": 0.5124170014218468, + "grad_norm": 2.4681930761511444, + "kl": 2.180352783203125, + "learning_rate": 1.1307173235619342e-05, + "loss": 0.0872, + "reward": 1.3009003968909383, + "reward_std": 0.543780626336303, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07618293184367815, + "rewards/format_reward": 0.575, + "rewards/reasoning_steps_reward": 0.8020833391696215, + "step": 2320 + }, + { + "completion_length": 1024.0, + "epoch": 0.5135213484076697, + "grad_norm": 3.113537344897485, + "kl": 1.70172119140625, + "learning_rate": 1.126893783124583e-05, + "loss": 0.0681, + "reward": 1.558167396299541, + "reward_std": 0.44948710563394345, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.07308261062056545, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8750000052154064, + "step": 2325 + }, + { + "completion_length": 1024.0, + "epoch": 0.5146256953934927, + "grad_norm": 4.243060777071984, + "kl": 1.395947265625, + "learning_rate": 1.1230683562710549e-05, + "loss": 0.0559, + "reward": 1.6064410168677568, + "reward_std": 0.408382396842228, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04564233084790885, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.8708333348855376, + "step": 2330 + }, + { + "completion_length": 1024.0, + "epoch": 0.5157300423793156, + "grad_norm": 2.446852880740469, + "kl": 2.1044677734375, + "learning_rate": 1.1192410998705432e-05, + "loss": 0.0841, + "reward": 1.667079577036202, + "reward_std": 0.3678033341785465, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03500375264862328, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8895833365619182, + "step": 2335 + }, + { + "completion_length": 1018.425, + "epoch": 0.5168343893651385, + "grad_norm": 3.310047995806835, + "kl": 4.138531494140625, + "learning_rate": 1.1154120708194398e-05, + "loss": 0.1654, + "reward": 1.1680033100768923, + "reward_std": 0.5004193522453079, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09033003134591695, + "rewards/format_reward": 0.45, + "rewards/reasoning_steps_reward": 0.8083333427086472, + "step": 2340 + }, + { + "completion_length": 1024.0, + "epoch": 0.5179387363509614, + "grad_norm": 0.6777064303206283, + "kl": 0.7271484375, + "learning_rate": 1.1115813260404889e-05, + "loss": 0.0291, + "reward": 1.2690949118230492, + "reward_std": 0.5361405726755037, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.037155097232880505, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.7875000109896064, + "step": 2345 + }, + { + "completion_length": 1024.0, + "epoch": 0.5190430833367844, + "grad_norm": 1.6889816167510485, + "kl": 0.479376220703125, + "learning_rate": 1.1077489224819402e-05, + "loss": 0.0192, + "reward": 1.5625366240739822, + "reward_std": 0.47994972608394165, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.014546713606068806, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.858333339355886, + "step": 2350 + }, + { + "completion_length": 1024.0, + "epoch": 0.5201474303226074, + "grad_norm": 1.0697187024840797, + "kl": 0.394384765625, + "learning_rate": 1.1039149171167046e-05, + "loss": 0.0158, + "reward": 1.4704681444913148, + "reward_std": 0.5033119401799923, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.02328185227161157, + "rewards/format_reward": 0.65, + "rewards/reasoning_steps_reward": 0.8312500044703484, + "step": 2355 + }, + { + "completion_length": 1024.0, + "epoch": 0.5212517773084303, + "grad_norm": 0.25919087539939206, + "kl": 0.17598876953125, + "learning_rate": 1.1000793669415035e-05, + "loss": 0.007, + "reward": 1.5186623342335224, + "reward_std": 0.38529679665589356, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.02092099927031086, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.995833333581686, + "step": 2360 + }, + { + "completion_length": 1024.0, + "epoch": 0.5223561242942533, + "grad_norm": 0.21925652598328885, + "kl": 0.139227294921875, + "learning_rate": 1.0962423289760254e-05, + "loss": 0.0056, + "reward": 1.4680943846702577, + "reward_std": 0.37131147203908765, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.038155618403834524, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 1.0, + "step": 2365 + }, + { + "completion_length": 1024.0, + "epoch": 0.5234604712800762, + "grad_norm": 0.43901396796754966, + "kl": 0.248681640625, + "learning_rate": 1.0924038602620757e-05, + "loss": 0.01, + "reward": 1.8161877155303956, + "reward_std": 0.2932082125398665, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.0838122889137594, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 1.0, + "step": 2370 + }, + { + "completion_length": 1024.0, + "epoch": 0.5245648182658992, + "grad_norm": 0.4594563192454749, + "kl": 0.282012939453125, + "learning_rate": 1.0885640178627291e-05, + "loss": 0.0113, + "reward": 1.9489604651927948, + "reward_std": 0.15323295153175423, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.03228953526704572, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.9812500014901161, + "step": 2375 + }, + { + "completion_length": 1024.0, + "epoch": 0.5256691652517221, + "grad_norm": 1.4188978226513564, + "kl": 0.5095947265625, + "learning_rate": 1.0847228588614821e-05, + "loss": 0.0204, + "reward": 1.813528909534216, + "reward_std": 0.1997750176507907, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.06980442710773786, + "rewards/format_reward": 0.9875, + "rewards/reasoning_steps_reward": 0.889583338610828, + "step": 2380 + }, + { + "completion_length": 1024.0, + "epoch": 0.526773512237545, + "grad_norm": 1.5094835244677218, + "kl": 1.997509765625, + "learning_rate": 1.0808804403614044e-05, + "loss": 0.0799, + "reward": 1.6416274465620517, + "reward_std": 0.277405764979585, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.04378921372735931, + "rewards/format_reward": 0.975, + "rewards/reasoning_steps_reward": 0.7041666742414237, + "step": 2385 + }, + { + "completion_length": 1024.0, + "epoch": 0.5278778592233679, + "grad_norm": 1.417871425108259, + "kl": 0.92401123046875, + "learning_rate": 1.0770368194842886e-05, + "loss": 0.037, + "reward": 1.751328294724226, + "reward_std": 0.21636201266960597, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.025755050106090492, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 0.8145833384245634, + "step": 2390 + }, + { + "completion_length": 1024.0, + "epoch": 0.5289822062091909, + "grad_norm": 0.3460517954325989, + "kl": 0.739208984375, + "learning_rate": 1.073192053369802e-05, + "loss": 0.0296, + "reward": 1.7935828588902951, + "reward_std": 0.310901246771391, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.010583810009211447, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.8979166697710752, + "step": 2395 + }, + { + "completion_length": 1024.0, + "epoch": 0.5300865531950139, + "grad_norm": 0.299553336858581, + "kl": 0.269195556640625, + "learning_rate": 1.0693461991746389e-05, + "loss": 0.0108, + "reward": 1.7330624889582396, + "reward_std": 0.3259334075613879, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.03985418364172801, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.9416666716337204, + "step": 2400 + }, + { + "epoch": 0.5300865531950139, + "eval_completion_length": 1024.0, + "eval_kl": 0.40162109375, + "eval_loss": 0.015542779117822647, + "eval_reward": 1.6480663681030274, + "eval_reward_std": 0.3486202434706502, + "eval_rewards/accuracy_reward": 0.01, + "eval_rewards/cosine_scaled_reward": -0.041933644004166125, + "eval_rewards/format_reward": 0.835, + "eval_rewards/reasoning_steps_reward": 0.8450000047683716, + "eval_runtime": 203.8327, + "eval_samples_per_second": 0.486, + "eval_steps_per_second": 0.123, + "step": 2400 + }, + { + "completion_length": 1024.0, + "epoch": 0.5311909001808368, + "grad_norm": 1.1046026648755514, + "kl": 0.2391357421875, + "learning_rate": 1.0654993140716665e-05, + "loss": 0.0096, + "reward": 1.7627467274665833, + "reward_std": 0.2496938494945425, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.060169946975656786, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.8854166742414236, + "step": 2405 + }, + { + "completion_length": 1024.0, + "epoch": 0.5322952471666598, + "grad_norm": 0.5282380418655942, + "kl": 0.385986328125, + "learning_rate": 1.0616514552490791e-05, + "loss": 0.0154, + "reward": 1.7804573088884355, + "reward_std": 0.2669147708988021, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.0695426897440484, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9187500055879354, + "step": 2410 + }, + { + "completion_length": 1024.0, + "epoch": 0.5333995941524827, + "grad_norm": 2.4670630667481435, + "kl": 1.252880859375, + "learning_rate": 1.0578026799095464e-05, + "loss": 0.0501, + "reward": 1.6643784150481225, + "reward_std": 0.3057770044339122, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06687159015400539, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.8500000052154064, + "step": 2415 + }, + { + "completion_length": 1024.0, + "epoch": 0.5345039411383057, + "grad_norm": 0.9935269786883532, + "kl": 1.50946044921875, + "learning_rate": 1.0539530452693625e-05, + "loss": 0.0604, + "reward": 1.4255976286716758, + "reward_std": 0.438153853449694, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11815237666669418, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.7000000068917871, + "step": 2420 + }, + { + "completion_length": 1024.0, + "epoch": 0.5356082881241286, + "grad_norm": 2.1780659206470374, + "kl": 0.464599609375, + "learning_rate": 1.0501026085575967e-05, + "loss": 0.0186, + "reward": 1.3587639363482595, + "reward_std": 0.4793480883206939, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10165273098900798, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.7479166768491268, + "step": 2425 + }, + { + "completion_length": 1024.0, + "epoch": 0.5367126351099516, + "grad_norm": 0.26711816143900485, + "kl": 0.23184814453125, + "learning_rate": 1.046251427015241e-05, + "loss": 0.0093, + "reward": 1.3165842306800186, + "reward_std": 0.5014927750421976, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.08758243720367317, + "rewards/format_reward": 0.6, + "rewards/reasoning_steps_reward": 0.7854166770353913, + "step": 2430 + }, + { + "completion_length": 1024.0, + "epoch": 0.5378169820957744, + "grad_norm": 1.9867481118253592, + "kl": 0.311883544921875, + "learning_rate": 1.0423995578943615e-05, + "loss": 0.0125, + "reward": 1.5306999891996385, + "reward_std": 0.48772922792995815, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.09430001441578498, + "rewards/format_reward": 0.73125, + "rewards/reasoning_steps_reward": 0.8812500104308129, + "step": 2435 + }, + { + "completion_length": 1024.0, + "epoch": 0.5389213290815974, + "grad_norm": 4.140607916865834, + "kl": 1.620135498046875, + "learning_rate": 1.0385470584572449e-05, + "loss": 0.0648, + "reward": 1.6088348772376775, + "reward_std": 0.4257907736697234, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12241512268665247, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.9250000029802322, + "step": 2440 + }, + { + "completion_length": 1024.0, + "epoch": 0.5400256760674204, + "grad_norm": 0.8670307079372768, + "kl": 0.9895751953125, + "learning_rate": 1.0346939859755481e-05, + "loss": 0.0396, + "reward": 1.6652016706764698, + "reward_std": 0.3854114227630362, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10563166871434078, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.9270833343267441, + "step": 2445 + }, + { + "completion_length": 1024.0, + "epoch": 0.5411300230532433, + "grad_norm": 1.1343950295235299, + "kl": 0.3840087890625, + "learning_rate": 1.0308403977294476e-05, + "loss": 0.0154, + "reward": 1.8509787783026694, + "reward_std": 0.18647551744506927, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.08652122293206048, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 0.968750000745058, + "step": 2450 + }, + { + "completion_length": 1024.0, + "epoch": 0.5422343700390663, + "grad_norm": 1.0687106066072793, + "kl": 0.62066650390625, + "learning_rate": 1.0269863510067872e-05, + "loss": 0.0248, + "reward": 1.8417707242071628, + "reward_std": 0.2083289371990759, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.06447928030538605, + "rewards/format_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9625000022351742, + "step": 2455 + }, + { + "completion_length": 1024.0, + "epoch": 0.5433387170248892, + "grad_norm": 0.7622623459030559, + "kl": 0.68385009765625, + "learning_rate": 1.023131903102226e-05, + "loss": 0.0274, + "reward": 1.772145263105631, + "reward_std": 0.3072871359312558, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.07993807349630515, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.9145833348855377, + "step": 2460 + }, + { + "completion_length": 1024.0, + "epoch": 0.5444430640107122, + "grad_norm": 3.600110628598048, + "kl": 2.00994873046875, + "learning_rate": 1.0192771113163875e-05, + "loss": 0.0804, + "reward": 1.2840112496167422, + "reward_std": 0.5591359864323749, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2159887515474111, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.7937500040978194, + "step": 2465 + }, + { + "completion_length": 1024.0, + "epoch": 0.5455474109965351, + "grad_norm": 2.75418080869431, + "kl": 1.002532958984375, + "learning_rate": 1.0154220329550076e-05, + "loss": 0.0401, + "reward": 1.2907778739929199, + "reward_std": 0.5905611860769568, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13838879884569905, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.8729166677221656, + "step": 2470 + }, + { + "completion_length": 1020.225, + "epoch": 0.5466517579823581, + "grad_norm": 1.6118829429672292, + "kl": 1.244329833984375, + "learning_rate": 1.0115667253280817e-05, + "loss": 0.0498, + "reward": 1.0045916791073979, + "reward_std": 0.6294532329367939, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12249164624954574, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7270833354443311, + "step": 2475 + }, + { + "completion_length": 1024.0, + "epoch": 0.5477561049681811, + "grad_norm": 1.6057608879936789, + "kl": 1.879241943359375, + "learning_rate": 1.0077112457490143e-05, + "loss": 0.0752, + "reward": 1.0946636897511781, + "reward_std": 0.506724919876433, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13658630648133113, + "rewards/format_reward": 0.425, + "rewards/reasoning_steps_reward": 0.8000000026077032, + "step": 2480 + }, + { + "completion_length": 1024.0, + "epoch": 0.5488604519540039, + "grad_norm": 0.5646928134790875, + "kl": 0.6290374755859375, + "learning_rate": 1.0038556515337654e-05, + "loss": 0.0252, + "reward": 1.3780475069768727, + "reward_std": 0.4772715052065905, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1386191618919838, + "rewards/format_reward": 0.61875, + "rewards/reasoning_steps_reward": 0.8916666675359011, + "step": 2485 + }, + { + "completion_length": 1024.0, + "epoch": 0.5499647989398269, + "grad_norm": 0.8859860742230482, + "kl": 0.401104736328125, + "learning_rate": 1e-05, + "loss": 0.0161, + "reward": 1.7356179803609848, + "reward_std": 0.3069496586394962, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12479868032969535, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.941666667163372, + "step": 2490 + }, + { + "completion_length": 1024.0, + "epoch": 0.5510691459256498, + "grad_norm": 1.4769301041413523, + "kl": 0.3043701171875, + "learning_rate": 9.961443484662349e-06, + "loss": 0.0122, + "reward": 1.7910444140434265, + "reward_std": 0.28909274661323253, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.11312224400317064, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9229166679084301, + "step": 2495 + }, + { + "completion_length": 1024.0, + "epoch": 0.5521734929114728, + "grad_norm": 0.5303430541917586, + "kl": 0.1882080078125, + "learning_rate": 9.92288754250986e-06, + "loss": 0.0075, + "reward": 1.9123993963003159, + "reward_std": 0.24396189967519605, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.10635060318236356, + "rewards/format_reward": 0.98125, + "rewards/reasoning_steps_reward": 0.9937500014901162, + "step": 2500 + }, + { + "epoch": 0.5521734929114728, + "eval_completion_length": 1024.0, + "eval_kl": 0.21314453125, + "eval_loss": 0.008531954139471054, + "eval_reward": 1.8809818363189696, + "eval_reward_std": 0.17097413605777548, + "eval_rewards/accuracy_reward": 0.025, + "eval_rewards/cosine_scaled_reward": -0.11235150025226176, + "eval_rewards/format_reward": 0.99, + "eval_rewards/reasoning_steps_reward": 0.9783333337306976, + "eval_runtime": 203.3411, + "eval_samples_per_second": 0.487, + "eval_steps_per_second": 0.123, + "step": 2500 + }, + { + "completion_length": 1024.0, + "epoch": 0.5532778398972957, + "grad_norm": 0.3634886750349178, + "kl": 0.218756103515625, + "learning_rate": 9.884332746719186e-06, + "loss": 0.0088, + "reward": 1.8593285992741584, + "reward_std": 0.1559050077528809, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.11567139578983188, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 1.0, + "step": 2505 + }, + { + "completion_length": 1024.0, + "epoch": 0.5543821868831187, + "grad_norm": 0.5051986020364817, + "kl": 0.266455078125, + "learning_rate": 9.845779670449926e-06, + "loss": 0.0107, + "reward": 1.8667958162724971, + "reward_std": 0.22305318891121714, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.0561208424041979, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9729166675359011, + "step": 2510 + }, + { + "completion_length": 1024.0, + "epoch": 0.5554865338689416, + "grad_norm": 1.1588379380142202, + "kl": 0.49033203125, + "learning_rate": 9.807228886836128e-06, + "loss": 0.0196, + "reward": 1.6591941472142935, + "reward_std": 0.35607906174791426, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.061639188357457894, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.877083339355886, + "step": 2515 + }, + { + "completion_length": 1024.0, + "epoch": 0.5565908808547646, + "grad_norm": 1.0057531712372425, + "kl": 0.37701416015625, + "learning_rate": 9.768680968977743e-06, + "loss": 0.0151, + "reward": 1.6011904481798411, + "reward_std": 0.3233839514392457, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06130954009086054, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8500000039115548, + "step": 2520 + }, + { + "completion_length": 1022.8625, + "epoch": 0.5576952278405876, + "grad_norm": 2.7751702000639926, + "kl": 1.606396484375, + "learning_rate": 9.730136489932133e-06, + "loss": 0.0642, + "reward": 1.186481614317745, + "reward_std": 0.7302230117369618, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.14476838221307844, + "rewards/format_reward": 0.63125, + "rewards/reasoning_steps_reward": 0.668750005401671, + "step": 2525 + }, + { + "completion_length": 1024.0, + "epoch": 0.5587995748264104, + "grad_norm": 1.753321795976581, + "kl": 0.9668701171875, + "learning_rate": 9.691596022705527e-06, + "loss": 0.0387, + "reward": 1.1860073703341185, + "reward_std": 0.570556367138488, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.11399263035709736, + "rewards/format_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.7062500042840838, + "step": 2530 + }, + { + "completion_length": 1024.0, + "epoch": 0.5599039218122334, + "grad_norm": 2.3586733201486587, + "kl": 1.7131591796875, + "learning_rate": 9.653060140244524e-06, + "loss": 0.0685, + "reward": 0.7566529627889395, + "reward_std": 0.45097853311744984, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.17043037730327343, + "rewards/format_reward": 0.175, + "rewards/reasoning_steps_reward": 0.7458333386108279, + "step": 2535 + }, + { + "completion_length": 1024.0, + "epoch": 0.5610082687980563, + "grad_norm": 2.4868110206825675, + "kl": 0.817779541015625, + "learning_rate": 9.614529415427556e-06, + "loss": 0.0327, + "reward": 0.7670935079455375, + "reward_std": 0.35415494777553247, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.16623983170138673, + "rewards/format_reward": 0.04375, + "rewards/reasoning_steps_reward": 0.8645833404734731, + "step": 2540 + }, + { + "completion_length": 1024.0, + "epoch": 0.5621126157838793, + "grad_norm": 8.994673033795959, + "kl": 1.3493743896484376, + "learning_rate": 9.576004421056389e-06, + "loss": 0.0541, + "reward": 0.7187479682266712, + "reward_std": 0.24309571331159532, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.18541869991458954, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.8666666701436043, + "step": 2545 + }, + { + "completion_length": 1024.0, + "epoch": 0.5632169627697022, + "grad_norm": 1.9064717191281668, + "kl": 0.844708251953125, + "learning_rate": 9.537485729847594e-06, + "loss": 0.0338, + "reward": 0.6888711890205741, + "reward_std": 0.25518429378644214, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2194621481387003, + "rewards/format_reward": 0.00625, + "rewards/reasoning_steps_reward": 0.9020833350718022, + "step": 2550 + }, + { + "completion_length": 1024.0, + "epoch": 0.5643213097555252, + "grad_norm": 6.497313362493659, + "kl": 1.571319580078125, + "learning_rate": 9.498973914424035e-06, + "loss": 0.0629, + "reward": 0.6698607638478279, + "reward_std": 0.27875330625029165, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21972257128509226, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.8583333335816861, + "step": 2555 + }, + { + "completion_length": 1024.0, + "epoch": 0.5654256567413481, + "grad_norm": 1.5891156033669043, + "kl": 3.031695556640625, + "learning_rate": 9.460469547306375e-06, + "loss": 0.1212, + "reward": 0.7087379619479179, + "reward_std": 0.3350365572499868, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.15584536720998585, + "rewards/format_reward": 0.025, + "rewards/reasoning_steps_reward": 0.8208333358168602, + "step": 2560 + }, + { + "completion_length": 1024.0, + "epoch": 0.5665300037271711, + "grad_norm": 5.078097996420085, + "kl": 1.387432861328125, + "learning_rate": 9.421973200904538e-06, + "loss": 0.0555, + "reward": 0.6076501269359141, + "reward_std": 0.3604350686266116, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.20693321338949316, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.7770833358168602, + "step": 2565 + }, + { + "completion_length": 1024.0, + "epoch": 0.5676343507129941, + "grad_norm": 12.377672982659016, + "kl": 1.8506378173828124, + "learning_rate": 9.38348544750921e-06, + "loss": 0.0741, + "reward": 0.6430217208398972, + "reward_std": 0.3532645304381731, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.23406161246239207, + "rewards/format_reward": 0.04375, + "rewards/reasoning_steps_reward": 0.8333333343267441, + "step": 2570 + }, + { + "completion_length": 1024.0, + "epoch": 0.568738697698817, + "grad_norm": 1.8285393227548976, + "kl": 1.123150634765625, + "learning_rate": 9.345006859283338e-06, + "loss": 0.045, + "reward": 0.7630160832777619, + "reward_std": 0.3213708248760668, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.19323392115911703, + "rewards/format_reward": 0.05625, + "rewards/reasoning_steps_reward": 0.8750000027939677, + "step": 2575 + }, + { + "completion_length": 1024.0, + "epoch": 0.5698430446846399, + "grad_norm": 3.9635852553643303, + "kl": 0.58538818359375, + "learning_rate": 9.306538008253611e-06, + "loss": 0.0234, + "reward": 0.7108107196167112, + "reward_std": 0.24838708396055154, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21835594929289073, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.9166666705161333, + "step": 2580 + }, + { + "completion_length": 1024.0, + "epoch": 0.5709473916704628, + "grad_norm": 0.21806865311979415, + "kl": 0.7177490234375, + "learning_rate": 9.268079466301978e-06, + "loss": 0.0287, + "reward": 0.7812843410298228, + "reward_std": 0.2617356756320078, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1874656603875337, + "rewards/format_reward": 0.025, + "rewards/reasoning_steps_reward": 0.9312500005587936, + "step": 2585 + }, + { + "completion_length": 1024.0, + "epoch": 0.5720517386562858, + "grad_norm": 6.301133014500009, + "kl": 0.8725372314453125, + "learning_rate": 9.229631805157116e-06, + "loss": 0.0349, + "reward": 0.7710512263700366, + "reward_std": 0.2632641014934052, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.18936544355237855, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.9229166684672236, + "step": 2590 + }, + { + "completion_length": 1024.0, + "epoch": 0.5731560856421087, + "grad_norm": 8.909114886388627, + "kl": 0.811669921875, + "learning_rate": 9.19119559638596e-06, + "loss": 0.0325, + "reward": 0.8804346274584531, + "reward_std": 0.34509176830179056, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16331537813821342, + "rewards/format_reward": 0.10625, + "rewards/reasoning_steps_reward": 0.9250000014901161, + "step": 2595 + }, + { + "completion_length": 1024.0, + "epoch": 0.5742604326279317, + "grad_norm": 1.244644274672843, + "kl": 1.816058349609375, + "learning_rate": 9.15277141138518e-06, + "loss": 0.0727, + "reward": 0.8285146844573319, + "reward_std": 0.3990332660243439, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.10690198320662603, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.8291666699573398, + "step": 2600 + }, + { + "epoch": 0.5742604326279317, + "eval_completion_length": 1024.0, + "eval_kl": 1.531572265625, + "eval_loss": 0.0615340955555439, + "eval_reward": 0.8556969636678695, + "eval_reward_std": 0.4192620050907135, + "eval_rewards/accuracy_reward": 0.03, + "eval_rewards/cosine_scaled_reward": -0.14596970692276956, + "eval_rewards/format_reward": 0.1, + "eval_rewards/reasoning_steps_reward": 0.871666669845581, + "eval_runtime": 203.5802, + "eval_samples_per_second": 0.486, + "eval_steps_per_second": 0.123, + "step": 2600 + }, + { + "completion_length": 1024.0, + "epoch": 0.5753647796137547, + "grad_norm": 3.507048504579022, + "kl": 1.900311279296875, + "learning_rate": 9.114359821372714e-06, + "loss": 0.076, + "reward": 0.7871741138980723, + "reward_std": 0.38461359875800555, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1753258895187173, + "rewards/format_reward": 0.15, + "rewards/reasoning_steps_reward": 0.8062500018626452, + "step": 2605 + }, + { + "completion_length": 1024.0, + "epoch": 0.5764691265995776, + "grad_norm": 2.5214326423125244, + "kl": 1.04107666015625, + "learning_rate": 9.075961397379247e-06, + "loss": 0.0417, + "reward": 0.8862119485624135, + "reward_std": 0.4803070175581524, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13878805282874965, + "rewards/format_reward": 0.1625, + "rewards/reasoning_steps_reward": 0.8500000033527613, + "step": 2610 + }, + { + "completion_length": 1024.0, + "epoch": 0.5775734735854006, + "grad_norm": 3.307341729299716, + "kl": 2.38963623046875, + "learning_rate": 9.037576710239748e-06, + "loss": 0.0955, + "reward": 0.841398511081934, + "reward_std": 0.4593913863740454, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.2002681548066903, + "rewards/format_reward": 0.2, + "rewards/reasoning_steps_reward": 0.8291666697710752, + "step": 2615 + }, + { + "completion_length": 1024.0, + "epoch": 0.5786778205712235, + "grad_norm": 1.1047461484931345, + "kl": 1.93094482421875, + "learning_rate": 8.999206330584969e-06, + "loss": 0.0773, + "reward": 0.8279006748460234, + "reward_std": 0.4829202242734027, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.16584932910191127, + "rewards/format_reward": 0.19375, + "rewards/reasoning_steps_reward": 0.7625000022351742, + "step": 2620 + }, + { + "completion_length": 1024.0, + "epoch": 0.5797821675570465, + "grad_norm": 1.065156980266071, + "kl": 583.0839782714844, + "learning_rate": 8.960850828832958e-06, + "loss": 23.2767, + "reward": 0.8661070578498766, + "reward_std": 0.4774655077977513, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.17139294502558186, + "rewards/format_reward": 0.15625, + "rewards/reasoning_steps_reward": 0.8437500035390257, + "step": 2625 + }, + { + "completion_length": 1024.0, + "epoch": 0.5808865145428693, + "grad_norm": 4.568392600149099, + "kl": 4.222686767578125, + "learning_rate": 8.9225107751806e-06, + "loss": 0.1693, + "reward": 0.8945860045030714, + "reward_std": 0.3841389437075122, + "rewards/accuracy_reward": 0.05625, + "rewards/cosine_scaled_reward": -0.17208065872546285, + "rewards/format_reward": 0.15, + "rewards/reasoning_steps_reward": 0.8604166731238365, + "step": 2630 + }, + { + "completion_length": 1024.0, + "epoch": 0.5819908615286923, + "grad_norm": 1.981971937068967, + "kl": 0.79658203125, + "learning_rate": 8.884186739595114e-06, + "loss": 0.0319, + "reward": 0.7690370593219995, + "reward_std": 0.3609249549546803, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.21221294170245528, + "rewards/format_reward": 0.225, + "rewards/reasoning_steps_reward": 0.7437500122934579, + "step": 2635 + }, + { + "completion_length": 1024.0, + "epoch": 0.5830952085145152, + "grad_norm": 2.7805804505170175, + "kl": 1.688623046875, + "learning_rate": 8.845879291805605e-06, + "loss": 0.0675, + "reward": 1.1183765586465597, + "reward_std": 0.6193717653281056, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.15454010646790267, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.7541666772216559, + "step": 2640 + }, + { + "completion_length": 1024.0, + "epoch": 0.5841995555003382, + "grad_norm": 2.1901160477872876, + "kl": 2.324169921875, + "learning_rate": 8.807589001294571e-06, + "loss": 0.093, + "reward": 1.1031825848389416, + "reward_std": 0.5278661086966168, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.24473407644545658, + "rewards/format_reward": 0.73125, + "rewards/reasoning_steps_reward": 0.591666679084301, + "step": 2645 + }, + { + "completion_length": 1024.0, + "epoch": 0.5853039024861612, + "grad_norm": 5.1317753880312935, + "kl": 2.041021728515625, + "learning_rate": 8.769316437289456e-06, + "loss": 0.0817, + "reward": 1.161124537140131, + "reward_std": 0.6632860126585001, + "rewards/accuracy_reward": 0.05, + "rewards/cosine_scaled_reward": -0.21387546394253149, + "rewards/format_reward": 0.69375, + "rewards/reasoning_steps_reward": 0.631250012665987, + "step": 2650 + }, + { + "completion_length": 1024.0, + "epoch": 0.5864082494719841, + "grad_norm": 2.978675995841801, + "kl": 1.001495361328125, + "learning_rate": 8.731062168754174e-06, + "loss": 0.0401, + "reward": 1.357415765337646, + "reward_std": 0.47671173595408617, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.1863342406024458, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.7125000091269612, + "step": 2655 + }, + { + "completion_length": 1024.0, + "epoch": 0.5875125964578071, + "grad_norm": 2.0018445118420565, + "kl": 1.871514892578125, + "learning_rate": 8.692826764380662e-06, + "loss": 0.0748, + "reward": 1.3236260378733278, + "reward_std": 0.5231166693600244, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.20970730545814148, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.7083333449438214, + "step": 2660 + }, + { + "completion_length": 1024.0, + "epoch": 0.58861694344363, + "grad_norm": 2.1407022480159283, + "kl": 0.8106689453125, + "learning_rate": 8.654610792580415e-06, + "loss": 0.0324, + "reward": 1.5535705825313926, + "reward_std": 0.3941577763791429, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.18809608481969917, + "rewards/format_reward": 0.9, + "rewards/reasoning_steps_reward": 0.816666678711772, + "step": 2665 + }, + { + "completion_length": 1024.0, + "epoch": 0.589721290429453, + "grad_norm": 2.3049029672167585, + "kl": 1.445733642578125, + "learning_rate": 8.616414821476048e-06, + "loss": 0.0578, + "reward": 1.6248370364308358, + "reward_std": 0.4025070207238969, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.17099629567801458, + "rewards/format_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.8770833343267441, + "step": 2670 + }, + { + "completion_length": 1024.0, + "epoch": 0.5908256374152758, + "grad_norm": 0.6919174013627627, + "kl": 2.08385009765625, + "learning_rate": 8.57823941889284e-06, + "loss": 0.0834, + "reward": 1.5777956765145063, + "reward_std": 0.48239568906356, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16595432664034887, + "rewards/format_reward": 0.8875, + "rewards/reasoning_steps_reward": 0.8437500029802323, + "step": 2675 + }, + { + "completion_length": 1024.0, + "epoch": 0.5919299844010988, + "grad_norm": 0.997312984133755, + "kl": 1.877044677734375, + "learning_rate": 8.54008515235029e-06, + "loss": 0.075, + "reward": 1.5631065297173337, + "reward_std": 0.3414640254137339, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14314347317558712, + "rewards/format_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9437500011175871, + "step": 2680 + }, + { + "completion_length": 1024.0, + "epoch": 0.5930343313869217, + "grad_norm": 1.5488170229596216, + "kl": 0.488629150390625, + "learning_rate": 8.501952589053694e-06, + "loss": 0.0196, + "reward": 1.6224470026791096, + "reward_std": 0.35797781147266505, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16921966964146123, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.9291666716337204, + "step": 2685 + }, + { + "completion_length": 1024.0, + "epoch": 0.5941386783727447, + "grad_norm": 4.133285080841036, + "kl": 0.826751708984375, + "learning_rate": 8.463842295885712e-06, + "loss": 0.0331, + "reward": 1.6491547813639045, + "reward_std": 0.37279988423979377, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12792855495936237, + "rewards/format_reward": 0.9, + "rewards/reasoning_steps_reward": 0.8645833399146795, + "step": 2690 + }, + { + "completion_length": 1024.0, + "epoch": 0.5952430253585677, + "grad_norm": 1.5372371953697817, + "kl": 1.359478759765625, + "learning_rate": 8.425754839397917e-06, + "loss": 0.0544, + "reward": 1.5732461759354919, + "reward_std": 0.4112935331926565, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.18092048349790274, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.8604166703298688, + "step": 2695 + }, + { + "completion_length": 1023.89375, + "epoch": 0.5963473723443906, + "grad_norm": 0.7879978615183016, + "kl": 0.82923583984375, + "learning_rate": 8.387690785802403e-06, + "loss": 0.0332, + "reward": 1.5549915019422769, + "reward_std": 0.42069571325846483, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14709182740189136, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9083333384245634, + "step": 2700 + }, + { + "epoch": 0.5963473723443906, + "eval_completion_length": 1024.0, + "eval_kl": 0.63541015625, + "eval_loss": 0.025506604462862015, + "eval_reward": 1.5887797927856446, + "eval_reward_std": 0.43425711914896964, + "eval_rewards/accuracy_reward": 0.015, + "eval_rewards/cosine_scaled_reward": -0.1462202015519142, + "eval_rewards/format_reward": 0.81, + "eval_rewards/reasoning_steps_reward": 0.9100000047683716, + "eval_runtime": 202.0281, + "eval_samples_per_second": 0.49, + "eval_steps_per_second": 0.124, + "step": 2700 + }, + { + "completion_length": 1024.0, + "epoch": 0.5974517193302136, + "grad_norm": 0.2954926442766836, + "kl": 0.60306396484375, + "learning_rate": 8.349650700963346e-06, + "loss": 0.0241, + "reward": 1.5327742783352734, + "reward_std": 0.43929143614368515, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.15055906748748385, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.877083340473473, + "step": 2705 + }, + { + "completion_length": 1024.0, + "epoch": 0.5985560663160365, + "grad_norm": 0.40255214529511685, + "kl": 0.2451416015625, + "learning_rate": 8.311635150388607e-06, + "loss": 0.0098, + "reward": 1.6107220947742462, + "reward_std": 0.3211441752166138, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16219456993276254, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.904166679829359, + "step": 2710 + }, + { + "completion_length": 1024.0, + "epoch": 0.5996604133018595, + "grad_norm": 1.0582522044938396, + "kl": 0.6796630859375, + "learning_rate": 8.273644699221309e-06, + "loss": 0.0272, + "reward": 1.708713711425662, + "reward_std": 0.3196752316202037, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.12670296079013496, + "rewards/format_reward": 0.925, + "rewards/reasoning_steps_reward": 0.8854166727513075, + "step": 2715 + }, + { + "completion_length": 1024.0, + "epoch": 0.6007647602876824, + "grad_norm": 2.277187831822091, + "kl": 1.67298583984375, + "learning_rate": 8.235679912231456e-06, + "loss": 0.0669, + "reward": 1.528065111860633, + "reward_std": 0.4964781049713565, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11776822947686014, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.827083338983357, + "step": 2720 + }, + { + "completion_length": 1023.20625, + "epoch": 0.6018691072735053, + "grad_norm": 18.823418701689295, + "kl": 3.2572021484375, + "learning_rate": 8.197741353807515e-06, + "loss": 0.1303, + "reward": 1.1230425384826959, + "reward_std": 0.7203874601105781, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.26654079704312605, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.6895833391696214, + "step": 2725 + }, + { + "completion_length": 1011.4875, + "epoch": 0.6029734542593282, + "grad_norm": 4.2381751634354945, + "kl": 4.9869384765625, + "learning_rate": 8.159829587948048e-06, + "loss": 0.1993, + "reward": -0.06850880788988434, + "reward_std": 0.2341864599104156, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.20809214230976067, + "rewards/format_reward": 0.06875, + "rewards/reasoning_steps_reward": 0.06458333525806666, + "step": 2730 + }, + { + "completion_length": 1024.0, + "epoch": 0.6040778012451512, + "grad_norm": 96482763.82745863, + "kl": 180908570442.25247, + "learning_rate": 8.1219451782533e-06, + "loss": 7239816806.4, + "reward": -0.38960654605180023, + "reward_std": 0.10572403705027682, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.41252321302890776, + "rewards/format_reward": 0.0125, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "step": 2735 + }, + { + "completion_length": 1024.0, + "epoch": 0.6051821482309742, + "grad_norm": 3267730.5942430696, + "kl": 421346.476171875, + "learning_rate": 8.084088687916853e-06, + "loss": 16876.4047, + "reward": -0.27517261541215704, + "reward_std": 0.13037438962201123, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.30642261592438447, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "step": 2740 + }, + { + "completion_length": 1024.0, + "epoch": 0.6062864952167971, + "grad_norm": 723724.1963519381, + "kl": 60747.18715820312, + "learning_rate": 8.046260679717225e-06, + "loss": 2430.2525, + "reward": -0.1426667131279828, + "reward_std": 0.13045339315722232, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17391671401273925, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "step": 2745 + }, + { + "completion_length": 1024.0, + "epoch": 0.6073908422026201, + "grad_norm": 1441.8019454907167, + "kl": 521.3070068359375, + "learning_rate": 8.00846171600952e-06, + "loss": 20.84, + "reward": -0.2268078915774822, + "reward_std": 0.1437462084002618, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2684745579957962, + "rewards/format_reward": 0.03125, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "step": 2750 + }, + { + "completion_length": 1024.0, + "epoch": 0.608495189188443, + "grad_norm": 9.380650885562765, + "kl": 684.062841796875, + "learning_rate": 7.970692358717067e-06, + "loss": 27.3489, + "reward": -0.055707710242131725, + "reward_std": 0.30809029219599326, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.259874378569657, + "rewards/format_reward": 0.1875, + "rewards/reasoning_steps_reward": 0.01666666716337204, + "step": 2755 + }, + { + "completion_length": 1024.0, + "epoch": 0.609599536174266, + "grad_norm": 22.947054770935207, + "kl": 3.025634765625, + "learning_rate": 7.932953169323057e-06, + "loss": 0.1211, + "reward": 0.1340429156436585, + "reward_std": 0.403066113893874, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.18887375577469356, + "rewards/format_reward": 0.275, + "rewards/reasoning_steps_reward": 0.022916667349636554, + "step": 2760 + }, + { + "completion_length": 1024.0, + "epoch": 0.610703883160089, + "grad_norm": 4.1216872479716695, + "kl": 1.19234619140625, + "learning_rate": 7.895244708862204e-06, + "loss": 0.0477, + "reward": 0.19553506562951953, + "reward_std": 0.39290520365666454, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21071493784547785, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "step": 2765 + }, + { + "completion_length": 1024.0, + "epoch": 0.6118082301459118, + "grad_norm": 1.6959155942935658, + "kl": 1.01761474609375, + "learning_rate": 7.857567537912404e-06, + "loss": 0.0407, + "reward": 0.5248618606012314, + "reward_std": 0.4952483652741648, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.21680481360817794, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.21666667181998492, + "step": 2770 + }, + { + "completion_length": 1024.0, + "epoch": 0.6129125771317347, + "grad_norm": 1.5482569009814988, + "kl": 0.517236328125, + "learning_rate": 7.8199222165864e-06, + "loss": 0.0207, + "reward": 0.8010378686711193, + "reward_std": 0.49899706967407836, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.20729547403170728, + "rewards/format_reward": 0.6375, + "rewards/reasoning_steps_reward": 0.37083333916962147, + "step": 2775 + }, + { + "completion_length": 1024.0, + "epoch": 0.6140169241175577, + "grad_norm": 2.4108682922884355, + "kl": 0.65213623046875, + "learning_rate": 7.78230930452345e-06, + "loss": 0.0261, + "reward": 1.0279582727060188, + "reward_std": 0.6527682813815773, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13454173824720783, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.43125001043081285, + "step": 2780 + }, + { + "completion_length": 1024.0, + "epoch": 0.6151212711033807, + "grad_norm": 3.0357670258767997, + "kl": 0.825726318359375, + "learning_rate": 7.744729360881023e-06, + "loss": 0.033, + "reward": 0.9146947997360257, + "reward_std": 0.5863862118949328, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13530521424108882, + "rewards/format_reward": 0.61875, + "rewards/reasoning_steps_reward": 0.4250000087544322, + "step": 2785 + }, + { + "completion_length": 1024.0, + "epoch": 0.6162256180892036, + "grad_norm": 5.402255250510556, + "kl": 1.20025634765625, + "learning_rate": 7.70718294432646e-06, + "loss": 0.048, + "reward": 0.8170664728269912, + "reward_std": 0.5222204860349848, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.20168353992048652, + "rewards/format_reward": 0.525, + "rewards/reasoning_steps_reward": 0.4937500128522515, + "step": 2790 + }, + { + "completion_length": 1024.0, + "epoch": 0.6173299650750266, + "grad_norm": 2.6490412611159906, + "kl": 0.7834259033203125, + "learning_rate": 7.669670613028705e-06, + "loss": 0.0313, + "reward": 0.9308588748739567, + "reward_std": 0.5479372062931361, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21289113731472753, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.5562500149011612, + "step": 2795 + }, + { + "completion_length": 1024.0, + "epoch": 0.6184343120608495, + "grad_norm": 0.9296004266230328, + "kl": 0.679315185546875, + "learning_rate": 7.632192924649969e-06, + "loss": 0.0272, + "reward": 1.3005210721777984, + "reward_std": 0.42411895469613226, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.20572894245560747, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.668750011920929, + "step": 2800 + }, + { + "epoch": 0.6184343120608495, + "eval_completion_length": 1024.0, + "eval_kl": 0.41427734375, + "eval_loss": 0.016660606488585472, + "eval_reward": 1.3896788090467453, + "eval_reward_std": 0.3565726025402546, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.2536545431613922, + "eval_rewards/format_reward": 0.89, + "eval_rewards/reasoning_steps_reward": 0.7533333519101143, + "eval_runtime": 203.0994, + "eval_samples_per_second": 0.487, + "eval_steps_per_second": 0.123, + "step": 2800 + }, + { + "completion_length": 1024.0, + "epoch": 0.6195386590466725, + "grad_norm": 0.819894636043627, + "kl": 0.622088623046875, + "learning_rate": 7.594750436337467e-06, + "loss": 0.0249, + "reward": 1.3817081528744892, + "reward_std": 0.39700459074229, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2516251891473075, + "rewards/format_reward": 0.88125, + "rewards/reasoning_steps_reward": 0.7520833408460021, + "step": 2805 + }, + { + "completion_length": 1024.0, + "epoch": 0.6206430060324954, + "grad_norm": 24.5216205783706, + "kl": 0.28448486328125, + "learning_rate": 7.557343704715121e-06, + "loss": 0.0114, + "reward": 1.6452918566763401, + "reward_std": 0.2875926383348997, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2067914859391749, + "rewards/format_reward": 0.95625, + "rewards/reasoning_steps_reward": 0.8958333423361182, + "step": 2810 + }, + { + "completion_length": 1024.0, + "epoch": 0.6217473530183184, + "grad_norm": 0.4780364776400206, + "kl": 2.242742919921875, + "learning_rate": 7.519973285875303e-06, + "loss": 0.0896, + "reward": 1.669452325697057, + "reward_std": 0.25911546872521285, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.18888101823395118, + "rewards/format_reward": 0.93125, + "rewards/reasoning_steps_reward": 0.9270833373069763, + "step": 2815 + }, + { + "completion_length": 1024.0, + "epoch": 0.6228517000041413, + "grad_norm": 0.30599849603614737, + "kl": 0.239642333984375, + "learning_rate": 7.482639735370536e-06, + "loss": 0.0096, + "reward": 1.7530787236988545, + "reward_std": 0.22120716450335748, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1531712787807919, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9500000029802322, + "step": 2820 + }, + { + "completion_length": 1024.0, + "epoch": 0.6239560469899642, + "grad_norm": 0.3026466856046581, + "kl": 0.4430419921875, + "learning_rate": 7.445343608205273e-06, + "loss": 0.0177, + "reward": 1.7619009755551815, + "reward_std": 0.2164007437779219, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15059902559150942, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9625000029802322, + "step": 2825 + }, + { + "completion_length": 1024.0, + "epoch": 0.6250603939757872, + "grad_norm": 7.081063593030829, + "kl": 1.41685791015625, + "learning_rate": 7.408085458827612e-06, + "loss": 0.0566, + "reward": 1.679977324604988, + "reward_std": 0.3088035559238051, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13877267238276544, + "rewards/format_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9437500014901161, + "step": 2830 + }, + { + "completion_length": 1024.0, + "epoch": 0.6261647409616101, + "grad_norm": 2.1946555791360374, + "kl": 0.98446044921875, + "learning_rate": 7.37086584112108e-06, + "loss": 0.0394, + "reward": 1.635563358478248, + "reward_std": 0.3597205114591816, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11443664449179777, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.9187500059604645, + "step": 2835 + }, + { + "completion_length": 1024.0, + "epoch": 0.6272690879474331, + "grad_norm": 5.092191379211699, + "kl": 1.26165771484375, + "learning_rate": 7.333685308396383e-06, + "loss": 0.0505, + "reward": 1.437623752374202, + "reward_std": 0.564753658437985, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11445958482654532, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.8333333428949118, + "step": 2840 + }, + { + "completion_length": 1024.0, + "epoch": 0.628373434933256, + "grad_norm": 0.3518558091842223, + "kl": 0.6064208984375, + "learning_rate": 7.2965444133831905e-06, + "loss": 0.0243, + "reward": 1.6313017681241035, + "reward_std": 0.39246049159555696, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12494824056047946, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.900000006891787, + "step": 2845 + }, + { + "completion_length": 1024.0, + "epoch": 0.629477781919079, + "grad_norm": 0.4555829404371441, + "kl": 0.33995361328125, + "learning_rate": 7.2594437082219074e-06, + "loss": 0.0136, + "reward": 1.7459608260542154, + "reward_std": 0.23804061831906437, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12487251611310057, + "rewards/format_reward": 0.95, + "rewards/reasoning_steps_reward": 0.9145833410322666, + "step": 2850 + }, + { + "completion_length": 1024.0, + "epoch": 0.630582128904902, + "grad_norm": 1.0048391384575739, + "kl": 0.7876708984375, + "learning_rate": 7.222383744455477e-06, + "loss": 0.0315, + "reward": 1.6474058616906404, + "reward_std": 0.2795259444072144, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12759414308093256, + "rewards/format_reward": 0.91875, + "rewards/reasoning_steps_reward": 0.8562500108033418, + "step": 2855 + }, + { + "completion_length": 1024.0, + "epoch": 0.6316864758907249, + "grad_norm": 13.683680496200315, + "kl": 1.6222900390625, + "learning_rate": 7.185365073021171e-06, + "loss": 0.0649, + "reward": 1.6242793073877693, + "reward_std": 0.4118321215661126, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11113736048791907, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.8666666738688946, + "step": 2860 + }, + { + "completion_length": 1024.0, + "epoch": 0.6327908228765479, + "grad_norm": 15.701500203307981, + "kl": 5.002978515625, + "learning_rate": 7.148388244242414e-06, + "loss": 0.2001, + "reward": 0.950605523493141, + "reward_std": 0.5147591066779569, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.18481114405440166, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.6604166783392429, + "step": 2865 + }, + { + "completion_length": 1024.0, + "epoch": 0.6338951698623707, + "grad_norm": 7.517215117513362, + "kl": 4.4798828125, + "learning_rate": 7.111453807820587e-06, + "loss": 0.1791, + "reward": 0.7040133336733561, + "reward_std": 0.5188803709228523, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.17307000582586624, + "rewards/format_reward": 0.2625, + "rewards/reasoning_steps_reward": 0.6020833441987634, + "step": 2870 + }, + { + "completion_length": 1024.0, + "epoch": 0.6349995168481937, + "grad_norm": 2.7433463665535838, + "kl": 1.028369140625, + "learning_rate": 7.0745623128268605e-06, + "loss": 0.0411, + "reward": 0.9007511441479437, + "reward_std": 0.5913131707464345, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17008219271956476, + "rewards/format_reward": 0.35625, + "rewards/reasoning_steps_reward": 0.7145833415910602, + "step": 2875 + }, + { + "completion_length": 1024.0, + "epoch": 0.6361038638340166, + "grad_norm": 4.233552397302543, + "kl": 1.39521484375, + "learning_rate": 7.037714307694038e-06, + "loss": 0.0558, + "reward": 0.8574047698173672, + "reward_std": 0.5745913892163514, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1780118998591206, + "rewards/format_reward": 0.33125, + "rewards/reasoning_steps_reward": 0.7041666755452752, + "step": 2880 + }, + { + "completion_length": 1024.0, + "epoch": 0.6372082108198396, + "grad_norm": 7.045129815332537, + "kl": 2.99322509765625, + "learning_rate": 7.000910340208393e-06, + "loss": 0.1197, + "reward": 0.6241100358776748, + "reward_std": 0.6179765696193499, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2154732992494246, + "rewards/format_reward": 0.225, + "rewards/reasoning_steps_reward": 0.6145833460614085, + "step": 2885 + }, + { + "completion_length": 1024.0, + "epoch": 0.6383125578056625, + "grad_norm": 2.9137501903252745, + "kl": 2.52530517578125, + "learning_rate": 6.964150957501538e-06, + "loss": 0.101, + "reward": 0.6827161773107946, + "reward_std": 0.5930506098521618, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2006171574297241, + "rewards/format_reward": 0.2625, + "rewards/reasoning_steps_reward": 0.6208333447575569, + "step": 2890 + }, + { + "completion_length": 1024.0, + "epoch": 0.6394169047914855, + "grad_norm": 3.1443553762227796, + "kl": 1.66383056640625, + "learning_rate": 6.927436706042276e-06, + "loss": 0.0666, + "reward": 0.8434384000953287, + "reward_std": 0.697918272089737, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21072826540221284, + "rewards/format_reward": 0.3625, + "rewards/reasoning_steps_reward": 0.6916666748002172, + "step": 2895 + }, + { + "completion_length": 1024.0, + "epoch": 0.6405212517773085, + "grad_norm": 2.1234073498427977, + "kl": 1.063555908203125, + "learning_rate": 6.890768131628492e-06, + "loss": 0.0425, + "reward": 1.0372354218969122, + "reward_std": 0.5802448318753705, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16901458023348823, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7375000087544322, + "step": 2900 + }, + { + "epoch": 0.6405212517773085, + "eval_completion_length": 1024.0, + "eval_kl": 1.18333984375, + "eval_loss": 0.04755154624581337, + "eval_reward": 1.1212347888946532, + "eval_reward_std": 0.60481853954494, + "eval_rewards/accuracy_reward": 0.025, + "eval_rewards/cosine_scaled_reward": -0.1587652049958706, + "eval_rewards/format_reward": 0.475, + "eval_rewards/reasoning_steps_reward": 0.7800000095367432, + "eval_runtime": 203.4898, + "eval_samples_per_second": 0.487, + "eval_steps_per_second": 0.123, + "step": 2900 + }, + { + "completion_length": 1024.0, + "epoch": 0.6416255987631314, + "grad_norm": 8.761149447970329, + "kl": 1.7990478515625, + "learning_rate": 6.8541457793790204e-06, + "loss": 0.0719, + "reward": 1.0065348925068975, + "reward_std": 0.7884352072956972, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.19346511634066701, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7187500128522515, + "step": 2905 + }, + { + "completion_length": 1022.025, + "epoch": 0.6427299457489544, + "grad_norm": 37.871727195710804, + "kl": 3.3865478515625, + "learning_rate": 6.8175701937255645e-06, + "loss": 0.1355, + "reward": 0.8355722818523645, + "reward_std": 0.5863746992239612, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.2581777243176475, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7000000117346644, + "step": 2910 + }, + { + "completion_length": 1022.73125, + "epoch": 0.6438342927347772, + "grad_norm": 7.935357600252908, + "kl": 2.79166259765625, + "learning_rate": 6.781041918404578e-06, + "loss": 0.1117, + "reward": 0.8582875849679112, + "reward_std": 0.7126139059808339, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.20629576151259243, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.677083345130086, + "step": 2915 + }, + { + "completion_length": 1024.0, + "epoch": 0.6449386397206002, + "grad_norm": 1.322793615061763, + "kl": 1.676708984375, + "learning_rate": 6.744561496449208e-06, + "loss": 0.0671, + "reward": 1.1432567204814403, + "reward_std": 0.6100759642431512, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17132661846117117, + "rewards/format_reward": 0.55, + "rewards/reasoning_steps_reward": 0.7645833445712924, + "step": 2920 + }, + { + "completion_length": 1024.0, + "epoch": 0.6460429867064231, + "grad_norm": 3.0878247518849595, + "kl": 1.036236572265625, + "learning_rate": 6.708129470181197e-06, + "loss": 0.0414, + "reward": 1.1472035638988018, + "reward_std": 0.6603698913229892, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14029642865643838, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7812500115483999, + "step": 2925 + }, + { + "completion_length": 1024.0, + "epoch": 0.6471473336922461, + "grad_norm": 7.62569345930808, + "kl": 0.920257568359375, + "learning_rate": 6.671746381202835e-06, + "loss": 0.0368, + "reward": 1.327859591320157, + "reward_std": 0.5649135158251738, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14089040720136836, + "rewards/format_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8937500085681677, + "step": 2930 + }, + { + "completion_length": 1024.0, + "epoch": 0.648251680678069, + "grad_norm": 3.927819408054046, + "kl": 1.06234130859375, + "learning_rate": 6.635412770388911e-06, + "loss": 0.0425, + "reward": 1.2363583998754621, + "reward_std": 0.5483630039729178, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.17614159435615875, + "rewards/format_reward": 0.56875, + "rewards/reasoning_steps_reward": 0.8375000089406968, + "step": 2935 + }, + { + "completion_length": 1024.0, + "epoch": 0.649356027663892, + "grad_norm": 10.957938169885486, + "kl": 1.73878173828125, + "learning_rate": 6.5991291778786556e-06, + "loss": 0.0696, + "reward": 1.2223307210952044, + "reward_std": 0.5801997775997734, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.18808595033478923, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.8416666766628623, + "step": 2940 + }, + { + "completion_length": 1024.0, + "epoch": 0.650460374649715, + "grad_norm": 7.657170788292764, + "kl": 0.892596435546875, + "learning_rate": 6.562896143067734e-06, + "loss": 0.0357, + "reward": 1.2322623513638973, + "reward_std": 0.6118116105441004, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1469043156481348, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.8291666766628623, + "step": 2945 + }, + { + "completion_length": 1024.0, + "epoch": 0.6515647216355379, + "grad_norm": 8.99018306454909, + "kl": 1.005328369140625, + "learning_rate": 6.526714204600212e-06, + "loss": 0.0402, + "reward": 1.110228473204188, + "reward_std": 0.5596809437090997, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14393819727119989, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7791666759178042, + "step": 2950 + }, + { + "completion_length": 1024.0, + "epoch": 0.6526690686213609, + "grad_norm": 7.671710767110563, + "kl": 0.838983154296875, + "learning_rate": 6.490583900360543e-06, + "loss": 0.0336, + "reward": 1.3401226574555039, + "reward_std": 0.553086530593282, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13904401452746243, + "rewards/format_reward": 0.60625, + "rewards/reasoning_steps_reward": 0.8604166749864817, + "step": 2955 + }, + { + "completion_length": 1024.0, + "epoch": 0.6537734156071838, + "grad_norm": 3.292747371908634, + "kl": 0.92437744140625, + "learning_rate": 6.4545057674655954e-06, + "loss": 0.037, + "reward": 1.328625155496411, + "reward_std": 0.4291026462393347, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12137485419007135, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.8125000057742, + "step": 2960 + }, + { + "completion_length": 1024.0, + "epoch": 0.6548777625930067, + "grad_norm": 4.362719186181906, + "kl": 0.94898681640625, + "learning_rate": 6.418480342256635e-06, + "loss": 0.038, + "reward": 1.357650207653205, + "reward_std": 0.538313817546441, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13609979636312347, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.862500006146729, + "step": 2965 + }, + { + "completion_length": 1024.0, + "epoch": 0.6559821095788296, + "grad_norm": 6.047466207524628, + "kl": 1.5346435546875, + "learning_rate": 6.38250816029139e-06, + "loss": 0.0614, + "reward": 1.3210783490445466, + "reward_std": 0.5543214490637183, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13100498942367267, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 0.8270833432674408, + "step": 2970 + }, + { + "completion_length": 1024.0, + "epoch": 0.6570864565646526, + "grad_norm": 11.380391296560157, + "kl": 0.86226806640625, + "learning_rate": 6.34658975633605e-06, + "loss": 0.0345, + "reward": 1.3079326836625114, + "reward_std": 0.427550901808354, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12540064963977784, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.8083333384245635, + "step": 2975 + }, + { + "completion_length": 1024.0, + "epoch": 0.6581908035504755, + "grad_norm": 3.430002203421272, + "kl": 1.301171875, + "learning_rate": 6.310725664357349e-06, + "loss": 0.0521, + "reward": 1.3198414511978627, + "reward_std": 0.5648883628775365, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.08849188148742541, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.8270833415910601, + "step": 2980 + }, + { + "completion_length": 1024.0, + "epoch": 0.6592951505362985, + "grad_norm": 10.740243313709202, + "kl": 1.353033447265625, + "learning_rate": 6.274916417514605e-06, + "loss": 0.0542, + "reward": 1.3674539031460882, + "reward_std": 0.3986359235073905, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12837943203630858, + "rewards/format_reward": 0.6625, + "rewards/reasoning_steps_reward": 0.8270833410322667, + "step": 2985 + }, + { + "completion_length": 1024.0, + "epoch": 0.6603994975221215, + "grad_norm": 6.295784777875361, + "kl": 1.66905517578125, + "learning_rate": 6.239162548151809e-06, + "loss": 0.0667, + "reward": 1.2881278064567596, + "reward_std": 0.6237002839145134, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.1264555389279849, + "rewards/format_reward": 0.58125, + "rewards/reasoning_steps_reward": 0.8083333417773246, + "step": 2990 + }, + { + "completion_length": 1024.0, + "epoch": 0.6615038445079444, + "grad_norm": 4.214532608040704, + "kl": 1.88565673828125, + "learning_rate": 6.2034645877897e-06, + "loss": 0.0754, + "reward": 1.2514354882296175, + "reward_std": 0.5845151668967447, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.11939785096910782, + "rewards/format_reward": 0.55, + "rewards/reasoning_steps_reward": 0.7958333421498537, + "step": 2995 + }, + { + "completion_length": 1024.0, + "epoch": 0.6626081914937674, + "grad_norm": 9.193960699184903, + "kl": 2.572412109375, + "learning_rate": 6.167823067117868e-06, + "loss": 0.1029, + "reward": 1.358169614057988, + "reward_std": 0.5987432187257582, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11891372348181903, + "rewards/format_reward": 0.63125, + "rewards/reasoning_steps_reward": 0.8270833395421505, + "step": 3000 + }, + { + "epoch": 0.6626081914937674, + "eval_completion_length": 1024.0, + "eval_kl": 0.82634765625, + "eval_loss": 0.03317331522703171, + "eval_reward": 1.4329055428504944, + "eval_reward_std": 0.47894756741821765, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.15376112081110477, + "eval_rewards/format_reward": 0.69, + "eval_rewards/reasoning_steps_reward": 0.896666671037674, + "eval_runtime": 202.2678, + "eval_samples_per_second": 0.489, + "eval_steps_per_second": 0.124, + "step": 3000 + }, + { + "completion_length": 1024.0, + "epoch": 0.6637125384795903, + "grad_norm": 4.332199272935333, + "kl": 1.16727294921875, + "learning_rate": 6.132238515986868e-06, + "loss": 0.0467, + "reward": 1.2984854570306197, + "reward_std": 0.4428606638291967, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12859787809356932, + "rewards/format_reward": 0.61875, + "rewards/reasoning_steps_reward": 0.8083333378657699, + "step": 3005 + }, + { + "completion_length": 1024.0, + "epoch": 0.6648168854654133, + "grad_norm": 6.405145163446423, + "kl": 1.905499267578125, + "learning_rate": 6.096711463400333e-06, + "loss": 0.0762, + "reward": 1.4704199727624654, + "reward_std": 0.47390592549927535, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15249669990153053, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.8979166725650429, + "step": 3010 + }, + { + "completion_length": 1024.0, + "epoch": 0.6659212324512361, + "grad_norm": 2.7983690454882835, + "kl": 1.94473876953125, + "learning_rate": 6.061242437507131e-06, + "loss": 0.0778, + "reward": 1.250732819433324, + "reward_std": 0.6140466192155145, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.15968385117303113, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.8354166734963655, + "step": 3015 + }, + { + "completion_length": 1024.0, + "epoch": 0.6670255794370591, + "grad_norm": 2.84797048221281, + "kl": 2.10213623046875, + "learning_rate": 6.025831965593479e-06, + "loss": 0.0841, + "reward": 1.107512214500457, + "reward_std": 0.5237852192483843, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.19457112488453276, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.8270833421498537, + "step": 3020 + }, + { + "completion_length": 1024.0, + "epoch": 0.668129926422882, + "grad_norm": 1.7479375003128341, + "kl": 2.409619140625, + "learning_rate": 5.990480574075143e-06, + "loss": 0.0963, + "reward": 1.113942611636594, + "reward_std": 0.6643439802435751, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1798073928861413, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.8125000087544322, + "step": 3025 + }, + { + "completion_length": 1024.0, + "epoch": 0.669234273408705, + "grad_norm": 4.910115773642404, + "kl": 2.628387451171875, + "learning_rate": 5.955188788489583e-06, + "loss": 0.1052, + "reward": 1.0177073845639826, + "reward_std": 0.6183391271624714, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.18854261809028686, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.8250000083819031, + "step": 3030 + }, + { + "completion_length": 1024.0, + "epoch": 0.670338620394528, + "grad_norm": 2.5567080757004326, + "kl": 1.95263671875, + "learning_rate": 5.919957133488155e-06, + "loss": 0.078, + "reward": 0.9483737903181464, + "reward_std": 0.5370062646285078, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1974595475825481, + "rewards/format_reward": 0.35, + "rewards/reasoning_steps_reward": 0.7958333401009441, + "step": 3035 + }, + { + "completion_length": 1024.0, + "epoch": 0.6714429673803509, + "grad_norm": 53.531605237061235, + "kl": 1.660748291015625, + "learning_rate": 5.884786132828304e-06, + "loss": 0.0664, + "reward": 1.0435184644535185, + "reward_std": 0.6034316588047659, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.18773154099471867, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.8437500104308129, + "step": 3040 + }, + { + "completion_length": 1024.0, + "epoch": 0.6725473143661739, + "grad_norm": 22.074165945119677, + "kl": 4.23909912109375, + "learning_rate": 5.849676309365786e-06, + "loss": 0.1697, + "reward": 1.0926960329525173, + "reward_std": 0.6052615458262153, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.18022063072421587, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.7979166788980365, + "step": 3045 + }, + { + "completion_length": 1024.0, + "epoch": 0.6736516613519968, + "grad_norm": 95.44466344567884, + "kl": 3.39718017578125, + "learning_rate": 5.814628185046884e-06, + "loss": 0.1359, + "reward": 1.2230933974977234, + "reward_std": 0.6664319176386926, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1852399377487018, + "rewards/format_reward": 0.58125, + "rewards/reasoning_steps_reward": 0.8145833408460021, + "step": 3050 + }, + { + "completion_length": 1024.0, + "epoch": 0.6747560083378198, + "grad_norm": 6.30138216782726, + "kl": 4.382000732421875, + "learning_rate": 5.779642280900668e-06, + "loss": 0.1753, + "reward": 1.2127747944556178, + "reward_std": 0.521913470455911, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.18930854120990262, + "rewards/format_reward": 0.58125, + "rewards/reasoning_steps_reward": 0.8208333421498537, + "step": 3055 + }, + { + "completion_length": 1024.0, + "epoch": 0.6758603553236426, + "grad_norm": 3.3382749208758082, + "kl": 1.1747314453125, + "learning_rate": 5.744719117031217e-06, + "loss": 0.047, + "reward": 1.2256219832226634, + "reward_std": 0.6094286283361725, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.19521136274852324, + "rewards/format_reward": 0.6125, + "rewards/reasoning_steps_reward": 0.7958333427086473, + "step": 3060 + }, + { + "completion_length": 1024.0, + "epoch": 0.6769647023094656, + "grad_norm": 1.0005265918987625, + "kl": 1.160723876953125, + "learning_rate": 5.709859212609919e-06, + "loss": 0.0464, + "reward": 1.1743648422183468, + "reward_std": 0.4749204738356639, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.20063516062800774, + "rewards/format_reward": 0.58125, + "rewards/reasoning_steps_reward": 0.7875000093132257, + "step": 3065 + }, + { + "completion_length": 1022.075, + "epoch": 0.6780690492952886, + "grad_norm": 1.821401805667734, + "kl": 1.6499267578125, + "learning_rate": 5.675063085867747e-06, + "loss": 0.066, + "reward": 1.3006652948854025, + "reward_std": 0.5155112744258077, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.21600137041823472, + "rewards/format_reward": 0.675, + "rewards/reasoning_steps_reward": 0.8416666749864816, + "step": 3070 + }, + { + "completion_length": 1024.0, + "epoch": 0.6791733962811115, + "grad_norm": 5.981269217881959, + "kl": 1.138519287109375, + "learning_rate": 5.6403312540875325e-06, + "loss": 0.0456, + "reward": 1.1754640196362742, + "reward_std": 0.5700981944799424, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.17036931174516212, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.7708333428949118, + "step": 3075 + }, + { + "completion_length": 1024.0, + "epoch": 0.6802777432669345, + "grad_norm": 0.41069015629939837, + "kl": 0.76771240234375, + "learning_rate": 5.6056642335963e-06, + "loss": 0.0307, + "reward": 1.1382618664763868, + "reward_std": 0.5532342360922484, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12007147440890549, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7770833456888795, + "step": 3080 + }, + { + "completion_length": 1024.0, + "epoch": 0.6813820902527574, + "grad_norm": 0.45858301017395814, + "kl": 0.736260986328125, + "learning_rate": 5.571062539757582e-06, + "loss": 0.0295, + "reward": 1.1341605888563209, + "reward_std": 0.5165931562354672, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12417274994077161, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7770833443850279, + "step": 3085 + }, + { + "completion_length": 1024.0, + "epoch": 0.6824864372385804, + "grad_norm": 0.6870617586646819, + "kl": 0.554400634765625, + "learning_rate": 5.536526686963762e-06, + "loss": 0.0222, + "reward": 1.3453298162668943, + "reward_std": 0.41773030079348245, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14425351886893623, + "rewards/format_reward": 0.64375, + "rewards/reasoning_steps_reward": 0.845833346620202, + "step": 3090 + }, + { + "completion_length": 1024.0, + "epoch": 0.6835907842244033, + "grad_norm": 0.2919537380153754, + "kl": 0.2393585205078125, + "learning_rate": 5.50205718862841e-06, + "loss": 0.0096, + "reward": 1.4464579613879323, + "reward_std": 0.3893053664593026, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12229204796021804, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.8437500229105354, + "step": 3095 + }, + { + "completion_length": 1024.0, + "epoch": 0.6846951312102263, + "grad_norm": 0.8647207902345468, + "kl": 0.148114013671875, + "learning_rate": 5.467654557178679e-06, + "loss": 0.0059, + "reward": 1.5913012862205504, + "reward_std": 0.28262074058166037, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.131615390918887, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.8729166831821203, + "step": 3100 + }, + { + "epoch": 0.6846951312102263, + "eval_completion_length": 1024.0, + "eval_kl": 0.34259765625, + "eval_loss": 0.013789056800305843, + "eval_reward": 1.687197803258896, + "eval_reward_std": 0.24495098181068897, + "eval_rewards/accuracy_reward": 0.005, + "eval_rewards/cosine_scaled_reward": -0.12946887340396643, + "eval_rewards/format_reward": 0.93, + "eval_rewards/reasoning_steps_reward": 0.881666682958603, + "eval_runtime": 201.4218, + "eval_samples_per_second": 0.492, + "eval_steps_per_second": 0.124, + "step": 3100 + }, + { + "completion_length": 1024.0, + "epoch": 0.6857994781960493, + "grad_norm": 0.20802814162316793, + "kl": 0.171466064453125, + "learning_rate": 5.433319304047666e-06, + "loss": 0.0069, + "reward": 1.7369946524500848, + "reward_std": 0.22078395002754406, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15050535116752145, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 0.9250000149011612, + "step": 3105 + }, + { + "completion_length": 1024.0, + "epoch": 0.6869038251818721, + "grad_norm": 0.22469975874112494, + "kl": 0.1765625, + "learning_rate": 5.399051939666817e-06, + "loss": 0.0071, + "reward": 1.8477778874337674, + "reward_std": 0.16415738863433943, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.10013877979945392, + "rewards/format_reward": 0.9625, + "rewards/reasoning_steps_reward": 0.9729166686534881, + "step": 3110 + }, + { + "completion_length": 1024.0, + "epoch": 0.688008172167695, + "grad_norm": 0.29801419479847224, + "kl": 0.175927734375, + "learning_rate": 5.36485297345833e-06, + "loss": 0.007, + "reward": 1.815246258676052, + "reward_std": 0.18331017740074457, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.11392041069957486, + "rewards/format_reward": 0.94375, + "rewards/reasoning_steps_reward": 0.9791666708886624, + "step": 3115 + }, + { + "completion_length": 1024.0, + "epoch": 0.689112519153518, + "grad_norm": 0.29642392200106615, + "kl": 0.22320556640625, + "learning_rate": 5.330722913827594e-06, + "loss": 0.0089, + "reward": 1.831030797213316, + "reward_std": 0.12136474607978016, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09813587362295947, + "rewards/format_reward": 0.95625, + "rewards/reasoning_steps_reward": 0.9729166693985463, + "step": 3120 + }, + { + "completion_length": 1024.0, + "epoch": 0.690216866139341, + "grad_norm": 0.7212414204228248, + "kl": 0.284033203125, + "learning_rate": 5.29666226815563e-06, + "loss": 0.0114, + "reward": 1.931699000298977, + "reward_std": 0.184273699127516, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.05580101099476451, + "rewards/format_reward": 0.96875, + "rewards/reasoning_steps_reward": 0.993750000745058, + "step": 3125 + }, + { + "completion_length": 1024.0, + "epoch": 0.6913212131251639, + "grad_norm": 2.361609329654343, + "kl": 0.662255859375, + "learning_rate": 5.262671542791531e-06, + "loss": 0.0265, + "reward": 1.7818097308278085, + "reward_std": 0.2128821130763754, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.09319027925921546, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.9812500014901161, + "step": 3130 + }, + { + "completion_length": 1024.0, + "epoch": 0.6924255601109869, + "grad_norm": 12.86003879073767, + "kl": 2.10830078125, + "learning_rate": 5.228751243044961e-06, + "loss": 0.0843, + "reward": 1.560298126633279, + "reward_std": 0.4203119643294485, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10845187624436221, + "rewards/format_reward": 0.7375, + "rewards/reasoning_steps_reward": 0.9250000037252903, + "step": 3135 + }, + { + "completion_length": 1024.0, + "epoch": 0.6935299070968098, + "grad_norm": 0.7414360033034371, + "kl": 2.04703369140625, + "learning_rate": 5.194901873178622e-06, + "loss": 0.0819, + "reward": 1.4238286472856998, + "reward_std": 0.524612655222063, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13658802091204053, + "rewards/format_reward": 0.6375, + "rewards/reasoning_steps_reward": 0.9166666744276881, + "step": 3140 + }, + { + "completion_length": 1024.0, + "epoch": 0.6946342540826328, + "grad_norm": 1.948404998094558, + "kl": 0.603228759765625, + "learning_rate": 5.1611239364007694e-06, + "loss": 0.0241, + "reward": 1.5344082202762366, + "reward_std": 0.4847637562903401, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.08642512287769932, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.9333333365619183, + "step": 3145 + }, + { + "completion_length": 1024.0, + "epoch": 0.6957386010684558, + "grad_norm": 1.966165060762746, + "kl": 0.9482421875, + "learning_rate": 5.127417934857718e-06, + "loss": 0.0379, + "reward": 1.6038374023512005, + "reward_std": 0.4338164870128821, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11282926524436335, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.9104166701436043, + "step": 3150 + }, + { + "completion_length": 1024.0, + "epoch": 0.6968429480542786, + "grad_norm": 1.209511319929151, + "kl": 1.12919921875, + "learning_rate": 5.093784369626397e-06, + "loss": 0.0452, + "reward": 1.657803256250918, + "reward_std": 0.4734325369603539, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.11303008127142675, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.9145833380520344, + "step": 3155 + }, + { + "completion_length": 1024.0, + "epoch": 0.6979472950401016, + "grad_norm": 2.5216009135121897, + "kl": 1.0867431640625, + "learning_rate": 5.060223740706883e-06, + "loss": 0.0435, + "reward": 1.681024150364101, + "reward_std": 0.35479468195644587, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13772585423866984, + "rewards/format_reward": 0.89375, + "rewards/reasoning_steps_reward": 0.9125000040978193, + "step": 3160 + }, + { + "completion_length": 1024.0, + "epoch": 0.6990516420259245, + "grad_norm": 0.8886495088503168, + "kl": 1.002899169921875, + "learning_rate": 5.026736547014981e-06, + "loss": 0.0401, + "reward": 1.5900187201797962, + "reward_std": 0.3701528381861863, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14123128662467935, + "rewards/format_reward": 0.85, + "rewards/reasoning_steps_reward": 0.8750000044703483, + "step": 3165 + }, + { + "completion_length": 1024.0, + "epoch": 0.7001559890117475, + "grad_norm": 27.57217136094774, + "kl": 0.6616455078125, + "learning_rate": 4.993323286374787e-06, + "loss": 0.0265, + "reward": 1.66028910233872, + "reward_std": 0.4426726102217799, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11054423401947133, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.8895833365619182, + "step": 3170 + }, + { + "completion_length": 1024.0, + "epoch": 0.7012603359975704, + "grad_norm": 5.439057868497935, + "kl": 1.2489013671875, + "learning_rate": 4.959984455511313e-06, + "loss": 0.05, + "reward": 1.7117673270404339, + "reward_std": 0.3744364564627176, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.0944826710416237, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.9125000016763807, + "step": 3175 + }, + { + "completion_length": 1024.0, + "epoch": 0.7023646829833934, + "grad_norm": 1.2836582871647162, + "kl": 1.3407470703125, + "learning_rate": 4.926720550043089e-06, + "loss": 0.0536, + "reward": 1.5685864306986332, + "reward_std": 0.4689555343808024, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11266357623535442, + "rewards/format_reward": 0.81875, + "rewards/reasoning_steps_reward": 0.8437500027939677, + "step": 3180 + }, + { + "completion_length": 1024.0, + "epoch": 0.7034690299692163, + "grad_norm": 2.055876499575165, + "kl": 0.6889404296875, + "learning_rate": 4.893532064474787e-06, + "loss": 0.0276, + "reward": 1.6821095246821642, + "reward_std": 0.35570907073124547, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.09914047343772836, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9000000033527613, + "step": 3185 + }, + { + "completion_length": 1024.0, + "epoch": 0.7045733769550393, + "grad_norm": 0.849845748423523, + "kl": 0.6326416015625, + "learning_rate": 4.860419492189886e-06, + "loss": 0.0253, + "reward": 1.7199891421943903, + "reward_std": 0.4098593617709412, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.09042752947134432, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.941666667163372, + "step": 3190 + }, + { + "completion_length": 1024.0, + "epoch": 0.7056777239408623, + "grad_norm": 1.5168426575194247, + "kl": 0.715838623046875, + "learning_rate": 4.827383325443331e-06, + "loss": 0.0286, + "reward": 1.6638664927333593, + "reward_std": 0.4076169016363565, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1132168541662395, + "rewards/format_reward": 0.84375, + "rewards/reasoning_steps_reward": 0.9208333348855376, + "step": 3195 + }, + { + "completion_length": 1024.0, + "epoch": 0.7067820709266852, + "grad_norm": 1.096524838167028, + "kl": 0.705718994140625, + "learning_rate": 4.794424055354213e-06, + "loss": 0.0283, + "reward": 1.6534100268036127, + "reward_std": 0.34983972859299683, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1278399708433426, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.9125000040978193, + "step": 3200 + }, + { + "epoch": 0.7067820709266852, + "eval_completion_length": 1024.0, + "eval_kl": 1.029453125, + "eval_loss": 0.041368499398231506, + "eval_reward": 1.6281694555282593, + "eval_reward_std": 0.3215235733985901, + "eval_rewards/accuracy_reward": 0.005, + "eval_rewards/cosine_scaled_reward": -0.12349721165373921, + "eval_rewards/format_reward": 0.85, + "eval_rewards/reasoning_steps_reward": 0.896666671037674, + "eval_runtime": 202.4017, + "eval_samples_per_second": 0.489, + "eval_steps_per_second": 0.124, + "step": 3200 + }, + { + "completion_length": 1024.0, + "epoch": 0.7078864179125081, + "grad_norm": 0.43758151226762365, + "kl": 1.76549072265625, + "learning_rate": 4.761542171898469e-06, + "loss": 0.0706, + "reward": 1.5854908142238855, + "reward_std": 0.4213532349691377, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1353425225330284, + "rewards/format_reward": 0.83125, + "rewards/reasoning_steps_reward": 0.8833333387970924, + "step": 3205 + }, + { + "completion_length": 1024.0, + "epoch": 0.708990764898331, + "grad_norm": 4.873529816062781, + "kl": 1.70841064453125, + "learning_rate": 4.728738163901597e-06, + "loss": 0.0684, + "reward": 1.571311548165977, + "reward_std": 0.5427065275493078, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.10577178624807856, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8333333378657699, + "step": 3210 + }, + { + "completion_length": 1024.0, + "epoch": 0.710095111884154, + "grad_norm": 2.164610457649961, + "kl": 2.211083984375, + "learning_rate": 4.696012519031397e-06, + "loss": 0.0885, + "reward": 1.3987850124016403, + "reward_std": 0.6345736057992326, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14913165720063262, + "rewards/format_reward": 0.7375, + "rewards/reasoning_steps_reward": 0.7979166723787785, + "step": 3215 + }, + { + "completion_length": 1024.0, + "epoch": 0.7111994588699769, + "grad_norm": 2.5538641739468426, + "kl": 1.83671875, + "learning_rate": 4.663365723790698e-06, + "loss": 0.0735, + "reward": 1.4091109903994947, + "reward_std": 0.6236417117870587, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15755568039567153, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.7979166768491268, + "step": 3220 + }, + { + "completion_length": 1024.0, + "epoch": 0.7123038058557999, + "grad_norm": 1.350379876708449, + "kl": 0.92711181640625, + "learning_rate": 4.630798263510162e-06, + "loss": 0.0371, + "reward": 1.528088748920709, + "reward_std": 0.629235400253674, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.13024459112311887, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8083333395421505, + "step": 3225 + }, + { + "completion_length": 1024.0, + "epoch": 0.7134081528416228, + "grad_norm": 0.7771904145990121, + "kl": 0.715679931640625, + "learning_rate": 4.598310622341037e-06, + "loss": 0.0286, + "reward": 1.4236940758302807, + "reward_std": 0.3954155091239954, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16797260310559067, + "rewards/format_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.7979166796430945, + "step": 3230 + }, + { + "completion_length": 1024.0, + "epoch": 0.7145124998274458, + "grad_norm": 0.8225437506678818, + "kl": 0.7590087890625, + "learning_rate": 4.565903283247981e-06, + "loss": 0.0304, + "reward": 1.4585623749531806, + "reward_std": 0.5462490013138449, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15185430651181378, + "rewards/format_reward": 0.8, + "rewards/reasoning_steps_reward": 0.7979166800156235, + "step": 3235 + }, + { + "completion_length": 1024.0, + "epoch": 0.7156168468132688, + "grad_norm": 6.890597605713054, + "kl": 1.92825927734375, + "learning_rate": 4.533576728001858e-06, + "loss": 0.0772, + "reward": 1.270110378577374, + "reward_std": 0.48464135241520123, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.12988962087547407, + "rewards/format_reward": 0.6625, + "rewards/reasoning_steps_reward": 0.7062500072643161, + "step": 3240 + }, + { + "completion_length": 1024.0, + "epoch": 0.7167211937990917, + "grad_norm": 7.0282697170862685, + "kl": 1.8246063232421874, + "learning_rate": 4.501331437172606e-06, + "loss": 0.073, + "reward": 1.0211014951419202, + "reward_std": 0.5092876911558051, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13723184492700966, + "rewards/format_reward": 0.53125, + "rewards/reasoning_steps_reward": 0.6208333423361182, + "step": 3245 + }, + { + "completion_length": 1024.0, + "epoch": 0.7178255407849147, + "grad_norm": 18.009744755911356, + "kl": 1.937615966796875, + "learning_rate": 4.469167890122073e-06, + "loss": 0.0775, + "reward": 0.9181118378648534, + "reward_std": 0.48384187065748846, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15897149957600049, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.6020833395421505, + "step": 3250 + }, + { + "completion_length": 1024.0, + "epoch": 0.7189298877707375, + "grad_norm": 11.741470309490381, + "kl": 1.74998779296875, + "learning_rate": 4.437086564996891e-06, + "loss": 0.07, + "reward": 0.7014416160909605, + "reward_std": 0.4629500539504988, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.19439171630911006, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.5208333391696215, + "step": 3255 + }, + { + "completion_length": 1024.0, + "epoch": 0.7200342347565605, + "grad_norm": 3.4613430082652017, + "kl": 0.99676513671875, + "learning_rate": 4.405087938721376e-06, + "loss": 0.0399, + "reward": 0.7130648781159834, + "reward_std": 0.4935263180799666, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1994351311448554, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.5375000078231096, + "step": 3260 + }, + { + "completion_length": 1024.0, + "epoch": 0.7211385817423834, + "grad_norm": 6.852820579545043, + "kl": 1.585101318359375, + "learning_rate": 4.373172486990436e-06, + "loss": 0.0634, + "reward": 0.5891364488064028, + "reward_std": 0.5846462953391892, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2025302262120931, + "rewards/format_reward": 0.28125, + "rewards/reasoning_steps_reward": 0.5104166753590107, + "step": 3265 + }, + { + "completion_length": 1024.0, + "epoch": 0.7222429287282064, + "grad_norm": 7.2550760579621345, + "kl": 1.471478271484375, + "learning_rate": 4.341340684262498e-06, + "loss": 0.0589, + "reward": 0.9112151099252515, + "reward_std": 0.5459779361841356, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15128489717026242, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.5812500080093741, + "step": 3270 + }, + { + "completion_length": 1024.0, + "epoch": 0.7233472757140293, + "grad_norm": 1.943678687735993, + "kl": 1.039892578125, + "learning_rate": 4.309593003752446e-06, + "loss": 0.0416, + "reward": 1.2046927298419177, + "reward_std": 0.5959788782405667, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16405727104865947, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.7312500104308128, + "step": 3275 + }, + { + "completion_length": 1024.0, + "epoch": 0.7244516226998523, + "grad_norm": 3.426760417270529, + "kl": 1.6177978515625, + "learning_rate": 4.277929917424602e-06, + "loss": 0.0647, + "reward": 1.2421426644548774, + "reward_std": 0.5169150336907478, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17869066282291896, + "rewards/format_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.7333333460614085, + "step": 3280 + }, + { + "completion_length": 1024.0, + "epoch": 0.7255559696856753, + "grad_norm": 3.12832092985529, + "kl": 1.526617431640625, + "learning_rate": 4.246351895985702e-06, + "loss": 0.0611, + "reward": 1.3539482331834733, + "reward_std": 0.5149914252076997, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16480177526245826, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.8062500070780516, + "step": 3285 + }, + { + "completion_length": 1024.0, + "epoch": 0.7266603166714982, + "grad_norm": 2.271368859704475, + "kl": 1.709259033203125, + "learning_rate": 4.214859408877899e-06, + "loss": 0.0683, + "reward": 1.2573978632688523, + "reward_std": 0.6140790973193362, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16968547967262565, + "rewards/format_reward": 0.66875, + "rewards/reasoning_steps_reward": 0.745833345875144, + "step": 3290 + }, + { + "completion_length": 1024.0, + "epoch": 0.7277646636573212, + "grad_norm": 1.525812788718264, + "kl": 1.299114990234375, + "learning_rate": 4.183452924271776e-06, + "loss": 0.052, + "reward": 1.341607284604106, + "reward_std": 0.5649268690554891, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15422605765979824, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.7770833449438215, + "step": 3295 + }, + { + "completion_length": 1024.0, + "epoch": 0.728869010643144, + "grad_norm": 9.383515179336834, + "kl": 2.305963134765625, + "learning_rate": 4.152132909059402e-06, + "loss": 0.0923, + "reward": 1.2610476991161703, + "reward_std": 0.5714556918683229, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.18270230072084814, + "rewards/format_reward": 0.7, + "rewards/reasoning_steps_reward": 0.7375000089406967, + "step": 3300 + }, + { + "epoch": 0.728869010643144, + "eval_completion_length": 1024.0, + "eval_kl": 1.10443359375, + "eval_loss": 0.044205810874700546, + "eval_reward": 1.5034566915035248, + "eval_reward_std": 0.48886050406843423, + "eval_rewards/accuracy_reward": 0.01, + "eval_rewards/cosine_scaled_reward": -0.15654332179576158, + "eval_rewards/format_reward": 0.805, + "eval_rewards/reasoning_steps_reward": 0.8450000095367431, + "eval_runtime": 203.426, + "eval_samples_per_second": 0.487, + "eval_steps_per_second": 0.123, + "step": 3300 + }, + { + "completion_length": 1024.0, + "epoch": 0.729973357628967, + "grad_norm": 2.618356452427954, + "kl": 1.448486328125, + "learning_rate": 4.120899828847385e-06, + "loss": 0.058, + "reward": 1.4807198433205486, + "reward_std": 0.4224258393329364, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1880301550409058, + "rewards/format_reward": 0.80625, + "rewards/reasoning_steps_reward": 0.8625000074505806, + "step": 3305 + }, + { + "completion_length": 1024.0, + "epoch": 0.7310777046147899, + "grad_norm": 16.51623283552961, + "kl": 2.0116455078125, + "learning_rate": 4.089754147949935e-06, + "loss": 0.0806, + "reward": 1.6333741225302219, + "reward_std": 0.3553940106343362, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1457925467126188, + "rewards/format_reward": 0.86875, + "rewards/reasoning_steps_reward": 0.8979166746139526, + "step": 3310 + }, + { + "completion_length": 1024.0, + "epoch": 0.7321820516006129, + "grad_norm": 0.5974081751959402, + "kl": 0.870172119140625, + "learning_rate": 4.058696329381987e-06, + "loss": 0.0348, + "reward": 1.6305226560682058, + "reward_std": 0.34538418338706833, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.15281066768729942, + "rewards/format_reward": 0.8625, + "rewards/reasoning_steps_reward": 0.902083340473473, + "step": 3315 + }, + { + "completion_length": 1024.0, + "epoch": 0.7332863985864359, + "grad_norm": 2.29446866118963, + "kl": 1.221923828125, + "learning_rate": 4.027726834852303e-06, + "loss": 0.0489, + "reward": 1.5875793149694801, + "reward_std": 0.477928023977438, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14158735829405486, + "rewards/format_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9041666734963656, + "step": 3320 + }, + { + "completion_length": 1024.0, + "epoch": 0.7343907455722588, + "grad_norm": 2.8765044679308747, + "kl": 1.35091552734375, + "learning_rate": 3.996846124756609e-06, + "loss": 0.0541, + "reward": 1.4872914606705308, + "reward_std": 0.411738165695715, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.16687520117702662, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.8604166748002171, + "step": 3325 + }, + { + "completion_length": 1024.0, + "epoch": 0.7354950925580818, + "grad_norm": 1.4728439317575581, + "kl": 2.201837158203125, + "learning_rate": 3.966054658170754e-06, + "loss": 0.0881, + "reward": 1.3737801656650845, + "reward_std": 0.5207441252474382, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17830317884217947, + "rewards/format_reward": 0.7125, + "rewards/reasoning_steps_reward": 0.8395833428949118, + "step": 3330 + }, + { + "completion_length": 1024.0, + "epoch": 0.7365994395439047, + "grad_norm": 0.6119173500048468, + "kl": 0.925311279296875, + "learning_rate": 3.93535289284388e-06, + "loss": 0.037, + "reward": 1.5270693870261312, + "reward_std": 0.49105042346992606, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1458472884342882, + "rewards/format_reward": 0.75625, + "rewards/reasoning_steps_reward": 0.9041666742414236, + "step": 3335 + }, + { + "completion_length": 1024.0, + "epoch": 0.7377037865297277, + "grad_norm": 0.5654163389180057, + "kl": 0.505694580078125, + "learning_rate": 3.904741285191629e-06, + "loss": 0.0202, + "reward": 1.5745669988915325, + "reward_std": 0.4111982766771689, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14834966310299932, + "rewards/format_reward": 0.825, + "rewards/reasoning_steps_reward": 0.8979166766628623, + "step": 3340 + }, + { + "completion_length": 1024.0, + "epoch": 0.7388081335155506, + "grad_norm": 2.6402255771621412, + "kl": 1.411602783203125, + "learning_rate": 3.874220290289337e-06, + "loss": 0.0565, + "reward": 1.5241700040176511, + "reward_std": 0.5233304193599906, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13833000464364886, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.8562500102445483, + "step": 3345 + }, + { + "completion_length": 1024.0, + "epoch": 0.7399124805013735, + "grad_norm": 0.6883781397562108, + "kl": 0.793212890625, + "learning_rate": 3.8437903618652895e-06, + "loss": 0.0318, + "reward": 1.5530781971290708, + "reward_std": 0.3981657780946989, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10733847725496162, + "rewards/format_reward": 0.79375, + "rewards/reasoning_steps_reward": 0.860416678711772, + "step": 3350 + }, + { + "completion_length": 1024.0, + "epoch": 0.7410168274871964, + "grad_norm": 27.700580105645894, + "kl": 0.977423095703125, + "learning_rate": 3.8134519522939693e-06, + "loss": 0.0391, + "reward": 1.5121993293985725, + "reward_std": 0.4588557916787977, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14196734559372998, + "rewards/format_reward": 0.7875, + "rewards/reasoning_steps_reward": 0.8541666759178043, + "step": 3355 + }, + { + "completion_length": 1024.0, + "epoch": 0.7421211744730194, + "grad_norm": 2.2978020557582335, + "kl": 1.358447265625, + "learning_rate": 3.7832055125893318e-06, + "loss": 0.0544, + "reward": 1.4460453005507587, + "reward_std": 0.6020016885802761, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1414546983760374, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.8187500098720193, + "step": 3360 + }, + { + "completion_length": 1024.0, + "epoch": 0.7432255214588424, + "grad_norm": 0.4062793871357252, + "kl": 0.942327880859375, + "learning_rate": 3.753051492398089e-06, + "loss": 0.0377, + "reward": 1.6512468622997403, + "reward_std": 0.397758130193688, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.10083647546125576, + "rewards/format_reward": 0.8375, + "rewards/reasoning_steps_reward": 0.889583339355886, + "step": 3365 + }, + { + "completion_length": 1024.0, + "epoch": 0.7443298684446653, + "grad_norm": 1.0151769195066895, + "kl": 1.735552978515625, + "learning_rate": 3.7229903399930423e-06, + "loss": 0.0694, + "reward": 1.4343198793707415, + "reward_std": 0.4978488245993503, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14901345187099652, + "rewards/format_reward": 0.7375, + "rewards/reasoning_steps_reward": 0.8333333402872085, + "step": 3370 + }, + { + "completion_length": 1021.4625, + "epoch": 0.7454342154304883, + "grad_norm": 2.398157240549169, + "kl": 1.697705078125, + "learning_rate": 3.6930225022664136e-06, + "loss": 0.0679, + "reward": 1.379991829302162, + "reward_std": 0.5659466957542463, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16167483936878851, + "rewards/format_reward": 0.70625, + "rewards/reasoning_steps_reward": 0.8229166742414236, + "step": 3375 + }, + { + "completion_length": 1024.0, + "epoch": 0.7465385624163112, + "grad_norm": 2.8096952907954975, + "kl": 1.368798828125, + "learning_rate": 3.6631484247231896e-06, + "loss": 0.0547, + "reward": 1.3315429392270743, + "reward_std": 0.6065680258263455, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.16220706112799235, + "rewards/format_reward": 0.69375, + "rewards/reasoning_steps_reward": 0.7937500094994903, + "step": 3380 + }, + { + "completion_length": 1024.0, + "epoch": 0.7476429094021342, + "grad_norm": 10.578165514570484, + "kl": 1.167584228515625, + "learning_rate": 3.6333685514745165e-06, + "loss": 0.0467, + "reward": 1.3127060623912257, + "reward_std": 0.5424171181814017, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1456272709775476, + "rewards/format_reward": 0.65625, + "rewards/reasoning_steps_reward": 0.8020833404734731, + "step": 3385 + }, + { + "completion_length": 1024.0, + "epoch": 0.7487472563879571, + "grad_norm": 0.6269448458862071, + "kl": 1.054925537109375, + "learning_rate": 3.6036833252310887e-06, + "loss": 0.0422, + "reward": 1.5182505875825882, + "reward_std": 0.43510723081126346, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12758274595544208, + "rewards/format_reward": 0.7625, + "rewards/reasoning_steps_reward": 0.8833333406597376, + "step": 3390 + }, + { + "completion_length": 1024.0, + "epoch": 0.7498516033737801, + "grad_norm": 3.0610939003327315, + "kl": 1.247955322265625, + "learning_rate": 3.574093187296568e-06, + "loss": 0.0499, + "reward": 1.3740362918004394, + "reward_std": 0.5503279601005489, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12804705323942472, + "rewards/format_reward": 0.625, + "rewards/reasoning_steps_reward": 0.8583333391696215, + "step": 3395 + }, + { + "completion_length": 1024.0, + "epoch": 0.7509559503596029, + "grad_norm": 1.4397719595687892, + "kl": 0.642218017578125, + "learning_rate": 3.544598577561016e-06, + "loss": 0.0257, + "reward": 1.5150041854009033, + "reward_std": 0.5328479968578904, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.09332914527039975, + "rewards/format_reward": 0.675, + "rewards/reasoning_steps_reward": 0.9145833414047957, + "step": 3400 + }, + { + "epoch": 0.7509559503596029, + "eval_completion_length": 1024.0, + "eval_kl": 1.03607421875, + "eval_loss": 0.041635580360889435, + "eval_reward": 1.522298811674118, + "eval_reward_std": 0.45481792830396445, + "eval_rewards/accuracy_reward": 0.025, + "eval_rewards/cosine_scaled_reward": -0.0977012000605464, + "eval_rewards/format_reward": 0.73, + "eval_rewards/reasoning_steps_reward": 0.8650000083446503, + "eval_runtime": 205.7972, + "eval_samples_per_second": 0.481, + "eval_steps_per_second": 0.121, + "step": 3400 + }, + { + "completion_length": 1020.38125, + "epoch": 0.7520602973454259, + "grad_norm": 1.4135755886673778, + "kl": 0.83087158203125, + "learning_rate": 3.515199934494373e-06, + "loss": 0.0332, + "reward": 1.5240996377862757, + "reward_std": 0.5872578708353103, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.07590036567416973, + "rewards/format_reward": 0.71875, + "rewards/reasoning_steps_reward": 0.8500000078231096, + "step": 3405 + }, + { + "completion_length": 1024.0, + "epoch": 0.7531646443312489, + "grad_norm": 1.440038114167272, + "kl": 1.372210693359375, + "learning_rate": 3.4858976951399237e-06, + "loss": 0.0549, + "reward": 1.447450523695443, + "reward_std": 0.5360916848971101, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.11921614635066362, + "rewards/format_reward": 0.725, + "rewards/reasoning_steps_reward": 0.8291666748002171, + "step": 3410 + }, + { + "completion_length": 1023.81875, + "epoch": 0.7542689913170718, + "grad_norm": 6.136632317031085, + "kl": 1.036328125, + "learning_rate": 3.4566922951078086e-06, + "loss": 0.0415, + "reward": 1.3465895116969477, + "reward_std": 0.6256579857220459, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13049382427416276, + "rewards/format_reward": 0.65625, + "rewards/reasoning_steps_reward": 0.8083333387970925, + "step": 3415 + }, + { + "completion_length": 1024.0, + "epoch": 0.7553733383028948, + "grad_norm": 1.6173327075684585, + "kl": 1.124951171875, + "learning_rate": 3.427584168568535e-06, + "loss": 0.045, + "reward": 1.369578105956316, + "reward_std": 0.5765885033455561, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12625523412593792, + "rewards/format_reward": 0.65, + "rewards/reasoning_steps_reward": 0.8270833423361182, + "step": 3420 + }, + { + "completion_length": 1024.0, + "epoch": 0.7564776852887177, + "grad_norm": 3.4475234051324466, + "kl": 1.0495849609375, + "learning_rate": 3.398573748246544e-06, + "loss": 0.042, + "reward": 1.4252166913822293, + "reward_std": 0.4993854798289249, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.11644997615949251, + "rewards/format_reward": 0.65625, + "rewards/reasoning_steps_reward": 0.8791666707023978, + "step": 3425 + }, + { + "completion_length": 1024.0, + "epoch": 0.7575820322745407, + "grad_norm": 3.1336201942373756, + "kl": 1.468182373046875, + "learning_rate": 3.3696614654137637e-06, + "loss": 0.0587, + "reward": 1.265179492533207, + "reward_std": 0.6175256297712621, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12440384250803618, + "rewards/format_reward": 0.525, + "rewards/reasoning_steps_reward": 0.8583333404734731, + "step": 3430 + }, + { + "completion_length": 1024.0, + "epoch": 0.7586863792603636, + "grad_norm": 31.463751192253017, + "kl": 1.829046630859375, + "learning_rate": 3.3408477498831917e-06, + "loss": 0.0732, + "reward": 1.123897957149893, + "reward_std": 0.4925203584745759, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1281853732885793, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.8020833419635892, + "step": 3435 + }, + { + "completion_length": 1024.0, + "epoch": 0.7597907262461866, + "grad_norm": 2.3970625271113373, + "kl": 1.119512939453125, + "learning_rate": 3.3121330300025222e-06, + "loss": 0.0448, + "reward": 1.135027837054804, + "reward_std": 0.5514261644682847, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1274721680150833, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.8312500065192581, + "step": 3440 + }, + { + "completion_length": 1023.64375, + "epoch": 0.7608950732320094, + "grad_norm": 1.6382197678473562, + "kl": 1.080792236328125, + "learning_rate": 3.2835177326477675e-06, + "loss": 0.0432, + "reward": 1.072135358909145, + "reward_std": 0.5708221859607875, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.10911463984884903, + "rewards/format_reward": 0.40625, + "rewards/reasoning_steps_reward": 0.7687500072643161, + "step": 3445 + }, + { + "completion_length": 1024.0, + "epoch": 0.7619994202178324, + "grad_norm": 7.773948160603541, + "kl": 1.0737548828125, + "learning_rate": 3.2550022832169125e-06, + "loss": 0.043, + "reward": 1.1452186492097098, + "reward_std": 0.531599986756919, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.10269802667899057, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.8041666718199849, + "step": 3450 + }, + { + "completion_length": 1024.0, + "epoch": 0.7631037672036554, + "grad_norm": 7.451659626325706, + "kl": 1.38333740234375, + "learning_rate": 3.2265871056235974e-06, + "loss": 0.0553, + "reward": 1.0934795890934765, + "reward_std": 0.5327975626219995, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.11485373933683149, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.7895833387970924, + "step": 3455 + }, + { + "completion_length": 1024.0, + "epoch": 0.7642081141894783, + "grad_norm": 4.642606409507648, + "kl": 1.634930419921875, + "learning_rate": 3.1982726222908046e-06, + "loss": 0.0655, + "reward": 1.1026644762256184, + "reward_std": 0.4953272847373228, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.09733552844481892, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.812500007264316, + "step": 3460 + }, + { + "completion_length": 1024.0, + "epoch": 0.7653124611753013, + "grad_norm": 3.6705444957957836, + "kl": 0.82242431640625, + "learning_rate": 3.170059254144593e-06, + "loss": 0.0329, + "reward": 1.031765272654593, + "reward_std": 0.5869754700732301, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.11823472948744893, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7562500076368451, + "step": 3465 + }, + { + "completion_length": 1024.0, + "epoch": 0.7664168081611242, + "grad_norm": 10.804721272461153, + "kl": 1.247711181640625, + "learning_rate": 3.1419474206078203e-06, + "loss": 0.0499, + "reward": 0.9572328898822888, + "reward_std": 0.5374889099814026, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14068377380608582, + "rewards/format_reward": 0.35, + "rewards/reasoning_steps_reward": 0.7416666751727462, + "step": 3470 + }, + { + "completion_length": 1024.0, + "epoch": 0.7675211551469472, + "grad_norm": 12.231601833459766, + "kl": 1.7123046875, + "learning_rate": 3.113937539593931e-06, + "loss": 0.0685, + "reward": 0.9976887149776303, + "reward_std": 0.4858238591014924, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1293946221430815, + "rewards/format_reward": 0.40625, + "rewards/reasoning_steps_reward": 0.7208333428949117, + "step": 3475 + }, + { + "completion_length": 1024.0, + "epoch": 0.7686255021327701, + "grad_norm": 40.059144629731655, + "kl": 2.3020477294921875, + "learning_rate": 3.086030027500728e-06, + "loss": 0.0921, + "reward": 0.886852508764423, + "reward_std": 0.5351808949055339, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14648083351930835, + "rewards/format_reward": 0.33125, + "rewards/reasoning_steps_reward": 0.6958333432674408, + "step": 3480 + }, + { + "completion_length": 1024.0, + "epoch": 0.7697298491185931, + "grad_norm": 6.123342241192849, + "kl": 1.230517578125, + "learning_rate": 3.058225299204195e-06, + "loss": 0.0492, + "reward": 0.7911023863998707, + "reward_std": 0.46425032978731906, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13181428615062032, + "rewards/format_reward": 0.275, + "rewards/reasoning_steps_reward": 0.6479166723787785, + "step": 3485 + }, + { + "completion_length": 1024.0, + "epoch": 0.7708341961044161, + "grad_norm": 7.471941635569114, + "kl": 2.369635009765625, + "learning_rate": 3.0305237680523046e-06, + "loss": 0.0947, + "reward": 0.9569121636566706, + "reward_std": 0.5213623499432287, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14517117467548815, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.7083333432674408, + "step": 3490 + }, + { + "completion_length": 1024.0, + "epoch": 0.7719385430902389, + "grad_norm": 5.404185333488702, + "kl": 1.592156982421875, + "learning_rate": 3.002925845858905e-06, + "loss": 0.0637, + "reward": 0.9760491037741303, + "reward_std": 0.5136939262922169, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14686756571754814, + "rewards/format_reward": 0.39375, + "rewards/reasoning_steps_reward": 0.7229166744276881, + "step": 3495 + }, + { + "completion_length": 1019.0, + "epoch": 0.7730428900760619, + "grad_norm": 50.5244563754164, + "kl": 2.205517578125, + "learning_rate": 2.9754319428975796e-06, + "loss": 0.0883, + "reward": 0.7373021919673193, + "reward_std": 0.4538879245365024, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14186448440450478, + "rewards/format_reward": 0.2875, + "rewards/reasoning_steps_reward": 0.5854166746139526, + "step": 3500 + }, + { + "epoch": 0.7730428900760619, + "eval_completion_length": 1023.54, + "eval_kl": 1.16181640625, + "eval_loss": 0.04671285301446915, + "eval_reward": 0.856374105066061, + "eval_reward_std": 0.5129354545962996, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.15029256213456393, + "eval_rewards/format_reward": 0.3, + "eval_rewards/reasoning_steps_reward": 0.7066666767001152, + "eval_runtime": 204.9165, + "eval_samples_per_second": 0.483, + "eval_steps_per_second": 0.122, + "step": 3500 + }, + { + "completion_length": 1024.0, + "epoch": 0.7741472370618848, + "grad_norm": 7.70175417588847, + "kl": 1.191650390625, + "learning_rate": 2.948042467895544e-06, + "loss": 0.0477, + "reward": 0.7259443550603464, + "reward_std": 0.4864570820616791, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1490556534990901, + "rewards/format_reward": 0.25625, + "rewards/reasoning_steps_reward": 0.6187500074505806, + "step": 3505 + }, + { + "completion_length": 1024.0, + "epoch": 0.7752515840477078, + "grad_norm": 9.742286641987212, + "kl": 1.1121826171875, + "learning_rate": 2.920757828027586e-06, + "loss": 0.0445, + "reward": 0.8020869728876278, + "reward_std": 0.49714295062603925, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1333296972559765, + "rewards/format_reward": 0.25, + "rewards/reasoning_steps_reward": 0.6729166770353914, + "step": 3510 + }, + { + "completion_length": 1024.0, + "epoch": 0.7763559310335307, + "grad_norm": 9.596508493581142, + "kl": 1.29034423828125, + "learning_rate": 2.893578428909998e-06, + "loss": 0.0516, + "reward": 0.7311014153528959, + "reward_std": 0.48636515507059813, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1293152589641977, + "rewards/format_reward": 0.26875, + "rewards/reasoning_steps_reward": 0.5854166768491268, + "step": 3515 + }, + { + "completion_length": 1024.0, + "epoch": 0.7774602780193537, + "grad_norm": 14.081251027714208, + "kl": 1.638250732421875, + "learning_rate": 2.8665046745945555e-06, + "loss": 0.0655, + "reward": 0.8628839520883048, + "reward_std": 0.5482322660624049, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14336604780110065, + "rewards/format_reward": 0.34375, + "rewards/reasoning_steps_reward": 0.6625000078231096, + "step": 3520 + }, + { + "completion_length": 1024.0, + "epoch": 0.7785646250051766, + "grad_norm": 45.591594718848036, + "kl": 1.7972900390625, + "learning_rate": 2.839536967562504e-06, + "loss": 0.0718, + "reward": 0.7295638629118912, + "reward_std": 0.45584265950456027, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15585280851228162, + "rewards/format_reward": 0.25, + "rewards/reasoning_steps_reward": 0.6291666755452752, + "step": 3525 + }, + { + "completion_length": 1024.0, + "epoch": 0.7796689719909996, + "grad_norm": 2.437142557750893, + "kl": 1.276025390625, + "learning_rate": 2.8126757087185797e-06, + "loss": 0.051, + "reward": 0.8280719975009561, + "reward_std": 0.5058557034655677, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14276133371167815, + "rewards/format_reward": 0.275, + "rewards/reasoning_steps_reward": 0.6895833410322666, + "step": 3530 + }, + { + "completion_length": 1024.0, + "epoch": 0.7807733189768226, + "grad_norm": 12.311576928951354, + "kl": 2.1121337890625, + "learning_rate": 2.7859212973850535e-06, + "loss": 0.0846, + "reward": 0.8593616144207772, + "reward_std": 0.42143602522992296, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15522172061318998, + "rewards/format_reward": 0.34375, + "rewards/reasoning_steps_reward": 0.670833339355886, + "step": 3535 + }, + { + "completion_length": 1024.0, + "epoch": 0.7818776659626455, + "grad_norm": 11.646125532907798, + "kl": 1.4277587890625, + "learning_rate": 2.759274131295787e-06, + "loss": 0.0571, + "reward": 0.8780025206928258, + "reward_std": 0.42374305160010406, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12408081038593081, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.620833339355886, + "step": 3540 + }, + { + "completion_length": 1024.0, + "epoch": 0.7829820129484684, + "grad_norm": 39.017400560487744, + "kl": 1.5975341796875, + "learning_rate": 2.732734606590318e-06, + "loss": 0.0639, + "reward": 0.5735942371771671, + "reward_std": 0.37694009622591695, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14932243004150222, + "rewards/format_reward": 0.19375, + "rewards/reasoning_steps_reward": 0.5229166757315398, + "step": 3545 + }, + { + "completion_length": 1024.0, + "epoch": 0.7840863599342913, + "grad_norm": 2.6124122163287264, + "kl": 1.3592529296875, + "learning_rate": 2.7063031178079847e-06, + "loss": 0.0544, + "reward": 0.6471456294239033, + "reward_std": 0.39274544618529034, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15493770116772793, + "rewards/format_reward": 0.225, + "rewards/reasoning_steps_reward": 0.5645833417773247, + "step": 3550 + }, + { + "completion_length": 1024.0, + "epoch": 0.7851907069201143, + "grad_norm": 6.2513767416820345, + "kl": 1.19891357421875, + "learning_rate": 2.679980057882049e-06, + "loss": 0.048, + "reward": 0.7008785362413619, + "reward_std": 0.45641955096180026, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11995480215264251, + "rewards/format_reward": 0.25625, + "rewards/reasoning_steps_reward": 0.5458333419635892, + "step": 3555 + }, + { + "completion_length": 1024.0, + "epoch": 0.7862950539059372, + "grad_norm": 2.9178799008139684, + "kl": 0.7404937744140625, + "learning_rate": 2.6537658181338534e-06, + "loss": 0.0296, + "reward": 0.7164174870704301, + "reward_std": 0.44081706466859033, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.11691585160442627, + "rewards/format_reward": 0.25, + "rewards/reasoning_steps_reward": 0.570833340100944, + "step": 3560 + }, + { + "completion_length": 1024.0, + "epoch": 0.7873994008917602, + "grad_norm": 4.960918513864824, + "kl": 0.7436981201171875, + "learning_rate": 2.6276607882670135e-06, + "loss": 0.0297, + "reward": 0.7412137555482332, + "reward_std": 0.46463715256686555, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14003624991455582, + "rewards/format_reward": 0.28125, + "rewards/reasoning_steps_reward": 0.6000000102445483, + "step": 3565 + }, + { + "completion_length": 1024.0, + "epoch": 0.7885037478775831, + "grad_norm": 10.639247963083111, + "kl": 0.886761474609375, + "learning_rate": 2.60166535636162e-06, + "loss": 0.0355, + "reward": 0.8172663133533206, + "reward_std": 0.45502894536784877, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14523369194939734, + "rewards/format_reward": 0.275, + "rewards/reasoning_steps_reward": 0.6750000080093741, + "step": 3570 + }, + { + "completion_length": 1024.0, + "epoch": 0.7896080948634061, + "grad_norm": 28.08455167991598, + "kl": 1.941729736328125, + "learning_rate": 2.5757799088684654e-06, + "loss": 0.0777, + "reward": 0.843649728321634, + "reward_std": 0.5577885125540888, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.16468361105535223, + "rewards/format_reward": 0.3, + "rewards/reasoning_steps_reward": 0.7020833423361182, + "step": 3575 + }, + { + "completion_length": 1024.0, + "epoch": 0.7907124418492291, + "grad_norm": 3.970425059446987, + "kl": 1.81851806640625, + "learning_rate": 2.5500048306033065e-06, + "loss": 0.0727, + "reward": 0.9449512905091979, + "reward_std": 0.5703302607609657, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15088205109277625, + "rewards/format_reward": 0.3375, + "rewards/reasoning_steps_reward": 0.7458333436399698, + "step": 3580 + }, + { + "completion_length": 1024.0, + "epoch": 0.791816788835052, + "grad_norm": 2.682816536500783, + "kl": 1.6467498779296874, + "learning_rate": 2.5243405047411353e-06, + "loss": 0.0659, + "reward": 0.9549707896774635, + "reward_std": 0.5791340425539602, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.17836254732101225, + "rewards/format_reward": 0.34375, + "rewards/reasoning_steps_reward": 0.7833333445712924, + "step": 3585 + }, + { + "completion_length": 1024.0, + "epoch": 0.7929211358208749, + "grad_norm": 1.9396200301781, + "kl": 1.76396484375, + "learning_rate": 2.498787312810492e-06, + "loss": 0.0706, + "reward": 1.0381604361347854, + "reward_std": 0.7050543383204058, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1305895757221151, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.7875000135973096, + "step": 3590 + }, + { + "completion_length": 1024.0, + "epoch": 0.7940254828066978, + "grad_norm": 1.2512973833054954, + "kl": 1.017486572265625, + "learning_rate": 2.4733456346877817e-06, + "loss": 0.0407, + "reward": 0.9714192368090153, + "reward_std": 0.5311943096166942, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.16399744372538408, + "rewards/format_reward": 0.33125, + "rewards/reasoning_steps_reward": 0.7979166781529784, + "step": 3595 + }, + { + "completion_length": 1024.0, + "epoch": 0.7951298297925208, + "grad_norm": 4.205148920710968, + "kl": 2.4321533203125, + "learning_rate": 2.448015848591638e-06, + "loss": 0.0973, + "reward": 0.9355880039744079, + "reward_std": 0.5558298278599978, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1706620103039313, + "rewards/format_reward": 0.35, + "rewards/reasoning_steps_reward": 0.7562500150874257, + "step": 3600 + }, + { + "epoch": 0.7951298297925208, + "eval_completion_length": 1024.0, + "eval_kl": 3.12923828125, + "eval_loss": 0.1257171779870987, + "eval_reward": 1.0359005191922188, + "eval_reward_std": 0.5016864457726479, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.18409949489636346, + "eval_rewards/format_reward": 0.425, + "eval_rewards/reasoning_steps_reward": 0.7950000166893005, + "eval_runtime": 201.9601, + "eval_samples_per_second": 0.49, + "eval_steps_per_second": 0.124, + "step": 3600 + }, + { + "completion_length": 1024.0, + "epoch": 0.7962341767783437, + "grad_norm": 2.9772870979661126, + "kl": 2.625927734375, + "learning_rate": 2.4227983310772963e-06, + "loss": 0.1052, + "reward": 0.941523305606097, + "reward_std": 0.5259468010030105, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.19806003746198259, + "rewards/format_reward": 0.3625, + "rewards/reasoning_steps_reward": 0.7770833464339375, + "step": 3605 + }, + { + "completion_length": 1024.0, + "epoch": 0.7973385237641667, + "grad_norm": 2.731744844543052, + "kl": 2.591387939453125, + "learning_rate": 2.3976934570309974e-06, + "loss": 0.1037, + "reward": 0.896416311757639, + "reward_std": 0.5306536434363807, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.2119170312071219, + "rewards/format_reward": 0.3625, + "rewards/reasoning_steps_reward": 0.7395833466202021, + "step": 3610 + }, + { + "completion_length": 1024.0, + "epoch": 0.7984428707499897, + "grad_norm": 7.5385515356990185, + "kl": 1.553973388671875, + "learning_rate": 2.3727015996644043e-06, + "loss": 0.0622, + "reward": 0.8575988472090103, + "reward_std": 0.5232200874595037, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16323449706978863, + "rewards/format_reward": 0.34375, + "rewards/reasoning_steps_reward": 0.6770833473652601, + "step": 3615 + }, + { + "completion_length": 1024.0, + "epoch": 0.7995472177358126, + "grad_norm": 9.021182502422391, + "kl": 1.311651611328125, + "learning_rate": 2.3478231305090694e-06, + "loss": 0.0524, + "reward": 0.9543035954702646, + "reward_std": 0.5737970527330617, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15819641145644708, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.7250000117346644, + "step": 3620 + }, + { + "completion_length": 1024.0, + "epoch": 0.8006515647216356, + "grad_norm": 4.472787988182606, + "kl": 1.0048095703125, + "learning_rate": 2.3230584194109074e-06, + "loss": 0.0402, + "reward": 0.9397510398179293, + "reward_std": 0.5379055161934957, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.17691563499101903, + "rewards/format_reward": 0.3625, + "rewards/reasoning_steps_reward": 0.7479166820645332, + "step": 3625 + }, + { + "completion_length": 1024.0, + "epoch": 0.8017559117074585, + "grad_norm": 4.113018900515695, + "kl": 1.157135009765625, + "learning_rate": 2.298407834524682e-06, + "loss": 0.0463, + "reward": 0.9861817553406581, + "reward_std": 0.4714244429342216, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.16590158142935252, + "rewards/format_reward": 0.4, + "rewards/reasoning_steps_reward": 0.7395833438262344, + "step": 3630 + }, + { + "completion_length": 1024.0, + "epoch": 0.8028602586932815, + "grad_norm": 5.611788601802546, + "kl": 1.5865966796875, + "learning_rate": 2.2738717423085543e-06, + "loss": 0.0635, + "reward": 1.0154616593383252, + "reward_std": 0.5189789967440447, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13870502139761812, + "rewards/format_reward": 0.425, + "rewards/reasoning_steps_reward": 0.729166678339243, + "step": 3635 + }, + { + "completion_length": 1024.0, + "epoch": 0.8039646056791043, + "grad_norm": 4.863709299234087, + "kl": 1.793304443359375, + "learning_rate": 2.2494505075186234e-06, + "loss": 0.0718, + "reward": 1.0299670369713567, + "reward_std": 0.6288951909449679, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1679496320008184, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.7541666803881526, + "step": 3640 + }, + { + "completion_length": 1024.0, + "epoch": 0.8050689526649273, + "grad_norm": 3.0389529591387983, + "kl": 2.23280029296875, + "learning_rate": 2.2251444932035094e-06, + "loss": 0.0893, + "reward": 0.9538824698538519, + "reward_std": 0.5754079432575964, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.18570087201660498, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.7645833479240537, + "step": 3645 + }, + { + "completion_length": 1024.0, + "epoch": 0.8061732996507502, + "grad_norm": 4.261038530209944, + "kl": 2.365380859375, + "learning_rate": 2.200954060698941e-06, + "loss": 0.0947, + "reward": 1.0910434238612652, + "reward_std": 0.5977108788116311, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.18395657959727033, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7937500169500709, + "step": 3650 + }, + { + "completion_length": 1024.0, + "epoch": 0.8072776466365732, + "grad_norm": 11.856530569543205, + "kl": 1.533966064453125, + "learning_rate": 2.176879569622409e-06, + "loss": 0.0613, + "reward": 0.9856115130707621, + "reward_std": 0.5456289411027683, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22063849223195575, + "rewards/format_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.7687500143423677, + "step": 3655 + }, + { + "completion_length": 1024.0, + "epoch": 0.8083819936223962, + "grad_norm": 3.2452205346158083, + "kl": 1.236077880859375, + "learning_rate": 2.1529213778677993e-06, + "loss": 0.0494, + "reward": 0.8988889244385063, + "reward_std": 0.6373270594252972, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.194861083329306, + "rewards/format_reward": 0.33125, + "rewards/reasoning_steps_reward": 0.7312500124797225, + "step": 3660 + }, + { + "completion_length": 1024.0, + "epoch": 0.8094863406082191, + "grad_norm": 12.439028568659024, + "kl": 0.76240234375, + "learning_rate": 2.1290798416000857e-06, + "loss": 0.0305, + "reward": 1.0351363457739353, + "reward_std": 0.5226037830223504, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.18361366387798625, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.793750013038516, + "step": 3665 + }, + { + "completion_length": 1024.0, + "epoch": 0.8105906875940421, + "grad_norm": 0.6611045827209621, + "kl": 0.63751220703125, + "learning_rate": 2.1053553152500204e-06, + "loss": 0.0255, + "reward": 1.0747706493362785, + "reward_std": 0.5134325242426712, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.16481269130890724, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.839583345875144, + "step": 3670 + }, + { + "completion_length": 1024.0, + "epoch": 0.811695034579865, + "grad_norm": 0.943299396325815, + "kl": 0.6408935546875, + "learning_rate": 2.081748151508883e-06, + "loss": 0.0256, + "reward": 0.9957553435117006, + "reward_std": 0.49763372070156037, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.18341132483328693, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.8041666820645332, + "step": 3675 + }, + { + "completion_length": 1024.0, + "epoch": 0.812799381565688, + "grad_norm": 0.6683933277746549, + "kl": 0.687762451171875, + "learning_rate": 2.0582587013232268e-06, + "loss": 0.0275, + "reward": 1.0955076265148818, + "reward_std": 0.5081618877709844, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.18157571223564445, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.8458333477377892, + "step": 3680 + }, + { + "completion_length": 1024.0, + "epoch": 0.8139037285515108, + "grad_norm": 1.0847188860596124, + "kl": 0.6183135986328125, + "learning_rate": 2.0348873138896563e-06, + "loss": 0.0247, + "reward": 1.1622585462406279, + "reward_std": 0.5471403273332044, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1877414623158984, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.8312500134110451, + "step": 3685 + }, + { + "completion_length": 1024.0, + "epoch": 0.8150080755373338, + "grad_norm": 1.5112078670527096, + "kl": 0.8150054931640625, + "learning_rate": 2.0116343366496493e-06, + "loss": 0.0326, + "reward": 1.1093786764889955, + "reward_std": 0.48941911092260854, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.19687132386607117, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.8562500154599547, + "step": 3690 + }, + { + "completion_length": 1024.0, + "epoch": 0.8161124225231567, + "grad_norm": 11.427469945106585, + "kl": 1.845751953125, + "learning_rate": 1.988500115284385e-06, + "loss": 0.0738, + "reward": 1.3140614761039615, + "reward_std": 0.5058772598677024, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.17135519032599406, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.9166666734963655, + "step": 3695 + }, + { + "completion_length": 1024.0, + "epoch": 0.8172167695089797, + "grad_norm": 1.7741207108559547, + "kl": 0.9895751953125, + "learning_rate": 1.9654849937096033e-06, + "loss": 0.0396, + "reward": 1.33723512776196, + "reward_std": 0.42841523091628914, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.17526487102441024, + "rewards/format_reward": 0.58125, + "rewards/reasoning_steps_reward": 0.9125000096857547, + "step": 3700 + }, + { + "epoch": 0.8172167695089797, + "eval_completion_length": 1024.0, + "eval_kl": 1.0012890625, + "eval_loss": 0.040022626519203186, + "eval_reward": 1.4169398856163025, + "eval_reward_std": 0.4867195551842451, + "eval_rewards/accuracy_reward": 0.01, + "eval_rewards/cosine_scaled_reward": -0.15139344856142997, + "eval_rewards/format_reward": 0.61, + "eval_rewards/reasoning_steps_reward": 0.9483333396911621, + "eval_runtime": 203.2325, + "eval_samples_per_second": 0.487, + "eval_steps_per_second": 0.123, + "step": 3700 + }, + { + "completion_length": 1024.0, + "epoch": 0.8183211164948027, + "grad_norm": 12.054030744773742, + "kl": 1.897283935546875, + "learning_rate": 1.942589314070494e-06, + "loss": 0.0759, + "reward": 1.3313264921307564, + "reward_std": 0.46922859043552306, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14575683891016525, + "rewards/format_reward": 0.525, + "rewards/reasoning_steps_reward": 0.9395833417773247, + "step": 3705 + }, + { + "completion_length": 1024.0, + "epoch": 0.8194254634806256, + "grad_norm": 3.7069945069091936, + "kl": 1.22840576171875, + "learning_rate": 1.9198134167366156e-06, + "loss": 0.0492, + "reward": 1.4052705839276314, + "reward_std": 0.403043683465512, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1197294215176953, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.9562500081956387, + "step": 3710 + }, + { + "completion_length": 1024.0, + "epoch": 0.8205298104664486, + "grad_norm": 2.768907455782949, + "kl": 2.87457275390625, + "learning_rate": 1.897157640296825e-06, + "loss": 0.1152, + "reward": 1.4017615104094148, + "reward_std": 0.3942436770506902, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11490515425248304, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.9291666714474559, + "step": 3715 + }, + { + "completion_length": 1024.0, + "epoch": 0.8216341574522715, + "grad_norm": 6.158506141748816, + "kl": 1.50045166015625, + "learning_rate": 1.8746223215542482e-06, + "loss": 0.06, + "reward": 1.4124196864664555, + "reward_std": 0.43862366709727213, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13133030838798732, + "rewards/format_reward": 0.5875, + "rewards/reasoning_steps_reward": 0.9437500080093741, + "step": 3720 + }, + { + "completion_length": 1024.0, + "epoch": 0.8227385044380945, + "grad_norm": 4.954257940316965, + "kl": 2.506939697265625, + "learning_rate": 1.8522077955212791e-06, + "loss": 0.1003, + "reward": 1.2474359845742584, + "reward_std": 0.409401986663579, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1900640235922765, + "rewards/format_reward": 0.53125, + "rewards/reasoning_steps_reward": 0.9062500108033419, + "step": 3725 + }, + { + "completion_length": 1024.0, + "epoch": 0.8238428514239174, + "grad_norm": 3.384994347227523, + "kl": 2.5660552978515625, + "learning_rate": 1.8299143954145926e-06, + "loss": 0.1026, + "reward": 1.2905212937039323, + "reward_std": 0.5092491055722348, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.19489537274930627, + "rewards/format_reward": 0.56875, + "rewards/reasoning_steps_reward": 0.8979166783392429, + "step": 3730 + }, + { + "completion_length": 1024.0, + "epoch": 0.8249471984097403, + "grad_norm": 16.989052594983022, + "kl": 2.394305419921875, + "learning_rate": 1.8077424526501964e-06, + "loss": 0.0958, + "reward": 1.2323162292130292, + "reward_std": 0.43714046496825176, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17601710449671373, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.8708333456888795, + "step": 3735 + }, + { + "completion_length": 1024.0, + "epoch": 0.8260515453955632, + "grad_norm": 3.3697087191377157, + "kl": 2.461981201171875, + "learning_rate": 1.7856922968384926e-06, + "loss": 0.0985, + "reward": 1.3591851346194743, + "reward_std": 0.4494941798264335, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.16789820450503612, + "rewards/format_reward": 0.59375, + "rewards/reasoning_steps_reward": 0.9270833421498537, + "step": 3740 + }, + { + "completion_length": 1024.0, + "epoch": 0.8271558923813862, + "grad_norm": 3.3314479469990848, + "kl": 1.8787109375, + "learning_rate": 1.763764255779392e-06, + "loss": 0.0751, + "reward": 1.2150082999374718, + "reward_std": 0.45866580544243335, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13915836790692993, + "rewards/format_reward": 0.45, + "rewards/reasoning_steps_reward": 0.8854166736826301, + "step": 3745 + }, + { + "completion_length": 1024.0, + "epoch": 0.8282602393672092, + "grad_norm": 5.075803385145104, + "kl": 3.6768798828125, + "learning_rate": 1.7419586554574364e-06, + "loss": 0.147, + "reward": 1.2929603595286607, + "reward_std": 0.5276947294652927, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.1549563185122679, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.8854166813194752, + "step": 3750 + }, + { + "completion_length": 1024.0, + "epoch": 0.8293645863530321, + "grad_norm": 4.835328190612411, + "kl": 2.53955078125, + "learning_rate": 1.720275820036944e-06, + "loss": 0.1016, + "reward": 1.0985228657722472, + "reward_std": 0.5404113541560946, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.1389771427086089, + "rewards/format_reward": 0.40625, + "rewards/reasoning_steps_reward": 0.8125000113621355, + "step": 3755 + }, + { + "completion_length": 1024.0, + "epoch": 0.8304689333388551, + "grad_norm": 3.6128842543406647, + "kl": 1.5438568115234375, + "learning_rate": 1.6987160718572027e-06, + "loss": 0.0617, + "reward": 1.0970849219709635, + "reward_std": 0.4372379134729272, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15499841592172742, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.820833345875144, + "step": 3760 + }, + { + "completion_length": 1024.0, + "epoch": 0.831573280324678, + "grad_norm": 3.522120303062643, + "kl": 1.2506439208984375, + "learning_rate": 1.6772797314276712e-06, + "loss": 0.05, + "reward": 0.9676698416282307, + "reward_std": 0.39677214640614694, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.178163488789869, + "rewards/format_reward": 0.35625, + "rewards/reasoning_steps_reward": 0.7895833473652601, + "step": 3765 + }, + { + "completion_length": 1024.0, + "epoch": 0.832677627310501, + "grad_norm": 6.35794295461514, + "kl": 1.3462554931640625, + "learning_rate": 1.6559671174232195e-06, + "loss": 0.0539, + "reward": 1.0329537893645466, + "reward_std": 0.48206213802768616, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14412954530816932, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.7958333443850278, + "step": 3770 + }, + { + "completion_length": 1024.0, + "epoch": 0.833781974296324, + "grad_norm": 4.258986368861116, + "kl": 2.50684814453125, + "learning_rate": 1.6347785466793764e-06, + "loss": 0.1003, + "reward": 0.950670113041997, + "reward_std": 0.46344148809494073, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13057988811051474, + "rewards/format_reward": 0.28125, + "rewards/reasoning_steps_reward": 0.787500013038516, + "step": 3775 + }, + { + "completion_length": 1024.0, + "epoch": 0.8348863212821469, + "grad_norm": 2.4134853475595444, + "kl": 2.65799560546875, + "learning_rate": 1.6137143341876439e-06, + "loss": 0.1063, + "reward": 1.0280450066551565, + "reward_std": 0.46901671985397114, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13862167526385746, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.7916666807606816, + "step": 3780 + }, + { + "completion_length": 1024.0, + "epoch": 0.8359906682679697, + "grad_norm": 4.096493293080795, + "kl": 2.3369903564453125, + "learning_rate": 1.5927747930907921e-06, + "loss": 0.0935, + "reward": 1.0754234604537487, + "reward_std": 0.45691707939695336, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14124320970195187, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.8041666811332107, + "step": 3785 + }, + { + "completion_length": 1024.0, + "epoch": 0.8370950152537927, + "grad_norm": 3.2060019572278375, + "kl": 1.8758026123046876, + "learning_rate": 1.5719602346782215e-06, + "loss": 0.075, + "reward": 0.9844577558338642, + "reward_std": 0.48203894472389947, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12804225913132541, + "rewards/format_reward": 0.31875, + "rewards/reasoning_steps_reward": 0.7812500189989805, + "step": 3790 + }, + { + "completion_length": 1024.0, + "epoch": 0.8381993622396157, + "grad_norm": 1.9710529098598488, + "kl": 2.16820068359375, + "learning_rate": 1.5512709683813165e-06, + "loss": 0.0868, + "reward": 1.0551761870272458, + "reward_std": 0.42293115961001604, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.09690715366800759, + "rewards/format_reward": 0.31875, + "rewards/reasoning_steps_reward": 0.8208333468064666, + "step": 3795 + }, + { + "completion_length": 1024.0, + "epoch": 0.8393037092254386, + "grad_norm": 4.843746007302358, + "kl": 1.226190185546875, + "learning_rate": 1.5307073017688644e-06, + "loss": 0.0491, + "reward": 1.1217388808727264, + "reward_std": 0.4583800950756995, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.11367778851745243, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.8166666807606816, + "step": 3800 + }, + { + "epoch": 0.8393037092254386, + "eval_completion_length": 1024.0, + "eval_kl": 0.873955078125, + "eval_loss": 0.03470970690250397, + "eval_reward": 1.0069829231500627, + "eval_reward_std": 0.4387795051932335, + "eval_rewards/accuracy_reward": 0.0, + "eval_rewards/cosine_scaled_reward": -0.14968374449759722, + "eval_rewards/format_reward": 0.355, + "eval_rewards/reasoning_steps_reward": 0.8016666841506958, + "eval_runtime": 203.6085, + "eval_samples_per_second": 0.486, + "eval_steps_per_second": 0.123, + "step": 3800 + }, + { + "completion_length": 1024.0, + "epoch": 0.8404080562112616, + "grad_norm": 9.8277486524576, + "kl": 1.67528076171875, + "learning_rate": 1.5102695405424738e-06, + "loss": 0.0671, + "reward": 0.9232415302656591, + "reward_std": 0.48757856400334276, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1225918134470703, + "rewards/format_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.7333333453163504, + "step": 3805 + }, + { + "completion_length": 1024.0, + "epoch": 0.8415124031970845, + "grad_norm": 7.887505524007284, + "kl": 1.87099609375, + "learning_rate": 1.4899579885320237e-06, + "loss": 0.0749, + "reward": 1.1503344973316416, + "reward_std": 0.41806495695200285, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1309155066817766, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.8250000141561031, + "step": 3810 + }, + { + "completion_length": 1024.0, + "epoch": 0.8426167501829075, + "grad_norm": 3.811520777347624, + "kl": 1.80526123046875, + "learning_rate": 1.4697729476911614e-06, + "loss": 0.0722, + "reward": 1.0021475785411895, + "reward_std": 0.48045386319281536, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13743576427805237, + "rewards/format_reward": 0.35625, + "rewards/reasoning_steps_reward": 0.770833350904286, + "step": 3815 + }, + { + "completion_length": 1024.0, + "epoch": 0.8437210971687304, + "grad_norm": 2.407666114214345, + "kl": 1.65206298828125, + "learning_rate": 1.449714718092803e-06, + "loss": 0.0661, + "reward": 1.1085011329501868, + "reward_std": 0.4444706824200694, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.0914988732081838, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.7937500156462193, + "step": 3820 + }, + { + "completion_length": 1024.0, + "epoch": 0.8448254441545534, + "grad_norm": 4.045358362837525, + "kl": 1.606927490234375, + "learning_rate": 1.4297835979246777e-06, + "loss": 0.0643, + "reward": 1.0195800764486194, + "reward_std": 0.4161707017852677, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14916992889229733, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7875000160187483, + "step": 3825 + }, + { + "completion_length": 1024.0, + "epoch": 0.8459297911403763, + "grad_norm": 4.254843536541897, + "kl": 1.788482666015625, + "learning_rate": 1.4099798834848855e-06, + "loss": 0.0716, + "reward": 1.0221065394580364, + "reward_std": 0.4706483832735103, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13206013098242692, + "rewards/format_reward": 0.35, + "rewards/reasoning_steps_reward": 0.7979166816920042, + "step": 3830 + }, + { + "completion_length": 1024.0, + "epoch": 0.8470341381261992, + "grad_norm": 4.7405984768373415, + "kl": 2.34656982421875, + "learning_rate": 1.3903038691775095e-06, + "loss": 0.0938, + "reward": 1.0039814393036068, + "reward_std": 0.4533116746461019, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12101856818553643, + "rewards/format_reward": 0.35625, + "rewards/reasoning_steps_reward": 0.7625000121071934, + "step": 3835 + }, + { + "completion_length": 1024.0, + "epoch": 0.8481384851120222, + "grad_norm": 4.516232325190879, + "kl": 1.359423828125, + "learning_rate": 1.370755847508226e-06, + "loss": 0.0543, + "reward": 1.0372777149896137, + "reward_std": 0.492635853459069, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.10647228800517042, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7500000139698386, + "step": 3840 + }, + { + "completion_length": 1024.0, + "epoch": 0.8492428320978451, + "grad_norm": 3.2830116476433098, + "kl": 1.38970947265625, + "learning_rate": 1.3513361090799537e-06, + "loss": 0.0556, + "reward": 1.0849484650418162, + "reward_std": 0.4841716932147392, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14421821158321108, + "rewards/format_reward": 0.425, + "rewards/reasoning_steps_reward": 0.7979166807606817, + "step": 3845 + }, + { + "completion_length": 1024.0, + "epoch": 0.8503471790836681, + "grad_norm": 4.480857992077528, + "kl": 1.501470947265625, + "learning_rate": 1.332044942588545e-06, + "loss": 0.0601, + "reward": 1.1137627013260498, + "reward_std": 0.42765071511385033, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12582063591689802, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.8083333443850279, + "step": 3850 + }, + { + "completion_length": 1024.0, + "epoch": 0.851451526069491, + "grad_norm": 7.943484649640199, + "kl": 1.7091888427734374, + "learning_rate": 1.3128826348184886e-06, + "loss": 0.0684, + "reward": 1.0067575078748632, + "reward_std": 0.5076355822726327, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12449250066274545, + "rewards/format_reward": 0.36875, + "rewards/reasoning_steps_reward": 0.7500000152736902, + "step": 3855 + }, + { + "completion_length": 1024.0, + "epoch": 0.852555873055314, + "grad_norm": 2.5412807678609806, + "kl": 0.94459228515625, + "learning_rate": 1.2938494706386462e-06, + "loss": 0.0378, + "reward": 1.156890353001654, + "reward_std": 0.48906847709586143, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.10560965545591898, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.8187500165775419, + "step": 3860 + }, + { + "completion_length": 1024.0, + "epoch": 0.853660220041137, + "grad_norm": 3.8180837005980854, + "kl": 1.16829833984375, + "learning_rate": 1.2749457329980108e-06, + "loss": 0.0467, + "reward": 1.0843943199433852, + "reward_std": 0.4156025383039378, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1385223493918602, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.8354166794568301, + "step": 3865 + }, + { + "completion_length": 1024.0, + "epoch": 0.8547645670269599, + "grad_norm": 34.666542207088746, + "kl": 2.95928955078125, + "learning_rate": 1.256171702921516e-06, + "loss": 0.1184, + "reward": 1.1096524391323328, + "reward_std": 0.4731335958989803, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16743089526426047, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7833333468064666, + "step": 3870 + }, + { + "completion_length": 1024.0, + "epoch": 0.8558689140127829, + "grad_norm": 4.192208427980209, + "kl": 1.46943359375, + "learning_rate": 1.237527659505846e-06, + "loss": 0.0588, + "reward": 1.0440983198815956, + "reward_std": 0.40177030080858456, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14756835330772447, + "rewards/format_reward": 0.39375, + "rewards/reasoning_steps_reward": 0.7916666785255074, + "step": 3875 + }, + { + "completion_length": 1024.0, + "epoch": 0.8569732609986057, + "grad_norm": 1.4373607975014058, + "kl": 0.4958740234375, + "learning_rate": 1.2190138799152851e-06, + "loss": 0.0198, + "reward": 1.1634813494980336, + "reward_std": 0.49837744958058466, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12193531318916939, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.82291667945683, + "step": 3880 + }, + { + "completion_length": 1024.0, + "epoch": 0.8580776079844287, + "grad_norm": 10.062619229211846, + "kl": 0.8890899658203125, + "learning_rate": 1.200630639377609e-06, + "loss": 0.0356, + "reward": 0.9816565293818712, + "reward_std": 0.38651110691134816, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1329268047120422, + "rewards/format_reward": 0.35625, + "rewards/reasoning_steps_reward": 0.7583333479240537, + "step": 3885 + }, + { + "completion_length": 1024.0, + "epoch": 0.8591819549702516, + "grad_norm": 2.517396839676297, + "kl": 1.968524169921875, + "learning_rate": 1.1823782111799843e-06, + "loss": 0.0787, + "reward": 0.959625584539026, + "reward_std": 0.5189180550776655, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15079108572681435, + "rewards/format_reward": 0.35, + "rewards/reasoning_steps_reward": 0.7541666792705655, + "step": 3890 + }, + { + "completion_length": 1024.0, + "epoch": 0.8602863019560746, + "grad_norm": 2.8178537770706886, + "kl": 0.726678466796875, + "learning_rate": 1.1642568666649067e-06, + "loss": 0.0291, + "reward": 1.0694204801693559, + "reward_std": 0.44945709503699616, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1493295207095798, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.7875000167638063, + "step": 3895 + }, + { + "completion_length": 1024.0, + "epoch": 0.8613906489418975, + "grad_norm": 1.2175267033985402, + "kl": 0.955523681640625, + "learning_rate": 1.1462668752261652e-06, + "loss": 0.0382, + "reward": 1.000610039383173, + "reward_std": 0.49909483049996195, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1598066317845223, + "rewards/format_reward": 0.4, + "rewards/reasoning_steps_reward": 0.7604166818782687, + "step": 3900 + }, + { + "epoch": 0.8613906489418975, + "eval_completion_length": 1024.0, + "eval_kl": 1.683125, + "eval_loss": 0.0676305741071701, + "eval_reward": 0.9381104612350464, + "eval_reward_std": 0.4509727944433689, + "eval_rewards/accuracy_reward": 0.015, + "eval_rewards/cosine_scaled_reward": -0.1618895485624671, + "eval_rewards/format_reward": 0.28, + "eval_rewards/reasoning_steps_reward": 0.8050000178813934, + "eval_runtime": 201.783, + "eval_samples_per_second": 0.491, + "eval_steps_per_second": 0.124, + "step": 3900 + }, + { + "completion_length": 1024.0, + "epoch": 0.8624949959277205, + "grad_norm": 2.170430471378303, + "kl": 1.952362060546875, + "learning_rate": 1.1284085043048465e-06, + "loss": 0.0781, + "reward": 1.025868459790945, + "reward_std": 0.5059246151708067, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17413154328241945, + "rewards/format_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.7625000141561031, + "step": 3905 + }, + { + "completion_length": 1024.0, + "epoch": 0.8635993429135435, + "grad_norm": 5.357087985724468, + "kl": 1.887725830078125, + "learning_rate": 1.1106820193853484e-06, + "loss": 0.0755, + "reward": 1.0269752063788473, + "reward_std": 0.44671607576310635, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15219147218740545, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.7791666839271784, + "step": 3910 + }, + { + "completion_length": 1024.0, + "epoch": 0.8647036898993664, + "grad_norm": 2.5863387297050693, + "kl": 0.875408935546875, + "learning_rate": 1.0930876839914418e-06, + "loss": 0.035, + "reward": 1.1502701964229345, + "reward_std": 0.44783246733859416, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13514647865085863, + "rewards/format_reward": 0.46875, + "rewards/reasoning_steps_reward": 0.8104166803881526, + "step": 3915 + }, + { + "completion_length": 1024.0, + "epoch": 0.8658080368851894, + "grad_norm": 3.23242533022308, + "kl": 1.299554443359375, + "learning_rate": 1.0756257596823427e-06, + "loss": 0.052, + "reward": 1.112755262479186, + "reward_std": 0.49227567511698, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1414114096784033, + "rewards/format_reward": 0.43125, + "rewards/reasoning_steps_reward": 0.8229166833683849, + "step": 3920 + }, + { + "completion_length": 1024.0, + "epoch": 0.8669123838710123, + "grad_norm": 2.138140290646965, + "kl": 1.885113525390625, + "learning_rate": 1.058296506048836e-06, + "loss": 0.0754, + "reward": 1.164114381093532, + "reward_std": 0.5346207804002916, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13796895813720766, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.7958333490416407, + "step": 3925 + }, + { + "completion_length": 1024.0, + "epoch": 0.8680167308568352, + "grad_norm": 5.127451325350497, + "kl": 1.73660888671875, + "learning_rate": 1.04110018070941e-06, + "loss": 0.0695, + "reward": 1.0745520979922731, + "reward_std": 0.49730775001007715, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1400312382105767, + "rewards/format_reward": 0.46875, + "rewards/reasoning_steps_reward": 0.7458333507180214, + "step": 3930 + }, + { + "completion_length": 1024.0, + "epoch": 0.8691210778426581, + "grad_norm": 3.665613966481991, + "kl": 0.8619049072265625, + "learning_rate": 1.0240370393064235e-06, + "loss": 0.0345, + "reward": 1.1202016398310661, + "reward_std": 0.60314673992807, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12771503719559404, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7854166829958558, + "step": 3935 + }, + { + "completion_length": 1024.0, + "epoch": 0.8702254248284811, + "grad_norm": 5.457231423921015, + "kl": 1.279339599609375, + "learning_rate": 1.0071073355023097e-06, + "loss": 0.0512, + "reward": 1.1111865887418388, + "reward_std": 0.5126064650583431, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1346467504816246, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7645833479240537, + "step": 3940 + }, + { + "completion_length": 1024.0, + "epoch": 0.871329771814304, + "grad_norm": 1.4204895897110041, + "kl": 1.849835205078125, + "learning_rate": 9.903113209758098e-07, + "loss": 0.074, + "reward": 1.1405908815562724, + "reward_std": 0.5249507926579099, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.15732579003379216, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.7854166811332106, + "step": 3945 + }, + { + "completion_length": 1024.0, + "epoch": 0.872434118800127, + "grad_norm": 2.613485274176624, + "kl": 1.406982421875, + "learning_rate": 9.736492454182211e-07, + "loss": 0.0563, + "reward": 0.9954834171570838, + "reward_std": 0.5651785338108312, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12951659189420753, + "rewards/format_reward": 0.34375, + "rewards/reasoning_steps_reward": 0.7750000163912774, + "step": 3950 + }, + { + "completion_length": 1024.0, + "epoch": 0.87353846578595, + "grad_norm": 3.2220887386073467, + "kl": 1.297503662109375, + "learning_rate": 9.571213565296877e-07, + "loss": 0.0519, + "reward": 1.045906347129494, + "reward_std": 0.5811525732686278, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1290936590579804, + "rewards/format_reward": 0.40625, + "rewards/reasoning_steps_reward": 0.7562500141561032, + "step": 3955 + }, + { + "completion_length": 1024.0, + "epoch": 0.8746428127717729, + "grad_norm": 3.4199691033243105, + "kl": 0.97791748046875, + "learning_rate": 9.407279000155311e-07, + "loss": 0.0391, + "reward": 1.1156769435852767, + "reward_std": 0.5415843210706953, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14265640405064914, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.795833345875144, + "step": 3960 + }, + { + "completion_length": 1024.0, + "epoch": 0.8757471597575959, + "grad_norm": 6.611742861375454, + "kl": 1.126373291015625, + "learning_rate": 9.244691195825794e-07, + "loss": 0.0451, + "reward": 1.0655396494301386, + "reward_std": 0.4242564166415832, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14487702874903335, + "rewards/format_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.7666666831821203, + "step": 3965 + }, + { + "completion_length": 1024.0, + "epoch": 0.8768515067434188, + "grad_norm": 6.2650579753314295, + "kl": 1.105645751953125, + "learning_rate": 9.0834525693555e-07, + "loss": 0.0442, + "reward": 1.1305613292381167, + "reward_std": 0.5012515306996648, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12568867837253492, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7687500163912773, + "step": 3970 + }, + { + "completion_length": 1024.0, + "epoch": 0.8779558537292417, + "grad_norm": 6.559802348774649, + "kl": 1.6363189697265625, + "learning_rate": 8.923565517734633e-07, + "loss": 0.0654, + "reward": 1.1925322379916907, + "reward_std": 0.5651830204267754, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.11163442874731118, + "rewards/format_reward": 0.46875, + "rewards/reasoning_steps_reward": 0.8104166805744171, + "step": 3975 + }, + { + "completion_length": 1024.0, + "epoch": 0.8790602007150646, + "grad_norm": 3.897602069962089, + "kl": 1.82530517578125, + "learning_rate": 8.765032417860753e-07, + "loss": 0.073, + "reward": 0.9899700607638806, + "reward_std": 0.4757543180807261, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1391966143855825, + "rewards/format_reward": 0.38125, + "rewards/reasoning_steps_reward": 0.7416666837409138, + "step": 3980 + }, + { + "completion_length": 1024.0, + "epoch": 0.8801645477008876, + "grad_norm": 7.591241433425781, + "kl": 1.08575439453125, + "learning_rate": 8.607855626503403e-07, + "loss": 0.0434, + "reward": 1.1372865000739694, + "reward_std": 0.5247640458663227, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12938016958419213, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.8041666815057397, + "step": 3985 + }, + { + "completion_length": 1024.0, + "epoch": 0.8812688946867105, + "grad_norm": 4.303510587529375, + "kl": 1.087200927734375, + "learning_rate": 8.452037480269082e-07, + "loss": 0.0435, + "reward": 1.0119140914292075, + "reward_std": 0.4009784645517357, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12350257498037535, + "rewards/format_reward": 0.375, + "rewards/reasoning_steps_reward": 0.7479166829958558, + "step": 3990 + }, + { + "completion_length": 1024.0, + "epoch": 0.8823732416725335, + "grad_norm": 5.929217922923497, + "kl": 1.24005126953125, + "learning_rate": 8.297580295566576e-07, + "loss": 0.0496, + "reward": 1.026335727609694, + "reward_std": 0.48216943825391356, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1236642804076837, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.7250000150874257, + "step": 3995 + }, + { + "completion_length": 1024.0, + "epoch": 0.8834775886583565, + "grad_norm": 2.332684901634004, + "kl": 1.697442626953125, + "learning_rate": 8.144486368572468e-07, + "loss": 0.0679, + "reward": 1.0781446799635888, + "reward_std": 0.4809949690039502, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13852199764514808, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.7916666816920042, + "step": 4000 + }, + { + "epoch": 0.8834775886583565, + "eval_completion_length": 1024.0, + "eval_kl": 1.759248046875, + "eval_loss": 0.07073025405406952, + "eval_reward": 1.0747132487595081, + "eval_reward_std": 0.486686719302088, + "eval_rewards/accuracy_reward": 0.015, + "eval_rewards/cosine_scaled_reward": -0.12195342842489482, + "eval_rewards/format_reward": 0.435, + "eval_rewards/reasoning_steps_reward": 0.74666669100523, + "eval_runtime": 203.224, + "eval_samples_per_second": 0.487, + "eval_steps_per_second": 0.123, + "step": 4000 + }, + { + "completion_length": 1024.0, + "epoch": 0.8845819356441794, + "grad_norm": 3.153161789027804, + "kl": 1.55931396484375, + "learning_rate": 7.992757975196974e-07, + "loss": 0.0624, + "reward": 1.0648077727295457, + "reward_std": 0.45864389188354837, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1289422349364031, + "rewards/format_reward": 0.39375, + "rewards/reasoning_steps_reward": 0.8000000143423677, + "step": 4005 + }, + { + "completion_length": 1024.0, + "epoch": 0.8856862826300024, + "grad_norm": 2.2325669337373983, + "kl": 1.570654296875, + "learning_rate": 7.842397371050181e-07, + "loss": 0.0628, + "reward": 1.065108502563089, + "reward_std": 0.5289823830302339, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.16197483572468627, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.7645833469927311, + "step": 4010 + }, + { + "completion_length": 1024.0, + "epoch": 0.8867906296158253, + "grad_norm": 2.7142887474028012, + "kl": 1.02591552734375, + "learning_rate": 7.693406791408476e-07, + "loss": 0.041, + "reward": 1.1182113092392683, + "reward_std": 0.5619218776490016, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1463720285333693, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7895833475515246, + "step": 4015 + }, + { + "completion_length": 1024.0, + "epoch": 0.8878949766016483, + "grad_norm": 4.411870374708956, + "kl": 0.986627197265625, + "learning_rate": 7.545788451181313e-07, + "loss": 0.0395, + "reward": 1.1376007285900414, + "reward_std": 0.46481511711303936, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1394826118749279, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7645833507180214, + "step": 4020 + }, + { + "completion_length": 1024.0, + "epoch": 0.8889993235874711, + "grad_norm": 3.425420744219714, + "kl": 1.2092529296875, + "learning_rate": 7.399544544878268e-07, + "loss": 0.0484, + "reward": 1.093935468606651, + "reward_std": 0.5276332072971854, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15606453956934274, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.756250019557774, + "step": 4025 + }, + { + "completion_length": 1024.0, + "epoch": 0.8901036705732941, + "grad_norm": 6.2482885105172725, + "kl": 2.00596923828125, + "learning_rate": 7.25467724657647e-07, + "loss": 0.0803, + "reward": 1.112189820688218, + "reward_std": 0.5027000481175492, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.135726847530168, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7666666828095913, + "step": 4030 + }, + { + "completion_length": 1024.0, + "epoch": 0.891208017559117, + "grad_norm": 4.123267493368509, + "kl": 1.91627197265625, + "learning_rate": 7.11118870988825e-07, + "loss": 0.0767, + "reward": 1.0083888062275945, + "reward_std": 0.5308977565960958, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.17077786354784621, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.7291666796430946, + "step": 4035 + }, + { + "completion_length": 1024.0, + "epoch": 0.89231236454494, + "grad_norm": 2.5447247120995904, + "kl": 1.29932861328125, + "learning_rate": 6.969081067929129e-07, + "loss": 0.052, + "reward": 1.1347805107012392, + "reward_std": 0.37370490196408357, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14438615987601225, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.8041666796430945, + "step": 4040 + }, + { + "completion_length": 1024.0, + "epoch": 0.893416711530763, + "grad_norm": 3.3545211281045546, + "kl": 1.1327117919921874, + "learning_rate": 6.828356433286065e-07, + "loss": 0.0453, + "reward": 0.9914261367172003, + "reward_std": 0.5204619866504799, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1294072063108615, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.6958333469927311, + "step": 4045 + }, + { + "completion_length": 1024.0, + "epoch": 0.8945210585165859, + "grad_norm": 4.230761353854734, + "kl": 0.87974853515625, + "learning_rate": 6.689016897986123e-07, + "loss": 0.0352, + "reward": 1.0761857211589814, + "reward_std": 0.5829236682388, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.14256428971712012, + "rewards/format_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.7625000156462193, + "step": 4050 + }, + { + "completion_length": 1024.0, + "epoch": 0.8956254055024089, + "grad_norm": 1.8923064047781268, + "kl": 1.389752197265625, + "learning_rate": 6.551064533465335e-07, + "loss": 0.0556, + "reward": 1.1430289884097875, + "reward_std": 0.5576779363034803, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1382210265833237, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.7562500160187483, + "step": 4055 + }, + { + "completion_length": 1024.0, + "epoch": 0.8967297524882318, + "grad_norm": 6.0362623942979505, + "kl": 2.644989013671875, + "learning_rate": 6.414501390537875e-07, + "loss": 0.1057, + "reward": 1.150946792308241, + "reward_std": 0.48350526331923904, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.15530321877513414, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7750000149011612, + "step": 4060 + }, + { + "completion_length": 1024.0, + "epoch": 0.8978340994740548, + "grad_norm": 2.199768849556265, + "kl": 2.11390380859375, + "learning_rate": 6.279329499365649e-07, + "loss": 0.0846, + "reward": 1.1383668217342346, + "reward_std": 0.46480973800826175, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.10954985310381744, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.7791666816920042, + "step": 4065 + }, + { + "completion_length": 1024.0, + "epoch": 0.8989384464598776, + "grad_norm": 9.295433499575353, + "kl": 1.520904541015625, + "learning_rate": 6.14555086942804e-07, + "loss": 0.0608, + "reward": 1.1201986480504273, + "reward_std": 0.4280889055877196, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13605136539481463, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7937500178813934, + "step": 4070 + }, + { + "completion_length": 1024.0, + "epoch": 0.9000427934457006, + "grad_norm": 1.6615044294375885, + "kl": 1.034375, + "learning_rate": 6.013167489492089e-07, + "loss": 0.0414, + "reward": 1.0116388690192253, + "reward_std": 0.46661656610522184, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1404444719890307, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.7333333438262344, + "step": 4075 + }, + { + "completion_length": 1024.0, + "epoch": 0.9011471404315236, + "grad_norm": 5.062466581880976, + "kl": 1.399224853515625, + "learning_rate": 5.88218132758287e-07, + "loss": 0.056, + "reward": 1.0753349607344718, + "reward_std": 0.48987307887655335, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12258170813620381, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.7479166820645332, + "step": 4080 + }, + { + "completion_length": 1024.0, + "epoch": 0.9022514874173465, + "grad_norm": 11.209836105553178, + "kl": 1.842578125, + "learning_rate": 5.752594330954275e-07, + "loss": 0.0737, + "reward": 1.0284711010754108, + "reward_std": 0.5226584098192688, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.14652890994620976, + "rewards/format_reward": 0.4125, + "rewards/reasoning_steps_reward": 0.7437500147148967, + "step": 4085 + }, + { + "completion_length": 1024.0, + "epoch": 0.9033558344031695, + "grad_norm": 7.199991876784292, + "kl": 2.265875244140625, + "learning_rate": 5.624408426060124e-07, + "loss": 0.0907, + "reward": 1.1538773463093093, + "reward_std": 0.5723583094921196, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14403933168505317, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7916666816920042, + "step": 4090 + }, + { + "completion_length": 1024.0, + "epoch": 0.9044601813889924, + "grad_norm": 3.975211613433658, + "kl": 1.727044677734375, + "learning_rate": 5.497625518525374e-07, + "loss": 0.0691, + "reward": 1.1335043588653206, + "reward_std": 0.4624854526555282, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12899564821491366, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7937500139698386, + "step": 4095 + }, + { + "completion_length": 1024.0, + "epoch": 0.9055645283748154, + "grad_norm": 3.383012591131137, + "kl": 1.18646240234375, + "learning_rate": 5.372247493117921e-07, + "loss": 0.0474, + "reward": 1.1752978217788042, + "reward_std": 0.3756918714298081, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14761884533763805, + "rewards/format_reward": 0.525, + "rewards/reasoning_steps_reward": 0.7916666809469461, + "step": 4100 + }, + { + "epoch": 0.9055645283748154, + "eval_completion_length": 1024.0, + "eval_kl": 1.332880859375, + "eval_loss": 0.05309397354722023, + "eval_reward": 1.1452737122774124, + "eval_reward_std": 0.6122168021649123, + "eval_rewards/accuracy_reward": 0.02, + "eval_rewards/cosine_scaled_reward": -0.12472628904506564, + "eval_rewards/format_reward": 0.475, + "eval_rewards/reasoning_steps_reward": 0.7750000178813934, + "eval_runtime": 202.1824, + "eval_samples_per_second": 0.49, + "eval_steps_per_second": 0.124, + "step": 4100 + }, + { + "completion_length": 1024.0, + "epoch": 0.9066688753606383, + "grad_norm": 3.4174559729540874, + "kl": 1.212176513671875, + "learning_rate": 5.248276213720526e-07, + "loss": 0.0485, + "reward": 1.0279372279532253, + "reward_std": 0.5210070614280994, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15956277402001434, + "rewards/format_reward": 0.3875, + "rewards/reasoning_steps_reward": 0.7937500147148967, + "step": 4105 + }, + { + "completion_length": 1024.0, + "epoch": 0.9077732223464613, + "grad_norm": 4.039284591694262, + "kl": 1.042962646484375, + "learning_rate": 5.125713523303133e-07, + "loss": 0.0417, + "reward": 1.1345659455750137, + "reward_std": 0.5074929667287507, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1612673976487713, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.802083351649344, + "step": 4110 + }, + { + "completion_length": 1024.0, + "epoch": 0.9088775693322843, + "grad_norm": 2.1407975550189455, + "kl": 1.029180908203125, + "learning_rate": 5.004561243895433e-07, + "loss": 0.0412, + "reward": 1.1098061236087233, + "reward_std": 0.4296879690635251, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1506105485586886, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.7979166826233268, + "step": 4115 + }, + { + "completion_length": 1024.0, + "epoch": 0.9099819163181071, + "grad_norm": 2.787687525177025, + "kl": 1.16190185546875, + "learning_rate": 4.884821176559817e-07, + "loss": 0.0465, + "reward": 1.1772496801801027, + "reward_std": 0.45350805636044245, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14983365589287131, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.8145833475515246, + "step": 4120 + }, + { + "completion_length": 1024.0, + "epoch": 0.91108626330393, + "grad_norm": 3.4685777527475925, + "kl": 1.169305419921875, + "learning_rate": 4.7664951013645875e-07, + "loss": 0.0468, + "reward": 1.1000913422554732, + "reward_std": 0.586482156632701, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13740866679772806, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7750000130385161, + "step": 4125 + }, + { + "completion_length": 1024.0, + "epoch": 0.912190610289753, + "grad_norm": 2.5440932847767126, + "kl": 0.869580078125, + "learning_rate": 4.649584777357452e-07, + "loss": 0.0348, + "reward": 1.101002143137157, + "reward_std": 0.5315961849104497, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14691453777631977, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.785416685976088, + "step": 4130 + }, + { + "completion_length": 1024.0, + "epoch": 0.913294957275576, + "grad_norm": 4.050421877530695, + "kl": 1.568072509765625, + "learning_rate": 4.534091942539476e-07, + "loss": 0.0628, + "reward": 1.1267952339723706, + "reward_std": 0.4579050815096707, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13778810024832638, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7833333531394601, + "step": 4135 + }, + { + "completion_length": 1024.0, + "epoch": 0.9143993042613989, + "grad_norm": 9.301363016784023, + "kl": 1.900787353515625, + "learning_rate": 4.420018313839147e-07, + "loss": 0.076, + "reward": 1.1611902100965381, + "reward_std": 0.4580170904053375, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15547646980849095, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.7666666839271784, + "step": 4140 + }, + { + "completion_length": 1024.0, + "epoch": 0.9155036512472219, + "grad_norm": 3.174997058824143, + "kl": 1.6366119384765625, + "learning_rate": 4.3073655870869093e-07, + "loss": 0.0655, + "reward": 1.1532112454995513, + "reward_std": 0.6160767867557297, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14053876251273323, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7937500162050128, + "step": 4145 + }, + { + "completion_length": 1024.0, + "epoch": 0.9166079982330448, + "grad_norm": 5.719018592647839, + "kl": 1.850433349609375, + "learning_rate": 4.1961354369898675e-07, + "loss": 0.0741, + "reward": 1.1542285384610296, + "reward_std": 0.4916458193274593, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12493812668617466, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7791666813194752, + "step": 4150 + }, + { + "completion_length": 1024.0, + "epoch": 0.9177123452188678, + "grad_norm": 6.25154325723326, + "kl": 1.735711669921875, + "learning_rate": 4.086329517107046e-07, + "loss": 0.0694, + "reward": 1.0918453134596349, + "reward_std": 0.5446474470940302, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12898802184672603, + "rewards/format_reward": 0.41875, + "rewards/reasoning_steps_reward": 0.7958333440124988, + "step": 4155 + }, + { + "completion_length": 1024.0, + "epoch": 0.9188166922046908, + "grad_norm": 2.5591543960133207, + "kl": 1.272998046875, + "learning_rate": 3.9779494598246484e-07, + "loss": 0.0509, + "reward": 1.1463804263621569, + "reward_std": 0.5207044870971004, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14945291494550475, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7833333505317569, + "step": 4160 + }, + { + "completion_length": 1024.0, + "epoch": 0.9199210391905137, + "grad_norm": 7.67735841511338, + "kl": 2.06146240234375, + "learning_rate": 3.8709968763318894e-07, + "loss": 0.0824, + "reward": 1.0511977900750935, + "reward_std": 0.5209969227365946, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14046887978074665, + "rewards/format_reward": 0.425, + "rewards/reasoning_steps_reward": 0.7604166788980364, + "step": 4165 + }, + { + "completion_length": 1024.0, + "epoch": 0.9210253861763366, + "grad_norm": 1.0227618516880124, + "kl": 1.127886962890625, + "learning_rate": 3.7654733565969826e-07, + "loss": 0.0451, + "reward": 1.1872975867241622, + "reward_std": 0.46389186800297466, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14603575810724578, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.7958333490416407, + "step": 4170 + }, + { + "completion_length": 1024.0, + "epoch": 0.9221297331621595, + "grad_norm": 7.718448222947808, + "kl": 1.17996826171875, + "learning_rate": 3.661380469343556e-07, + "loss": 0.0472, + "reward": 1.190551941562444, + "reward_std": 0.565188505727565, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.12403139998496045, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.8083333481103182, + "step": 4175 + }, + { + "completion_length": 1024.0, + "epoch": 0.9232340801479825, + "grad_norm": 4.845262970075181, + "kl": 1.309710693359375, + "learning_rate": 3.558719762027307e-07, + "loss": 0.0524, + "reward": 1.1957894197665155, + "reward_std": 0.4323232921247836, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12921058931970036, + "rewards/format_reward": 0.53125, + "rewards/reasoning_steps_reward": 0.7875000156462193, + "step": 4180 + }, + { + "completion_length": 1024.0, + "epoch": 0.9243384271338054, + "grad_norm": 4.766569710401553, + "kl": 1.71771240234375, + "learning_rate": 3.457492760812975e-07, + "loss": 0.0687, + "reward": 1.2371738120913505, + "reward_std": 0.5003112346317267, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12949285372342273, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.8166666815057397, + "step": 4185 + }, + { + "completion_length": 1024.0, + "epoch": 0.9254427741196284, + "grad_norm": 8.214132928933958, + "kl": 2.177740478515625, + "learning_rate": 3.357700970551681e-07, + "loss": 0.0871, + "reward": 1.1125389066524805, + "reward_std": 0.4511790112737799, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12912776759476402, + "rewards/format_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.7979166798293591, + "step": 4190 + }, + { + "completion_length": 1024.0, + "epoch": 0.9265471211054513, + "grad_norm": 12.11864779361759, + "kl": 1.639886474609375, + "learning_rate": 3.2593458747585683e-07, + "loss": 0.0656, + "reward": 1.0972121067345142, + "reward_std": 0.49666505910572595, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1527878977904038, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7687500169500708, + "step": 4195 + }, + { + "completion_length": 1024.0, + "epoch": 0.9276514680912743, + "grad_norm": 1.7415606550831522, + "kl": 1.255419921875, + "learning_rate": 3.1624289355907334e-07, + "loss": 0.0502, + "reward": 1.1839751296676695, + "reward_std": 0.47496756471573465, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12852488255703065, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.8187500152736902, + "step": 4200 + }, + { + "epoch": 0.9276514680912743, + "eval_completion_length": 1024.0, + "eval_kl": 2.0467578125, + "eval_loss": 0.08228980004787445, + "eval_reward": 1.2043625664711, + "eval_reward_std": 0.5759655183041468, + "eval_rewards/accuracy_reward": 0.02, + "eval_rewards/cosine_scaled_reward": -0.1273041057959199, + "eval_rewards/format_reward": 0.53, + "eval_rewards/reasoning_steps_reward": 0.7816666835546493, + "eval_runtime": 201.8511, + "eval_samples_per_second": 0.49, + "eval_steps_per_second": 0.124, + "step": 4200 + }, + { + "completion_length": 1024.0, + "epoch": 0.9287558150770973, + "grad_norm": 4.138803338820017, + "kl": 1.4931396484375, + "learning_rate": 3.0669515938254404e-07, + "loss": 0.0597, + "reward": 1.0497166961431503, + "reward_std": 0.4923819173818629, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15236664820404258, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.7395833484828472, + "step": 4205 + }, + { + "completion_length": 1024.0, + "epoch": 0.9298601620629202, + "grad_norm": 4.359344826638161, + "kl": 1.354461669921875, + "learning_rate": 2.972915268838794e-07, + "loss": 0.0542, + "reward": 1.1435699885245412, + "reward_std": 0.5130232198811427, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13976335329643916, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7708333484828472, + "step": 4210 + }, + { + "completion_length": 1024.0, + "epoch": 0.9309645090487431, + "grad_norm": 2.438246106743701, + "kl": 1.33017578125, + "learning_rate": 2.8803213585846036e-07, + "loss": 0.0532, + "reward": 1.134584633493796, + "reward_std": 0.5522605210964684, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13208204362870218, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7604166816920042, + "step": 4215 + }, + { + "completion_length": 1024.0, + "epoch": 0.932068856034566, + "grad_norm": 3.668505135077039, + "kl": 1.239410400390625, + "learning_rate": 2.7891712395735513e-07, + "loss": 0.0496, + "reward": 1.1889993457123638, + "reward_std": 0.49140313523857915, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.12558400264533703, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7895833475515246, + "step": 4220 + }, + { + "completion_length": 1024.0, + "epoch": 0.933173203020389, + "grad_norm": 2.3683790983764506, + "kl": 1.01651611328125, + "learning_rate": 2.699466266852779e-07, + "loss": 0.0407, + "reward": 1.2326950676739217, + "reward_std": 0.4832893053477164, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15272159982996528, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.8229166805744171, + "step": 4225 + }, + { + "completion_length": 1024.0, + "epoch": 0.9342775500062119, + "grad_norm": 7.580296007856714, + "kl": 1.550482177734375, + "learning_rate": 2.6112077739857465e-07, + "loss": 0.062, + "reward": 1.0981525180861353, + "reward_std": 0.48764542372955477, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12476415821192859, + "rewards/format_reward": 0.45, + "rewards/reasoning_steps_reward": 0.7729166800156235, + "step": 4230 + }, + { + "completion_length": 1024.0, + "epoch": 0.9353818969920349, + "grad_norm": 2.347265033901168, + "kl": 1.591693115234375, + "learning_rate": 2.524397073032403e-07, + "loss": 0.0637, + "reward": 1.104093014076352, + "reward_std": 0.6151681105347961, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13132366243735305, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7416666794568301, + "step": 4235 + }, + { + "completion_length": 1024.0, + "epoch": 0.9364862439778578, + "grad_norm": 1.8325563935417812, + "kl": 1.41153564453125, + "learning_rate": 2.4390354545296257e-07, + "loss": 0.0565, + "reward": 1.1709908257238566, + "reward_std": 0.5391441511004814, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.135259176461841, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7875000162050128, + "step": 4240 + }, + { + "completion_length": 1024.0, + "epoch": 0.9375905909636808, + "grad_norm": 2.4516010722839416, + "kl": 1.5540771484375, + "learning_rate": 2.3551241874721353e-07, + "loss": 0.0622, + "reward": 1.093654316617176, + "reward_std": 0.5696152574062581, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14592901616124437, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7645833492279053, + "step": 4245 + }, + { + "completion_length": 1024.0, + "epoch": 0.9386949379495038, + "grad_norm": 7.947155244735769, + "kl": 1.482958984375, + "learning_rate": 2.272664519293566e-07, + "loss": 0.0593, + "reward": 1.1012504249811172, + "reward_std": 0.5641292022948619, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.167499577824492, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.7750000167638064, + "step": 4250 + }, + { + "completion_length": 1024.0, + "epoch": 0.9397992849353267, + "grad_norm": 3.319777474470088, + "kl": 1.46353759765625, + "learning_rate": 2.1916576758478913e-07, + "loss": 0.0586, + "reward": 1.1003205741755664, + "reward_std": 0.5253774059608987, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15384610984983738, + "rewards/format_reward": 0.46875, + "rewards/reasoning_steps_reward": 0.7854166815057397, + "step": 4255 + }, + { + "completion_length": 1024.0, + "epoch": 0.9409036319211497, + "grad_norm": 5.512661571326239, + "kl": 1.366729736328125, + "learning_rate": 2.1121048613912843e-07, + "loss": 0.0547, + "reward": 1.1612554124556482, + "reward_std": 0.5421569795755203, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13457791769033065, + "rewards/format_reward": 0.53125, + "rewards/reasoning_steps_reward": 0.7458333497866988, + "step": 4260 + }, + { + "completion_length": 1024.0, + "epoch": 0.9420079789069725, + "grad_norm": 2.1266648680818863, + "kl": 1.49979248046875, + "learning_rate": 2.0340072585641523e-07, + "loss": 0.06, + "reward": 1.111027823621407, + "reward_std": 0.5750096809195384, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.1348055164780817, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.7270833497866989, + "step": 4265 + }, + { + "completion_length": 1024.0, + "epoch": 0.9431123258927955, + "grad_norm": 3.406473823796893, + "kl": 1.421197509765625, + "learning_rate": 1.9573660283735974e-07, + "loss": 0.0568, + "reward": 1.0889284812612459, + "reward_std": 0.4393613349617226, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.146488197666622, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.729166685603559, + "step": 4270 + }, + { + "completion_length": 1024.0, + "epoch": 0.9442166728786184, + "grad_norm": 4.687927464271602, + "kl": 1.366455078125, + "learning_rate": 1.8821823101760949e-07, + "loss": 0.0547, + "reward": 1.1325325137935578, + "reward_std": 0.5590258315745359, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13205082423082787, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7833333486691118, + "step": 4275 + }, + { + "completion_length": 1024.0, + "epoch": 0.9453210198644414, + "grad_norm": 2.9056519481828955, + "kl": 1.257012939453125, + "learning_rate": 1.8084572216606422e-07, + "loss": 0.0503, + "reward": 1.1094948038109578, + "reward_std": 0.5337844159294036, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1259218716812029, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7416666800156235, + "step": 4280 + }, + { + "completion_length": 1024.0, + "epoch": 0.9464253668502643, + "grad_norm": 3.211780491107793, + "kl": 1.4444091796875, + "learning_rate": 1.736191858832048e-07, + "loss": 0.0578, + "reward": 1.1021898888982833, + "reward_std": 0.5546243787208368, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1436434510455001, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7833333473652602, + "step": 4285 + }, + { + "completion_length": 1024.0, + "epoch": 0.9475297138360873, + "grad_norm": 3.791982940822234, + "kl": 1.3896728515625, + "learning_rate": 1.665387295994747e-07, + "loss": 0.0556, + "reward": 1.0838238134048879, + "reward_std": 0.493443389685126, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13284285950794583, + "rewards/format_reward": 0.44375, + "rewards/reasoning_steps_reward": 0.7729166828095912, + "step": 4290 + }, + { + "completion_length": 1024.0, + "epoch": 0.9486340608219103, + "grad_norm": 3.5429912382327564, + "kl": 1.38509521484375, + "learning_rate": 1.5960445857367003e-07, + "loss": 0.0554, + "reward": 1.16405497957021, + "reward_std": 0.5455976179917343, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1505283652804792, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.7645833499729633, + "step": 4295 + }, + { + "completion_length": 1024.0, + "epoch": 0.9497384078077332, + "grad_norm": 3.754514777830821, + "kl": 1.62933349609375, + "learning_rate": 1.5281647589138527e-07, + "loss": 0.0652, + "reward": 1.1515638804994524, + "reward_std": 0.5102403333643452, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1526028042600956, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.7791666816920042, + "step": 4300 + }, + { + "epoch": 0.9497384078077332, + "eval_completion_length": 1024.0, + "eval_kl": 1.414345703125, + "eval_loss": 0.056856490671634674, + "eval_reward": 1.3021593242883682, + "eval_reward_std": 0.5029132961481809, + "eval_rewards/accuracy_reward": 0.03, + "eval_rewards/cosine_scaled_reward": -0.10784066822379827, + "eval_rewards/format_reward": 0.565, + "eval_rewards/reasoning_steps_reward": 0.8150000131130218, + "eval_runtime": 202.3543, + "eval_samples_per_second": 0.489, + "eval_steps_per_second": 0.124, + "step": 4300 + }, + { + "completion_length": 1024.0, + "epoch": 0.9508427547935562, + "grad_norm": 5.874297414860863, + "kl": 1.873388671875, + "learning_rate": 1.4617488246348012e-07, + "loss": 0.0749, + "reward": 1.091808697162196, + "reward_std": 0.4958706302659266, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15194130790732743, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7562500137835741, + "step": 4305 + }, + { + "completion_length": 1024.0, + "epoch": 0.9519471017793791, + "grad_norm": 3.022042116074618, + "kl": 1.3565673828125, + "learning_rate": 1.3967977702456946e-07, + "loss": 0.0542, + "reward": 1.2365093669854104, + "reward_std": 0.5081978341855574, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.14265730559855, + "rewards/format_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8041666803881526, + "step": 4310 + }, + { + "completion_length": 1024.0, + "epoch": 0.953051448765202, + "grad_norm": 2.89681170743707, + "kl": 1.37025146484375, + "learning_rate": 1.3333125613156695e-07, + "loss": 0.0548, + "reward": 1.167290734499693, + "reward_std": 0.5237989299959736, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14104260036001506, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.7583333503454923, + "step": 4315 + }, + { + "completion_length": 1024.0, + "epoch": 0.9541557957510249, + "grad_norm": 3.303852665082722, + "kl": 1.4704833984375, + "learning_rate": 1.271294141622459e-07, + "loss": 0.0588, + "reward": 1.1580770617350935, + "reward_std": 0.525305885047419, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13775627961731515, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7833333499729633, + "step": 4320 + }, + { + "completion_length": 1024.0, + "epoch": 0.9552601427368479, + "grad_norm": 154.75500513505, + "kl": 4.75120849609375, + "learning_rate": 1.2107434331383504e-07, + "loss": 0.1896, + "reward": 1.1435575605370105, + "reward_std": 0.5372929855715484, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14185910428859644, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.7791666820645332, + "step": 4325 + }, + { + "completion_length": 1024.0, + "epoch": 0.9563644897226709, + "grad_norm": 4.195260582471402, + "kl": 1.28109130859375, + "learning_rate": 1.1516613360164408e-07, + "loss": 0.0512, + "reward": 1.194129934720695, + "reward_std": 0.5264870421490742, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13503673870000057, + "rewards/format_reward": 0.53125, + "rewards/reasoning_steps_reward": 0.7854166809469462, + "step": 4330 + }, + { + "completion_length": 1024.0, + "epoch": 0.9574688367084938, + "grad_norm": 4.697501623828217, + "kl": 1.60849609375, + "learning_rate": 1.094048728577346e-07, + "loss": 0.0643, + "reward": 1.1921196983545088, + "reward_std": 0.46050074373306416, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14954696901404532, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.7854166820645332, + "step": 4335 + }, + { + "completion_length": 1024.0, + "epoch": 0.9585731836943168, + "grad_norm": 5.237784344319697, + "kl": 1.4283721923828125, + "learning_rate": 1.0379064672960793e-07, + "loss": 0.0571, + "reward": 1.1046239531598985, + "reward_std": 0.5052039703403353, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.13495938415580894, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.777083345875144, + "step": 4340 + }, + { + "completion_length": 1024.0, + "epoch": 0.9596775306801397, + "grad_norm": 3.8322720113450583, + "kl": 1.65595703125, + "learning_rate": 9.832353867893385e-08, + "loss": 0.0663, + "reward": 1.173404076125007, + "reward_std": 0.49049041836988183, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13284592391137268, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7875000132247806, + "step": 4345 + }, + { + "completion_length": 1024.0, + "epoch": 0.9607818776659627, + "grad_norm": 1.4873099565182297, + "kl": 1.376214599609375, + "learning_rate": 9.300362998030832e-08, + "loss": 0.0551, + "reward": 1.1576748417923226, + "reward_std": 0.43831962359836324, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13190850754035638, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.7833333479240536, + "step": 4350 + }, + { + "completion_length": 1024.0, + "epoch": 0.9618862246517856, + "grad_norm": 2.317212736464978, + "kl": 1.31639404296875, + "learning_rate": 8.783099972004882e-08, + "loss": 0.0526, + "reward": 1.2284595191478729, + "reward_std": 0.5590447532267717, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.09862381973216543, + "rewards/format_reward": 0.5, + "rewards/reasoning_steps_reward": 0.7895833496004343, + "step": 4355 + }, + { + "completion_length": 1024.0, + "epoch": 0.9629905716376085, + "grad_norm": 4.2578007685169394, + "kl": 1.872528076171875, + "learning_rate": 8.280572479501426e-08, + "loss": 0.0749, + "reward": 1.1292044205591083, + "reward_std": 0.5521556083040196, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.16662891303567448, + "rewards/format_reward": 0.525, + "rewards/reasoning_steps_reward": 0.7645833449438214, + "step": 4360 + }, + { + "completion_length": 1024.0, + "epoch": 0.9640949186234314, + "grad_norm": 2.917500800023099, + "kl": 1.26483154296875, + "learning_rate": 7.792787991146356e-08, + "loss": 0.0506, + "reward": 1.15064637940377, + "reward_std": 0.5034463145228074, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14935362476826414, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.8062500160187482, + "step": 4365 + }, + { + "completion_length": 1024.0, + "epoch": 0.9651992656092544, + "grad_norm": 4.463219264295833, + "kl": 1.64820556640625, + "learning_rate": 7.319753758394665e-08, + "loss": 0.066, + "reward": 1.1375942932441832, + "reward_std": 0.5617788247385761, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1478223788541982, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7666666842997074, + "step": 4370 + }, + { + "completion_length": 1024.0, + "epoch": 0.9663036125950774, + "grad_norm": 1.973179732100984, + "kl": 1.30853271484375, + "learning_rate": 6.861476813422419e-08, + "loss": 0.0523, + "reward": 1.1293663954362274, + "reward_std": 0.529233861564353, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12063361131085912, + "rewards/format_reward": 0.4625, + "rewards/reasoning_steps_reward": 0.7812500165775418, + "step": 4375 + }, + { + "completion_length": 1024.0, + "epoch": 0.9674079595809003, + "grad_norm": 9.289039619646017, + "kl": 1.227276611328125, + "learning_rate": 6.417963969022389e-08, + "loss": 0.0491, + "reward": 1.1480536976829172, + "reward_std": 0.553274897771189, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13527963738015386, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7645833507180214, + "step": 4380 + }, + { + "completion_length": 1024.0, + "epoch": 0.9685123065667233, + "grad_norm": 1.9727874950880977, + "kl": 1.340447998046875, + "learning_rate": 5.989221818502478e-08, + "loss": 0.0536, + "reward": 1.1438379530794918, + "reward_std": 0.5070553012817982, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.11657871988791157, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7416666815057397, + "step": 4385 + }, + { + "completion_length": 1024.0, + "epoch": 0.9696166535525462, + "grad_norm": 4.708920919089679, + "kl": 1.432037353515625, + "learning_rate": 5.5752567355883415e-08, + "loss": 0.0573, + "reward": 1.2146405932493507, + "reward_std": 0.5154525164925872, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1145260815290385, + "rewards/format_reward": 0.55, + "rewards/reasoning_steps_reward": 0.7791666824370622, + "step": 4390 + }, + { + "completion_length": 1024.0, + "epoch": 0.9707210005383692, + "grad_norm": 5.44416839852759, + "kl": 1.595782470703125, + "learning_rate": 5.176074874327919e-08, + "loss": 0.0639, + "reward": 1.1949712364934384, + "reward_std": 0.4661798856162932, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12377876958362322, + "rewards/format_reward": 0.55, + "rewards/reasoning_steps_reward": 0.7625000156462193, + "step": 4395 + }, + { + "completion_length": 1024.0, + "epoch": 0.9718253475241921, + "grad_norm": 1.4586617640539672, + "kl": 1.410101318359375, + "learning_rate": 4.791682169000056e-08, + "loss": 0.0564, + "reward": 1.189527632854879, + "reward_std": 0.5215653205494164, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13130570322609855, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7958333488553763, + "step": 4400 + }, + { + "epoch": 0.9718253475241921, + "eval_completion_length": 1024.0, + "eval_kl": 1.406845703125, + "eval_loss": 0.05658888816833496, + "eval_reward": 1.2143233251571655, + "eval_reward_std": 0.5284116192162037, + "eval_rewards/accuracy_reward": 0.005, + "eval_rewards/cosine_scaled_reward": -0.13401001332793386, + "eval_rewards/format_reward": 0.535, + "eval_rewards/reasoning_steps_reward": 0.8083333480358124, + "eval_runtime": 204.7668, + "eval_samples_per_second": 0.483, + "eval_steps_per_second": 0.122, + "step": 4400 + }, + { + "completion_length": 1024.0, + "epoch": 0.9729296945100151, + "grad_norm": 4.00824183633246, + "kl": 1.24869384765625, + "learning_rate": 4.4220843340269105e-08, + "loss": 0.0499, + "reward": 1.1324331050971523, + "reward_std": 0.48450650404556656, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1425668969808612, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.7875000163912773, + "step": 4405 + }, + { + "completion_length": 1024.0, + "epoch": 0.9740340414958379, + "grad_norm": 6.004531096378766, + "kl": 1.369793701171875, + "learning_rate": 4.067286863888131e-08, + "loss": 0.0548, + "reward": 1.1905505992472172, + "reward_std": 0.549637848045677, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14486607483704575, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.7979166818782687, + "step": 4410 + }, + { + "completion_length": 1024.0, + "epoch": 0.9751383884816609, + "grad_norm": 3.1816931594107856, + "kl": 1.212567138671875, + "learning_rate": 3.727295033040035e-08, + "loss": 0.0485, + "reward": 1.2453083097934723, + "reward_std": 0.4840336770255817, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1255250307280221, + "rewards/format_reward": 0.55, + "rewards/reasoning_steps_reward": 0.8083333456888795, + "step": 4415 + }, + { + "completion_length": 1024.0, + "epoch": 0.9762427354674839, + "grad_norm": 2.127607846360171, + "kl": 1.438409423828125, + "learning_rate": 3.402113895836445e-08, + "loss": 0.0575, + "reward": 1.1015230394899844, + "reward_std": 0.5222396170902357, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14014363521273482, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.754166679829359, + "step": 4420 + }, + { + "completion_length": 1024.0, + "epoch": 0.9773470824533068, + "grad_norm": 1.885777803810275, + "kl": 1.2944549560546874, + "learning_rate": 3.091748286453866e-08, + "loss": 0.0518, + "reward": 1.235387963615358, + "reward_std": 0.4532736351233325, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.15211204282531982, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.8375000137835741, + "step": 4425 + }, + { + "completion_length": 1024.0, + "epoch": 0.9784514294391298, + "grad_norm": 2.6855196178764076, + "kl": 1.15111083984375, + "learning_rate": 2.796202818819871e-08, + "loss": 0.0461, + "reward": 1.233874310180545, + "reward_std": 0.4524709703262488, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.1473756947714719, + "rewards/format_reward": 0.575, + "rewards/reasoning_steps_reward": 0.8062500150874257, + "step": 4430 + }, + { + "completion_length": 1024.0, + "epoch": 0.9795557764249527, + "grad_norm": 4.215344003298246, + "kl": 1.727777099609375, + "learning_rate": 2.5154818865440466e-08, + "loss": 0.0692, + "reward": 1.0581813110038638, + "reward_std": 0.47543474311573847, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14390203252357878, + "rewards/format_reward": 0.45, + "rewards/reasoning_steps_reward": 0.7520833509042859, + "step": 4435 + }, + { + "completion_length": 1024.0, + "epoch": 0.9806601234107757, + "grad_norm": 5.2448583567855565, + "kl": 1.29886474609375, + "learning_rate": 2.2495896628529355e-08, + "loss": 0.052, + "reward": 1.2382936247624456, + "reward_std": 0.5197429495237884, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13462304897766444, + "rewards/format_reward": 0.55625, + "rewards/reasoning_steps_reward": 0.8041666820645332, + "step": 4440 + }, + { + "completion_length": 1024.0, + "epoch": 0.9817644703965986, + "grad_norm": 2.0177248381920503, + "kl": 1.354461669921875, + "learning_rate": 1.9985301005280843e-08, + "loss": 0.0541, + "reward": 1.1819130264222621, + "reward_std": 0.4609925127326278, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.17017031446448527, + "rewards/format_reward": 0.54375, + "rewards/reasoning_steps_reward": 0.8083333512768149, + "step": 4445 + }, + { + "completion_length": 1024.0, + "epoch": 0.9828688173824216, + "grad_norm": 3.9907433219645108, + "kl": 1.760162353515625, + "learning_rate": 1.7623069318469797e-08, + "loss": 0.0704, + "reward": 1.1392354678362608, + "reward_std": 0.5574802142305998, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13784787233062162, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7520833496004343, + "step": 4450 + }, + { + "completion_length": 1024.0, + "epoch": 0.9839731643682446, + "grad_norm": 2.3118290114345497, + "kl": 1.65042724609375, + "learning_rate": 1.5409236685277608e-08, + "loss": 0.066, + "reward": 1.1408525642938911, + "reward_std": 0.548295050940942, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1403974376269616, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7625000149011611, + "step": 4455 + }, + { + "completion_length": 1024.0, + "epoch": 0.9850775113540674, + "grad_norm": 57.29691554401206, + "kl": 2.243560791015625, + "learning_rate": 1.3343836016772582e-08, + "loss": 0.0898, + "reward": 1.1517346784472466, + "reward_std": 0.47670648889688894, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13368199388496577, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.779166679829359, + "step": 4460 + }, + { + "completion_length": 1024.0, + "epoch": 0.9861818583398904, + "grad_norm": 5.2108404717142465, + "kl": 1.72296142578125, + "learning_rate": 1.1426898017412591e-08, + "loss": 0.0689, + "reward": 1.1585712847299874, + "reward_std": 0.5616339144078666, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1539287220512051, + "rewards/format_reward": 0.5125, + "rewards/reasoning_steps_reward": 0.7937500132247806, + "step": 4465 + }, + { + "completion_length": 1024.0, + "epoch": 0.9872862053257133, + "grad_norm": 4.200732044863656, + "kl": 2.089434814453125, + "learning_rate": 9.658451184600959e-09, + "loss": 0.0836, + "reward": 1.157062985189259, + "reward_std": 0.5377040596376901, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12210368756641402, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.797916679084301, + "step": 4470 + }, + { + "completion_length": 1024.0, + "epoch": 0.9883905523115363, + "grad_norm": 7.880275781288607, + "kl": 1.3374755859375, + "learning_rate": 8.038521808249045e-09, + "loss": 0.0535, + "reward": 1.1648781194817275, + "reward_std": 0.464659771242259, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.1309552209175308, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.7895833460614086, + "step": 4475 + }, + { + "completion_length": 1024.0, + "epoch": 0.9894948992973592, + "grad_norm": 3.1424315830543934, + "kl": 1.504827880859375, + "learning_rate": 6.567133970397654e-09, + "loss": 0.0602, + "reward": 1.1670374654233455, + "reward_std": 0.5484940701397136, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1412958735250868, + "rewards/format_reward": 0.51875, + "rewards/reasoning_steps_reward": 0.7833333479240536, + "step": 4480 + }, + { + "completion_length": 1024.0, + "epoch": 0.9905992462831822, + "grad_norm": 2.587303669883422, + "kl": 2.021490478515625, + "learning_rate": 5.2443095448506674e-09, + "loss": 0.0809, + "reward": 1.1091021137312054, + "reward_std": 0.5705063059137274, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.13881455489261044, + "rewards/format_reward": 0.49375, + "rewards/reasoning_steps_reward": 0.7479166816920042, + "step": 4485 + }, + { + "completion_length": 1024.0, + "epoch": 0.9917035932690051, + "grad_norm": 6.095524885050565, + "kl": 1.741949462890625, + "learning_rate": 4.070068196853072e-09, + "loss": 0.0697, + "reward": 1.156708344630897, + "reward_std": 0.4693000786108314, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14954165498202202, + "rewards/format_reward": 0.5375, + "rewards/reasoning_steps_reward": 0.7625000149011611, + "step": 4490 + }, + { + "completion_length": 1024.0, + "epoch": 0.9928079402548281, + "grad_norm": 2.396353477035366, + "kl": 1.70526123046875, + "learning_rate": 3.0444273828000857e-09, + "loss": 0.0682, + "reward": 1.0966801326721907, + "reward_std": 0.6420538909413154, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.14498654929120675, + "rewards/format_reward": 0.475, + "rewards/reasoning_steps_reward": 0.7604166805744171, + "step": 4495 + }, + { + "completion_length": 1024.0, + "epoch": 0.9939122872406511, + "grad_norm": 7.241557754267188, + "kl": 1.6732666015625, + "learning_rate": 2.167402349972925e-09, + "loss": 0.0669, + "reward": 1.077176136802882, + "reward_std": 0.5696488693953142, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.13740720303030685, + "rewards/format_reward": 0.45625, + "rewards/reasoning_steps_reward": 0.7458333460614085, + "step": 4500 + }, + { + "epoch": 0.9939122872406511, + "eval_completion_length": 1024.0, + "eval_kl": 1.605205078125, + "eval_loss": 0.06404020637273788, + "eval_reward": 1.1858943843841552, + "eval_reward_std": 0.5475942821498029, + "eval_rewards/accuracy_reward": 0.01, + "eval_rewards/cosine_scaled_reward": -0.13243895017309115, + "eval_rewards/format_reward": 0.515, + "eval_rewards/reasoning_steps_reward": 0.7933333450555802, + "eval_runtime": 227.8369, + "eval_samples_per_second": 0.435, + "eval_steps_per_second": 0.11, + "step": 4500 + }, + { + "completion_length": 1024.0, + "epoch": 0.9950166342264739, + "grad_norm": 2.4597613761069437, + "kl": 1.281756591796875, + "learning_rate": 1.4390061363189767e-09, + "loss": 0.0513, + "reward": 1.1734716016799212, + "reward_std": 0.4506149138222099, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12861173490746297, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7895833510905504, + "step": 4505 + }, + { + "completion_length": 1024.0, + "epoch": 0.9961209812122969, + "grad_norm": 7.245936093195075, + "kl": 1.83837890625, + "learning_rate": 8.592495702497427e-10, + "loss": 0.0735, + "reward": 1.1442578772082925, + "reward_std": 0.49708576006232763, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.14532546755945078, + "rewards/format_reward": 0.50625, + "rewards/reasoning_steps_reward": 0.7833333475515246, + "step": 4510 + }, + { + "completion_length": 1024.0, + "epoch": 0.9972253281981198, + "grad_norm": 4.855720269609806, + "kl": 1.75643310546875, + "learning_rate": 4.2814127048873553e-10, + "loss": 0.0703, + "reward": 1.1811686454340815, + "reward_std": 0.5081049962056567, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.11883136416436173, + "rewards/format_reward": 0.4875, + "rewards/reasoning_steps_reward": 0.8062500145286322, + "step": 4515 + }, + { + "completion_length": 1024.0, + "epoch": 0.9983296751839428, + "grad_norm": 9.240096677091826, + "kl": 1.80220947265625, + "learning_rate": 1.4568764593603235e-10, + "loss": 0.0721, + "reward": 1.1370168601162731, + "reward_std": 0.5179185615059396, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1338164767366834, + "rewards/format_reward": 0.48125, + "rewards/reasoning_steps_reward": 0.7833333497866988, + "step": 4520 + }, + { + "completion_length": 1024.0, + "epoch": 0.9994340221697657, + "grad_norm": 8.38153363176688, + "kl": 1.0825714111328124, + "learning_rate": 1.1892895576126606e-11, + "loss": 0.0433, + "reward": 1.2265190588310362, + "reward_std": 0.4433173163793981, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.15056427880190312, + "rewards/format_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8145833492279053, + "step": 4525 + }, + { + "completion_length": 1024.0, + "epoch": 0.9998757609640949, + "kl": 1.4818878173828125, + "reward": 1.2614081082865596, + "reward_std": 0.6296223533936427, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.12400856785438918, + "rewards/format_reward": 0.578125, + "rewards/reasoning_steps_reward": 0.8072916818782687, + "step": 4527, + "total_flos": 0.0, + "train_loss": 7996306.261529862, + "train_runtime": 280239.113, + "train_samples_per_second": 0.258, + "train_steps_per_second": 0.016 + } + ], + "logging_steps": 5, + "max_steps": 4527, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}