{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998757609640949, "eval_steps": 100, "global_step": 4527, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 407.66875, "epoch": 0.0011043469858229456, "grad_norm": 1.3796405034281456, "kl": 9.243488311767578e-05, "learning_rate": 2.2075055187637973e-07, "loss": 0.0, "reward": 0.7898407633416354, "reward_std": 0.5020190233539324, "rewards/accuracy_reward": 0.175, "rewards/cosine_scaled_reward": -0.08515923985396512, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.2250000026077032, "step": 5 }, { "completion_length": 403.64375, "epoch": 0.002208693971645891, "grad_norm": 2.5191525617733994, "kl": 0.00025566108524799347, "learning_rate": 4.4150110375275946e-07, "loss": 0.0, "reward": 0.5968398815020919, "reward_std": 0.5149940982650151, "rewards/accuracy_reward": 0.125, "rewards/cosine_scaled_reward": -0.1469101178692654, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.18750000279396772, "step": 10 }, { "completion_length": 385.18125, "epoch": 0.0033130409574688366, "grad_norm": 1.3862240766673408, "kl": 0.0002525851130485535, "learning_rate": 6.622516556291392e-07, "loss": 0.0, "reward": 0.5968430628068745, "reward_std": 0.5985216822278744, "rewards/accuracy_reward": 0.15625, "rewards/cosine_scaled_reward": -0.12399027492501773, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.17708333674818277, "step": 15 }, { "completion_length": 400.81875, "epoch": 0.004417387943291782, "grad_norm": 1.7133179531751626, "kl": 0.0004843875765800476, "learning_rate": 8.830022075055189e-07, "loss": 0.0, "reward": 0.64714968377084, "reward_std": 0.47575741023320006, "rewards/accuracy_reward": 0.11875, "rewards/cosine_scaled_reward": -0.1341003203036962, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.1687500037252903, "step": 20 }, { "completion_length": 440.61875, "epoch": 0.005521734929114728, "grad_norm": 1.3020436787261798, "kl": 0.0021125122904777525, "learning_rate": 1.1037527593818985e-06, "loss": 0.0001, "reward": 0.6775904539041221, "reward_std": 0.521566571767471, "rewards/accuracy_reward": 0.125, "rewards/cosine_scaled_reward": -0.1244928854459431, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.21458333693444728, "step": 25 }, { "completion_length": 358.81875, "epoch": 0.006626081914937673, "grad_norm": 2.383431595729729, "kl": 0.0027086704969406126, "learning_rate": 1.3245033112582784e-06, "loss": 0.0001, "reward": 0.6904986225068569, "reward_std": 0.5199786387616768, "rewards/accuracy_reward": 0.1, "rewards/cosine_scaled_reward": -0.11158471030648798, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.15833333544433117, "step": 30 }, { "completion_length": 312.475, "epoch": 0.007730428900760619, "grad_norm": 1.8980957623315589, "kl": 0.00699460506439209, "learning_rate": 1.545253863134658e-06, "loss": 0.0003, "reward": 0.8459697065874934, "reward_std": 0.49007231930154377, "rewards/accuracy_reward": 0.15625, "rewards/cosine_scaled_reward": -0.08736362864729016, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.10833333507180214, "step": 35 }, { "completion_length": 317.73125, "epoch": 0.008834775886583565, "grad_norm": 1.9533540981598545, "kl": 0.012215614318847656, "learning_rate": 1.7660044150110378e-06, "loss": 0.0005, "reward": 0.7698885165620595, "reward_std": 0.4332339205837343, "rewards/accuracy_reward": 0.09375, "rewards/cosine_scaled_reward": -0.13219481345731765, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.10833333525806665, "step": 40 }, { "completion_length": 223.4125, "epoch": 0.00993912287240651, "grad_norm": 2.2672189417696456, "kl": 0.019639110565185545, "learning_rate": 1.9867549668874175e-06, "loss": 0.0008, "reward": 0.797611591219902, "reward_std": 0.3280314706848003, "rewards/accuracy_reward": 0.06875, "rewards/cosine_scaled_reward": -0.13572174331638961, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.03958333414047956, "step": 45 }, { "completion_length": 276.6375, "epoch": 0.011043469858229456, "grad_norm": 2.365582947188283, "kl": 0.03506050109863281, "learning_rate": 2.207505518763797e-06, "loss": 0.0014, "reward": 0.9014913145452738, "reward_std": 0.4327640982926823, "rewards/accuracy_reward": 0.10625, "rewards/cosine_scaled_reward": -0.07350869019464881, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.08750000186264514, "step": 50 }, { "completion_length": 188.775, "epoch": 0.012147816844052401, "grad_norm": 3.410151636466289, "kl": 0.045893096923828126, "learning_rate": 2.4282560706401767e-06, "loss": 0.0018, "reward": 0.8858188761398196, "reward_std": 0.3367013673152542, "rewards/accuracy_reward": 0.05625, "rewards/cosine_scaled_reward": -0.09334779935888946, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.08541666828095913, "step": 55 }, { "completion_length": 156.75, "epoch": 0.013252163829875346, "grad_norm": 3.64480765148819, "kl": 0.07016792297363281, "learning_rate": 2.6490066225165567e-06, "loss": 0.0028, "reward": 0.8576473254710436, "reward_std": 0.26497103955189233, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.0631860019522719, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.058333334140479566, "step": 60 }, { "completion_length": 132.20625, "epoch": 0.014356510815698293, "grad_norm": 4.132077532204708, "kl": 0.1404022216796875, "learning_rate": 2.8697571743929364e-06, "loss": 0.0056, "reward": 0.9789868280291557, "reward_std": 0.31287062716583025, "rewards/accuracy_reward": 0.08125, "rewards/cosine_scaled_reward": -0.0022631677449680863, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.056250000931322576, "step": 65 }, { "completion_length": 111.775, "epoch": 0.015460857801521238, "grad_norm": 3.774544126186504, "kl": 0.45338897705078124, "learning_rate": 3.090507726269316e-06, "loss": 0.0182, "reward": 0.915225807391107, "reward_std": 0.27416729260148714, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.024357524031074718, "rewards/format_reward": 0.85625, "rewards/reasoning_steps_reward": 0.03958333432674408, "step": 70 }, { "completion_length": 90.04375, "epoch": 0.016565204787344183, "grad_norm": 4.161283635522782, "kl": 0.2106475830078125, "learning_rate": 3.311258278145696e-06, "loss": 0.0084, "reward": 0.9585830196738243, "reward_std": 0.292374527291031, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": 0.0002496805100236088, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.020833333767950536, "step": 75 }, { "completion_length": 153.38125, "epoch": 0.01766955177316713, "grad_norm": 3.3838750614323034, "kl": 0.21140708923339843, "learning_rate": 3.5320088300220757e-06, "loss": 0.0085, "reward": 0.8901195518672467, "reward_std": 0.3666377069861483, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": -0.016130454646190628, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.06250000093132257, "step": 80 }, { "completion_length": 108.075, "epoch": 0.018773898758990076, "grad_norm": 3.2335960830389885, "kl": 0.11846466064453125, "learning_rate": 3.752759381898455e-06, "loss": 0.0047, "reward": 1.0230381244793534, "reward_std": 0.3194487606411712, "rewards/accuracy_reward": 0.0875, "rewards/cosine_scaled_reward": 0.014704790979158134, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.04583333432674408, "step": 85 }, { "completion_length": 112.8875, "epoch": 0.01987824574481302, "grad_norm": 2.981290303972626, "kl": 0.14231414794921876, "learning_rate": 3.973509933774835e-06, "loss": 0.0057, "reward": 1.0851823196280748, "reward_std": 0.305687127640158, "rewards/accuracy_reward": 0.11875, "rewards/cosine_scaled_reward": 0.03934898309526034, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.05208333432674408, "step": 90 }, { "completion_length": 105.1125, "epoch": 0.020982592730635966, "grad_norm": 3.41324876981118, "kl": 0.1306793212890625, "learning_rate": 4.1942604856512145e-06, "loss": 0.0052, "reward": 1.0669921234250068, "reward_std": 0.31615011730809783, "rewards/accuracy_reward": 0.08125, "rewards/cosine_scaled_reward": -0.008007883114623837, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.06875000167638064, "step": 95 }, { "completion_length": 96.00625, "epoch": 0.022086939716458913, "grad_norm": 3.3620993390968845, "kl": 0.1841644287109375, "learning_rate": 4.415011037527594e-06, "loss": 0.0074, "reward": 1.043765520118177, "reward_std": 0.25984670983216346, "rewards/accuracy_reward": 0.05625, "rewards/cosine_scaled_reward": -0.01456781585002318, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.07083333544433117, "step": 100 }, { "epoch": 0.022086939716458913, "eval_completion_length": 93.125, "eval_kl": 0.1815234375, "eval_loss": 0.007249877788126469, "eval_reward": 1.1368126523494722, "eval_reward_std": 0.30647377895191313, "eval_rewards/accuracy_reward": 0.08, "eval_rewards/cosine_scaled_reward": 0.008479312220588326, "eval_rewards/format_reward": 0.945, "eval_rewards/reasoning_steps_reward": 0.10333333551883697, "eval_runtime": 45.45, "eval_samples_per_second": 2.178, "eval_steps_per_second": 0.55, "step": 100 }, { "completion_length": 119.2375, "epoch": 0.023191286702281856, "grad_norm": 2.8049680767356135, "kl": 0.1911853790283203, "learning_rate": 4.635761589403974e-06, "loss": 0.0076, "reward": 1.1845005745068193, "reward_std": 0.40934712939670137, "rewards/accuracy_reward": 0.0875, "rewards/cosine_scaled_reward": 0.005333909482578747, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.1854166705161333, "step": 105 }, { "completion_length": 82.05625, "epoch": 0.024295633688104803, "grad_norm": 3.456560596923405, "kl": 0.2972686767578125, "learning_rate": 4.856512141280353e-06, "loss": 0.0119, "reward": 1.2734145127236842, "reward_std": 0.2678998214521016, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.009918814865523018, "rewards/format_reward": 0.95625, "rewards/reasoning_steps_reward": 0.28333334121853115, "step": 110 }, { "completion_length": 54.4625, "epoch": 0.02539998067392775, "grad_norm": 3.5364458340049385, "kl": 0.471929931640625, "learning_rate": 5.077262693156734e-06, "loss": 0.0189, "reward": 1.3385284006595612, "reward_std": 0.18102660190961614, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": 0.003111723146867007, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.3229166755452752, "step": 115 }, { "completion_length": 54.4125, "epoch": 0.026504327659750693, "grad_norm": 4.342426566312151, "kl": 0.61019287109375, "learning_rate": 5.2980132450331135e-06, "loss": 0.0244, "reward": 1.3336276397109033, "reward_std": 0.20837920447120267, "rewards/accuracy_reward": 0.05625, "rewards/cosine_scaled_reward": 0.021127634699223564, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.3125000087544322, "step": 120 }, { "completion_length": 64.14375, "epoch": 0.02760867464557364, "grad_norm": 3.210503848172606, "kl": 0.5955078125, "learning_rate": 5.518763796909493e-06, "loss": 0.0238, "reward": 1.3687469862401485, "reward_std": 0.26018318940004975, "rewards/accuracy_reward": 0.0625, "rewards/cosine_scaled_reward": 0.016663649416295813, "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.3520833417773247, "step": 125 }, { "completion_length": 64.01875, "epoch": 0.028713021631396586, "grad_norm": 3.7132153189677544, "kl": 0.568804931640625, "learning_rate": 5.739514348785873e-06, "loss": 0.0228, "reward": 1.2947323210537434, "reward_std": 0.24608583817171165, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.007351013895822689, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.3708333432674408, "step": 130 }, { "completion_length": 48.625, "epoch": 0.02981736861721953, "grad_norm": 2.6304505181783546, "kl": 0.53724365234375, "learning_rate": 5.960264900662252e-06, "loss": 0.0215, "reward": 1.3914526164531709, "reward_std": 0.12061821918068745, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.006464047048939392, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.3791666755452752, "step": 135 }, { "completion_length": 67.4, "epoch": 0.030921715603042476, "grad_norm": 3.1538444400935526, "kl": 0.4556884765625, "learning_rate": 6.181015452538632e-06, "loss": 0.0182, "reward": 1.4109491214156151, "reward_std": 0.16968962437640583, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.01821754773845896, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.4291666761040688, "step": 140 }, { "completion_length": 97.26875, "epoch": 0.03202606258886542, "grad_norm": 2.5494271727394686, "kl": 0.479034423828125, "learning_rate": 6.4017660044150125e-06, "loss": 0.0192, "reward": 1.5318490117788315, "reward_std": 0.27640023065378044, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.04731766675249673, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.5916666815057396, "step": 145 }, { "completion_length": 211.43125, "epoch": 0.033130409574688366, "grad_norm": 1.5190327305356153, "kl": 0.474810791015625, "learning_rate": 6.622516556291392e-06, "loss": 0.019, "reward": 1.6260522678494453, "reward_std": 0.23128953371724492, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.1364477440190967, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.7875000171363353, "step": 150 }, { "completion_length": 414.16875, "epoch": 0.034234756560511316, "grad_norm": 1.6713240655364676, "kl": 0.221484375, "learning_rate": 6.843267108167772e-06, "loss": 0.0089, "reward": 1.6096204966306686, "reward_std": 0.5072136571725423, "rewards/accuracy_reward": 0.0875, "rewards/cosine_scaled_reward": -0.17579616815783083, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.9041666768491268, "step": 155 }, { "completion_length": 224.45, "epoch": 0.03533910354633426, "grad_norm": 1.6799839469624172, "kl": 0.284002685546875, "learning_rate": 7.064017660044151e-06, "loss": 0.0114, "reward": 1.6045206032693387, "reward_std": 0.41629061991989147, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.16422939775511622, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.8812500163912773, "step": 160 }, { "completion_length": 143.5375, "epoch": 0.0364434505321572, "grad_norm": 2.749038664091265, "kl": 0.40633544921875, "learning_rate": 7.28476821192053e-06, "loss": 0.0163, "reward": 1.6239365682005882, "reward_std": 0.30945953201444354, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14481342723593116, "rewards/format_reward": 0.85625, "rewards/reasoning_steps_reward": 0.9062500089406967, "step": 165 }, { "completion_length": 79.20625, "epoch": 0.03754779751798015, "grad_norm": 1.9950235960081237, "kl": 0.58839111328125, "learning_rate": 7.50551876379691e-06, "loss": 0.0235, "reward": 1.7936675041913985, "reward_std": 0.24646314584424545, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.047999159130267796, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.8729166803881526, "step": 170 }, { "completion_length": 72.99375, "epoch": 0.038652144503803096, "grad_norm": 3.1334250495896403, "kl": 0.7208740234375, "learning_rate": 7.726269315673288e-06, "loss": 0.0288, "reward": 1.8457185290753841, "reward_std": 0.1848987685256361, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.058448143280111255, "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.9541666734963655, "step": 175 }, { "completion_length": 68.33125, "epoch": 0.03975649148962604, "grad_norm": 1.8399927140885233, "kl": 0.9168701171875, "learning_rate": 7.94701986754967e-06, "loss": 0.0367, "reward": 1.8285610511898995, "reward_std": 0.1804211751697949, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06310561551945285, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.9854166701436042, "step": 180 }, { "completion_length": 62.3625, "epoch": 0.04086083847544899, "grad_norm": 1.7243816581634572, "kl": 0.999951171875, "learning_rate": 8.16777041942605e-06, "loss": 0.04, "reward": 1.7962127968668937, "reward_std": 0.2584745195626965, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.03503720639273524, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.9687500029802323, "step": 185 }, { "completion_length": 43.825, "epoch": 0.04196518546127193, "grad_norm": 1.9090704424835, "kl": 1.1904296875, "learning_rate": 8.388520971302429e-06, "loss": 0.0476, "reward": 1.9285470694303513, "reward_std": 0.07938597783086151, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.019369596161413937, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 0.9729166701436043, "step": 190 }, { "completion_length": 43.8, "epoch": 0.043069532447094876, "grad_norm": 0.8384814834445764, "kl": 1.274560546875, "learning_rate": 8.609271523178809e-06, "loss": 0.051, "reward": 1.9545519351959229, "reward_std": 0.07418545563377847, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.018364711850881576, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.9729166686534881, "step": 195 }, { "completion_length": 48.35625, "epoch": 0.044173879432917826, "grad_norm": 2.236940540903145, "kl": 1.300146484375, "learning_rate": 8.830022075055188e-06, "loss": 0.052, "reward": 1.9158101230859756, "reward_std": 0.07812084475554001, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03210651960689574, "rewards/format_reward": 1.0, "rewards/reasoning_steps_reward": 0.9479166716337204, "step": 200 }, { "epoch": 0.044173879432917826, "eval_completion_length": 71.615, "eval_kl": 1.23375, "eval_loss": 0.04937754571437836, "eval_reward": 1.8474071335792541, "eval_reward_std": 0.18131415246985852, "eval_rewards/accuracy_reward": 0.01, "eval_rewards/cosine_scaled_reward": -0.040926196351647376, "eval_rewards/format_reward": 0.97, "eval_rewards/reasoning_steps_reward": 0.9083333444595337, "eval_runtime": 36.3868, "eval_samples_per_second": 2.721, "eval_steps_per_second": 0.687, "step": 200 }, { "completion_length": 74.2, "epoch": 0.04527822641874077, "grad_norm": 1.1539830590686253, "kl": 1.27861328125, "learning_rate": 9.050772626931568e-06, "loss": 0.0511, "reward": 1.868411859869957, "reward_std": 0.13951143043577757, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.04408814936177805, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.9250000035390258, "step": 205 }, { "completion_length": 38.5375, "epoch": 0.04638257340456371, "grad_norm": 2.3094120876632154, "kl": 2.0333984375, "learning_rate": 9.271523178807948e-06, "loss": 0.0813, "reward": 1.9079508751630783, "reward_std": 0.14040698215430894, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.014965799322817474, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9666666693985462, "step": 210 }, { "completion_length": 36.73125, "epoch": 0.04748692039038666, "grad_norm": 1.9626719196766829, "kl": 1.74697265625, "learning_rate": 9.492273730684327e-06, "loss": 0.0699, "reward": 1.824728435277939, "reward_std": 0.20264510232354951, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.019021555897779763, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9125000078231096, "step": 215 }, { "completion_length": 46.425, "epoch": 0.048591267376209606, "grad_norm": 2.344462570648517, "kl": 1.8123046875, "learning_rate": 9.713024282560707e-06, "loss": 0.0725, "reward": 1.7400818899273873, "reward_std": 0.3166843029595839, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.02450146197807044, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9208333410322667, "step": 220 }, { "completion_length": 36.45625, "epoch": 0.04969561436203255, "grad_norm": 1.6108117585836246, "kl": 1.7595703125, "learning_rate": 9.933774834437086e-06, "loss": 0.0704, "reward": 1.7901365123689175, "reward_std": 0.22408495279232737, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.016113495687022805, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.8312500113621354, "step": 225 }, { "completion_length": 36.65, "epoch": 0.0507999613478555, "grad_norm": 3.236305451533758, "kl": 1.82919921875, "learning_rate": 1.0154525386313468e-05, "loss": 0.0732, "reward": 1.8323896206915378, "reward_std": 0.21802660732319054, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.017610373883508147, "rewards/format_reward": 0.91875, "rewards/reasoning_steps_reward": 0.9312500074505806, "step": 230 }, { "completion_length": 45.64375, "epoch": 0.05190430833367844, "grad_norm": 1.6334788003351493, "kl": 1.7169921875, "learning_rate": 1.0375275938189846e-05, "loss": 0.0687, "reward": 1.8946375951170922, "reward_std": 0.11254046880385431, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.015779063804075122, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9854166671633721, "step": 235 }, { "completion_length": 63.425, "epoch": 0.053008655319501385, "grad_norm": 1.7113105745171684, "kl": 1.695556640625, "learning_rate": 1.0596026490066227e-05, "loss": 0.0678, "reward": 1.9150223009288312, "reward_std": 0.10123766471187992, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01831102555152029, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.9895833343267441, "step": 240 }, { "completion_length": 93.68125, "epoch": 0.054113002305324336, "grad_norm": 0.4404451775440966, "kl": 1.57080078125, "learning_rate": 1.0816777041942605e-05, "loss": 0.0628, "reward": 1.8356807470321654, "reward_std": 0.2136607704902417, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03931924531934783, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.9687500029802323, "step": 245 }, { "completion_length": 56.9125, "epoch": 0.05521734929114728, "grad_norm": 4.754826808249897, "kl": 2.864794921875, "learning_rate": 1.1037527593818986e-05, "loss": 0.1146, "reward": 1.9538603499531746, "reward_std": 0.028102499651504333, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01488963805604726, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.993750000745058, "step": 250 }, { "completion_length": 85.1875, "epoch": 0.05632169627697022, "grad_norm": 1.9425038218319501, "kl": 2.82373046875, "learning_rate": 1.1258278145695364e-05, "loss": 0.1127, "reward": 1.6320011641830205, "reward_std": 0.3420275276679604, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.040915494039654735, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.8229166699573398, "step": 255 }, { "completion_length": 51.23125, "epoch": 0.05742604326279317, "grad_norm": 1.6999510907770932, "kl": 1.645703125, "learning_rate": 1.1479028697571745e-05, "loss": 0.0658, "reward": 1.874811889231205, "reward_std": 0.12289988842621824, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02310477099381387, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 0.9291666693985462, "step": 260 }, { "completion_length": 150.65, "epoch": 0.058530390248616115, "grad_norm": 1.442258073406476, "kl": 1.566796875, "learning_rate": 1.1699779249448125e-05, "loss": 0.0627, "reward": 1.7884906940162182, "reward_std": 0.22153634292044444, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.019842630508355795, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9333333425223828, "step": 265 }, { "completion_length": 115.4, "epoch": 0.05963473723443906, "grad_norm": 0.6973653373093713, "kl": 481691.4131835938, "learning_rate": 1.1920529801324505e-05, "loss": 19216.3781, "reward": 1.8304102931171655, "reward_std": 0.1441515963528218, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.021673032961552964, "rewards/format_reward": 0.9, "rewards/reasoning_steps_reward": 0.9520833369344472, "step": 270 }, { "completion_length": 78.175, "epoch": 0.06073908422026201, "grad_norm": 0.7996143804584955, "kl": 1.68466796875, "learning_rate": 1.2141280353200884e-05, "loss": 0.0674, "reward": 1.9026772230863571, "reward_std": 0.1005777153201052, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.022322767600417136, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9937500014901162, "step": 275 }, { "completion_length": 94.39375, "epoch": 0.06184343120608495, "grad_norm": 0.46301803821702214, "kl": 1.720703125, "learning_rate": 1.2362030905077264e-05, "loss": 0.0688, "reward": 1.909028697013855, "reward_std": 0.1098573448281968, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02013796616811305, "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.9916666686534882, "step": 280 }, { "completion_length": 35.9875, "epoch": 0.0629477781919079, "grad_norm": 25.392403138510254, "kl": 6.09052734375, "learning_rate": 1.2582781456953644e-05, "loss": 0.2435, "reward": 1.9458073616027831, "reward_std": 0.05808448990646866, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01877596784615889, "rewards/format_reward": 0.9875, "rewards/reasoning_steps_reward": 0.9770833350718021, "step": 285 }, { "completion_length": 44.14375, "epoch": 0.06405212517773085, "grad_norm": 5.742884239841623, "kl": 1.8162109375, "learning_rate": 1.2803532008830025e-05, "loss": 0.0727, "reward": 1.942259357869625, "reward_std": 0.051475054543698204, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02232399402419105, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.9895833343267441, "step": 290 }, { "completion_length": 70.83125, "epoch": 0.06515647216355379, "grad_norm": 1.1878621439792336, "kl": 1.75224609375, "learning_rate": 1.3024282560706403e-05, "loss": 0.0701, "reward": 1.8855205789208411, "reward_std": 0.14350247889015008, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0311461063567549, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9854166686534882, "step": 295 }, { "completion_length": 59.16875, "epoch": 0.06626081914937673, "grad_norm": 1.0737008694745849, "kl": 2.21201171875, "learning_rate": 1.3245033112582784e-05, "loss": 0.0886, "reward": 1.8838034845888614, "reward_std": 0.10267033483396518, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.032863204437308016, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.991666667163372, "step": 300 }, { "epoch": 0.06626081914937673, "eval_completion_length": 37.235, "eval_kl": 1.8078125, "eval_loss": 0.07229267060756683, "eval_reward": 1.9764094233512879, "eval_reward_std": 0.015016918147157412, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.013590602073818445, "eval_rewards/format_reward": 0.99, "eval_rewards/reasoning_steps_reward": 1.0, "eval_runtime": 19.7868, "eval_samples_per_second": 5.003, "eval_steps_per_second": 1.263, "step": 300 }, { "completion_length": 112.075, "epoch": 0.06736516613519968, "grad_norm": 38.86453958076354, "kl": 116.337109375, "learning_rate": 1.3465783664459162e-05, "loss": 4.6466, "reward": 1.7255545184016228, "reward_std": 0.3258556753062294, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05569549936335534, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.9750000014901161, "step": 305 }, { "completion_length": 91.34375, "epoch": 0.06846951312102263, "grad_norm": 0.7032767211156884, "kl": 2.12666015625, "learning_rate": 1.3686534216335543e-05, "loss": 0.0851, "reward": 1.8290522865951062, "reward_std": 0.19914081794595404, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03136437909561209, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9854166671633721, "step": 310 }, { "completion_length": 35.0125, "epoch": 0.06957386010684558, "grad_norm": 0.1219839633736872, "kl": 3.073046875, "learning_rate": 1.3907284768211921e-05, "loss": 0.1228, "reward": 1.9570835500955581, "reward_std": 0.04304394810787926, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.017916415364015847, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 1.0, "step": 315 }, { "completion_length": 32.275, "epoch": 0.07067820709266852, "grad_norm": 1.104264851050662, "kl": 3.7556640625, "learning_rate": 1.4128035320088303e-05, "loss": 0.1502, "reward": 1.968562737107277, "reward_std": 0.026956520293606446, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012687235441990196, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 1.0, "step": 320 }, { "completion_length": 132.08125, "epoch": 0.07178255407849146, "grad_norm": 68555.41271806235, "kl": 1126.325, "learning_rate": 1.434878587196468e-05, "loss": 44.9841, "reward": 1.8718097068369388, "reward_std": 0.05580442189639143, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.015690282598370686, "rewards/format_reward": 0.8875, "rewards/reasoning_steps_reward": 1.0, "step": 325 }, { "completion_length": 213.45625, "epoch": 0.0728869010643144, "grad_norm": 3.503371973098148, "kl": 44.011328125, "learning_rate": 1.456953642384106e-05, "loss": 1.757, "reward": 1.7566166341304779, "reward_std": 0.21126712449513435, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03296670269337483, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.995833334326744, "step": 330 }, { "completion_length": 68.475, "epoch": 0.07399124805013735, "grad_norm": 1.079221435802786, "kl": 9.17109375, "learning_rate": 1.479028697571744e-05, "loss": 0.368, "reward": 1.8921391651034356, "reward_std": 0.1115680442419034, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.022444166260538623, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9895833350718022, "step": 335 }, { "completion_length": 35.54375, "epoch": 0.0750955950359603, "grad_norm": 0.3296732185602028, "kl": 1.96572265625, "learning_rate": 1.501103752759382e-05, "loss": 0.0786, "reward": 1.9763855755329132, "reward_std": 0.015541732574638445, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.013197748665697872, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.995833334326744, "step": 340 }, { "completion_length": 38.39375, "epoch": 0.07619994202178325, "grad_norm": 0.804238046719128, "kl": 2.0283203125, "learning_rate": 1.52317880794702e-05, "loss": 0.0812, "reward": 1.9684513732790947, "reward_std": 0.026967421242807176, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012798593926709146, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.9875000007450581, "step": 345 }, { "completion_length": 48.86875, "epoch": 0.07730428900760619, "grad_norm": 1.9342787529877643, "kl": 2.15478515625, "learning_rate": 1.5452538631346577e-05, "loss": 0.0862, "reward": 1.915003441274166, "reward_std": 0.10254102655635507, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.024579859105870128, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.995833334326744, "step": 350 }, { "completion_length": 238.325, "epoch": 0.07840863599342913, "grad_norm": 7.756150900770026, "kl": 4.5904296875, "learning_rate": 1.567328918322296e-05, "loss": 0.1833, "reward": 1.5095329130068422, "reward_std": 0.4169476901159214, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06338373832768411, "rewards/format_reward": 0.69375, "rewards/reasoning_steps_reward": 0.8791666779667139, "step": 355 }, { "completion_length": 147.71875, "epoch": 0.07951298297925208, "grad_norm": 0.1004225405142809, "kl": 2.135546875, "learning_rate": 1.589403973509934e-05, "loss": 0.0854, "reward": 1.7320448141545057, "reward_std": 0.2472891275290749, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04920516336569562, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.9375000055879354, "step": 360 }, { "completion_length": 73.8125, "epoch": 0.08061732996507502, "grad_norm": 3.8314243673640505, "kl": 2.2732421875, "learning_rate": 1.6114790286975718e-05, "loss": 0.0909, "reward": 1.858565354347229, "reward_std": 0.1464589938717836, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03518462204374373, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9687500059604645, "step": 365 }, { "completion_length": 46.4625, "epoch": 0.08172167695089798, "grad_norm": 1.148009265152921, "kl": 2.7, "learning_rate": 1.63355408388521e-05, "loss": 0.108, "reward": 1.935795644670725, "reward_std": 0.07739904040663532, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.020454342185985297, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 0.9875000014901161, "step": 370 }, { "completion_length": 28.63125, "epoch": 0.08282602393672092, "grad_norm": 0.018613804390782206, "kl": 2.887109375, "learning_rate": 1.6556291390728477e-05, "loss": 0.1155, "reward": 1.9873796686530114, "reward_std": 0.005896346227382309, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.008453596872277558, "rewards/format_reward": 1.0, "rewards/reasoning_steps_reward": 0.995833333581686, "step": 375 }, { "completion_length": 28.8375, "epoch": 0.08393037092254386, "grad_norm": 1.6371785454339474, "kl": 2.7423828125, "learning_rate": 1.6777041942604858e-05, "loss": 0.1097, "reward": 1.9645642668008805, "reward_std": 0.020640570133218718, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.008352343272417784, "rewards/format_reward": 1.0, "rewards/reasoning_steps_reward": 0.9729166701436043, "step": 380 }, { "completion_length": 27.5, "epoch": 0.08503471790836681, "grad_norm": 2.9035413740067773, "kl": 2.6583984375, "learning_rate": 1.699779249448124e-05, "loss": 0.1063, "reward": 1.9090838402509689, "reward_std": 0.04726645071750681, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.007582775689661503, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.9229166693985462, "step": 385 }, { "completion_length": 25.5, "epoch": 0.08613906489418975, "grad_norm": 10.067463916774571, "kl": 2.821484375, "learning_rate": 1.7218543046357617e-05, "loss": 0.1128, "reward": 1.4473052226414438, "reward_std": 0.2710021964079715, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.006861429521813988, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.6416666749864817, "step": 390 }, { "completion_length": 25.33125, "epoch": 0.0872434118800127, "grad_norm": 1.1120951826184378, "kl": 2.9173828125, "learning_rate": 1.7439293598234e-05, "loss": 0.1167, "reward": 1.6058874435722827, "reward_std": 0.04730093894213496, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.006612581602530554, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.6312500178813935, "step": 395 }, { "completion_length": 260.55625, "epoch": 0.08834775886583565, "grad_norm": 18.757375828870764, "kl": 2208.165234375, "learning_rate": 1.7660044150110377e-05, "loss": 88.441, "reward": 1.2055384639650584, "reward_std": 0.23063606465893827, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09237821163842455, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.78541667945683, "step": 400 }, { "epoch": 0.08834775886583565, "eval_completion_length": 217.92, "eval_kl": 87.934375, "eval_loss": 3.5442821979522705, "eval_reward": 1.2175334417819976, "eval_reward_std": 0.365319043637719, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.07746655503287911, "eval_rewards/format_reward": 0.455, "eval_rewards/reasoning_steps_reward": 0.8400000214576722, "eval_runtime": 99.5826, "eval_samples_per_second": 0.994, "eval_steps_per_second": 0.251, "step": 400 }, { "completion_length": 625.7875, "epoch": 0.0894521058516586, "grad_norm": 19.106502016916846, "kl": 12.63349609375, "learning_rate": 1.7880794701986758e-05, "loss": 0.5055, "reward": 0.651996704749763, "reward_std": 0.4102951940265484, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21883663166663608, "rewards/format_reward": 0.08125, "rewards/reasoning_steps_reward": 0.7895833376795054, "step": 405 }, { "completion_length": 161.35, "epoch": 0.09055645283748154, "grad_norm": 0.8426631626860688, "kl": 2.6265625, "learning_rate": 1.8101545253863136e-05, "loss": 0.1051, "reward": 1.346696252003312, "reward_std": 0.32623073370778, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08663710475666449, "rewards/format_reward": 0.68125, "rewards/reasoning_steps_reward": 0.7520833492279053, "step": 410 }, { "completion_length": 27.78125, "epoch": 0.09166079982330448, "grad_norm": 2.026191681702149, "kl": 2.80498046875, "learning_rate": 1.8322295805739517e-05, "loss": 0.1122, "reward": 1.942487709224224, "reward_std": 0.02961592679930618, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0075122533366084095, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.9562500052154064, "step": 415 }, { "completion_length": 23.975, "epoch": 0.09276514680912742, "grad_norm": 0.31172284735643946, "kl": 4.0859375, "learning_rate": 1.8543046357615895e-05, "loss": 0.1634, "reward": 1.5779291547834873, "reward_std": 0.22977396088535898, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.005404202520730905, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.8833333443850279, "step": 420 }, { "completion_length": 26.6, "epoch": 0.09386949379495037, "grad_norm": 10734.08122919461, "kl": 104.6666015625, "learning_rate": 1.8763796909492276e-05, "loss": 4.1821, "reward": 1.5809004239737987, "reward_std": 0.1068729208822333, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0065996065095532686, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 1.0, "step": 425 }, { "completion_length": 83.06875, "epoch": 0.09497384078077332, "grad_norm": 123.88060508402012, "kl": 6.474365234375, "learning_rate": 1.8984547461368654e-05, "loss": 0.2591, "reward": 1.6652553364634515, "reward_std": 0.282259418636113, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07224468101048842, "rewards/format_reward": 0.7375, "rewards/reasoning_steps_reward": 1.0, "step": 430 }, { "completion_length": 106.91875, "epoch": 0.09607818776659627, "grad_norm": 0.940290808549546, "kl": 842.04267578125, "learning_rate": 1.9205298013245036e-05, "loss": 33.6922, "reward": 1.4902156308293342, "reward_std": 0.16858022491724114, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12228436931036413, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 1.0, "step": 435 }, { "completion_length": 31.5625, "epoch": 0.09718253475241921, "grad_norm": 15.254329720774644, "kl": 28.6966796875, "learning_rate": 1.9426048565121414e-05, "loss": 1.1475, "reward": 1.6264034517109394, "reward_std": 0.31542411341342813, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.013179889318416826, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.8833333440124989, "step": 440 }, { "completion_length": 36.19375, "epoch": 0.09828688173824215, "grad_norm": 1.7786785678688244, "kl": 3.790625, "learning_rate": 1.9646799116997795e-05, "loss": 0.1516, "reward": 1.6109677419066428, "reward_std": 0.3635523966368055, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01611559497541748, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.864583345502615, "step": 445 }, { "completion_length": 28.9125, "epoch": 0.0993912287240651, "grad_norm": 13.049933334291598, "kl": 4.665234375, "learning_rate": 1.9867549668874173e-05, "loss": 0.1867, "reward": 1.9556236922740937, "reward_std": 0.05575240566577122, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0068762671202421185, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 1.0, "step": 450 }, { "completion_length": 48.9, "epoch": 0.10049557570988804, "grad_norm": 12.750419334393301, "kl": 3.6001953125, "learning_rate": 1.9999988107104428e-05, "loss": 0.144, "reward": 1.8631265118718148, "reward_std": 0.1681165755485381, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.016040142456768082, "rewards/format_reward": 0.8875, "rewards/reasoning_steps_reward": 0.99166666790843, "step": 455 }, { "completion_length": 99.2625, "epoch": 0.101599922695711, "grad_norm": 3.6354989719273636, "kl": 149.4564453125, "learning_rate": 1.9999854312354064e-05, "loss": 5.9617, "reward": 1.8252798458561301, "reward_std": 0.17444125837337196, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01847012363432441, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9687500018626451, "step": 460 }, { "completion_length": 135.56875, "epoch": 0.10270426968153394, "grad_norm": 5.2304357492626625, "kl": 4.8642578125, "learning_rate": 1.999957185872951e-05, "loss": 0.1946, "reward": 1.7171886287629605, "reward_std": 0.3744886555035009, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02239466998144053, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.9395833402872086, "step": 465 }, { "completion_length": 78.5125, "epoch": 0.10380861666735688, "grad_norm": 91.64606863523989, "kl": 70.5404296875, "learning_rate": 1.999914075042975e-05, "loss": 2.8153, "reward": 1.6547522293403745, "reward_std": 0.2920411152263114, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.055664407834410665, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.9291666738688946, "step": 470 }, { "completion_length": 64.2375, "epoch": 0.10491296365317983, "grad_norm": 48.311980555436286, "kl": 4.6033203125, "learning_rate": 1.9998560993863682e-05, "loss": 0.1843, "reward": 1.6796498108655213, "reward_std": 0.2776311224031815, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04535014413995668, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.9437500044703484, "step": 475 }, { "completion_length": 36.9, "epoch": 0.10601731063900277, "grad_norm": 5.921542778461717, "kl": 4.1017578125, "learning_rate": 1.999783259765003e-05, "loss": 0.164, "reward": 1.7877669408917427, "reward_std": 0.2045195282747045, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.030983004180598073, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.9375000033527613, "step": 480 }, { "completion_length": 152.48125, "epoch": 0.10712165762482571, "grad_norm": 20.142481995600914, "kl": 5.8130859375, "learning_rate": 1.9996955572617202e-05, "loss": 0.2326, "reward": 1.4146922817453742, "reward_std": 0.396951551019356, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08530769811477512, "rewards/format_reward": 0.6375, "rewards/reasoning_steps_reward": 0.8625000085681677, "step": 485 }, { "completion_length": 239.51875, "epoch": 0.10822600461064867, "grad_norm": 139.2698059408305, "kl": 15.3798828125, "learning_rate": 1.999592993180315e-05, "loss": 0.6159, "reward": 1.1587502604350448, "reward_std": 0.6047080957883736, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12041640711249783, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.760416679084301, "step": 490 }, { "completion_length": 110.30625, "epoch": 0.10933035159647161, "grad_norm": 3.7802819605372977, "kl": 2.846484375, "learning_rate": 1.9994755690455154e-05, "loss": 0.1139, "reward": 1.3858223337680102, "reward_std": 0.4451230512106122, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08501100512221456, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.758333345130086, "step": 495 }, { "completion_length": 419.45, "epoch": 0.11043469858229456, "grad_norm": 1.4189717430862432, "kl": 1.2078369140625, "learning_rate": 1.9993432866029604e-05, "loss": 0.0483, "reward": 1.3895617920905352, "reward_std": 0.3321596068039071, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22502154029498342, "rewards/format_reward": 0.63125, "rewards/reasoning_steps_reward": 0.9833333343267441, "step": 500 }, { "epoch": 0.11043469858229456, "eval_completion_length": 66.62, "eval_kl": 2.2115625, "eval_loss": 0.08852547407150269, "eval_reward": 1.8721801257133484, "eval_reward_std": 0.14555715662660076, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.041153202150017026, "eval_rewards/format_reward": 0.915, "eval_rewards/reasoning_steps_reward": 0.9983333337306977, "eval_runtime": 32.0208, "eval_samples_per_second": 3.092, "eval_steps_per_second": 0.781, "step": 500 }, { "completion_length": 33.025, "epoch": 0.1115390455681175, "grad_norm": 1.314290920645204, "kl": 2.6916015625, "learning_rate": 1.9991961478191753e-05, "loss": 0.1077, "reward": 1.9229176357388496, "reward_std": 0.09401306777253922, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012499059329275041, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 0.9729166716337204, "step": 505 }, { "completion_length": 28.95625, "epoch": 0.11264339255394044, "grad_norm": 13.466340801725543, "kl": 4.073046875, "learning_rate": 1.99903415488154e-05, "loss": 0.163, "reward": 1.955730925500393, "reward_std": 0.05362616253169108, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.006769094028277323, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 1.0, "step": 510 }, { "completion_length": 32.275, "epoch": 0.11374773953976339, "grad_norm": 4.387812745130587, "kl": 3.749609375, "learning_rate": 1.998857310198259e-05, "loss": 0.15, "reward": 1.9408951826393603, "reward_std": 0.07298454180927365, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00910481644968968, "rewards/format_reward": 0.95625, "rewards/reasoning_steps_reward": 0.99375, "step": 515 }, { "completion_length": 208.75625, "epoch": 0.11485208652558634, "grad_norm": 15.261102221631818, "kl": 4.52294921875, "learning_rate": 1.998665616398323e-05, "loss": 0.1808, "reward": 0.8959554025903345, "reward_std": 0.28214400556153124, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08737794174230658, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.4645833414047956, "step": 520 }, { "completion_length": 946.95625, "epoch": 0.11595643351140929, "grad_norm": 4.064474873038914, "kl": 7.4697265625, "learning_rate": 1.9984590763314722e-05, "loss": 0.299, "reward": 0.4448224641382694, "reward_std": 0.1930072069000744, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1822608746260812, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.6145833436399698, "step": 525 }, { "completion_length": 867.95, "epoch": 0.11706078049723223, "grad_norm": 0.8715946488428576, "kl": 1.211572265625, "learning_rate": 1.998237693068153e-05, "loss": 0.0485, "reward": 1.1276234179735183, "reward_std": 0.33161048383626623, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1411265860995627, "rewards/format_reward": 0.26875, "rewards/reasoning_steps_reward": 1.0, "step": 530 }, { "completion_length": 1018.55625, "epoch": 0.11816512748305517, "grad_norm": 0.5739914958239132, "kl": 1.079345703125, "learning_rate": 1.9980014698994722e-05, "loss": 0.0432, "reward": 1.2353254936635494, "reward_std": 0.24865906643667585, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.45217450708150864, "rewards/format_reward": 0.6875, "rewards/reasoning_steps_reward": 1.0, "step": 535 }, { "completion_length": 853.175, "epoch": 0.11926947446887812, "grad_norm": 1.0404924783317373, "kl": 1.39716796875, "learning_rate": 1.997750410337147e-05, "loss": 0.0558, "reward": 1.4958539374172688, "reward_std": 0.1754556493193377, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.404146053083241, "rewards/format_reward": 0.9, "rewards/reasoning_steps_reward": 1.0, "step": 540 }, { "completion_length": 54.7, "epoch": 0.12037382145470106, "grad_norm": 1.3971817042757162, "kl": 4.2869140625, "learning_rate": 1.997484518113456e-05, "loss": 0.1714, "reward": 1.8733646899461747, "reward_std": 0.15371908817323857, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.024551973055349664, "rewards/format_reward": 0.91875, "rewards/reasoning_steps_reward": 0.9791666693985462, "step": 545 }, { "completion_length": 32.7, "epoch": 0.12147816844052402, "grad_norm": 3.3861272960337225, "kl": 3.658203125, "learning_rate": 1.9972037971811802e-05, "loss": 0.1464, "reward": 1.7629884868860244, "reward_std": 0.24611905133260734, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.009928178068366832, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.8979166731238365, "step": 550 }, { "completion_length": 32.2375, "epoch": 0.12258251542634696, "grad_norm": 0.9537515740920277, "kl": 3.62529296875, "learning_rate": 1.9969082517135463e-05, "loss": 0.145, "reward": 1.707100809639087, "reward_std": 0.36074760046503795, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.009565857070265337, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.8791666701436043, "step": 555 }, { "completion_length": 29.175, "epoch": 0.1236868624121699, "grad_norm": 1.8079021090047303, "kl": 3.3318359375, "learning_rate": 1.9965978861041637e-05, "loss": 0.1333, "reward": 1.8566195629537106, "reward_std": 0.16747138476275722, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.00796372244367376, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9333333380520343, "step": 560 }, { "completion_length": 28.85, "epoch": 0.12479120939799285, "grad_norm": 0.30956855309501174, "kl": 3.42373046875, "learning_rate": 1.99627270496696e-05, "loss": 0.137, "reward": 1.8421029239892959, "reward_std": 0.13506208243761647, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.007897024205885828, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.8687500126659871, "step": 565 }, { "completion_length": 26.525, "epoch": 0.1258955563838158, "grad_norm": 0.47918362299607187, "kl": 4.195703125, "learning_rate": 1.995932713136112e-05, "loss": 0.168, "reward": 1.5204951745280142, "reward_std": 0.31988839789064516, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.006588119767866374, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.7770833345130086, "step": 570 }, { "completion_length": 29.475, "epoch": 0.12699990336963873, "grad_norm": 1.7107660050085856, "kl": 3.25537109375, "learning_rate": 1.9955779156659735e-05, "loss": 0.1302, "reward": 1.9459566242992878, "reward_std": 0.06541404174095078, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.008209979979437777, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.9791666671633721, "step": 575 }, { "completion_length": 28.4, "epoch": 0.1281042503554617, "grad_norm": 3.0620288960322344, "kl": 3.51279296875, "learning_rate": 1.9952083178310002e-05, "loss": 0.1406, "reward": 1.823703521117568, "reward_std": 0.19747704482369954, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.007546431059017778, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9562500011175871, "step": 580 }, { "completion_length": 24.125, "epoch": 0.12920859734128462, "grad_norm": 1.5920134602595593, "kl": 4.5791015625, "learning_rate": 1.994823925125672e-05, "loss": 0.1832, "reward": 1.687876349347016, "reward_std": 0.304932097128949, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.005873611301694837, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.8812500063329936, "step": 585 }, { "completion_length": 38.7125, "epoch": 0.13031294432710758, "grad_norm": 1.7803721516558564, "kl": 5.0361328125, "learning_rate": 1.994424743264412e-05, "loss": 0.2014, "reward": 1.4074853049299691, "reward_std": 0.4336934615795144, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.030014701202208015, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.8500000024214387, "step": 590 }, { "completion_length": 90.8625, "epoch": 0.13141729131293053, "grad_norm": 1.3605634584420858, "kl": 4.99658203125, "learning_rate": 1.9940107781814976e-05, "loss": 0.1999, "reward": 1.2846552881412208, "reward_std": 0.5878636933018242, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08409471668419428, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 0.7562500022351741, "step": 595 }, { "completion_length": 430.75, "epoch": 0.13252163829875346, "grad_norm": 0.42608770036732224, "kl": 4.250341796875, "learning_rate": 1.993582036030978e-05, "loss": 0.17, "reward": 1.0411737323971466, "reward_std": 0.5796436283727416, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08382627054525074, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.5812500020489096, "step": 600 }, { "epoch": 0.13252163829875346, "eval_completion_length": 531.12, "eval_kl": 5.138125, "eval_loss": 0.20564807951450348, "eval_reward": 1.5188868433237075, "eval_reward_std": 0.39198793584240776, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.017779812179505826, "eval_rewards/format_reward": 0.785, "eval_rewards/reasoning_steps_reward": 0.75166668176651, "eval_runtime": 102.3024, "eval_samples_per_second": 0.968, "eval_steps_per_second": 0.244, "step": 600 }, { "completion_length": 936.55, "epoch": 0.13362598528457642, "grad_norm": 0.5683802140587957, "kl": 2.01455078125, "learning_rate": 1.993138523186578e-05, "loss": 0.0805, "reward": 0.327013082918711, "reward_std": 0.42701971883070655, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16673692046315408, "rewards/format_reward": 0.1375, "rewards/reasoning_steps_reward": 0.35625000596046447, "step": 605 }, { "completion_length": 1024.0, "epoch": 0.13473033227039935, "grad_norm": 0.4839523012096288, "kl": 0.7296875, "learning_rate": 1.9926802462416054e-05, "loss": 0.0292, "reward": 0.41157908397726717, "reward_std": 0.28430348377587505, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.23425425429013558, "rewards/format_reward": 0.00625, "rewards/reasoning_steps_reward": 0.6395833477377891, "step": 610 }, { "completion_length": 879.0, "epoch": 0.1358346792562223, "grad_norm": 0.8739388757124811, "kl": 3.1473388671875, "learning_rate": 1.9922072120088537e-05, "loss": 0.1259, "reward": 0.8592522375285625, "reward_std": 0.1283191536087543, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0803310930976295, "rewards/format_reward": 0.00625, "rewards/reasoning_steps_reward": 0.9333333414047956, "step": 615 }, { "completion_length": 956.5125, "epoch": 0.13693902624204526, "grad_norm": 0.3409618950357477, "kl": 2.539794921875, "learning_rate": 1.991719427520499e-05, "loss": 0.1016, "reward": 0.6374624267220497, "reward_std": 0.23787620406874338, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.24378757532394957, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.8687500182539225, "step": 620 }, { "completion_length": 951.675, "epoch": 0.1380433732278682, "grad_norm": 0.5311158252207424, "kl": 1.86796875, "learning_rate": 1.9912169000279952e-05, "loss": 0.0747, "reward": -0.04920477559790015, "reward_std": 0.21609443256020314, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.35337144320365044, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3041666731238365, "step": 625 }, { "completion_length": 1019.275, "epoch": 0.13914772021369115, "grad_norm": 0.4779187904211677, "kl": 1.165673828125, "learning_rate": 1.9906996370019692e-05, "loss": 0.0466, "reward": 0.08100474406965077, "reward_std": 0.22535915609259974, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1669119239784777, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.24791667144745588, "step": 630 }, { "completion_length": 742.05, "epoch": 0.14025206719951408, "grad_norm": 2.860219052376377, "kl": 3.076318359375, "learning_rate": 1.990167646132107e-05, "loss": 0.123, "reward": 0.5310021251440048, "reward_std": 0.3654556166498878, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.20649788190494292, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.7062500109896064, "step": 635 }, { "completion_length": 154.4625, "epoch": 0.14135641418533704, "grad_norm": 0.20831523609247937, "kl": 3.63935546875, "learning_rate": 1.9896209353270394e-05, "loss": 0.1455, "reward": 1.8390323543921112, "reward_std": 0.1871601128950715, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.029717639734735714, "rewards/format_reward": 0.91875, "rewards/reasoning_steps_reward": 0.9500000027939677, "step": 640 }, { "completion_length": 759.0375, "epoch": 0.14246076117115997, "grad_norm": 0.4107178938373751, "kl": 2.780615234375, "learning_rate": 1.989059512714227e-05, "loss": 0.1112, "reward": 1.2166819516569376, "reward_std": 0.3681499962562157, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.060401382704731076, "rewards/format_reward": 0.4, "rewards/reasoning_steps_reward": 0.8770833380520344, "step": 645 }, { "completion_length": 1020.01875, "epoch": 0.14356510815698292, "grad_norm": 0.5118018262308587, "kl": 1.021142578125, "learning_rate": 1.988483386639836e-05, "loss": 0.0409, "reward": 0.7364203490898944, "reward_std": 0.2831057722181868, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04066298486868618, "rewards/format_reward": 0.01875, "rewards/reasoning_steps_reward": 0.7583333430811763, "step": 650 }, { "completion_length": 1023.7625, "epoch": 0.14466945514280588, "grad_norm": 0.5971199868423984, "kl": 0.971728515625, "learning_rate": 1.9878925656686167e-05, "loss": 0.0389, "reward": 0.5211033774306998, "reward_std": 0.2769409292843193, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.27264662481029517, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.7812500074505806, "step": 655 }, { "completion_length": 1024.0, "epoch": 0.1457738021286288, "grad_norm": 0.7257973266644143, "kl": 0.5168212890625, "learning_rate": 1.9872870585837757e-05, "loss": 0.0207, "reward": 0.5945031743496656, "reward_std": 0.24122693912358956, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22424682592973114, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.8062500102445483, "step": 660 }, { "completion_length": 1024.0, "epoch": 0.14687814911445177, "grad_norm": 0.6690686208159938, "kl": 0.796337890625, "learning_rate": 1.9866668743868437e-05, "loss": 0.0318, "reward": 0.9177674036473036, "reward_std": 0.13023071596617228, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07806593386630993, "rewards/format_reward": 0.04375, "rewards/reasoning_steps_reward": 0.9520833373069764, "step": 665 }, { "completion_length": 987.1375, "epoch": 0.1479824961002747, "grad_norm": 0.6120508069402077, "kl": 1.7907470703125, "learning_rate": 1.9860320222975435e-05, "loss": 0.0716, "reward": 1.260640586912632, "reward_std": 0.40482770588496353, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09769274116843008, "rewards/format_reward": 0.39375, "rewards/reasoning_steps_reward": 0.964583334326744, "step": 670 }, { "completion_length": 640.525, "epoch": 0.14908684308609765, "grad_norm": 0.710860699058398, "kl": 3.9996826171875, "learning_rate": 1.9853825117536522e-05, "loss": 0.16, "reward": 1.6751317463815212, "reward_std": 0.3215458321869846, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14778491355245932, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.972916667163372, "step": 675 }, { "completion_length": 852.1375, "epoch": 0.1501911900719206, "grad_norm": 0.4777776180383065, "kl": 3.6158203125, "learning_rate": 1.9847183524108614e-05, "loss": 0.1446, "reward": 1.468020135909319, "reward_std": 0.44469789950890115, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12572985703882295, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 0.9812500029802322, "step": 680 }, { "completion_length": 1007.66875, "epoch": 0.15129553705774354, "grad_norm": 0.5985977128324016, "kl": 1.750341796875, "learning_rate": 1.9840395541426333e-05, "loss": 0.07, "reward": 0.7763801473192871, "reward_std": 0.4260402750223875, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12778652228880674, "rewards/format_reward": 0.11875, "rewards/reasoning_steps_reward": 0.78541667945683, "step": 685 }, { "completion_length": 1024.0, "epoch": 0.1523998840435665, "grad_norm": 0.9348256931543566, "kl": 1.45712890625, "learning_rate": 1.983346127040053e-05, "loss": 0.0583, "reward": 0.7624955659732222, "reward_std": 0.3992441566209891, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1333377712897345, "rewards/format_reward": 0.1125, "rewards/reasoning_steps_reward": 0.7833333471789956, "step": 690 }, { "completion_length": 875.2375, "epoch": 0.15350423102938943, "grad_norm": 0.6791189480472896, "kl": 2.89599609375, "learning_rate": 1.9826380814116795e-05, "loss": 0.1157, "reward": 0.8852463798597455, "reward_std": 0.6508712698566341, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.27933695799438285, "rewards/format_reward": 0.35, "rewards/reasoning_steps_reward": 0.8145833453163505, "step": 695 }, { "completion_length": 923.7, "epoch": 0.15460857801521238, "grad_norm": 0.3922171537250086, "kl": 2.5765625, "learning_rate": 1.9819154277833938e-05, "loss": 0.1031, "reward": 1.0286956165917218, "reward_std": 0.5952219057944603, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1317210498600616, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.7916666803881526, "step": 700 }, { "epoch": 0.15460857801521238, "eval_completion_length": 1024.0, "eval_kl": 3.66328125, "eval_loss": 0.14716078341007233, "eval_reward": 1.1705710649490357, "eval_reward_std": 0.47160317164845766, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.03109560369513929, "eval_rewards/format_reward": 0.4, "eval_rewards/reasoning_steps_reward": 0.8016666793823242, "eval_runtime": 196.5133, "eval_samples_per_second": 0.504, "eval_steps_per_second": 0.127, "step": 700 }, { "completion_length": 923.7, "epoch": 0.1557129250010353, "grad_norm": 2.476456769332757, "kl": 3.134033203125, "learning_rate": 1.9811781768982392e-05, "loss": 0.1254, "reward": 1.064253362873569, "reward_std": 0.5311597665102454, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03574663798935944, "rewards/format_reward": 0.33125, "rewards/reasoning_steps_reward": 0.7687500152736902, "step": 705 }, { "completion_length": 973.85, "epoch": 0.15681727198685827, "grad_norm": 0.9110518537976159, "kl": 2.82705078125, "learning_rate": 1.980426339716264e-05, "loss": 0.1131, "reward": 0.8556666751392186, "reward_std": 0.45775549400859744, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12349999214638956, "rewards/format_reward": 0.20625, "rewards/reasoning_steps_reward": 0.7729166857898235, "step": 710 }, { "completion_length": 929.9875, "epoch": 0.15792161897268123, "grad_norm": 0.689638744596231, "kl": 2.47158203125, "learning_rate": 1.9796599274143586e-05, "loss": 0.0988, "reward": 1.0453462563455105, "reward_std": 0.4044397716068488, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04632041496806778, "rewards/format_reward": 0.23125, "rewards/reasoning_steps_reward": 0.8604166802018881, "step": 715 }, { "completion_length": 757.775, "epoch": 0.15902596595850416, "grad_norm": 2.6619811581457093, "kl": 13.9373046875, "learning_rate": 1.9788789513860875e-05, "loss": 0.5578, "reward": 1.106494550034404, "reward_std": 0.5601302272752037, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21433878369862214, "rewards/format_reward": 0.39375, "rewards/reasoning_steps_reward": 0.9270833387970925, "step": 720 }, { "completion_length": 834.575, "epoch": 0.1601303129443271, "grad_norm": 1.2429717954269097, "kl": 4.67158203125, "learning_rate": 1.9780834232415214e-05, "loss": 0.1868, "reward": 1.011640521325171, "reward_std": 0.4418655791791025, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2654428183857817, "rewards/format_reward": 0.325, "rewards/reasoning_steps_reward": 0.9520833365619182, "step": 725 }, { "completion_length": 899.575, "epoch": 0.16123465993015004, "grad_norm": 0.7171746700695241, "kl": 2.8833984375, "learning_rate": 1.9772733548070647e-05, "loss": 0.1154, "reward": 0.9335492581129075, "reward_std": 0.3853685567474713, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22061740965000354, "rewards/format_reward": 0.21875, "rewards/reasoning_steps_reward": 0.9354166708886623, "step": 730 }, { "completion_length": 938.2375, "epoch": 0.162339006915973, "grad_norm": 0.530387948893096, "kl": 2.845263671875, "learning_rate": 1.9764487581252787e-05, "loss": 0.1138, "reward": 0.9676836218684912, "reward_std": 0.4040184532976127, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11981637928402052, "rewards/format_reward": 0.19375, "rewards/reasoning_steps_reward": 0.8937500083819032, "step": 735 }, { "completion_length": 923.7125, "epoch": 0.16344335390179596, "grad_norm": 0.7148455898281986, "kl": 2.9392578125, "learning_rate": 1.975609645454704e-05, "loss": 0.1176, "reward": 1.004165413416922, "reward_std": 0.4944089779272872, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05000126257946249, "rewards/format_reward": 0.2625, "rewards/reasoning_steps_reward": 0.79166667945683, "step": 740 }, { "completion_length": 869.825, "epoch": 0.1645477008876189, "grad_norm": 0.5326753926199363, "kl": 2.82470703125, "learning_rate": 1.9747560292696763e-05, "loss": 0.113, "reward": 0.832844705414027, "reward_std": 0.5525396721786819, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13590529205976054, "rewards/format_reward": 0.29375, "rewards/reasoning_steps_reward": 0.6750000108033418, "step": 745 }, { "completion_length": 951.2625, "epoch": 0.16565204787344184, "grad_norm": 0.9976724610545183, "kl": 3.01611328125, "learning_rate": 1.9738879222601425e-05, "loss": 0.1207, "reward": 0.6285201878286898, "reward_std": 0.6893505120096052, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2298131546471268, "rewards/format_reward": 0.26875, "rewards/reasoning_steps_reward": 0.5895833455026149, "step": 750 }, { "completion_length": 935.625, "epoch": 0.16675639485926477, "grad_norm": 0.4851659837433616, "kl": 2.586865234375, "learning_rate": 1.9730053373314722e-05, "loss": 0.1035, "reward": 0.4737007636576891, "reward_std": 0.6648764016776113, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2950492393341847, "rewards/format_reward": 0.25, "rewards/reasoning_steps_reward": 0.5187500091269612, "step": 755 }, { "completion_length": 898.625, "epoch": 0.16786074184508773, "grad_norm": 0.4416112762647842, "kl": 2.737841796875, "learning_rate": 1.9721082876042644e-05, "loss": 0.1095, "reward": 0.6276166431605816, "reward_std": 0.6592218722492362, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2723833565658424, "rewards/format_reward": 0.3125, "rewards/reasoning_steps_reward": 0.5875000094994902, "step": 760 }, { "completion_length": 986.3875, "epoch": 0.16896508883091066, "grad_norm": 0.5212659541587674, "kl": 1.964501953125, "learning_rate": 1.9711967864141542e-05, "loss": 0.0786, "reward": 0.5854208903387189, "reward_std": 0.4578703532402869, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07707911343604792, "rewards/format_reward": 0.14375, "rewards/reasoning_steps_reward": 0.5187500098720192, "step": 765 }, { "completion_length": 1011.4625, "epoch": 0.17006943581673362, "grad_norm": 0.4202393392619023, "kl": 1.484814453125, "learning_rate": 1.970270847311612e-05, "loss": 0.0594, "reward": 0.5946683968533761, "reward_std": 0.3890162902807788, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09074827631484368, "rewards/format_reward": 0.09375, "rewards/reasoning_steps_reward": 0.5916666772216559, "step": 770 }, { "completion_length": 995.0375, "epoch": 0.17117378280255657, "grad_norm": 0.5057925969173489, "kl": 1.631982421875, "learning_rate": 1.9693304840617456e-05, "loss": 0.0652, "reward": 0.5266901765018701, "reward_std": 0.44898259460460394, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2420598310069181, "rewards/format_reward": 0.1125, "rewards/reasoning_steps_reward": 0.6562500141561032, "step": 775 }, { "completion_length": 973.85, "epoch": 0.1722781297883795, "grad_norm": 0.4342246003094615, "kl": 2.336279296875, "learning_rate": 1.968375710644093e-05, "loss": 0.0935, "reward": 0.8064871094655246, "reward_std": 0.6480218000418972, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.19559622975066304, "rewards/format_reward": 0.24375, "rewards/reasoning_steps_reward": 0.7583333445712924, "step": 780 }, { "completion_length": 823.4, "epoch": 0.17338247677420246, "grad_norm": 0.2840592190798346, "kl": 3.2484375, "learning_rate": 1.9674065412524147e-05, "loss": 0.13, "reward": 1.2522860381752252, "reward_std": 0.5435375596192898, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.020630629758670693, "rewards/format_reward": 0.45, "rewards/reasoning_steps_reward": 0.8229166774079204, "step": 785 }, { "completion_length": 958.7125, "epoch": 0.1744868237600254, "grad_norm": 0.5066655198721257, "kl": 2.073095703125, "learning_rate": 1.9664229902944833e-05, "loss": 0.0829, "reward": 0.7657026316504926, "reward_std": 0.46858742368640377, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06346403961651959, "rewards/format_reward": 0.18125, "rewards/reasoning_steps_reward": 0.6479166788980365, "step": 790 }, { "completion_length": 1011.4625, "epoch": 0.17559117074584835, "grad_norm": 0.5090355806565158, "kl": 1.276513671875, "learning_rate": 1.9654250723918706e-05, "loss": 0.0511, "reward": 0.6036186209297739, "reward_std": 0.30241919725158367, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03596471712298808, "rewards/format_reward": 0.05625, "rewards/reasoning_steps_reward": 0.5833333427086472, "step": 795 }, { "completion_length": 1024.0, "epoch": 0.1766955177316713, "grad_norm": 0.362393508654969, "kl": 1.162890625, "learning_rate": 1.9644128023797273e-05, "loss": 0.0465, "reward": 0.5545021136291325, "reward_std": 0.3219915485853562, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.299664559494704, "rewards/format_reward": 0.0625, "rewards/reasoning_steps_reward": 0.79166667945683, "step": 800 }, { "epoch": 0.1766955177316713, "eval_completion_length": 1024.0, "eval_kl": 1.004609375, "eval_loss": 0.040289442986249924, "eval_reward": 0.5231576064229011, "eval_reward_std": 0.1656639602733776, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.45517572939395906, "eval_rewards/format_reward": 0.035, "eval_rewards/reasoning_steps_reward": 0.943333340883255, "eval_runtime": 202.7136, "eval_samples_per_second": 0.488, "eval_steps_per_second": 0.123, "step": 800 }, { "completion_length": 1018.0875, "epoch": 0.17779986471749423, "grad_norm": 0.44983968909591454, "kl": 1.289599609375, "learning_rate": 1.9633861953065648e-05, "loss": 0.0516, "reward": 0.5429016770794988, "reward_std": 0.2576578710861213, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.4425149895250797, "rewards/format_reward": 0.08125, "rewards/reasoning_steps_reward": 0.9041666716337204, "step": 805 }, { "completion_length": 979.1375, "epoch": 0.1789042117033172, "grad_norm": 0.4467769812510504, "kl": 1.9100830078125, "learning_rate": 1.9623452664340305e-05, "loss": 0.0763, "reward": 0.7711514856666326, "reward_std": 0.38121095038736713, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.3892651722300798, "rewards/format_reward": 0.1875, "rewards/reasoning_steps_reward": 0.9666666701436043, "step": 810 }, { "completion_length": 936.3375, "epoch": 0.18000855868914012, "grad_norm": 0.4199249856681596, "kl": 2.6292236328125, "learning_rate": 1.9612900312366815e-05, "loss": 0.1052, "reward": 0.7745890522375702, "reward_std": 0.5395248577739948, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.34207761539146303, "rewards/format_reward": 0.28125, "rewards/reasoning_steps_reward": 0.8354166736826301, "step": 815 }, { "completion_length": 998.925, "epoch": 0.18111290567496308, "grad_norm": 0.38807419895689016, "kl": 2.0396484375, "learning_rate": 1.9602205054017534e-05, "loss": 0.0815, "reward": 0.38755306117236615, "reward_std": 0.6135149325666134, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.30203027836978436, "rewards/format_reward": 0.1625, "rewards/reasoning_steps_reward": 0.5270833438262343, "step": 820 }, { "completion_length": 964.6375, "epoch": 0.182217252660786, "grad_norm": 0.34646502449060457, "kl": 2.594287109375, "learning_rate": 1.9591367048289297e-05, "loss": 0.1038, "reward": 0.5837174264714122, "reward_std": 0.6725190826080507, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13503257725387813, "rewards/format_reward": 0.23125, "rewards/reasoning_steps_reward": 0.4875000072643161, "step": 825 }, { "completion_length": 920.1, "epoch": 0.18332159964660896, "grad_norm": 0.30414365620585365, "kl": 2.875, "learning_rate": 1.9580386456301014e-05, "loss": 0.115, "reward": 0.8815148666501045, "reward_std": 0.6505038747184699, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04973513415825437, "rewards/format_reward": 0.3125, "rewards/reasoning_steps_reward": 0.6187500094994902, "step": 830 }, { "completion_length": 834.025, "epoch": 0.18442594663243192, "grad_norm": 0.40396097917811025, "kl": 2.9671875, "learning_rate": 1.9569263441291312e-05, "loss": 0.1188, "reward": 1.1630100145936013, "reward_std": 0.5103294208552143, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03282331913396774, "rewards/format_reward": 0.3625, "rewards/reasoning_steps_reward": 0.8333333428949118, "step": 835 }, { "completion_length": 976.95, "epoch": 0.18553029361825485, "grad_norm": 0.6142914802044704, "kl": 2.892529296875, "learning_rate": 1.9557998168616087e-05, "loss": 0.1157, "reward": 0.991350544989109, "reward_std": 0.4822408523090417, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.058649450994562355, "rewards/format_reward": 0.19375, "rewards/reasoning_steps_reward": 0.8562500078231097, "step": 840 }, { "completion_length": 911.1625, "epoch": 0.1866346406040778, "grad_norm": 0.3716875526820613, "kl": 3.598046875, "learning_rate": 1.9546590805746054e-05, "loss": 0.144, "reward": 1.1998552225530148, "reward_std": 0.4323180656544537, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03139477769100267, "rewards/format_reward": 0.31875, "rewards/reasoning_steps_reward": 0.9125000059604644, "step": 845 }, { "completion_length": 931.4, "epoch": 0.18773898758990074, "grad_norm": 0.32469474685255123, "kl": 2.931396484375, "learning_rate": 1.9535041522264256e-05, "loss": 0.1173, "reward": 1.1232962097972632, "reward_std": 0.501405765369418, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03087046154323616, "rewards/format_reward": 0.29375, "rewards/reasoning_steps_reward": 0.8604166770353914, "step": 850 }, { "completion_length": 854.675, "epoch": 0.1888433345757237, "grad_norm": 0.28376486622584113, "kl": 2.97490234375, "learning_rate": 1.9523350489863545e-05, "loss": 0.1189, "reward": 1.1701628059148788, "reward_std": 0.544858716159706, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03192052699450869, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.833333345130086, "step": 855 }, { "completion_length": 873.25, "epoch": 0.18994768156154665, "grad_norm": 0.305720363616131, "kl": 3.108154296875, "learning_rate": 1.951151788234402e-05, "loss": 0.1243, "reward": 1.1185833937488496, "reward_std": 0.5606642366210508, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03349993971351069, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7708333427086472, "step": 860 }, { "completion_length": 1011.4625, "epoch": 0.19105202854736958, "grad_norm": 0.36378063097811875, "kl": 1.690185546875, "learning_rate": 1.949954387561046e-05, "loss": 0.0676, "reward": 0.7097708626213717, "reward_std": 0.36822339960053796, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03189580502767057, "rewards/format_reward": 0.1, "rewards/reasoning_steps_reward": 0.6416666757315397, "step": 865 }, { "completion_length": 1015.04375, "epoch": 0.19215637553319254, "grad_norm": 0.3489339505523048, "kl": 1.650439453125, "learning_rate": 1.9487428647669688e-05, "loss": 0.066, "reward": 0.8181800896301865, "reward_std": 0.36886287455181443, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07140324611464166, "rewards/format_reward": 0.10625, "rewards/reasoning_steps_reward": 0.7833333414047956, "step": 870 }, { "completion_length": 937.45, "epoch": 0.19326072251901547, "grad_norm": 0.48239314974054465, "kl": 2.27412109375, "learning_rate": 1.947517237862795e-05, "loss": 0.091, "reward": 0.9640216436237097, "reward_std": 0.44529842740666936, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16931168984156103, "rewards/format_reward": 0.225, "rewards/reasoning_steps_reward": 0.908333340100944, "step": 875 }, { "completion_length": 810.875, "epoch": 0.19436506950483842, "grad_norm": 0.3776754403626219, "kl": 3.48330078125, "learning_rate": 1.9462775250688208e-05, "loss": 0.1394, "reward": 1.3459553118795156, "reward_std": 0.5377076888135889, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04571135753940325, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.9041666727513075, "step": 880 }, { "completion_length": 1009.91875, "epoch": 0.19546941649066138, "grad_norm": 0.4583496847925809, "kl": 2.01220703125, "learning_rate": 1.9450237448147463e-05, "loss": 0.0805, "reward": 0.6594697997083131, "reward_std": 0.4720977840166597, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02803020708306576, "rewards/format_reward": 0.14375, "rewards/reasoning_steps_reward": 0.5437500100582838, "step": 885 }, { "completion_length": 998.925, "epoch": 0.1965737634764843, "grad_norm": 0.37896192292415093, "kl": 2.0322265625, "learning_rate": 1.943755915739399e-05, "loss": 0.0813, "reward": 0.608962860464817, "reward_std": 0.450826744859296, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03270381114562042, "rewards/format_reward": 0.1375, "rewards/reasoning_steps_reward": 0.5041666772216559, "step": 890 }, { "completion_length": 1008.71875, "epoch": 0.19767811046230727, "grad_norm": 0.35794014020876924, "kl": 2.0822265625, "learning_rate": 1.9424740566904572e-05, "loss": 0.0832, "reward": 0.6830778570845724, "reward_std": 0.5404627276671817, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10233881335589104, "rewards/format_reward": 0.15625, "rewards/reasoning_steps_reward": 0.6291666775941849, "step": 895 }, { "completion_length": 948.8, "epoch": 0.1987824574481302, "grad_norm": 0.39271082198189455, "kl": 2.01015625, "learning_rate": 1.9411781867241718e-05, "loss": 0.0804, "reward": 0.844484331086278, "reward_std": 0.39959267702070067, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22634900148259476, "rewards/format_reward": 0.19375, "rewards/reasoning_steps_reward": 0.8770833415910602, "step": 900 }, { "epoch": 0.1987824574481302, "eval_completion_length": 1024.0, "eval_kl": 0.763046875, "eval_loss": 0.03052530251443386, "eval_reward": 0.5850898969173431, "eval_reward_std": 0.2078262207657099, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.31157676726579664, "eval_rewards/format_reward": 0.015, "eval_rewards/reasoning_steps_reward": 0.8816666746139527, "eval_runtime": 202.2443, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.124, "step": 900 }, { "completion_length": 1022.2875, "epoch": 0.19988680443395315, "grad_norm": 0.48408716939512914, "kl": 0.7494140625, "learning_rate": 1.9398683251050796e-05, "loss": 0.03, "reward": 0.5665433191694319, "reward_std": 0.19808940038928996, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.3292900139465928, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.8833333384245634, "step": 905 }, { "completion_length": 1024.0, "epoch": 0.20099115141977608, "grad_norm": 0.42990941163217694, "kl": 0.6109130859375, "learning_rate": 1.93854449130572e-05, "loss": 0.0244, "reward": 0.5857137320563197, "reward_std": 0.10294231597799808, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.385119604319334, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.9583333371207118, "step": 910 }, { "completion_length": 1024.0, "epoch": 0.20209549840559904, "grad_norm": 0.48720013231599374, "kl": 0.610107421875, "learning_rate": 1.937206705006344e-05, "loss": 0.0244, "reward": 0.594690283946693, "reward_std": 0.18519932519702706, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.369893048517406, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.9333333369344473, "step": 915 }, { "completion_length": 1024.0, "epoch": 0.203199845391422, "grad_norm": 0.5696400107146373, "kl": 0.5218994140625, "learning_rate": 1.9358549860946217e-05, "loss": 0.0209, "reward": 0.812476817984134, "reward_std": 0.24139399882988072, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1666898537427187, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.9104166731238366, "step": 920 }, { "completion_length": 1024.0, "epoch": 0.20430419237724493, "grad_norm": 0.541602324631897, "kl": 0.5464599609375, "learning_rate": 1.934489354665347e-05, "loss": 0.0219, "reward": 1.0158553715795278, "reward_std": 0.28890128862985875, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07789463058797992, "rewards/format_reward": 0.1625, "rewards/reasoning_steps_reward": 0.9312500067055225, "step": 925 }, { "completion_length": 1024.0, "epoch": 0.20540853936306788, "grad_norm": 0.5321993292267261, "kl": 0.5557861328125, "learning_rate": 1.9331098310201392e-05, "loss": 0.0222, "reward": 1.2741701494902373, "reward_std": 0.44738651754014425, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05916318525414681, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.9208333365619182, "step": 930 }, { "completion_length": 1024.0, "epoch": 0.2065128863488908, "grad_norm": 0.5959692383930701, "kl": 0.47041015625, "learning_rate": 1.9317164356671395e-05, "loss": 0.0188, "reward": 1.6727484971284867, "reward_std": 0.2997322161420016, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06266818092990434, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.9791666679084301, "step": 935 }, { "completion_length": 713.875, "epoch": 0.20761723333471377, "grad_norm": 1.1057415650471873, "kl": 2.516064453125, "learning_rate": 1.930309189320709e-05, "loss": 0.1006, "reward": 1.9278814405202866, "reward_std": 0.07678677760886785, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.040868553338805215, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 1.0, "step": 940 }, { "completion_length": 986.3875, "epoch": 0.20872158032053673, "grad_norm": 0.5228184716949067, "kl": 1.52724609375, "learning_rate": 1.9288881129011177e-05, "loss": 0.0611, "reward": 1.8507322192192077, "reward_std": 0.1548274521872372, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.053434445448510816, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.9291666721925139, "step": 945 }, { "completion_length": 1024.0, "epoch": 0.20982592730635966, "grad_norm": 0.5117019454936371, "kl": 0.4581787109375, "learning_rate": 1.9274532275342355e-05, "loss": 0.0183, "reward": 1.6492023468017578, "reward_std": 0.26452546955042633, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1466309867319069, "rewards/format_reward": 0.9, "rewards/reasoning_steps_reward": 0.8958333406597376, "step": 950 }, { "completion_length": 1024.0, "epoch": 0.2109302742921826, "grad_norm": 0.48557359205206774, "kl": 0.3712890625, "learning_rate": 1.9260045545512174e-05, "loss": 0.0149, "reward": 1.5456451624631882, "reward_std": 0.35966640640981495, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1064381811331259, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.8895833430811763, "step": 955 }, { "completion_length": 1024.0, "epoch": 0.21203462127800554, "grad_norm": 0.5356381874366991, "kl": 0.329052734375, "learning_rate": 1.9245421154881873e-05, "loss": 0.0132, "reward": 1.7083046436309814, "reward_std": 0.2587324011943565, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06461202607351879, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.9604166708886623, "step": 960 }, { "completion_length": 1024.0, "epoch": 0.2131389682638285, "grad_norm": 0.5603446707923614, "kl": 0.3440185546875, "learning_rate": 1.9230659320859157e-05, "loss": 0.0138, "reward": 1.7261480644345284, "reward_std": 0.2795285307271115, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.07176859906758182, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.9416666723787784, "step": 965 }, { "completion_length": 1024.0, "epoch": 0.21424331524965143, "grad_norm": 0.6759689326469629, "kl": 0.391845703125, "learning_rate": 1.9215760262894982e-05, "loss": 0.0157, "reward": 1.8084782645106317, "reward_std": 0.17105812441823218, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09152174164628377, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9687500022351742, "step": 970 }, { "completion_length": 1002.6375, "epoch": 0.21534766223547439, "grad_norm": 0.6017399141547436, "kl": 0.9856201171875, "learning_rate": 1.9200724202480305e-05, "loss": 0.0394, "reward": 1.5970855988562107, "reward_std": 0.35718659692502114, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1570810692050145, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.9479166727513075, "step": 975 }, { "completion_length": 1008.3375, "epoch": 0.21645200922129734, "grad_norm": 0.4897056955691348, "kl": 0.69998779296875, "learning_rate": 1.9185551363142754e-05, "loss": 0.028, "reward": 1.4942862942814827, "reward_std": 0.37678084987601324, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.10779703845037147, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.9645833373069763, "step": 980 }, { "completion_length": 1024.0, "epoch": 0.21755635620712027, "grad_norm": 0.4944414791078527, "kl": 0.30677490234375, "learning_rate": 1.9170241970443344e-05, "loss": 0.0123, "reward": 1.6588552303612232, "reward_std": 0.2802548832620232, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.08281143896747381, "rewards/format_reward": 0.74375, "rewards/reasoning_steps_reward": 0.9791666693985462, "step": 985 }, { "completion_length": 1024.0, "epoch": 0.21866070319294323, "grad_norm": 0.47055042970737043, "kl": 0.31396484375, "learning_rate": 1.9154796251973092e-05, "loss": 0.0126, "reward": 1.707213106751442, "reward_std": 0.23285328571801073, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.11153689179336652, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9375000059604645, "step": 990 }, { "completion_length": 1024.0, "epoch": 0.21976505017876616, "grad_norm": 0.4722092126957342, "kl": 0.27503662109375, "learning_rate": 1.9139214437349663e-05, "loss": 0.011, "reward": 1.6616327054798603, "reward_std": 0.2845032803234062, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13211730096081736, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.9000000070780516, "step": 995 }, { "completion_length": 1024.0, "epoch": 0.22086939716458912, "grad_norm": 0.4694447156979849, "kl": 0.2855224609375, "learning_rate": 1.9123496758213926e-05, "loss": 0.0114, "reward": 1.7685628682374954, "reward_std": 0.23412906796729657, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10018712454620982, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9375000063329935, "step": 1000 }, { "epoch": 0.22086939716458912, "eval_completion_length": 1024.0, "eval_kl": 0.29111328125, "eval_loss": 0.011653742752969265, "eval_reward": 1.8491823887825012, "eval_reward_std": 0.25383697646204384, "eval_rewards/accuracy_reward": 0.02, "eval_rewards/cosine_scaled_reward": -0.06248428151942789, "eval_rewards/format_reward": 0.95, "eval_rewards/reasoning_steps_reward": 0.9416666758060456, "eval_runtime": 203.8275, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.123, "step": 1000 }, { "completion_length": 1024.0, "epoch": 0.22197374415041207, "grad_norm": 0.44007384237619535, "kl": 0.28228759765625, "learning_rate": 1.9107643448226536e-05, "loss": 0.0113, "reward": 1.8567358702421188, "reward_std": 0.15496021461585768, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.07451412830560003, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 0.9562500029802322, "step": 1005 }, { "completion_length": 1024.0, "epoch": 0.223078091136235, "grad_norm": 0.4713086951763063, "kl": 0.28775634765625, "learning_rate": 1.909165474306445e-05, "loss": 0.0115, "reward": 1.851528912782669, "reward_std": 0.14237245887197786, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10680442866578232, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 0.9833333350718021, "step": 1010 }, { "completion_length": 1024.0, "epoch": 0.22418243812205796, "grad_norm": 0.4478755129652751, "kl": 0.30277099609375, "learning_rate": 1.9075530880417422e-05, "loss": 0.0121, "reward": 1.717442861199379, "reward_std": 0.2557602992928878, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1242238087579608, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.9479166723787784, "step": 1015 }, { "completion_length": 1024.0, "epoch": 0.2252867851078809, "grad_norm": 0.4532749764094587, "kl": 0.3177490234375, "learning_rate": 1.905927209998447e-05, "loss": 0.0127, "reward": 1.622461923956871, "reward_std": 0.363123452578111, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1129547476026346, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.9041666753590107, "step": 1020 }, { "completion_length": 1024.0, "epoch": 0.22639113209370385, "grad_norm": 0.45238116128275235, "kl": 0.34163818359375, "learning_rate": 1.9042878643470313e-05, "loss": 0.0137, "reward": 1.7687511250376702, "reward_std": 0.2220757791714277, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08958221631328342, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.8770833378657699, "step": 1025 }, { "completion_length": 1024.0, "epoch": 0.22749547907952677, "grad_norm": 0.4650153137737356, "kl": 0.34112548828125, "learning_rate": 1.9026350754581782e-05, "loss": 0.0137, "reward": 1.8675057023763657, "reward_std": 0.150637792609632, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.08666095893859165, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.9666666693985462, "step": 1030 }, { "completion_length": 1024.0, "epoch": 0.22859982606534973, "grad_norm": 0.4321232746728849, "kl": 0.312451171875, "learning_rate": 1.900968867902419e-05, "loss": 0.0125, "reward": 1.8571926668286323, "reward_std": 0.14679434172503533, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10114066474925494, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 0.9833333343267441, "step": 1035 }, { "completion_length": 1024.0, "epoch": 0.2297041730511727, "grad_norm": 0.4339868879357326, "kl": 0.30401611328125, "learning_rate": 1.8992892664497693e-05, "loss": 0.0122, "reward": 1.7687968090176582, "reward_std": 0.22350005892512853, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07911985855171225, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9854166671633721, "step": 1040 }, { "completion_length": 1024.0, "epoch": 0.23080852003699562, "grad_norm": 0.4536285695606801, "kl": 0.2967041015625, "learning_rate": 1.897596296069358e-05, "loss": 0.0119, "reward": 1.8167687579989433, "reward_std": 0.1868845313145357, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08323125535480358, "rewards/format_reward": 0.9125, "rewards/reasoning_steps_reward": 0.9875000014901161, "step": 1045 }, { "completion_length": 1024.0, "epoch": 0.23191286702281858, "grad_norm": 0.4735990601666979, "kl": 0.316552734375, "learning_rate": 1.8958899819290592e-05, "loss": 0.0127, "reward": 1.7116306245326995, "reward_std": 0.2591278723456526, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10086938191234367, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9750000022351741, "step": 1050 }, { "completion_length": 1024.0, "epoch": 0.2330172140086415, "grad_norm": 0.4603152932618283, "kl": 0.30628662109375, "learning_rate": 1.8941703493951163e-05, "loss": 0.0122, "reward": 1.8179652035236358, "reward_std": 0.21623797937058953, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.0778681310959655, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.9145833380520344, "step": 1055 }, { "completion_length": 1024.0, "epoch": 0.23412156099446446, "grad_norm": 0.4076470105342933, "kl": 0.32257080078125, "learning_rate": 1.892437424031766e-05, "loss": 0.0129, "reward": 1.8207226276397706, "reward_std": 0.19692284195543835, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06886071387561969, "rewards/format_reward": 0.99375, "rewards/reasoning_steps_reward": 0.8958333365619182, "step": 1060 }, { "completion_length": 1024.0, "epoch": 0.23522590798028742, "grad_norm": 0.40747101665458285, "kl": 0.2858154296875, "learning_rate": 1.890691231600856e-05, "loss": 0.0114, "reward": 1.8809554889798163, "reward_std": 0.14877327984722796, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.07529450277797878, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.968750000745058, "step": 1065 }, { "completion_length": 1024.0, "epoch": 0.23633025496611035, "grad_norm": 0.44323089226801765, "kl": 0.28681640625, "learning_rate": 1.8889317980614653e-05, "loss": 0.0115, "reward": 1.7944041058421134, "reward_std": 0.22159042263956508, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13892922517989065, "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.9833333350718021, "step": 1070 }, { "completion_length": 1024.0, "epoch": 0.2374346019519333, "grad_norm": 0.4913757009884322, "kl": 0.30711669921875, "learning_rate": 1.8871591495695156e-05, "loss": 0.0123, "reward": 1.6569852642714977, "reward_std": 0.30601046779338503, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1638480743567925, "rewards/format_reward": 0.8875, "rewards/reasoning_steps_reward": 0.9270833348855376, "step": 1075 }, { "completion_length": 1024.0, "epoch": 0.23853894893775623, "grad_norm": 0.4985806443874965, "kl": 0.32496337890625, "learning_rate": 1.8853733124773837e-05, "loss": 0.013, "reward": 1.7661071710288525, "reward_std": 0.2830719207166112, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12139282901771367, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.9250000029802322, "step": 1080 }, { "completion_length": 1024.0, "epoch": 0.2396432959235792, "grad_norm": 0.4792956594110829, "kl": 0.29970703125, "learning_rate": 1.8835743133335096e-05, "loss": 0.012, "reward": 1.808322674036026, "reward_std": 0.23723325279643176, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.07084399787709117, "rewards/format_reward": 0.9125, "rewards/reasoning_steps_reward": 0.9541666693985462, "step": 1085 }, { "completion_length": 1024.0, "epoch": 0.24074764290940212, "grad_norm": 0.43172903038425636, "kl": 0.2791015625, "learning_rate": 1.8817621788820017e-05, "loss": 0.0112, "reward": 1.8203510470688342, "reward_std": 0.2552833634043054, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.07131561395945027, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.9291666701436043, "step": 1090 }, { "completion_length": 1024.0, "epoch": 0.24185198989522508, "grad_norm": 0.4484582517601397, "kl": 0.31998291015625, "learning_rate": 1.8799369360622394e-05, "loss": 0.0128, "reward": 1.7547560043632984, "reward_std": 0.28648674785072215, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.0994106627011206, "rewards/format_reward": 0.91875, "rewards/reasoning_steps_reward": 0.9166666720062494, "step": 1095 }, { "completion_length": 1024.0, "epoch": 0.24295633688104804, "grad_norm": 0.41476974658385424, "kl": 0.33250732421875, "learning_rate": 1.8780986120084715e-05, "loss": 0.0133, "reward": 1.6285200668498874, "reward_std": 0.3249174021591898, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16106326215958688, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9145833356305957, "step": 1100 }, { "epoch": 0.24295633688104804, "eval_completion_length": 1024.0, "eval_kl": 0.45419921875, "eval_loss": 0.018182678148150444, "eval_reward": 1.4785543692111969, "eval_reward_std": 0.5918275532126427, "eval_rewards/accuracy_reward": 0.025, "eval_rewards/cosine_scaled_reward": -0.1331122925132513, "eval_rewards/format_reward": 0.745, "eval_rewards/reasoning_steps_reward": 0.8416666769981385, "eval_runtime": 203.0342, "eval_samples_per_second": 0.488, "eval_steps_per_second": 0.123, "step": 1100 }, { "completion_length": 1024.0, "epoch": 0.24406068386687096, "grad_norm": 0.3429126222185335, "kl": 0.373504638671875, "learning_rate": 1.876247234049416e-05, "loss": 0.0149, "reward": 1.4890433787368238, "reward_std": 0.5072989727195818, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.10262328688113484, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.891666672937572, "step": 1105 }, { "completion_length": 1024.0, "epoch": 0.24516503085269392, "grad_norm": 0.4247791062153765, "kl": 0.275592041015625, "learning_rate": 1.8743828297078485e-05, "loss": 0.011, "reward": 1.3021938862279057, "reward_std": 0.49335026524204295, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14988945015938954, "rewards/format_reward": 0.53125, "rewards/reasoning_steps_reward": 0.9145833373069763, "step": 1110 }, { "completion_length": 1020.725, "epoch": 0.24626937783851685, "grad_norm": 0.8127107356104201, "kl": 0.56708984375, "learning_rate": 1.8725054267001992e-05, "loss": 0.0227, "reward": 1.5503739204257727, "reward_std": 0.44013373394700467, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.14962608254988935, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.9625000026077032, "step": 1115 }, { "completion_length": 1024.0, "epoch": 0.2473737248243398, "grad_norm": 0.3549989094358324, "kl": 0.28006591796875, "learning_rate": 1.8706150529361355e-05, "loss": 0.0112, "reward": 1.8609877035021782, "reward_std": 0.4224522696546046, "rewards/accuracy_reward": 0.08125, "rewards/cosine_scaled_reward": -0.07651229500770569, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.962500000745058, "step": 1120 }, { "completion_length": 1020.66875, "epoch": 0.24847807181016277, "grad_norm": 0.3992396902478467, "kl": 0.3288330078125, "learning_rate": 1.8687117365181514e-05, "loss": 0.0132, "reward": 1.8732830002903937, "reward_std": 0.22004793924934346, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.07671700548453372, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.962500000745058, "step": 1125 }, { "completion_length": 1024.0, "epoch": 0.2495824187959857, "grad_norm": 0.3259551611857864, "kl": 0.3834228515625, "learning_rate": 1.8667955057411454e-05, "loss": 0.0153, "reward": 1.8463864415884017, "reward_std": 0.2847652018404915, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.08486354988708626, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9687500014901161, "step": 1130 }, { "completion_length": 1024.0, "epoch": 0.25068676578180865, "grad_norm": 0.3444343510959895, "kl": 0.302099609375, "learning_rate": 1.864866389092005e-05, "loss": 0.0121, "reward": 1.8804628476500511, "reward_std": 0.41146394111856355, "rewards/accuracy_reward": 0.08125, "rewards/cosine_scaled_reward": -0.05287047996171168, "rewards/format_reward": 0.9, "rewards/reasoning_steps_reward": 0.9520833358168602, "step": 1135 }, { "completion_length": 1024.0, "epoch": 0.2517911127676316, "grad_norm": 0.41868127372045477, "kl": 0.39306640625, "learning_rate": 1.8629244152491773e-05, "loss": 0.0157, "reward": 1.7143469981849193, "reward_std": 0.39155905476345654, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.04190299968176987, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9250000044703484, "step": 1140 }, { "completion_length": 1024.0, "epoch": 0.2528954597534545, "grad_norm": 0.3927845753079056, "kl": 0.43892822265625, "learning_rate": 1.860969613082249e-05, "loss": 0.0175, "reward": 1.4373045616783202, "reward_std": 0.5264668684656499, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.05227877427241765, "rewards/format_reward": 0.65625, "rewards/reasoning_steps_reward": 0.8083333402872086, "step": 1145 }, { "completion_length": 1024.0, "epoch": 0.25399980673927747, "grad_norm": 0.38745171247473803, "kl": 0.44339599609375, "learning_rate": 1.8590020116515116e-05, "loss": 0.0177, "reward": 1.3909062273800372, "reward_std": 0.5424190352443474, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.08617710751132109, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.8395833402872086, "step": 1150 }, { "completion_length": 1024.0, "epoch": 0.2551041537251004, "grad_norm": 0.36149961111425516, "kl": 0.393505859375, "learning_rate": 1.8570216402075326e-05, "loss": 0.0157, "reward": 1.5060852129012345, "reward_std": 0.5054815013281768, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.11474812921951524, "rewards/format_reward": 0.69375, "rewards/reasoning_steps_reward": 0.8958333402872085, "step": 1155 }, { "completion_length": 1024.0, "epoch": 0.2562085007109234, "grad_norm": 0.23940946232188295, "kl": 0.29405517578125, "learning_rate": 1.8550285281907198e-05, "loss": 0.0118, "reward": 1.5001007352024316, "reward_std": 0.4472006390285969, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1623992755456129, "rewards/format_reward": 0.74375, "rewards/reasoning_steps_reward": 0.9125000035390258, "step": 1160 }, { "completion_length": 1024.0, "epoch": 0.25731284769674634, "grad_norm": 0.23736459900896167, "kl": 0.245904541015625, "learning_rate": 1.8530227052308843e-05, "loss": 0.0098, "reward": 1.6085085548460483, "reward_std": 0.42973450673271146, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13315811949432826, "rewards/format_reward": 0.76875, "rewards/reasoning_steps_reward": 0.9604166686534882, "step": 1165 }, { "completion_length": 1024.0, "epoch": 0.25841719468256924, "grad_norm": 0.26537961711313995, "kl": 0.2716796875, "learning_rate": 1.8510042011467978e-05, "loss": 0.0109, "reward": 1.6423254296183587, "reward_std": 0.3275948438240448, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14100789964140859, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.9583333365619182, "step": 1170 }, { "completion_length": 1024.0, "epoch": 0.2595215416683922, "grad_norm": 0.2407532396886285, "kl": 0.304412841796875, "learning_rate": 1.848973045945753e-05, "loss": 0.0122, "reward": 1.5989655748941005, "reward_std": 0.35892669553431916, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13436777089991664, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.9270833365619182, "step": 1175 }, { "completion_length": 1024.0, "epoch": 0.26062588865421515, "grad_norm": 0.25225574352603947, "kl": 0.382373046875, "learning_rate": 1.8469292698231137e-05, "loss": 0.0153, "reward": 1.4464009982533752, "reward_std": 0.5317513575919293, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13693233868325477, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.8770833374932409, "step": 1180 }, { "completion_length": 1024.0, "epoch": 0.2617302356400381, "grad_norm": 0.2156684661256207, "kl": 0.29520263671875, "learning_rate": 1.8448729031618687e-05, "loss": 0.0118, "reward": 1.4996001317165792, "reward_std": 0.3996583673519126, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.133733198023765, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.8833333352580667, "step": 1185 }, { "completion_length": 1024.0, "epoch": 0.26283458262586107, "grad_norm": 0.2338480894666337, "kl": 0.25845947265625, "learning_rate": 1.8428039765321783e-05, "loss": 0.0103, "reward": 1.557172004878521, "reward_std": 0.4314892937021796, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13449467101017945, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.9041666699573397, "step": 1190 }, { "completion_length": 1024.0, "epoch": 0.26393892961168397, "grad_norm": 0.23739045779401974, "kl": 0.382818603515625, "learning_rate": 1.840722520690921e-05, "loss": 0.0153, "reward": 1.3562876941636204, "reward_std": 0.6140932853326376, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13746229895623402, "rewards/format_reward": 0.64375, "rewards/reasoning_steps_reward": 0.8500000033527613, "step": 1195 }, { "completion_length": 1024.0, "epoch": 0.2650432765975069, "grad_norm": 0.18812254182518015, "kl": 0.41695556640625, "learning_rate": 1.838628566581236e-05, "loss": 0.0167, "reward": 1.2772326513193548, "reward_std": 0.6012849385821027, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14985067906891344, "rewards/format_reward": 0.5625, "rewards/reasoning_steps_reward": 0.8583333371207118, "step": 1200 }, { "epoch": 0.2650432765975069, "eval_completion_length": 1024.0, "eval_kl": 0.2277880859375, "eval_loss": 0.009126170538365841, "eval_reward": 1.237759041786194, "eval_reward_std": 0.5342784489318728, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.14890761934220792, "eval_rewards/format_reward": 0.475, "eval_rewards/reasoning_steps_reward": 0.9116666686534881, "eval_runtime": 202.9615, "eval_samples_per_second": 0.488, "eval_steps_per_second": 0.123, "step": 1200 }, { "completion_length": 1024.0, "epoch": 0.2661476235833299, "grad_norm": 0.1396938684266728, "kl": 0.1735382080078125, "learning_rate": 1.8365221453320625e-05, "loss": 0.0069, "reward": 1.159564550407231, "reward_std": 0.5056648141493497, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13418545336462556, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.9187500011175871, "step": 1205 }, { "completion_length": 1024.0, "epoch": 0.26725197056915284, "grad_norm": 0.17009112400745297, "kl": 0.148822021484375, "learning_rate": 1.8344032882576784e-05, "loss": 0.006, "reward": 1.7244970690459014, "reward_std": 0.2821242625116156, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.056752927817615276, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.9500000007450581, "step": 1210 }, { "completion_length": 1020.8625, "epoch": 0.2683563175549758, "grad_norm": 0.08738103322845646, "kl": 0.14678955078125, "learning_rate": 1.8322720268572333e-05, "loss": 0.0059, "reward": 1.88584890589118, "reward_std": 0.15781026161916997, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.020401104938173376, "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.968750000745058, "step": 1215 }, { "completion_length": 1021.76875, "epoch": 0.2694606645407987, "grad_norm": 0.17622717031700735, "kl": 0.150677490234375, "learning_rate": 1.83012839281428e-05, "loss": 0.006, "reward": 1.6834479916840792, "reward_std": 0.3111886321689781, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.026968683502354908, "rewards/format_reward": 0.7375, "rewards/reasoning_steps_reward": 0.972916667163372, "step": 1220 }, { "completion_length": 1022.925, "epoch": 0.27056501152662166, "grad_norm": 0.11918594376170559, "kl": 0.285870361328125, "learning_rate": 1.827972417996306e-05, "loss": 0.0114, "reward": 1.5191262364387512, "reward_std": 0.3489377611604709, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04962377124284103, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 0.9562500014901161, "step": 1225 }, { "completion_length": 1024.0, "epoch": 0.2716693585124446, "grad_norm": 0.06285065134173069, "kl": 0.13602294921875, "learning_rate": 1.8258041344542567e-05, "loss": 0.0054, "reward": 0.9662943309172988, "reward_std": 0.10380094405736599, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.021205670808194556, "rewards/format_reward": 0.025, "rewards/reasoning_steps_reward": 0.9625000018626452, "step": 1230 }, { "completion_length": 1024.0, "epoch": 0.27277370549826757, "grad_norm": 0.07686648446244543, "kl": 0.116314697265625, "learning_rate": 1.823623574422061e-05, "loss": 0.0046, "reward": 0.944898908957839, "reward_std": 0.07661853031105466, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.019684416962354588, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.964583334326744, "step": 1235 }, { "completion_length": 1020.00625, "epoch": 0.27387805248409053, "grad_norm": 0.07754923602159525, "kl": 0.1475494384765625, "learning_rate": 1.821430770316151e-05, "loss": 0.0059, "reward": 0.9660956308245658, "reward_std": 0.1536989317713818, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.021404369897666697, "rewards/format_reward": 0.0375, "rewards/reasoning_steps_reward": 0.9500000022351742, "step": 1240 }, { "completion_length": 1024.0, "epoch": 0.27498239946991343, "grad_norm": 0.11285092638491938, "kl": 0.1615203857421875, "learning_rate": 1.8192257547349805e-05, "loss": 0.0065, "reward": 0.9706789815798402, "reward_std": 0.14492510877234963, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.037654361013386504, "rewards/format_reward": 0.05, "rewards/reasoning_steps_reward": 0.9583333354443312, "step": 1245 }, { "completion_length": 1022.925, "epoch": 0.2760867464557364, "grad_norm": 0.13307393716537338, "kl": 0.3531829833984375, "learning_rate": 1.817008560458541e-05, "loss": 0.0141, "reward": 0.7812707336619497, "reward_std": 0.3402592138071441, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09789594054198006, "rewards/format_reward": 0.05, "rewards/reasoning_steps_reward": 0.829166672565043, "step": 1250 }, { "completion_length": 1024.0, "epoch": 0.27719109344155934, "grad_norm": 0.25658627816669477, "kl": 0.487921142578125, "learning_rate": 1.814779220447872e-05, "loss": 0.0195, "reward": 0.6302142185159028, "reward_std": 0.5167911179111343, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14686911877834063, "rewards/format_reward": 0.05625, "rewards/reasoning_steps_reward": 0.7208333412185312, "step": 1255 }, { "completion_length": 1024.0, "epoch": 0.2782954404273823, "grad_norm": 0.5529738552955684, "kl": 0.373065185546875, "learning_rate": 1.8125377678445755e-05, "loss": 0.0149, "reward": 0.9013677610084414, "reward_std": 0.4689164909107603, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10071557244591531, "rewards/format_reward": 0.18125, "rewards/reasoning_steps_reward": 0.8208333399146795, "step": 1260 }, { "completion_length": 1023.875, "epoch": 0.2793997874132052, "grad_norm": 3.3418210634211905, "kl": 2.01922607421875, "learning_rate": 1.8102842359703177e-05, "loss": 0.0809, "reward": 1.1432768110185862, "reward_std": 0.44278795822719985, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09005652678351908, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.8583333399146795, "step": 1265 }, { "completion_length": 1024.0, "epoch": 0.28050413439902816, "grad_norm": 4.521061137528278, "kl": 1.19075927734375, "learning_rate": 1.8080186583263386e-05, "loss": 0.0476, "reward": 0.5847210302948952, "reward_std": 0.41035552892026317, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2277789770560048, "rewards/format_reward": 0.05625, "rewards/reasoning_steps_reward": 0.7562500078231096, "step": 1270 }, { "completion_length": 1024.0, "epoch": 0.2816084813848511, "grad_norm": 30.812440821129783, "kl": 250.4185028076172, "learning_rate": 1.8057410685929505e-05, "loss": 10.0457, "reward": 0.5216961699537933, "reward_std": 0.43772149360393087, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22413716676055628, "rewards/format_reward": 0.04375, "rewards/reasoning_steps_reward": 0.702083344757557, "step": 1275 }, { "completion_length": 1006.43125, "epoch": 0.2827128283706741, "grad_norm": 2.740052403563944, "kl": 2.95443115234375, "learning_rate": 1.8034515006290398e-05, "loss": 0.1182, "reward": 0.6823957259068265, "reward_std": 0.3967391650963691, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13010428047346068, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.7437500119209289, "step": 1280 }, { "completion_length": 1022.3875, "epoch": 0.28381717535649703, "grad_norm": 39.81394935188374, "kl": 2.14736328125, "learning_rate": 1.8011499884715616e-05, "loss": 0.086, "reward": 0.8438511086627841, "reward_std": 0.30578292898626386, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.07281556303551043, "rewards/format_reward": 0.05625, "rewards/reasoning_steps_reward": 0.8541666775941849, "step": 1285 }, { "completion_length": 1024.0, "epoch": 0.28492152234231993, "grad_norm": 3.137457380908233, "kl": 2.644873046875, "learning_rate": 1.7988365663350352e-05, "loss": 0.1059, "reward": 0.9859581716358662, "reward_std": 0.22302457209725618, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06195849884697964, "rewards/format_reward": 0.09375, "rewards/reasoning_steps_reward": 0.9541666701436042, "step": 1290 }, { "completion_length": 1021.1, "epoch": 0.2860258693281429, "grad_norm": 0.29805685695358275, "kl": 0.4004791259765625, "learning_rate": 1.7965112686110346e-05, "loss": 0.016, "reward": 0.9928273539990187, "reward_std": 0.24035521575530083, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04467265058710836, "rewards/format_reward": 0.1125, "rewards/reasoning_steps_reward": 0.9250000055879355, "step": 1295 }, { "completion_length": 1024.0, "epoch": 0.28713021631396585, "grad_norm": 1.5460480759475044, "kl": 0.447198486328125, "learning_rate": 1.7941741298676777e-05, "loss": 0.0179, "reward": 0.933686813339591, "reward_std": 0.21179361512779452, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03922985791132305, "rewards/format_reward": 0.05625, "rewards/reasoning_steps_reward": 0.9166666753590107, "step": 1300 }, { "epoch": 0.28713021631396585, "eval_completion_length": 1019.175, "eval_kl": 0.6944921875, "eval_loss": 0.027912691235542297, "eval_reward": 0.8045022475719452, "eval_reward_std": 0.29673283290376273, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.0638310891322908, "eval_rewards/format_reward": 0.05, "eval_rewards/reasoning_steps_reward": 0.8183333414793015, "eval_runtime": 202.212, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.124, "step": 1300 }, { "completion_length": 1024.0, "epoch": 0.2882345632997888, "grad_norm": 131.321337688551, "kl": 65.63963623046875, "learning_rate": 1.7918251848491118e-05, "loss": 2.6278, "reward": 0.7914097828324884, "reward_std": 0.35510434115416273, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0794235574917593, "rewards/format_reward": 0.0875, "rewards/reasoning_steps_reward": 0.7833333387970924, "step": 1305 }, { "completion_length": 999.3, "epoch": 0.28933891028561176, "grad_norm": 5.088107026581073, "kl": 16.488323974609376, "learning_rate": 1.7894644684749983e-05, "loss": 0.6609, "reward": 0.7671722872182727, "reward_std": 0.4012905497775364, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0869943821081847, "rewards/format_reward": 0.0875, "rewards/reasoning_steps_reward": 0.7666666701436042, "step": 1310 }, { "completion_length": 984.25, "epoch": 0.29044325727143466, "grad_norm": 5.662270180234321, "kl": 5.150537109375, "learning_rate": 1.7870920158399918e-05, "loss": 0.2062, "reward": 0.7325123744085431, "reward_std": 0.5204162761657812, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09040429510343415, "rewards/format_reward": 0.0875, "rewards/reasoning_steps_reward": 0.7354166703298688, "step": 1315 }, { "completion_length": 983.725, "epoch": 0.2915476042572576, "grad_norm": 3.8531905036661738, "kl": 2.1231201171875, "learning_rate": 1.7847078622132202e-05, "loss": 0.085, "reward": 0.6666571330279112, "reward_std": 0.4020955947952075, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08959287195330035, "rewards/format_reward": 0.025, "rewards/reasoning_steps_reward": 0.7312500027939677, "step": 1320 }, { "completion_length": 1024.0, "epoch": 0.2926519512430806, "grad_norm": 1.4713601895366548, "kl": 4.474505615234375, "learning_rate": 1.7823120430377593e-05, "loss": 0.1791, "reward": 0.8719107124954462, "reward_std": 0.20605486250725563, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04475596445258816, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.9041666679084301, "step": 1325 }, { "completion_length": 1024.0, "epoch": 0.29375629822890353, "grad_norm": 2.001254416658288, "kl": 234.38853149414064, "learning_rate": 1.7799045939301063e-05, "loss": 9.3666, "reward": 0.9464922484010458, "reward_std": 0.36131374030945834, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0597577509677194, "rewards/format_reward": 0.11875, "rewards/reasoning_steps_reward": 0.8875000007450581, "step": 1330 }, { "completion_length": 1024.0, "epoch": 0.2948606452147265, "grad_norm": 0.01825257511989942, "kl": 0.88497314453125, "learning_rate": 1.7774855506796497e-05, "loss": 0.0355, "reward": 0.9531695555895567, "reward_std": 0.0772957038158438, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025997111127071548, "rewards/format_reward": 0.025, "rewards/reasoning_steps_reward": 0.9541666679084301, "step": 1335 }, { "completion_length": 1024.0, "epoch": 0.2959649922005494, "grad_norm": 0.5453957093609844, "kl": 0.524688720703125, "learning_rate": 1.775054949248138e-05, "loss": 0.021, "reward": 0.9354104410856963, "reward_std": 0.13244989554941639, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.029172901513811668, "rewards/format_reward": 0.025, "rewards/reasoning_steps_reward": 0.9395833350718021, "step": 1340 }, { "completion_length": 1024.0, "epoch": 0.29706933918637235, "grad_norm": 2.4380567469335372, "kl": 1.2219635009765626, "learning_rate": 1.7726128257691447e-05, "loss": 0.0489, "reward": 0.9105375189334154, "reward_std": 0.16619428564858935, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04987915392703144, "rewards/format_reward": 0.0375, "rewards/reasoning_steps_reward": 0.9229166686534882, "step": 1345 }, { "completion_length": 1024.0, "epoch": 0.2981736861721953, "grad_norm": 1.4015232004301508, "kl": 0.3145111083984375, "learning_rate": 1.770159216547532e-05, "loss": 0.0126, "reward": 0.9414736803621053, "reward_std": 0.21337720166779944, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05227632584610546, "rewards/format_reward": 0.0625, "rewards/reasoning_steps_reward": 0.9312500022351742, "step": 1350 }, { "completion_length": 1024.0, "epoch": 0.29927803315801826, "grad_norm": 0.6098020893731024, "kl": 0.9691864013671875, "learning_rate": 1.7676941580589097e-05, "loss": 0.0388, "reward": 1.0848730199038983, "reward_std": 0.43680297569849247, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05471031935303472, "rewards/format_reward": 0.2375, "rewards/reasoning_steps_reward": 0.9020833350718022, "step": 1355 }, { "completion_length": 1024.0, "epoch": 0.3003823801438412, "grad_norm": 1.3473870887142767, "kl": 0.3821868896484375, "learning_rate": 1.7652176869490933e-05, "loss": 0.0153, "reward": 1.5648357531055808, "reward_std": 0.34688837417397733, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0351642573474237, "rewards/format_reward": 0.6625, "rewards/reasoning_steps_reward": 0.9375000013038516, "step": 1360 }, { "completion_length": 1024.0, "epoch": 0.3014867271296641, "grad_norm": 2.650927188181098, "kl": 34.706756591796875, "learning_rate": 1.76272984003356e-05, "loss": 1.3914, "reward": 1.7366739973425864, "reward_std": 0.3717085005620106, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0674926791811572, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.89791666790843, "step": 1365 }, { "completion_length": 1024.0, "epoch": 0.3025910741154871, "grad_norm": 1.1784705594565221, "kl": 1.138018798828125, "learning_rate": 1.7602306542969006e-05, "loss": 0.0455, "reward": 1.6846371553838253, "reward_std": 0.3902606235532744, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08202951550820217, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.8854166684672237, "step": 1370 }, { "completion_length": 1024.0, "epoch": 0.30369542110131004, "grad_norm": 1.069729400209864, "kl": 0.304193115234375, "learning_rate": 1.7577201668922702e-05, "loss": 0.0122, "reward": 1.725393744930625, "reward_std": 0.2859493938201467, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07460626843894716, "rewards/format_reward": 0.9125, "rewards/reasoning_steps_reward": 0.8875000020489097, "step": 1375 }, { "completion_length": 1024.0, "epoch": 0.304799768087133, "grad_norm": 11.760360030790453, "kl": 4.318035888671875, "learning_rate": 1.7551984151408363e-05, "loss": 0.173, "reward": 1.8561431474983692, "reward_std": 0.20155350471841302, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.035523528978228566, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9416666679084301, "step": 1380 }, { "completion_length": 1024.0, "epoch": 0.3059041150729559, "grad_norm": 0.7829414634076728, "kl": 0.50015869140625, "learning_rate": 1.7526654365312222e-05, "loss": 0.0201, "reward": 1.807436482422054, "reward_std": 0.21474389426180096, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.038396865250979316, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9208333337679505, "step": 1385 }, { "completion_length": 1024.0, "epoch": 0.30700846205877885, "grad_norm": 0.3526252407355771, "kl": 0.42960205078125, "learning_rate": 1.750121268718951e-05, "loss": 0.0172, "reward": 1.7724484391510487, "reward_std": 0.3056804820618481, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06088490542335308, "rewards/format_reward": 0.9125, "rewards/reasoning_steps_reward": 0.9208333365619182, "step": 1390 }, { "completion_length": 1024.0, "epoch": 0.3081128090446018, "grad_norm": 0.8538212226535026, "kl": 0.447174072265625, "learning_rate": 1.7475659495258864e-05, "loss": 0.0179, "reward": 1.1615405725315213, "reward_std": 0.3998776563397314, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1238761033207993, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.8229166688397527, "step": 1395 }, { "completion_length": 1024.0, "epoch": 0.30921715603042477, "grad_norm": 0.22647138061338717, "kl": 0.1839111328125, "learning_rate": 1.7449995169396693e-05, "loss": 0.0074, "reward": 0.598058795183897, "reward_std": 0.45160960222401625, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17277454470386147, "rewards/format_reward": 0.13125, "rewards/reasoning_steps_reward": 0.6395833432674408, "step": 1400 }, { "epoch": 0.30921715603042477, "eval_completion_length": 1024.0, "eval_kl": 0.114072265625, "eval_loss": 0.004564680624753237, "eval_reward": 0.152089421749115, "eval_reward_std": 0.1883784442592878, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.23457725435495377, "eval_rewards/format_reward": 0.03, "eval_rewards/reasoning_steps_reward": 0.3566666767001152, "eval_runtime": 206.7244, "eval_samples_per_second": 0.479, "eval_steps_per_second": 0.121, "step": 1400 }, { "completion_length": 1024.0, "epoch": 0.3103215030162477, "grad_norm": 0.1914184381458074, "kl": 0.1725677490234375, "learning_rate": 1.7424220091131536e-05, "loss": 0.0069, "reward": 0.0531884940341115, "reward_std": 0.23593127114654636, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.20722817881032823, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.18541666995733977, "step": 1405 }, { "completion_length": 1019.25, "epoch": 0.3114258500020706, "grad_norm": 0.18859472672382957, "kl": 0.270391845703125, "learning_rate": 1.739833464363838e-05, "loss": 0.0108, "reward": 0.07014747215434909, "reward_std": 0.2072645182282585, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.19651920032940212, "rewards/format_reward": 0.09375, "rewards/reasoning_steps_reward": 0.17291667088866233, "step": 1410 }, { "completion_length": 1024.0, "epoch": 0.3125301969878936, "grad_norm": 0.4524235162500008, "kl": 0.26759033203125, "learning_rate": 1.7372339211732988e-05, "loss": 0.0107, "reward": 0.767735379934311, "reward_std": 0.3956183263140701, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09684796158981043, "rewards/format_reward": 0.25, "rewards/reasoning_steps_reward": 0.6145833486691117, "step": 1415 }, { "completion_length": 1024.0, "epoch": 0.31363454397371654, "grad_norm": 0.29966595716364663, "kl": 0.2554931640625, "learning_rate": 1.734623418186615e-05, "loss": 0.0102, "reward": 0.9944251235574484, "reward_std": 0.3010030138277216, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13890820817177882, "rewards/format_reward": 0.1875, "rewards/reasoning_steps_reward": 0.9395833373069763, "step": 1420 }, { "completion_length": 1024.0, "epoch": 0.3147388909595395, "grad_norm": 0.34966167035721935, "kl": 0.2528564453125, "learning_rate": 1.7320019942117954e-05, "loss": 0.0101, "reward": 1.0877137396484613, "reward_std": 0.30549203366972505, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13311959272250534, "rewards/format_reward": 0.23125, "rewards/reasoning_steps_reward": 0.977083333581686, "step": 1425 }, { "completion_length": 1024.0, "epoch": 0.31584323794536245, "grad_norm": 0.32149028086929365, "kl": 0.30032958984375, "learning_rate": 1.729369688219202e-05, "loss": 0.012, "reward": 1.3974668875336647, "reward_std": 0.3835971452834201, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13378311347041744, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 1.0, "step": 1430 }, { "completion_length": 1024.0, "epoch": 0.31694758493118536, "grad_norm": 0.458686181420835, "kl": 0.269873046875, "learning_rate": 1.7267265393409684e-05, "loss": 0.0108, "reward": 1.7222345098853111, "reward_std": 0.2661111972852268, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11526548723049927, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 1.0, "step": 1435 }, { "completion_length": 1024.0, "epoch": 0.3180519319170083, "grad_norm": 0.4300493124515638, "kl": 0.28525390625, "learning_rate": 1.7240725868704218e-05, "loss": 0.0114, "reward": 1.9266040623188019, "reward_std": 0.16214110484579577, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.0754792769512278, "rewards/format_reward": 0.9875, "rewards/reasoning_steps_reward": 0.995833333581686, "step": 1440 }, { "completion_length": 1024.0, "epoch": 0.31915627890283127, "grad_norm": 0.42969892888824873, "kl": 0.2861572265625, "learning_rate": 1.7214078702614946e-05, "loss": 0.0114, "reward": 2.032954090833664, "reward_std": 0.19149340623989702, "rewards/accuracy_reward": 0.05625, "rewards/cosine_scaled_reward": -0.012879242049530148, "rewards/format_reward": 1.0, "rewards/reasoning_steps_reward": 0.989583333581686, "step": 1445 }, { "completion_length": 1024.0, "epoch": 0.3202606258886542, "grad_norm": 0.49367663303009623, "kl": 0.27318115234375, "learning_rate": 1.7187324291281423e-05, "loss": 0.0109, "reward": 1.9512878715991975, "reward_std": 0.1059367892348746, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.040378800382313784, "rewards/format_reward": 0.9875, "rewards/reasoning_steps_reward": 0.9916666686534882, "step": 1450 }, { "completion_length": 1024.0, "epoch": 0.3213649728744772, "grad_norm": 0.3778408957712114, "kl": 0.32586669921875, "learning_rate": 1.71604630324375e-05, "loss": 0.013, "reward": 1.9965640038251877, "reward_std": 0.22204069324834563, "rewards/accuracy_reward": 0.05, "rewards/cosine_scaled_reward": -0.001352671076892875, "rewards/format_reward": 0.9875, "rewards/reasoning_steps_reward": 0.9604166693985462, "step": 1455 }, { "completion_length": 1024.0, "epoch": 0.3224693198603001, "grad_norm": 0.39152943132673057, "kl": 0.3145263671875, "learning_rate": 1.7133495325405448e-05, "loss": 0.0126, "reward": 1.9360227391123772, "reward_std": 0.2746580336162879, "rewards/accuracy_reward": 0.05, "rewards/cosine_scaled_reward": -0.013977252056065481, "rewards/format_reward": 1.0, "rewards/reasoning_steps_reward": 0.9000000044703483, "step": 1460 }, { "completion_length": 1024.0, "epoch": 0.32357366684612304, "grad_norm": 0.5058134430224523, "kl": 0.33702392578125, "learning_rate": 1.7106421571090003e-05, "loss": 0.0135, "reward": 1.85144801735878, "reward_std": 0.2626168250788396, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.033968651014947684, "rewards/format_reward": 1.0, "rewards/reasoning_steps_reward": 0.8604166712611914, "step": 1465 }, { "completion_length": 1024.0, "epoch": 0.324678013831946, "grad_norm": 0.45638817493378453, "kl": 0.34488525390625, "learning_rate": 1.7079242171972417e-05, "loss": 0.0138, "reward": 1.705393605818972, "reward_std": 0.25409652892012674, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.04668973356310744, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.8895833380520344, "step": 1470 }, { "completion_length": 1024.0, "epoch": 0.32578236081776896, "grad_norm": 0.31595014246293646, "kl": 0.23759765625, "learning_rate": 1.705195753210446e-05, "loss": 0.0095, "reward": 1.4653432246297597, "reward_std": 0.33772912265012567, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.072156780314981, "rewards/format_reward": 0.56875, "rewards/reasoning_steps_reward": 0.962500001117587, "step": 1475 }, { "completion_length": 1024.0, "epoch": 0.3268867078035919, "grad_norm": 0.3387234465375613, "kl": 0.21739501953125, "learning_rate": 1.7024568057102423e-05, "loss": 0.0087, "reward": 1.2381744548678397, "reward_std": 0.3804714563237212, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.07015887794332229, "rewards/format_reward": 0.33125, "rewards/reasoning_steps_reward": 0.964583334326744, "step": 1480 }, { "completion_length": 1024.0, "epoch": 0.3279910547894148, "grad_norm": 0.3635919256435814, "kl": 0.290997314453125, "learning_rate": 1.6997074154141097e-05, "loss": 0.0116, "reward": 1.2281988142058253, "reward_std": 0.522791172178404, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.08638452111918013, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.8833333363756537, "step": 1485 }, { "completion_length": 1024.0, "epoch": 0.3290954017752378, "grad_norm": 0.31289390902077713, "kl": 0.413104248046875, "learning_rate": 1.69694762319477e-05, "loss": 0.0165, "reward": 1.2244404914788902, "reward_std": 0.5740925252861416, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.05264285028388258, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.745833340473473, "step": 1490 }, { "completion_length": 1019.89375, "epoch": 0.33019974876106073, "grad_norm": 0.2543210545589851, "kl": 0.35384521484375, "learning_rate": 1.694177470079581e-05, "loss": 0.0142, "reward": 1.278636990953237, "reward_std": 0.653353753479314, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.08802968083273299, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.7729166712611913, "step": 1495 }, { "completion_length": 1024.0, "epoch": 0.3313040957468837, "grad_norm": 0.2000090354174927, "kl": 0.285101318359375, "learning_rate": 1.6913969972499272e-05, "loss": 0.0114, "reward": 1.4027412496507168, "reward_std": 0.6077661401930528, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12017541860113852, "rewards/format_reward": 0.69375, "rewards/reasoning_steps_reward": 0.8291666712611914, "step": 1500 }, { "epoch": 0.3313040957468837, "eval_completion_length": 1024.0, "eval_kl": 3.11359375, "eval_loss": 0.1255832314491272, "eval_reward": 1.5576036548614502, "eval_reward_std": 0.4220656427741051, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.13572968773543834, "eval_rewards/format_reward": 0.79, "eval_rewards/reasoning_steps_reward": 0.9033333349227906, "eval_runtime": 202.5651, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.123, "step": 1500 }, { "completion_length": 1024.0, "epoch": 0.33240844273270664, "grad_norm": 0.18666628132889443, "kl": 0.2407806396484375, "learning_rate": 1.688606246040607e-05, "loss": 0.0096, "reward": 1.5353901420719922, "reward_std": 0.3581777959698229, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1312765258422587, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.8791666684672237, "step": 1505 }, { "completion_length": 1024.0, "epoch": 0.33351278971852955, "grad_norm": 0.15057205371474208, "kl": 0.2106201171875, "learning_rate": 1.6858052579392182e-05, "loss": 0.0084, "reward": 1.6789043765515088, "reward_std": 0.3242158696106344, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09817895484156906, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.9083333365619183, "step": 1510 }, { "completion_length": 1024.0, "epoch": 0.3346171367043525, "grad_norm": 0.15908647576995252, "kl": 0.26219482421875, "learning_rate": 1.682994074585541e-05, "loss": 0.0105, "reward": 1.6573340767994522, "reward_std": 0.33736471326956236, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.09058259856828954, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.8979166688397526, "step": 1515 }, { "completion_length": 1024.0, "epoch": 0.33572148369017546, "grad_norm": 0.15174865851665464, "kl": 0.321112060546875, "learning_rate": 1.6801727377709195e-05, "loss": 0.0128, "reward": 1.6820120507851244, "reward_std": 0.3425358484266326, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10132128554250812, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9208333363756538, "step": 1520 }, { "completion_length": 1024.0, "epoch": 0.3368258306759984, "grad_norm": 1.60136854765388, "kl": 0.7475982666015625, "learning_rate": 1.6773412894376404e-05, "loss": 0.0299, "reward": 1.664339251909405, "reward_std": 0.36049082253084636, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08149407562686975, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.9270833354443312, "step": 1525 }, { "completion_length": 1019.6375, "epoch": 0.3379301776618213, "grad_norm": 749.4573047515579, "kl": 22.88638916015625, "learning_rate": 1.674499771678309e-05, "loss": 0.9147, "reward": 1.6466619638726114, "reward_std": 0.3481345805644196, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07000470066104754, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.9416666677221656, "step": 1530 }, { "completion_length": 1024.0, "epoch": 0.3390345246476443, "grad_norm": 0.21658147441883935, "kl": 0.3058441162109375, "learning_rate": 1.6716482267352234e-05, "loss": 0.0123, "reward": 1.7539634361863137, "reward_std": 0.3016778468489065, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06686989046866074, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.9520833343267441, "step": 1535 }, { "completion_length": 1012.55, "epoch": 0.34013887163346723, "grad_norm": 461.5916103955411, "kl": 40.589141845703125, "learning_rate": 1.6687866969997483e-05, "loss": 1.6231, "reward": 1.6885167896049098, "reward_std": 0.3414273451831832, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07606654057235573, "rewards/format_reward": 0.85625, "rewards/reasoning_steps_reward": 0.9083333358168602, "step": 1540 }, { "completion_length": 1023.0, "epoch": 0.3412432186192902, "grad_norm": 5.713861291488285, "kl": 1.279107666015625, "learning_rate": 1.665915225011681e-05, "loss": 0.0512, "reward": 1.5734828183427454, "reward_std": 0.4893728211319285, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0848505121160997, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.8770833380520344, "step": 1545 }, { "completion_length": 1017.9, "epoch": 0.34234756560511315, "grad_norm": 44.99030930635304, "kl": 6.491766357421875, "learning_rate": 1.663033853458624e-05, "loss": 0.2599, "reward": 1.565668173879385, "reward_std": 0.4733093095805089, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0822484963136958, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.8229166740551591, "step": 1550 }, { "completion_length": 995.39375, "epoch": 0.34345191259093605, "grad_norm": 30.752879788425222, "kl": 1.7755828857421876, "learning_rate": 1.660142625175346e-05, "loss": 0.071, "reward": 1.2021763553842901, "reward_std": 0.407320237372187, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10199031746979018, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.4916666788980365, "step": 1555 }, { "completion_length": 1016.3625, "epoch": 0.344556259576759, "grad_norm": 4.863397224735704, "kl": 3.19599609375, "learning_rate": 1.6572415831431466e-05, "loss": 0.1281, "reward": 1.0388813458383084, "reward_std": 0.4170527165522799, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10278532494285172, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.3229166742414236, "step": 1560 }, { "completion_length": 1016.2, "epoch": 0.34566060656258196, "grad_norm": 2.693390293032911, "kl": 1.61478271484375, "learning_rate": 1.6543307704892196e-05, "loss": 0.0647, "reward": 1.0753739975392818, "reward_std": 0.3687583804799942, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.085042677965248, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.3291666736826301, "step": 1565 }, { "completion_length": 1024.0, "epoch": 0.3467649535484049, "grad_norm": 1.2274816870977052, "kl": 0.848553466796875, "learning_rate": 1.6514102304860077e-05, "loss": 0.034, "reward": 1.0539750020951033, "reward_std": 0.43142651255548115, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0876916709530633, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.3166666740551591, "step": 1570 }, { "completion_length": 1024.0, "epoch": 0.3478693005342279, "grad_norm": 0.2539185949163488, "kl": 0.2436279296875, "learning_rate": 1.6484800065505627e-05, "loss": 0.0097, "reward": 1.1815047591924668, "reward_std": 0.44734334169734213, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05182858537846187, "rewards/format_reward": 0.725, "rewards/reasoning_steps_reward": 0.5083333438262343, "step": 1575 }, { "completion_length": 1024.0, "epoch": 0.3489736475200508, "grad_norm": 0.2697024562923024, "kl": 78.62169189453125, "learning_rate": 1.6455401422438984e-05, "loss": 3.1515, "reward": 1.8654019482433797, "reward_std": 0.13044934055097884, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06168138112694806, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9770833380520344, "step": 1580 }, { "completion_length": 1024.0, "epoch": 0.35007799450587374, "grad_norm": 0.6487746875720865, "kl": 0.379437255859375, "learning_rate": 1.6425906812703435e-05, "loss": 0.0152, "reward": 1.7737618699669837, "reward_std": 0.2546435080017545, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.08457146059081425, "rewards/format_reward": 0.85625, "rewards/reasoning_steps_reward": 0.995833334326744, "step": 1585 }, { "completion_length": 1024.0, "epoch": 0.3511823414916967, "grad_norm": 0.16041873093251072, "kl": 0.50980224609375, "learning_rate": 1.6396316674768914e-05, "loss": 0.0204, "reward": 1.61807621717453, "reward_std": 0.3785809577530017, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.11317378490348347, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.9312500022351742, "step": 1590 }, { "completion_length": 1024.0, "epoch": 0.35228668847751965, "grad_norm": 0.16080044982174893, "kl": 0.220855712890625, "learning_rate": 1.6366631448525486e-05, "loss": 0.0088, "reward": 1.6727639326825738, "reward_std": 0.4138571529452747, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10848606025974732, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9125000037252903, "step": 1595 }, { "completion_length": 1024.0, "epoch": 0.3533910354633426, "grad_norm": 0.15122244016755843, "kl": 0.25135498046875, "learning_rate": 1.6336851575276814e-05, "loss": 0.0101, "reward": 1.5872814737260341, "reward_std": 0.4882094876725205, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1023018532214337, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.8583333350718021, "step": 1600 }, { "epoch": 0.3533910354633426, "eval_completion_length": 1024.0, "eval_kl": 0.257392578125, "eval_loss": 0.010239919647574425, "eval_reward": 1.5994415652751923, "eval_reward_std": 0.5397925276041496, "eval_rewards/accuracy_reward": 0.005, "eval_rewards/cosine_scaled_reward": -0.0838917788118124, "eval_rewards/format_reward": 0.815, "eval_rewards/reasoning_steps_reward": 0.8633333361148834, "eval_runtime": 204.9068, "eval_samples_per_second": 0.483, "eval_steps_per_second": 0.122, "step": 1600 }, { "completion_length": 1024.0, "epoch": 0.3544953824491655, "grad_norm": 0.15278881164591143, "kl": 0.215399169921875, "learning_rate": 1.630697749773359e-05, "loss": 0.0086, "reward": 1.6932586930692195, "reward_std": 0.4409494582350817, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.06715795926138526, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.9104166686534881, "step": 1605 }, { "completion_length": 1024.0, "epoch": 0.35559972943498847, "grad_norm": 0.12847749575672446, "kl": 0.308056640625, "learning_rate": 1.627700966000696e-05, "loss": 0.0123, "reward": 1.5046606879681348, "reward_std": 0.5595257567225417, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0911726443493535, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.8333333367481828, "step": 1610 }, { "completion_length": 1024.0, "epoch": 0.3567040764208114, "grad_norm": 0.12461377449215352, "kl": 0.287921142578125, "learning_rate": 1.6246948507601915e-05, "loss": 0.0115, "reward": 1.4486139392480255, "reward_std": 0.5760282400807227, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07846938786969986, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.8208333348855377, "step": 1615 }, { "completion_length": 1024.0, "epoch": 0.3578084234066344, "grad_norm": 0.03891215118141946, "kl": 0.10467529296875, "learning_rate": 1.621679448741067e-05, "loss": 0.0042, "reward": 1.7575967930257321, "reward_std": 0.3055588886391433, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01323653027502587, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.977083333581686, "step": 1620 }, { "completion_length": 1024.0, "epoch": 0.35891277039245734, "grad_norm": 0.0617309421173978, "kl": 0.173211669921875, "learning_rate": 1.618654804770603e-05, "loss": 0.007, "reward": 1.7544544816017151, "reward_std": 0.3281066141825704, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.033045521087115046, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.9187500014901161, "step": 1625 }, { "completion_length": 1024.0, "epoch": 0.36001711737828024, "grad_norm": 0.11475484692618709, "kl": 0.14339599609375, "learning_rate": 1.615620963813471e-05, "loss": 0.0057, "reward": 1.8531198611482977, "reward_std": 0.16819539086868646, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.026046801151460387, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9479166677221655, "step": 1630 }, { "completion_length": 1024.0, "epoch": 0.3611214643641032, "grad_norm": 1.7098540572139886, "kl": 0.311505126953125, "learning_rate": 1.6125779709710668e-05, "loss": 0.0125, "reward": 1.846768843382597, "reward_std": 0.21533012690188116, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02406449381742277, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9458333350718021, "step": 1635 }, { "completion_length": 1024.0, "epoch": 0.36222581134992615, "grad_norm": 0.10500065061318996, "kl": 0.172625732421875, "learning_rate": 1.6095258714808373e-05, "loss": 0.0069, "reward": 1.829521244764328, "reward_std": 0.2395985798266338, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03297876436563456, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.937500000745058, "step": 1640 }, { "completion_length": 1024.0, "epoch": 0.3633301583357491, "grad_norm": 0.12252448809449357, "kl": 0.18194580078125, "learning_rate": 1.606464710715612e-05, "loss": 0.0073, "reward": 1.8227525861933827, "reward_std": 0.21451298040622077, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03141408516330557, "rewards/format_reward": 0.9125, "rewards/reasoning_steps_reward": 0.9416666684672237, "step": 1645 }, { "completion_length": 1024.0, "epoch": 0.364434505321572, "grad_norm": 0.7445776320336748, "kl": 0.30133056640625, "learning_rate": 1.603394534182925e-05, "loss": 0.012, "reward": 1.6621078178286552, "reward_std": 0.27496500448987715, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05872552102773625, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.895833333954215, "step": 1650 }, { "completion_length": 1022.99375, "epoch": 0.36553885230739497, "grad_norm": 0.1955073245601333, "kl": 0.409490966796875, "learning_rate": 1.600315387524339e-05, "loss": 0.0164, "reward": 1.3730967482551932, "reward_std": 0.65049932035663, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0977365891343652, "rewards/format_reward": 0.65, "rewards/reasoning_steps_reward": 0.8208333378657698, "step": 1655 }, { "completion_length": 1024.0, "epoch": 0.3666431992932179, "grad_norm": 0.08757147668498837, "kl": 0.2857421875, "learning_rate": 1.5972273165147697e-05, "loss": 0.0115, "reward": 1.3178067412227392, "reward_std": 0.6414556607540363, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0717766029327322, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.8458333369344473, "step": 1660 }, { "completion_length": 1024.0, "epoch": 0.3677475462790409, "grad_norm": 0.10077064026422362, "kl": 0.194622802734375, "learning_rate": 1.5941303670618018e-05, "loss": 0.0078, "reward": 1.501453479193151, "reward_std": 0.47270524825935356, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.040213195209798866, "rewards/format_reward": 0.61875, "rewards/reasoning_steps_reward": 0.9166666686534881, "step": 1665 }, { "completion_length": 1024.0, "epoch": 0.36885189326486384, "grad_norm": 0.09456211756292313, "kl": 0.209967041015625, "learning_rate": 1.591024585205007e-05, "loss": 0.0084, "reward": 1.5002569787204265, "reward_std": 0.47431263369280713, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.051826363683358065, "rewards/format_reward": 0.6625, "rewards/reasoning_steps_reward": 0.8895833346992731, "step": 1670 }, { "completion_length": 1024.0, "epoch": 0.36995624025068674, "grad_norm": 0.10501416195016813, "kl": 0.20081787109375, "learning_rate": 1.587910017115262e-05, "loss": 0.008, "reward": 1.4864585481584072, "reward_std": 0.5479732289919411, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0489581265635934, "rewards/format_reward": 0.64375, "rewards/reasoning_steps_reward": 0.8916666679084301, "step": 1675 }, { "completion_length": 1024.0, "epoch": 0.3710605872365097, "grad_norm": 0.07587020227662776, "kl": 0.210357666015625, "learning_rate": 1.5847867090940602e-05, "loss": 0.0084, "reward": 1.6044495470821858, "reward_std": 0.4506511878987567, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05596711239659271, "rewards/format_reward": 0.74375, "rewards/reasoning_steps_reward": 0.9166666686534881, "step": 1680 }, { "completion_length": 1024.0, "epoch": 0.37216493422233266, "grad_norm": 0.13633268674501928, "kl": 0.245611572265625, "learning_rate": 1.5816547075728227e-05, "loss": 0.0098, "reward": 1.5000795137137175, "reward_std": 0.4664591760686562, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06450380514434073, "rewards/format_reward": 0.6625, "rewards/reasoning_steps_reward": 0.9020833361893892, "step": 1685 }, { "completion_length": 1024.0, "epoch": 0.3732692812081556, "grad_norm": 0.1289716587802781, "kl": 0.2935333251953125, "learning_rate": 1.5785140591122107e-05, "loss": 0.0117, "reward": 1.4516793651506306, "reward_std": 0.4440517393491973, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08790395214164164, "rewards/format_reward": 0.6875, "rewards/reasoning_steps_reward": 0.8520833365619183, "step": 1690 }, { "completion_length": 1024.0, "epoch": 0.37437362819397857, "grad_norm": 0.08366432741697126, "kl": 0.2215484619140625, "learning_rate": 1.57536481040143e-05, "loss": 0.0089, "reward": 1.4857848590239882, "reward_std": 0.5149718122185731, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05796512944652932, "rewards/format_reward": 0.6375, "rewards/reasoning_steps_reward": 0.9062500024214387, "step": 1695 }, { "completion_length": 1024.0, "epoch": 0.37547797517980147, "grad_norm": 0.12095767601763074, "kl": 0.2228729248046875, "learning_rate": 1.57220700825754e-05, "loss": 0.0089, "reward": 1.5507157089188695, "reward_std": 0.5246146993404182, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.051367602551181335, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.8895833346992731, "step": 1700 }, { "epoch": 0.37547797517980147, "eval_completion_length": 1024.0, "eval_kl": 0.376298828125, "eval_loss": 0.015111659653484821, "eval_reward": 1.4007956439256668, "eval_reward_std": 0.5410330631426041, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.07253768887836486, "eval_rewards/format_reward": 0.635, "eval_rewards/reasoning_steps_reward": 0.8383333373069763, "eval_runtime": 203.5513, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.123, "step": 1700 }, { "completion_length": 1024.0, "epoch": 0.37658232216562443, "grad_norm": 0.1043657866474859, "kl": 0.22974853515625, "learning_rate": 1.5690406996247557e-05, "loss": 0.0092, "reward": 1.4823718063533307, "reward_std": 0.5257471238997823, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05096150914614554, "rewards/format_reward": 0.64375, "rewards/reasoning_steps_reward": 0.8895833350718021, "step": 1705 }, { "completion_length": 1021.3, "epoch": 0.3776866691514474, "grad_norm": 0.1450103124132454, "kl": 222.7259979248047, "learning_rate": 1.5658659315737505e-05, "loss": 8.9174, "reward": 1.3179884374141693, "reward_std": 0.6270647759190069, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.08826154011912876, "rewards/format_reward": 0.58125, "rewards/reasoning_steps_reward": 0.8187500039115548, "step": 1710 }, { "completion_length": 1024.0, "epoch": 0.37879101613727034, "grad_norm": 0.4320971163586487, "kl": 2.19267578125, "learning_rate": 1.5626827513009565e-05, "loss": 0.0878, "reward": 1.3571282140910625, "reward_std": 0.6224009027850116, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10328842537855962, "rewards/format_reward": 0.6375, "rewards/reasoning_steps_reward": 0.822916672565043, "step": 1715 }, { "completion_length": 1024.0, "epoch": 0.3798953631230933, "grad_norm": 0.10636800980618685, "kl": 1.1803497314453124, "learning_rate": 1.5594912061278627e-05, "loss": 0.0472, "reward": 1.4040614984929563, "reward_std": 0.45519052888548683, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08552179490507114, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.820833338983357, "step": 1720 }, { "completion_length": 1019.55, "epoch": 0.3809997101089162, "grad_norm": 2.383636335628273, "kl": 0.4132568359375, "learning_rate": 1.5562913435003113e-05, "loss": 0.0165, "reward": 1.562475298345089, "reward_std": 0.47447351760647505, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05210799162014155, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.8583333352580667, "step": 1725 }, { "completion_length": 1024.0, "epoch": 0.38210405709473916, "grad_norm": 14.647822546025663, "kl": 0.928717041015625, "learning_rate": 1.5530832109877932e-05, "loss": 0.0372, "reward": 1.7276832605712116, "reward_std": 0.3270437012415641, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03065001876966562, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9208333371207118, "step": 1730 }, { "completion_length": 1024.0, "epoch": 0.3832084040805621, "grad_norm": 9.751639098465139, "kl": 8.048651123046875, "learning_rate": 1.5498668562827397e-05, "loss": 0.3212, "reward": 1.7738482117652894, "reward_std": 0.29785865079848006, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.026151732992730103, "rewards/format_reward": 0.85625, "rewards/reasoning_steps_reward": 0.9437500014901161, "step": 1735 }, { "completion_length": 1024.0, "epoch": 0.38431275106638507, "grad_norm": 29.60838556403236, "kl": 1.1584747314453125, "learning_rate": 1.5466423271998144e-05, "loss": 0.0463, "reward": 1.8169302485883236, "reward_std": 0.2547533855103211, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.022653027176784236, "rewards/format_reward": 0.8875, "rewards/reasoning_steps_reward": 0.9520833358168602, "step": 1740 }, { "completion_length": 1024.0, "epoch": 0.38541709805220803, "grad_norm": 1.493106786519115, "kl": 1.753643798828125, "learning_rate": 1.5434096716752023e-05, "loss": 0.0703, "reward": 1.736639281362295, "reward_std": 0.2719473097446098, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0321106720060925, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.9375000029802323, "step": 1745 }, { "completion_length": 1012.91875, "epoch": 0.38652144503803093, "grad_norm": 0.2769839530384672, "kl": 3.68170166015625, "learning_rate": 1.5401689377658962e-05, "loss": 0.1478, "reward": 1.6126683823764325, "reward_std": 0.44903155848760434, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.031081585782339972, "rewards/format_reward": 0.71875, "rewards/reasoning_steps_reward": 0.9250000022351742, "step": 1750 }, { "completion_length": 1024.0, "epoch": 0.3876257920238539, "grad_norm": 0.901459553106272, "kl": 2.446429443359375, "learning_rate": 1.536920173648984e-05, "loss": 0.0978, "reward": 1.5782335847616196, "reward_std": 0.393158163732096, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03218308122377493, "rewards/format_reward": 0.675, "rewards/reasoning_steps_reward": 0.9354166679084301, "step": 1755 }, { "completion_length": 1019.58125, "epoch": 0.38873013900967684, "grad_norm": 11.114883143058288, "kl": 2.0950164794921875, "learning_rate": 1.53366342762093e-05, "loss": 0.0839, "reward": 1.3569506576284767, "reward_std": 0.4249624714701895, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08054934682236307, "rewards/format_reward": 0.525, "rewards/reasoning_steps_reward": 0.9125000055879354, "step": 1760 }, { "completion_length": 1019.2375, "epoch": 0.3898344859954998, "grad_norm": 2.9423721339278655, "kl": 1.0611572265625, "learning_rate": 1.5303987480968607e-05, "loss": 0.0425, "reward": 1.4560162041336298, "reward_std": 0.5089367198333548, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06690047315787524, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.9354166716337204, "step": 1765 }, { "completion_length": 1020.55625, "epoch": 0.39093883298132276, "grad_norm": 1.1833839877582364, "kl": 6.795245361328125, "learning_rate": 1.5271261836098403e-05, "loss": 0.2715, "reward": 1.4157525110989808, "reward_std": 0.5201515946177097, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07174750183366996, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.9000000050291419, "step": 1770 }, { "completion_length": 1019.3375, "epoch": 0.39204317996714566, "grad_norm": 2.2476471036032795, "kl": 1.54696044921875, "learning_rate": 1.5238457828101531e-05, "loss": 0.062, "reward": 1.6591543201357126, "reward_std": 0.3638195994876696, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.040845706693653484, "rewards/format_reward": 0.76875, "rewards/reasoning_steps_reward": 0.9312500029802322, "step": 1775 }, { "completion_length": 1005.8125, "epoch": 0.3931475269529686, "grad_norm": 0.5559651125273167, "kl": 3.158740234375, "learning_rate": 1.520557594464579e-05, "loss": 0.1264, "reward": 1.5916038572788238, "reward_std": 0.39986591760807644, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04172948705709132, "rewards/format_reward": 0.68125, "rewards/reasoning_steps_reward": 0.9520833365619182, "step": 1780 }, { "completion_length": 1024.0, "epoch": 0.3942518739387916, "grad_norm": 0.7921314371486445, "kl": 0.52529296875, "learning_rate": 1.5172616674556673e-05, "loss": 0.021, "reward": 1.6518889758735895, "reward_std": 0.40095153995195004, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03561103413817364, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.9250000020489096, "step": 1785 }, { "completion_length": 1024.0, "epoch": 0.39535622092461453, "grad_norm": 198.98423683140666, "kl": 11.310357666015625, "learning_rate": 1.5139580507810118e-05, "loss": 0.4515, "reward": 1.6622562702745198, "reward_std": 0.3482717312890372, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03566040461216744, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.9479166693985462, "step": 1790 }, { "completion_length": 1024.0, "epoch": 0.39646056791043743, "grad_norm": 0.24822742846638876, "kl": 7.70560302734375, "learning_rate": 1.510646793552522e-05, "loss": 0.3082, "reward": 1.687724581360817, "reward_std": 0.36951222630941627, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.037275422150722194, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.9500000029802322, "step": 1795 }, { "completion_length": 1017.25, "epoch": 0.3975649148962604, "grad_norm": 4.99816542290855, "kl": 3.644091796875, "learning_rate": 1.5073279449956916e-05, "loss": 0.1455, "reward": 1.5804516039788723, "reward_std": 0.4355189655091465, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05288173892857344, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.9270833369344473, "step": 1800 }, { "epoch": 0.3975649148962604, "eval_completion_length": 1024.0, "eval_kl": 1.33404296875, "eval_loss": 0.05353707820177078, "eval_reward": 1.6259092497825622, "eval_reward_std": 0.44313232216529286, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.05075742355262264, "eval_rewards/format_reward": 0.735, "eval_rewards/reasoning_steps_reward": 0.9416666674613953, "eval_runtime": 200.6778, "eval_samples_per_second": 0.493, "eval_steps_per_second": 0.125, "step": 1800 }, { "completion_length": 1016.4625, "epoch": 0.39866926188208335, "grad_norm": 0.8216823589718119, "kl": 1.52921142578125, "learning_rate": 1.5040015544488689e-05, "loss": 0.0613, "reward": 1.580186554789543, "reward_std": 0.4473959618730078, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05106344955359532, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.9312500037252903, "step": 1805 }, { "completion_length": 1024.0, "epoch": 0.3997736088679063, "grad_norm": 0.27497954194074575, "kl": 1.869049072265625, "learning_rate": 1.5006676713625217e-05, "loss": 0.0748, "reward": 1.7112540045753122, "reward_std": 0.3451619381986461, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04707933118999676, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.9520833350718021, "step": 1810 }, { "completion_length": 1022.7875, "epoch": 0.40087795585372926, "grad_norm": 0.1932843975023837, "kl": 0.42769775390625, "learning_rate": 1.4973263452985023e-05, "loss": 0.0171, "reward": 1.7407132534310221, "reward_std": 0.2509838029066316, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.038453425541553087, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9416666677221656, "step": 1815 }, { "completion_length": 1024.0, "epoch": 0.40198230283955216, "grad_norm": 13.375482247028472, "kl": 5.430291748046875, "learning_rate": 1.493977625929312e-05, "loss": 0.2177, "reward": 1.8173677779734134, "reward_std": 0.25813394066904605, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.024298889722177818, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.9479166686534881, "step": 1820 }, { "completion_length": 1024.0, "epoch": 0.4030866498253751, "grad_norm": 0.09709265350974022, "kl": 0.277154541015625, "learning_rate": 1.4906215630373606e-05, "loss": 0.0111, "reward": 1.8708434641361236, "reward_std": 0.18241405415017767, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.010406546228927028, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.9750000014901161, "step": 1825 }, { "completion_length": 1024.0, "epoch": 0.4041909968111981, "grad_norm": 0.9087598758466838, "kl": 0.90994873046875, "learning_rate": 1.4872582065142285e-05, "loss": 0.0363, "reward": 1.7198083013296128, "reward_std": 0.36085402057337035, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04060836032874136, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.9354166686534882, "step": 1830 }, { "completion_length": 1024.0, "epoch": 0.40529534379702103, "grad_norm": 19.71139802721317, "kl": 1.30902099609375, "learning_rate": 1.4838876063599234e-05, "loss": 0.0524, "reward": 1.633510524313897, "reward_std": 0.41481399162443894, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.058156153634925545, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.9104166697710753, "step": 1835 }, { "completion_length": 1024.0, "epoch": 0.406399690782844, "grad_norm": 0.20736892182413091, "kl": 2.954833984375, "learning_rate": 1.480509812682138e-05, "loss": 0.118, "reward": 1.6065174978226424, "reward_std": 0.4605361075816063, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06014917756460818, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.8791666697710753, "step": 1840 }, { "completion_length": 1024.0, "epoch": 0.4075040377686669, "grad_norm": 0.12490512211432611, "kl": 1.48941650390625, "learning_rate": 1.4771248756955042e-05, "loss": 0.0597, "reward": 1.339280641824007, "reward_std": 0.6381314028223641, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08363602570825605, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.7979166705161334, "step": 1845 }, { "completion_length": 1024.0, "epoch": 0.40860838475448985, "grad_norm": 0.2179059890414028, "kl": 0.3721527099609375, "learning_rate": 1.4737328457208471e-05, "loss": 0.0149, "reward": 1.6670774094760419, "reward_std": 0.4513689050038465, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04958925420360174, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.9041666708886623, "step": 1850 }, { "completion_length": 1018.66875, "epoch": 0.4097127317403128, "grad_norm": 0.79455251719469, "kl": 0.32587890625, "learning_rate": 1.4703337731844374e-05, "loss": 0.0131, "reward": 1.7391955329105258, "reward_std": 0.3125075527260378, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03372114356534439, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.9229166684672236, "step": 1855 }, { "completion_length": 1018.4625, "epoch": 0.41081707872613576, "grad_norm": 1.1295492282165498, "kl": 0.2130218505859375, "learning_rate": 1.4669277086172406e-05, "loss": 0.0086, "reward": 1.7530269030481578, "reward_std": 0.22123022362970915, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.036556454593664967, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.9208333356305957, "step": 1860 }, { "completion_length": 1021.48125, "epoch": 0.4119214257119587, "grad_norm": 0.10472402689364621, "kl": 0.4535247802734375, "learning_rate": 1.4635147026541674e-05, "loss": 0.0182, "reward": 1.853439299762249, "reward_std": 0.2057221634101097, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.021560727976611814, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9500000007450581, "step": 1865 }, { "completion_length": 1011.59375, "epoch": 0.4130257726977816, "grad_norm": 0.060145090948972385, "kl": 1.786566162109375, "learning_rate": 1.4600948060333187e-05, "loss": 0.0715, "reward": 1.6970781801268457, "reward_std": 0.37393540782250057, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05500518054032, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9145833365619183, "step": 1870 }, { "completion_length": 1018.5125, "epoch": 0.4141301196836046, "grad_norm": 0.07160628265153375, "kl": 0.150506591796875, "learning_rate": 1.4566680695952333e-05, "loss": 0.0061, "reward": 1.849437363818288, "reward_std": 0.17379919528120807, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.021395997072977478, "rewards/format_reward": 0.91875, "rewards/reasoning_steps_reward": 0.9520833335816861, "step": 1875 }, { "completion_length": 1017.94375, "epoch": 0.41523446666942754, "grad_norm": 0.3061351568984248, "kl": 0.1790069580078125, "learning_rate": 1.4532345442821323e-05, "loss": 0.0072, "reward": 1.8633771307766438, "reward_std": 0.1917227172017192, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01787287338374881, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9562500014901161, "step": 1880 }, { "completion_length": 1011.39375, "epoch": 0.4163388136552505, "grad_norm": 0.13980206314174293, "kl": 0.840350341796875, "learning_rate": 1.4497942811371592e-05, "loss": 0.0336, "reward": 1.811388997361064, "reward_std": 0.18153300940950884, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02819433979511814, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.9583333352580666, "step": 1885 }, { "completion_length": 1014.46875, "epoch": 0.41744316064107345, "grad_norm": 37.89468511386642, "kl": 3.224212646484375, "learning_rate": 1.4463473313036241e-05, "loss": 0.129, "reward": 1.7037913450971245, "reward_std": 0.3241650226414777, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03787532943442784, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.92916666790843, "step": 1890 }, { "completion_length": 999.16875, "epoch": 0.41854750762689635, "grad_norm": 0.43770064535885594, "kl": 1.079486083984375, "learning_rate": 1.4428937460242417e-05, "loss": 0.0432, "reward": 1.4503749491646887, "reward_std": 0.5806084466651555, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08087505426301504, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.8625000054016709, "step": 1895 }, { "completion_length": 1012.8125, "epoch": 0.4196518546127193, "grad_norm": 0.031069888519680837, "kl": 0.6485260009765625, "learning_rate": 1.4394335766403703e-05, "loss": 0.0259, "reward": 1.5874255585018546, "reward_std": 0.530018257148754, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0500744489670069, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.8875000035390258, "step": 1900 }, { "epoch": 0.4196518546127193, "eval_completion_length": 1015.89, "eval_kl": 0.38810546875, "eval_loss": 0.015597357414662838, "eval_reward": 1.6550358521938324, "eval_reward_std": 0.4101494722440839, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.049964163267577535, "eval_rewards/format_reward": 0.805, "eval_rewards/reasoning_steps_reward": 0.9000000047683716, "eval_runtime": 200.6466, "eval_samples_per_second": 0.493, "eval_steps_per_second": 0.125, "step": 1900 }, { "completion_length": 1001.575, "epoch": 0.42075620159854227, "grad_norm": 2.0124246714328873, "kl": 1.036248779296875, "learning_rate": 1.4359668745912472e-05, "loss": 0.0414, "reward": 1.6629710331559182, "reward_std": 0.39515268294885003, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03286230957592693, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9083333373069763, "step": 1905 }, { "completion_length": 957.78125, "epoch": 0.4218605485843652, "grad_norm": 7.0968415445601, "kl": 161.5555908203125, "learning_rate": 1.4324936914132255e-05, "loss": 6.4648, "reward": 0.8477631491608918, "reward_std": 0.5642138687988535, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16473685780432845, "rewards/format_reward": 0.4375, "rewards/reasoning_steps_reward": 0.5750000026077032, "step": 1910 }, { "completion_length": 840.0625, "epoch": 0.4229648955701881, "grad_norm": 1.6032662050525144, "kl": 2.6, "learning_rate": 1.4290140787390083e-05, "loss": 0.104, "reward": 0.001272639585658908, "reward_std": 0.3248372384929098, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22581069722946268, "rewards/format_reward": 0.05, "rewards/reasoning_steps_reward": 0.1770833356305957, "step": 1915 }, { "completion_length": 914.06875, "epoch": 0.4240692425560111, "grad_norm": 4.201448980739016, "kl": 1.5188720703125, "learning_rate": 1.4255280882968787e-05, "loss": 0.0607, "reward": 0.18680339390411974, "reward_std": 0.412913748028177, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15694661159250245, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.2750000048428774, "step": 1920 }, { "completion_length": 951.16875, "epoch": 0.42517358954183404, "grad_norm": 1.209578741785896, "kl": 3.43743896484375, "learning_rate": 1.4220357719099338e-05, "loss": 0.1374, "reward": 0.3251433074765373, "reward_std": 0.4896274733968312, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11235669811055686, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.36875000689178705, "step": 1925 }, { "completion_length": 1000.65, "epoch": 0.426277936527657, "grad_norm": 1.7450178375722907, "kl": 0.916650390625, "learning_rate": 1.4185371814953116e-05, "loss": 0.0367, "reward": 0.3472979475278407, "reward_std": 0.4390462203696188, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21311872491462508, "rewards/format_reward": 0.09375, "rewards/reasoning_steps_reward": 0.4666666740551591, "step": 1930 }, { "completion_length": 1012.9375, "epoch": 0.42738228351347995, "grad_norm": 0.6809658317149088, "kl": 2.3167236328125, "learning_rate": 1.415032369063422e-05, "loss": 0.0926, "reward": 0.45425672866404054, "reward_std": 0.4167201625125017, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.31449327804148197, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.7000000111758709, "step": 1935 }, { "completion_length": 1019.725, "epoch": 0.42848663049930286, "grad_norm": 6.584644008036467, "kl": 1.9494140625, "learning_rate": 1.41152138671717e-05, "loss": 0.0781, "reward": 0.7717248608358205, "reward_std": 0.47345116818096356, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.23452514785294626, "rewards/format_reward": 0.1625, "rewards/reasoning_steps_reward": 0.8437500104308129, "step": 1940 }, { "completion_length": 1010.575, "epoch": 0.4295909774851258, "grad_norm": 0.9124464101651549, "kl": 0.98031005859375, "learning_rate": 1.408004286651185e-05, "loss": 0.0392, "reward": 0.9319220932200551, "reward_std": 0.3040256727119527, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10974457962370252, "rewards/format_reward": 0.15, "rewards/reasoning_steps_reward": 0.8916666749864817, "step": 1945 }, { "completion_length": 1007.06875, "epoch": 0.43069532447094877, "grad_norm": 0.8174851082205585, "kl": 1.955194091796875, "learning_rate": 1.4044811211510419e-05, "loss": 0.0784, "reward": 0.9499879771843552, "reward_std": 0.41973948137037664, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07292869019135821, "rewards/format_reward": 0.20625, "rewards/reasoning_steps_reward": 0.8166666731238366, "step": 1950 }, { "completion_length": 1011.7875, "epoch": 0.4317996714567717, "grad_norm": 9.563287846750592, "kl": 1.788214111328125, "learning_rate": 1.4009519425924858e-05, "loss": 0.0714, "reward": 0.9242004107683897, "reward_std": 0.49032472132910243, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06954959582892287, "rewards/format_reward": 0.23125, "rewards/reasoning_steps_reward": 0.7625000070780515, "step": 1955 }, { "completion_length": 1014.0375, "epoch": 0.4329040184425947, "grad_norm": 1.1172252423176203, "kl": 1.506072998046875, "learning_rate": 1.3974168034406524e-05, "loss": 0.0602, "reward": 1.0922672674059868, "reward_std": 0.6774374388254956, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05981607586077189, "rewards/format_reward": 0.3625, "rewards/reasoning_steps_reward": 0.7895833428949117, "step": 1960 }, { "completion_length": 1015.45, "epoch": 0.4340083654284176, "grad_norm": 8.662740397902532, "kl": 2.05098876953125, "learning_rate": 1.3938757562492873e-05, "loss": 0.082, "reward": 1.074470814689994, "reward_std": 0.6158715720244914, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.056779195064882514, "rewards/format_reward": 0.35625, "rewards/reasoning_steps_reward": 0.7750000122934579, "step": 1965 }, { "completion_length": 1014.45625, "epoch": 0.43511271241424054, "grad_norm": 0.3773294016373424, "kl": 3.3776611328125, "learning_rate": 1.3903288536599668e-05, "loss": 0.1351, "reward": 0.9905229835072532, "reward_std": 0.5584212095652674, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0615603672684756, "rewards/format_reward": 0.3125, "rewards/reasoning_steps_reward": 0.7395833427086472, "step": 1970 }, { "completion_length": 1020.29375, "epoch": 0.4362170594000635, "grad_norm": 0.4442185959316331, "kl": 1.775543212890625, "learning_rate": 1.3867761484013135e-05, "loss": 0.071, "reward": 1.1137715804390609, "reward_std": 0.5982852473727007, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04664510243914037, "rewards/format_reward": 0.39375, "rewards/reasoning_steps_reward": 0.7666666774079204, "step": 1975 }, { "completion_length": 1024.0, "epoch": 0.43732140638588646, "grad_norm": 21.47234145361546, "kl": 2.4731292724609375, "learning_rate": 1.3832176932882136e-05, "loss": 0.0989, "reward": 1.2862569394987076, "reward_std": 0.5667873994000729, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.051243075182765094, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.8187500070780516, "step": 1980 }, { "completion_length": 1019.7625, "epoch": 0.4384257533717094, "grad_norm": 1.527622097864932, "kl": 1.788006591796875, "learning_rate": 1.3796535412210301e-05, "loss": 0.0715, "reward": 1.2694264559075237, "reward_std": 0.5048920283968528, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04515689611821472, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.8145833436399699, "step": 1985 }, { "completion_length": 1024.0, "epoch": 0.4395301003575323, "grad_norm": 4.3138407765683855, "kl": 0.601263427734375, "learning_rate": 1.3760837451848193e-05, "loss": 0.0241, "reward": 1.3374655573628842, "reward_std": 0.6142874645546271, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.037534464124564695, "rewards/format_reward": 0.575, "rewards/reasoning_steps_reward": 0.8000000083819032, "step": 1990 }, { "completion_length": 1015.325, "epoch": 0.4406344473433553, "grad_norm": 0.22980488757849943, "kl": 0.72313232421875, "learning_rate": 1.3725083582485397e-05, "loss": 0.0289, "reward": 1.4313670295290648, "reward_std": 0.5948537336438051, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.016549649066297432, "rewards/format_reward": 0.6375, "rewards/reasoning_steps_reward": 0.8104166738688946, "step": 1995 }, { "completion_length": 1024.0, "epoch": 0.44173879432917823, "grad_norm": 10.31697817811464, "kl": 2.43072509765625, "learning_rate": 1.3689274335642653e-05, "loss": 0.0972, "reward": 1.577266044355929, "reward_std": 0.5029176145569266, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.022733943232353226, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.8500000040978193, "step": 2000 }, { "epoch": 0.44173879432917823, "eval_completion_length": 1019.28, "eval_kl": 0.4328515625, "eval_loss": 0.017405448481440544, "eval_reward": 1.6899439215660095, "eval_reward_std": 0.3815716141184316, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.003389413725956274, "eval_rewards/format_reward": 0.8, "eval_rewards/reasoning_steps_reward": 0.8933333384990693, "eval_runtime": 202.9245, "eval_samples_per_second": 0.488, "eval_steps_per_second": 0.123, "step": 2000 }, { "completion_length": 1018.48125, "epoch": 0.4428431413150012, "grad_norm": 0.37629734086009897, "kl": 0.5832275390625, "learning_rate": 1.3653410243663953e-05, "loss": 0.0233, "reward": 1.7200924716889858, "reward_std": 0.2776235666715309, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.015324184401330853, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.9229166697710752, "step": 2005 }, { "completion_length": 1020.01875, "epoch": 0.44394748830082414, "grad_norm": 0.28952601144092716, "kl": 1.030401611328125, "learning_rate": 1.3617491839708614e-05, "loss": 0.0413, "reward": 1.639422894269228, "reward_std": 0.3915938691733004, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012660458464461044, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.8895833376795054, "step": 2010 }, { "completion_length": 1024.0, "epoch": 0.44505183528664705, "grad_norm": 0.09617188273592023, "kl": 0.60234375, "learning_rate": 1.3581519657743365e-05, "loss": 0.0242, "reward": 1.7725027503445745, "reward_std": 0.24438665603962306, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012913908507604788, "rewards/format_reward": 0.85625, "rewards/reasoning_steps_reward": 0.9291666682809592, "step": 2015 }, { "completion_length": 1024.0, "epoch": 0.44615618227247, "grad_norm": 0.18225728128628407, "kl": 0.27862548828125, "learning_rate": 1.3545494232534406e-05, "loss": 0.0111, "reward": 1.7568111419677734, "reward_std": 0.29651416725586444, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.007772182382834103, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.9208333363756538, "step": 2020 }, { "completion_length": 1016.925, "epoch": 0.44726052925829296, "grad_norm": 0.2648706944585046, "kl": 0.615509033203125, "learning_rate": 1.3509416099639456e-05, "loss": 0.0246, "reward": 1.8220582745969296, "reward_std": 0.17748434675852423, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.007108392053859802, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9666666693985462, "step": 2025 }, { "completion_length": 1018.525, "epoch": 0.4483648762441159, "grad_norm": 0.6277482412805114, "kl": 0.555120849609375, "learning_rate": 1.3473285795399792e-05, "loss": 0.0222, "reward": 1.7602945683524012, "reward_std": 0.2339164051840612, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012622114044086174, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.9416666684672237, "step": 2030 }, { "completion_length": 1024.0, "epoch": 0.4494692232299388, "grad_norm": 0.3638572386307533, "kl": 0.827734375, "learning_rate": 1.3437103856932266e-05, "loss": 0.0331, "reward": 1.7036781013011932, "reward_std": 0.3036784950886727, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02757192025987365, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.9250000013038516, "step": 2035 }, { "completion_length": 1018.1125, "epoch": 0.4505735702157618, "grad_norm": 0.28945935173584225, "kl": 0.39534912109375, "learning_rate": 1.3400870822121348e-05, "loss": 0.0158, "reward": 1.7693685671314596, "reward_std": 0.29658813936011086, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.009798096649282684, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.9104166690260171, "step": 2040 }, { "completion_length": 1024.0, "epoch": 0.45167791720158473, "grad_norm": 0.30395883309793953, "kl": 0.496514892578125, "learning_rate": 1.3364587229611095e-05, "loss": 0.0198, "reward": 1.5488340200856328, "reward_std": 0.34414923763941657, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.044915972299531856, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.9250000067055225, "step": 2045 }, { "completion_length": 1023.36875, "epoch": 0.4527822641874077, "grad_norm": 0.37563339825606884, "kl": 1.257757568359375, "learning_rate": 1.332825361879717e-05, "loss": 0.0503, "reward": 1.0923855936154723, "reward_std": 0.5553531399730645, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10553107349589368, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.8229166757315397, "step": 2050 }, { "completion_length": 1020.39375, "epoch": 0.45388661117323065, "grad_norm": 0.17545325218642227, "kl": 0.552960205078125, "learning_rate": 1.3291870529818809e-05, "loss": 0.0221, "reward": 1.3102701783180237, "reward_std": 0.5236696774121044, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08556316045970505, "rewards/format_reward": 0.55, "rewards/reasoning_steps_reward": 0.8458333417773247, "step": 2055 }, { "completion_length": 1024.0, "epoch": 0.45499095815905355, "grad_norm": 2.065649559488245, "kl": 0.834759521484375, "learning_rate": 1.3255438503550796e-05, "loss": 0.0333, "reward": 1.69286774341017, "reward_std": 0.3293037013057784, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.027965584968657708, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.9208333367481828, "step": 2060 }, { "completion_length": 1024.0, "epoch": 0.4560953051448765, "grad_norm": 0.660593118562221, "kl": 0.329058837890625, "learning_rate": 1.3218958081595426e-05, "loss": 0.0131, "reward": 1.6265041932463646, "reward_std": 0.41342474384632055, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.031829144536436614, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.9020833386108279, "step": 2065 }, { "completion_length": 1023.1875, "epoch": 0.45719965213069946, "grad_norm": 0.1196002841795942, "kl": 0.259063720703125, "learning_rate": 1.3182429806274442e-05, "loss": 0.0103, "reward": 1.7037229581736029, "reward_std": 0.3075523629013503, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025443704352724694, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9416666693985463, "step": 2070 }, { "completion_length": 1024.0, "epoch": 0.4583039991165224, "grad_norm": 0.14409380240615186, "kl": 0.298175048828125, "learning_rate": 1.3145854220620981e-05, "loss": 0.0119, "reward": 1.6938567931763828, "reward_std": 0.3266446697782669, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03322654654869552, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9395833369344473, "step": 2075 }, { "completion_length": 1024.0, "epoch": 0.4594083461023454, "grad_norm": 0.12617090323900998, "kl": 0.24708251953125, "learning_rate": 1.3109231868371511e-05, "loss": 0.0099, "reward": 1.7247996438294648, "reward_std": 0.3449321190650778, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.029367040466587467, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.9479166708886624, "step": 2080 }, { "completion_length": 1024.0, "epoch": 0.4605126930881683, "grad_norm": 4.119605221827958, "kl": 0.795184326171875, "learning_rate": 1.3072563293957725e-05, "loss": 0.0318, "reward": 1.557416939828545, "reward_std": 0.37313184138067185, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.048833064705115704, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.9000000039115548, "step": 2085 }, { "completion_length": 1020.1125, "epoch": 0.46161704007399124, "grad_norm": 0.14784198259771675, "kl": 0.735406494140625, "learning_rate": 1.3035849042498462e-05, "loss": 0.0294, "reward": 1.5002798398956656, "reward_std": 0.5547551644277462, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06638682496321166, "rewards/format_reward": 0.6625, "rewards/reasoning_steps_reward": 0.904166673310101, "step": 2090 }, { "completion_length": 1020.9, "epoch": 0.4627213870598142, "grad_norm": 0.32720942030685624, "kl": 0.608282470703125, "learning_rate": 1.299908965979161e-05, "loss": 0.0244, "reward": 1.4782994251698256, "reward_std": 0.500833890357893, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05920058094643536, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.9125000052154064, "step": 2095 }, { "completion_length": 1024.0, "epoch": 0.46382573404563715, "grad_norm": 0.1310566495840646, "kl": 0.917279052734375, "learning_rate": 1.2962285692305964e-05, "loss": 0.0367, "reward": 1.6021131692454218, "reward_std": 0.4008885342831661, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.037470159548001904, "rewards/format_reward": 0.74375, "rewards/reasoning_steps_reward": 0.8958333371207118, "step": 2100 }, { "epoch": 0.46382573404563715, "eval_completion_length": 1024.0, "eval_kl": 0.28365234375, "eval_loss": 0.011327223852276802, "eval_reward": 1.6594515788555144, "eval_reward_std": 0.38401263874957065, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.035548398063092464, "eval_rewards/format_reward": 0.765, "eval_rewards/reasoning_steps_reward": 0.9300000059604645, "eval_runtime": 203.5966, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.123, "step": 2100 }, { "completion_length": 1024.0, "epoch": 0.4649300810314601, "grad_norm": 1.3191955643061455, "kl": 0.275543212890625, "learning_rate": 1.2925437687173144e-05, "loss": 0.011, "reward": 1.684731831587851, "reward_std": 0.25950010808546153, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04651815479469405, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.9375000039115549, "step": 2105 }, { "completion_length": 1024.0, "epoch": 0.466034428017283, "grad_norm": 0.09016927229701084, "kl": 0.257794189453125, "learning_rate": 1.2888546192179417e-05, "loss": 0.0103, "reward": 1.745303137972951, "reward_std": 0.2667829909493776, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02761352810288713, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9354166686534882, "step": 2110 }, { "completion_length": 1024.0, "epoch": 0.46713877500310597, "grad_norm": 0.20047174524395592, "kl": 0.311578369140625, "learning_rate": 1.2851611755757587e-05, "loss": 0.0125, "reward": 1.741257084161043, "reward_std": 0.34543239968699024, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.029576247078188088, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9333333380520343, "step": 2115 }, { "completion_length": 1019.73125, "epoch": 0.4682431219889289, "grad_norm": 0.17615770549711435, "kl": 0.356341552734375, "learning_rate": 1.2814634926978831e-05, "loss": 0.0142, "reward": 1.6373960416764022, "reward_std": 0.3339324926538552, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0480206228675911, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.9229166708886624, "step": 2120 }, { "completion_length": 1019.475, "epoch": 0.4693474689747519, "grad_norm": 13.564334122866743, "kl": 1.46416015625, "learning_rate": 1.2777616255544527e-05, "loss": 0.0586, "reward": 1.5654781736433505, "reward_std": 0.45222095985685656, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0761884901760709, "rewards/format_reward": 0.6875, "rewards/reasoning_steps_reward": 0.9541666701436042, "step": 2125 }, { "completion_length": 1020.94375, "epoch": 0.47045181596057484, "grad_norm": 29.85489728983082, "kl": 41.50612182617188, "learning_rate": 1.2740556291778096e-05, "loss": 1.6594, "reward": 1.48430804759264, "reward_std": 0.39297982692303035, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09485862025956919, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 0.9666666690260172, "step": 2130 }, { "completion_length": 1024.0, "epoch": 0.47155616294639774, "grad_norm": 0.9013678435103032, "kl": 3.813409423828125, "learning_rate": 1.2703455586616811e-05, "loss": 0.1528, "reward": 1.6619893133640289, "reward_std": 0.3401134827206249, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05259404219436874, "rewards/format_reward": 0.71875, "rewards/reasoning_steps_reward": 0.995833333581686, "step": 2135 }, { "completion_length": 1024.0, "epoch": 0.4726605099322207, "grad_norm": 0.786336979289747, "kl": 2.127301025390625, "learning_rate": 1.2666314691603615e-05, "loss": 0.085, "reward": 1.7701308561488986, "reward_std": 0.24104950832922895, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03611916397267123, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.9875, "step": 2140 }, { "completion_length": 1024.0, "epoch": 0.47376485691804365, "grad_norm": 1.0553226951559378, "kl": 1.034063720703125, "learning_rate": 1.2629134158878919e-05, "loss": 0.0413, "reward": 1.7154738694429397, "reward_std": 0.3030771045820302, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04494282233531521, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9729166686534881, "step": 2145 }, { "completion_length": 1024.0, "epoch": 0.4748692039038666, "grad_norm": 0.3393903797287573, "kl": 3.99031982421875, "learning_rate": 1.259191454117239e-05, "loss": 0.1597, "reward": 1.7812194317579269, "reward_std": 0.2781567809738135, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.041697263186324565, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.9791666686534881, "step": 2150 }, { "completion_length": 1024.0, "epoch": 0.47597355088968957, "grad_norm": 6.589061393274526, "kl": 1.5100830078125, "learning_rate": 1.255465639179473e-05, "loss": 0.0604, "reward": 1.7989778753370047, "reward_std": 0.23155034703924002, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03852214975868264, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9750000026077033, "step": 2155 }, { "completion_length": 1024.0, "epoch": 0.47707789787551247, "grad_norm": 0.8010593946661915, "kl": 3.04891357421875, "learning_rate": 1.2517360264629463e-05, "loss": 0.122, "reward": 1.7094581590034068, "reward_std": 0.34085092952595913, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.046791872511084874, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.9375000035390257, "step": 2160 }, { "completion_length": 1024.0, "epoch": 0.4781822448613354, "grad_norm": 2.217307019793859, "kl": 1.355609130859375, "learning_rate": 1.24800267141247e-05, "loss": 0.0542, "reward": 1.8146669074892998, "reward_std": 0.26201679514249465, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.022833122030738194, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9625000022351742, "step": 2165 }, { "completion_length": 1024.0, "epoch": 0.4792865918471584, "grad_norm": 0.6114018111973071, "kl": 1.562957763671875, "learning_rate": 1.2442656295284879e-05, "loss": 0.0625, "reward": 1.6614520654082299, "reward_std": 0.3890801830509247, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02813129380491546, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.9145833371207118, "step": 2170 }, { "completion_length": 1024.0, "epoch": 0.48039093883298134, "grad_norm": 5.905753794579747, "kl": 2.620941162109375, "learning_rate": 1.2405249563662539e-05, "loss": 0.1048, "reward": 1.6438979797065258, "reward_std": 0.4486811731098953, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.031102037969344565, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.9187500044703484, "step": 2175 }, { "completion_length": 1024.0, "epoch": 0.48149528581880424, "grad_norm": 5.462849225057963, "kl": 2.09735107421875, "learning_rate": 1.2367807075350036e-05, "loss": 0.0839, "reward": 1.6195215459913015, "reward_std": 0.40022268557599433, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04714512059364893, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.9041666707023979, "step": 2180 }, { "completion_length": 1024.0, "epoch": 0.4825996328046272, "grad_norm": 2.9029705830904895, "kl": 2.64107666015625, "learning_rate": 1.23303293869713e-05, "loss": 0.1057, "reward": 1.4394451253116132, "reward_std": 0.5855226124516661, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09180486189869157, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.8312500044703484, "step": 2185 }, { "completion_length": 1024.0, "epoch": 0.48370397979045016, "grad_norm": 0.5313710417788244, "kl": 2.38533935546875, "learning_rate": 1.2292817055673543e-05, "loss": 0.0955, "reward": 1.5141951335594057, "reward_std": 0.611786913189178, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0962215375395317, "rewards/format_reward": 0.7375, "rewards/reasoning_steps_reward": 0.8729166721925139, "step": 2190 }, { "completion_length": 1024.0, "epoch": 0.4848083267762731, "grad_norm": 1.5676422562430876, "kl": 2.0924560546875, "learning_rate": 1.2255270639118984e-05, "loss": 0.0837, "reward": 1.4455404764972628, "reward_std": 0.475542880991236, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11070954198064556, "rewards/format_reward": 0.73125, "rewards/reasoning_steps_reward": 0.8250000048428774, "step": 2195 }, { "completion_length": 1024.0, "epoch": 0.48591267376209607, "grad_norm": 1.1085617605053268, "kl": 1.0922119140625, "learning_rate": 1.2217690695476551e-05, "loss": 0.0436, "reward": 1.4205092269927264, "reward_std": 0.5992176422125908, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1315741269034561, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.8520833374932408, "step": 2200 }, { "epoch": 0.48591267376209607, "eval_completion_length": 1024.0, "eval_kl": 0.831904296875, "eval_loss": 0.032701995223760605, "eval_reward": 1.5383737568557263, "eval_reward_std": 0.39792034816321575, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.03329294507166196, "eval_rewards/format_reward": 0.765, "eval_rewards/reasoning_steps_reward": 0.8066666719317436, "eval_runtime": 206.2216, "eval_samples_per_second": 0.48, "eval_steps_per_second": 0.121, "step": 2200 }, { "completion_length": 1024.0, "epoch": 0.48701702074791897, "grad_norm": 0.9801168345465376, "kl": 0.500738525390625, "learning_rate": 1.2180077783413601e-05, "loss": 0.02, "reward": 1.6020004270598291, "reward_std": 0.5047344055276994, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025082948844874407, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.8520833380520344, "step": 2205 }, { "completion_length": 1024.0, "epoch": 0.48812136773374193, "grad_norm": 1.7123348809738306, "kl": 0.643328857421875, "learning_rate": 1.21424324620876e-05, "loss": 0.0257, "reward": 1.5895938023924827, "reward_std": 0.43823493779657385, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.033322910089100335, "rewards/format_reward": 0.76875, "rewards/reasoning_steps_reward": 0.8541666703298688, "step": 2210 }, { "completion_length": 1024.0, "epoch": 0.4892257147195649, "grad_norm": 0.42456634301355384, "kl": 0.2675048828125, "learning_rate": 1.2104755291137797e-05, "loss": 0.0107, "reward": 1.7852548621594906, "reward_std": 0.3036327707303144, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.008495195002069522, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.9125000014901161, "step": 2215 }, { "completion_length": 1024.0, "epoch": 0.49033006170538784, "grad_norm": 44.884356387747495, "kl": 1.102850341796875, "learning_rate": 1.2067046830676947e-05, "loss": 0.0441, "reward": 1.6169834925793112, "reward_std": 0.4143550125787215, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.030933218055542964, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.8729166686534882, "step": 2220 }, { "completion_length": 1024.0, "epoch": 0.4914344086912108, "grad_norm": 1.6254530181590954, "kl": 0.957958984375, "learning_rate": 1.2029307641282935e-05, "loss": 0.0383, "reward": 1.5575453802943229, "reward_std": 0.4768017131381782, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07787129828080311, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.8604166692122817, "step": 2225 }, { "completion_length": 1024.0, "epoch": 0.4925387556770337, "grad_norm": 3.8378763559078193, "kl": 1.298052978515625, "learning_rate": 1.1991538283990483e-05, "loss": 0.0519, "reward": 1.6807304440066217, "reward_std": 0.29701345783856825, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0380195384103331, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9312500037252903, "step": 2230 }, { "completion_length": 1024.0, "epoch": 0.49364310266285666, "grad_norm": 9.71139970165291, "kl": 4.29759521484375, "learning_rate": 1.1953739320282778e-05, "loss": 0.1719, "reward": 1.7868030063807965, "reward_std": 0.27771961602904865, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0048636200283453945, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9291666693985462, "step": 2235 }, { "completion_length": 1024.0, "epoch": 0.4947474496486796, "grad_norm": 1.4786352397643705, "kl": 1.40592041015625, "learning_rate": 1.191591131208315e-05, "loss": 0.0563, "reward": 1.6962168462574483, "reward_std": 0.3573447995016267, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.005866468011367943, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.9395833402872086, "step": 2240 }, { "completion_length": 1024.0, "epoch": 0.4958517966345026, "grad_norm": 12.15263252691778, "kl": 5.4029541015625, "learning_rate": 1.1878054821746703e-05, "loss": 0.2161, "reward": 1.6230487048625946, "reward_std": 0.38453564145783614, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.014451281244680559, "rewards/format_reward": 0.71875, "rewards/reasoning_steps_reward": 0.9187500052154064, "step": 2245 }, { "completion_length": 1024.0, "epoch": 0.49695614362032553, "grad_norm": 0.7557319167485542, "kl": 1.6591064453125, "learning_rate": 1.1840170412051957e-05, "loss": 0.0663, "reward": 1.702598787844181, "reward_std": 0.4023397401074874, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01406788526670084, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.9104166738688946, "step": 2250 }, { "completion_length": 1024.0, "epoch": 0.49806049060614843, "grad_norm": 1.569834132336579, "kl": 3.63101806640625, "learning_rate": 1.1802258646192486e-05, "loss": 0.1451, "reward": 1.7320811052806675, "reward_std": 0.29090855971943486, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01583560020637833, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.9354166699573397, "step": 2255 }, { "completion_length": 1024.0, "epoch": 0.4991648375919714, "grad_norm": 13.15568121526605, "kl": 1.659033203125, "learning_rate": 1.1764320087768546e-05, "loss": 0.0664, "reward": 1.6248051080852748, "reward_std": 0.45652774178926164, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.033528259821559915, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.8770833371207118, "step": 2260 }, { "completion_length": 1024.0, "epoch": 0.5002691845777943, "grad_norm": 0.7170030879216972, "kl": 2.472637939453125, "learning_rate": 1.1726355300778693e-05, "loss": 0.099, "reward": 1.6203329667448998, "reward_std": 0.5256475808244773, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.027583728811373477, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.8541666708886624, "step": 2265 }, { "completion_length": 1024.0, "epoch": 0.5013735315636173, "grad_norm": 3.390023769493416, "kl": 2.3892578125, "learning_rate": 1.1688364849611395e-05, "loss": 0.0957, "reward": 1.5784560879692435, "reward_std": 0.41030325355160924, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.036127270559586575, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.8520833376795054, "step": 2270 }, { "completion_length": 1024.0, "epoch": 0.5024778785494403, "grad_norm": 3.831774678363149, "kl": 1.948974609375, "learning_rate": 1.1650349299036656e-05, "loss": 0.078, "reward": 1.6070521710440517, "reward_std": 0.3782993628408235, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.015864501821948807, "rewards/format_reward": 0.775, "rewards/reasoning_steps_reward": 0.847916672565043, "step": 2275 }, { "completion_length": 1024.0, "epoch": 0.5035822255352632, "grad_norm": 1.3332468495023275, "kl": 3.39307861328125, "learning_rate": 1.1612309214197599e-05, "loss": 0.1357, "reward": 1.618955060839653, "reward_std": 0.44067184482020993, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.014378260358787998, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.8458333371207118, "step": 2280 }, { "completion_length": 1024.0, "epoch": 0.5046865725210862, "grad_norm": 5.409150362417238, "kl": 1.684649658203125, "learning_rate": 1.1574245160602085e-05, "loss": 0.0673, "reward": 1.4961256569251418, "reward_std": 0.43631368919288605, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01429101258823664, "rewards/format_reward": 0.64375, "rewards/reasoning_steps_reward": 0.8666666716337204, "step": 2285 }, { "completion_length": 1024.0, "epoch": 0.505790919506909, "grad_norm": 10.3090725611453, "kl": 2.255633544921875, "learning_rate": 1.153615770411429e-05, "loss": 0.0902, "reward": 1.5355156451463698, "reward_std": 0.4401967245209562, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01656769531377904, "rewards/format_reward": 0.6875, "rewards/reasoning_steps_reward": 0.8645833389833569, "step": 2290 }, { "completion_length": 1024.0, "epoch": 0.506895266492732, "grad_norm": 4.742828920657206, "kl": 3.08935546875, "learning_rate": 1.1498047410946307e-05, "loss": 0.1236, "reward": 1.5172470673918723, "reward_std": 0.402690345170231, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.014002943886953289, "rewards/format_reward": 0.64375, "rewards/reasoning_steps_reward": 0.8875000052154064, "step": 2295 }, { "completion_length": 1024.0, "epoch": 0.5079996134785549, "grad_norm": 3.7991396033177476, "kl": 2.6480224609375, "learning_rate": 1.1459914847649716e-05, "loss": 0.1059, "reward": 1.6665501791983843, "reward_std": 0.35762373491728566, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.010533173413750773, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.8833333371207118, "step": 2300 }, { "epoch": 0.5079996134785549, "eval_completion_length": 1024.0, "eval_kl": 3.185244140625, "eval_loss": 0.12666305899620056, "eval_reward": 1.7388547444343567, "eval_reward_std": 0.3053203289665726, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.02781194020019029, "eval_rewards/format_reward": 0.85, "eval_rewards/reasoning_steps_reward": 0.9166666674613952, "eval_runtime": 207.715, "eval_samples_per_second": 0.477, "eval_steps_per_second": 0.12, "step": 2300 }, { "completion_length": 1024.0, "epoch": 0.5091039604643779, "grad_norm": 1.49588954242709, "kl": 2.38382568359375, "learning_rate": 1.1421760581107164e-05, "loss": 0.0953, "reward": 1.6487581813707948, "reward_std": 0.2917481830695891, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04082516024231779, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.8583333348855376, "step": 2305 }, { "completion_length": 1024.0, "epoch": 0.5102083074502008, "grad_norm": 1.8780330623608932, "kl": 1.966387939453125, "learning_rate": 1.1383585178523955e-05, "loss": 0.0787, "reward": 1.60753201469779, "reward_std": 0.42931269481578627, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.046634629848381334, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.8541666697710752, "step": 2310 }, { "completion_length": 1024.0, "epoch": 0.5113126544360238, "grad_norm": 2.96432937885032, "kl": 2.16142578125, "learning_rate": 1.1345389207419588e-05, "loss": 0.0865, "reward": 1.5378418434411287, "reward_std": 0.5287294987143696, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.051741475899098076, "rewards/format_reward": 0.74375, "rewards/reasoning_steps_reward": 0.8395833408460021, "step": 2315 }, { "completion_length": 1024.0, "epoch": 0.5124170014218468, "grad_norm": 2.4681930761511444, "kl": 2.180352783203125, "learning_rate": 1.1307173235619342e-05, "loss": 0.0872, "reward": 1.3009003968909383, "reward_std": 0.543780626336303, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07618293184367815, "rewards/format_reward": 0.575, "rewards/reasoning_steps_reward": 0.8020833391696215, "step": 2320 }, { "completion_length": 1024.0, "epoch": 0.5135213484076697, "grad_norm": 3.113537344897485, "kl": 1.70172119140625, "learning_rate": 1.126893783124583e-05, "loss": 0.0681, "reward": 1.558167396299541, "reward_std": 0.44948710563394345, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.07308261062056545, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.8750000052154064, "step": 2325 }, { "completion_length": 1024.0, "epoch": 0.5146256953934927, "grad_norm": 4.243060777071984, "kl": 1.395947265625, "learning_rate": 1.1230683562710549e-05, "loss": 0.0559, "reward": 1.6064410168677568, "reward_std": 0.408382396842228, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04564233084790885, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.8708333348855376, "step": 2330 }, { "completion_length": 1024.0, "epoch": 0.5157300423793156, "grad_norm": 2.446852880740469, "kl": 2.1044677734375, "learning_rate": 1.1192410998705432e-05, "loss": 0.0841, "reward": 1.667079577036202, "reward_std": 0.3678033341785465, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03500375264862328, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.8895833365619182, "step": 2335 }, { "completion_length": 1018.425, "epoch": 0.5168343893651385, "grad_norm": 3.310047995806835, "kl": 4.138531494140625, "learning_rate": 1.1154120708194398e-05, "loss": 0.1654, "reward": 1.1680033100768923, "reward_std": 0.5004193522453079, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09033003134591695, "rewards/format_reward": 0.45, "rewards/reasoning_steps_reward": 0.8083333427086472, "step": 2340 }, { "completion_length": 1024.0, "epoch": 0.5179387363509614, "grad_norm": 0.6777064303206283, "kl": 0.7271484375, "learning_rate": 1.1115813260404889e-05, "loss": 0.0291, "reward": 1.2690949118230492, "reward_std": 0.5361405726755037, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.037155097232880505, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.7875000109896064, "step": 2345 }, { "completion_length": 1024.0, "epoch": 0.5190430833367844, "grad_norm": 1.6889816167510485, "kl": 0.479376220703125, "learning_rate": 1.1077489224819402e-05, "loss": 0.0192, "reward": 1.5625366240739822, "reward_std": 0.47994972608394165, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.014546713606068806, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.858333339355886, "step": 2350 }, { "completion_length": 1024.0, "epoch": 0.5201474303226074, "grad_norm": 1.0697187024840797, "kl": 0.394384765625, "learning_rate": 1.1039149171167046e-05, "loss": 0.0158, "reward": 1.4704681444913148, "reward_std": 0.5033119401799923, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.02328185227161157, "rewards/format_reward": 0.65, "rewards/reasoning_steps_reward": 0.8312500044703484, "step": 2355 }, { "completion_length": 1024.0, "epoch": 0.5212517773084303, "grad_norm": 0.25919087539939206, "kl": 0.17598876953125, "learning_rate": 1.1000793669415035e-05, "loss": 0.007, "reward": 1.5186623342335224, "reward_std": 0.38529679665589356, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.02092099927031086, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.995833333581686, "step": 2360 }, { "completion_length": 1024.0, "epoch": 0.5223561242942533, "grad_norm": 0.21925652598328885, "kl": 0.139227294921875, "learning_rate": 1.0962423289760254e-05, "loss": 0.0056, "reward": 1.4680943846702577, "reward_std": 0.37131147203908765, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.038155618403834524, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 1.0, "step": 2365 }, { "completion_length": 1024.0, "epoch": 0.5234604712800762, "grad_norm": 0.43901396796754966, "kl": 0.248681640625, "learning_rate": 1.0924038602620757e-05, "loss": 0.01, "reward": 1.8161877155303956, "reward_std": 0.2932082125398665, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.0838122889137594, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 1.0, "step": 2370 }, { "completion_length": 1024.0, "epoch": 0.5245648182658992, "grad_norm": 0.4594563192454749, "kl": 0.282012939453125, "learning_rate": 1.0885640178627291e-05, "loss": 0.0113, "reward": 1.9489604651927948, "reward_std": 0.15323295153175423, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.03228953526704572, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.9812500014901161, "step": 2375 }, { "completion_length": 1024.0, "epoch": 0.5256691652517221, "grad_norm": 1.4188978226513564, "kl": 0.5095947265625, "learning_rate": 1.0847228588614821e-05, "loss": 0.0204, "reward": 1.813528909534216, "reward_std": 0.1997750176507907, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.06980442710773786, "rewards/format_reward": 0.9875, "rewards/reasoning_steps_reward": 0.889583338610828, "step": 2380 }, { "completion_length": 1024.0, "epoch": 0.526773512237545, "grad_norm": 1.5094835244677218, "kl": 1.997509765625, "learning_rate": 1.0808804403614044e-05, "loss": 0.0799, "reward": 1.6416274465620517, "reward_std": 0.277405764979585, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.04378921372735931, "rewards/format_reward": 0.975, "rewards/reasoning_steps_reward": 0.7041666742414237, "step": 2385 }, { "completion_length": 1024.0, "epoch": 0.5278778592233679, "grad_norm": 1.417871425108259, "kl": 0.92401123046875, "learning_rate": 1.0770368194842886e-05, "loss": 0.037, "reward": 1.751328294724226, "reward_std": 0.21636201266960597, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025755050106090492, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 0.8145833384245634, "step": 2390 }, { "completion_length": 1024.0, "epoch": 0.5289822062091909, "grad_norm": 0.3460517954325989, "kl": 0.739208984375, "learning_rate": 1.073192053369802e-05, "loss": 0.0296, "reward": 1.7935828588902951, "reward_std": 0.310901246771391, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.010583810009211447, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.8979166697710752, "step": 2395 }, { "completion_length": 1024.0, "epoch": 0.5300865531950139, "grad_norm": 0.299553336858581, "kl": 0.269195556640625, "learning_rate": 1.0693461991746389e-05, "loss": 0.0108, "reward": 1.7330624889582396, "reward_std": 0.3259334075613879, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.03985418364172801, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.9416666716337204, "step": 2400 }, { "epoch": 0.5300865531950139, "eval_completion_length": 1024.0, "eval_kl": 0.40162109375, "eval_loss": 0.015542779117822647, "eval_reward": 1.6480663681030274, "eval_reward_std": 0.3486202434706502, "eval_rewards/accuracy_reward": 0.01, "eval_rewards/cosine_scaled_reward": -0.041933644004166125, "eval_rewards/format_reward": 0.835, "eval_rewards/reasoning_steps_reward": 0.8450000047683716, "eval_runtime": 203.8327, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.123, "step": 2400 }, { "completion_length": 1024.0, "epoch": 0.5311909001808368, "grad_norm": 1.1046026648755514, "kl": 0.2391357421875, "learning_rate": 1.0654993140716665e-05, "loss": 0.0096, "reward": 1.7627467274665833, "reward_std": 0.2496938494945425, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.060169946975656786, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.8854166742414236, "step": 2405 }, { "completion_length": 1024.0, "epoch": 0.5322952471666598, "grad_norm": 0.5282380418655942, "kl": 0.385986328125, "learning_rate": 1.0616514552490791e-05, "loss": 0.0154, "reward": 1.7804573088884355, "reward_std": 0.2669147708988021, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.0695426897440484, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9187500055879354, "step": 2410 }, { "completion_length": 1024.0, "epoch": 0.5333995941524827, "grad_norm": 2.4670630667481435, "kl": 1.252880859375, "learning_rate": 1.0578026799095464e-05, "loss": 0.0501, "reward": 1.6643784150481225, "reward_std": 0.3057770044339122, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06687159015400539, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.8500000052154064, "step": 2415 }, { "completion_length": 1024.0, "epoch": 0.5345039411383057, "grad_norm": 0.9935269786883532, "kl": 1.50946044921875, "learning_rate": 1.0539530452693625e-05, "loss": 0.0604, "reward": 1.4255976286716758, "reward_std": 0.438153853449694, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11815237666669418, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.7000000068917871, "step": 2420 }, { "completion_length": 1024.0, "epoch": 0.5356082881241286, "grad_norm": 2.1780659206470374, "kl": 0.464599609375, "learning_rate": 1.0501026085575967e-05, "loss": 0.0186, "reward": 1.3587639363482595, "reward_std": 0.4793480883206939, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10165273098900798, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.7479166768491268, "step": 2425 }, { "completion_length": 1024.0, "epoch": 0.5367126351099516, "grad_norm": 0.26711816143900485, "kl": 0.23184814453125, "learning_rate": 1.046251427015241e-05, "loss": 0.0093, "reward": 1.3165842306800186, "reward_std": 0.5014927750421976, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.08758243720367317, "rewards/format_reward": 0.6, "rewards/reasoning_steps_reward": 0.7854166770353913, "step": 2430 }, { "completion_length": 1024.0, "epoch": 0.5378169820957744, "grad_norm": 1.9867481118253592, "kl": 0.311883544921875, "learning_rate": 1.0423995578943615e-05, "loss": 0.0125, "reward": 1.5306999891996385, "reward_std": 0.48772922792995815, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.09430001441578498, "rewards/format_reward": 0.73125, "rewards/reasoning_steps_reward": 0.8812500104308129, "step": 2435 }, { "completion_length": 1024.0, "epoch": 0.5389213290815974, "grad_norm": 4.140607916865834, "kl": 1.620135498046875, "learning_rate": 1.0385470584572449e-05, "loss": 0.0648, "reward": 1.6088348772376775, "reward_std": 0.4257907736697234, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12241512268665247, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.9250000029802322, "step": 2440 }, { "completion_length": 1024.0, "epoch": 0.5400256760674204, "grad_norm": 0.8670307079372768, "kl": 0.9895751953125, "learning_rate": 1.0346939859755481e-05, "loss": 0.0396, "reward": 1.6652016706764698, "reward_std": 0.3854114227630362, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10563166871434078, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.9270833343267441, "step": 2445 }, { "completion_length": 1024.0, "epoch": 0.5411300230532433, "grad_norm": 1.1343950295235299, "kl": 0.3840087890625, "learning_rate": 1.0308403977294476e-05, "loss": 0.0154, "reward": 1.8509787783026694, "reward_std": 0.18647551744506927, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.08652122293206048, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 0.968750000745058, "step": 2450 }, { "completion_length": 1024.0, "epoch": 0.5422343700390663, "grad_norm": 1.0687106066072793, "kl": 0.62066650390625, "learning_rate": 1.0269863510067872e-05, "loss": 0.0248, "reward": 1.8417707242071628, "reward_std": 0.2083289371990759, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.06447928030538605, "rewards/format_reward": 0.9375, "rewards/reasoning_steps_reward": 0.9625000022351742, "step": 2455 }, { "completion_length": 1024.0, "epoch": 0.5433387170248892, "grad_norm": 0.7622623459030559, "kl": 0.68385009765625, "learning_rate": 1.023131903102226e-05, "loss": 0.0274, "reward": 1.772145263105631, "reward_std": 0.3072871359312558, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.07993807349630515, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.9145833348855377, "step": 2460 }, { "completion_length": 1024.0, "epoch": 0.5444430640107122, "grad_norm": 3.600110628598048, "kl": 2.00994873046875, "learning_rate": 1.0192771113163875e-05, "loss": 0.0804, "reward": 1.2840112496167422, "reward_std": 0.5591359864323749, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2159887515474111, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.7937500040978194, "step": 2465 }, { "completion_length": 1024.0, "epoch": 0.5455474109965351, "grad_norm": 2.75418080869431, "kl": 1.002532958984375, "learning_rate": 1.0154220329550076e-05, "loss": 0.0401, "reward": 1.2907778739929199, "reward_std": 0.5905611860769568, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13838879884569905, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.8729166677221656, "step": 2470 }, { "completion_length": 1020.225, "epoch": 0.5466517579823581, "grad_norm": 1.6118829429672292, "kl": 1.244329833984375, "learning_rate": 1.0115667253280817e-05, "loss": 0.0498, "reward": 1.0045916791073979, "reward_std": 0.6294532329367939, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12249164624954574, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7270833354443311, "step": 2475 }, { "completion_length": 1024.0, "epoch": 0.5477561049681811, "grad_norm": 1.6057608879936789, "kl": 1.879241943359375, "learning_rate": 1.0077112457490143e-05, "loss": 0.0752, "reward": 1.0946636897511781, "reward_std": 0.506724919876433, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13658630648133113, "rewards/format_reward": 0.425, "rewards/reasoning_steps_reward": 0.8000000026077032, "step": 2480 }, { "completion_length": 1024.0, "epoch": 0.5488604519540039, "grad_norm": 0.5646928134790875, "kl": 0.6290374755859375, "learning_rate": 1.0038556515337654e-05, "loss": 0.0252, "reward": 1.3780475069768727, "reward_std": 0.4772715052065905, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1386191618919838, "rewards/format_reward": 0.61875, "rewards/reasoning_steps_reward": 0.8916666675359011, "step": 2485 }, { "completion_length": 1024.0, "epoch": 0.5499647989398269, "grad_norm": 0.8859860742230482, "kl": 0.401104736328125, "learning_rate": 1e-05, "loss": 0.0161, "reward": 1.7356179803609848, "reward_std": 0.3069496586394962, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12479868032969535, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.941666667163372, "step": 2490 }, { "completion_length": 1024.0, "epoch": 0.5510691459256498, "grad_norm": 1.4769301041413523, "kl": 0.3043701171875, "learning_rate": 9.961443484662349e-06, "loss": 0.0122, "reward": 1.7910444140434265, "reward_std": 0.28909274661323253, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.11312224400317064, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9229166679084301, "step": 2495 }, { "completion_length": 1024.0, "epoch": 0.5521734929114728, "grad_norm": 0.5303430541917586, "kl": 0.1882080078125, "learning_rate": 9.92288754250986e-06, "loss": 0.0075, "reward": 1.9123993963003159, "reward_std": 0.24396189967519605, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.10635060318236356, "rewards/format_reward": 0.98125, "rewards/reasoning_steps_reward": 0.9937500014901162, "step": 2500 }, { "epoch": 0.5521734929114728, "eval_completion_length": 1024.0, "eval_kl": 0.21314453125, "eval_loss": 0.008531954139471054, "eval_reward": 1.8809818363189696, "eval_reward_std": 0.17097413605777548, "eval_rewards/accuracy_reward": 0.025, "eval_rewards/cosine_scaled_reward": -0.11235150025226176, "eval_rewards/format_reward": 0.99, "eval_rewards/reasoning_steps_reward": 0.9783333337306976, "eval_runtime": 203.3411, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.123, "step": 2500 }, { "completion_length": 1024.0, "epoch": 0.5532778398972957, "grad_norm": 0.3634886750349178, "kl": 0.218756103515625, "learning_rate": 9.884332746719186e-06, "loss": 0.0088, "reward": 1.8593285992741584, "reward_std": 0.1559050077528809, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.11567139578983188, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 1.0, "step": 2505 }, { "completion_length": 1024.0, "epoch": 0.5543821868831187, "grad_norm": 0.5051986020364817, "kl": 0.266455078125, "learning_rate": 9.845779670449926e-06, "loss": 0.0107, "reward": 1.8667958162724971, "reward_std": 0.22305318891121714, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.0561208424041979, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9729166675359011, "step": 2510 }, { "completion_length": 1024.0, "epoch": 0.5554865338689416, "grad_norm": 1.1588379380142202, "kl": 0.49033203125, "learning_rate": 9.807228886836128e-06, "loss": 0.0196, "reward": 1.6591941472142935, "reward_std": 0.35607906174791426, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.061639188357457894, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.877083339355886, "step": 2515 }, { "completion_length": 1024.0, "epoch": 0.5565908808547646, "grad_norm": 1.0057531712372425, "kl": 0.37701416015625, "learning_rate": 9.768680968977743e-06, "loss": 0.0151, "reward": 1.6011904481798411, "reward_std": 0.3233839514392457, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06130954009086054, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.8500000039115548, "step": 2520 }, { "completion_length": 1022.8625, "epoch": 0.5576952278405876, "grad_norm": 2.7751702000639926, "kl": 1.606396484375, "learning_rate": 9.730136489932133e-06, "loss": 0.0642, "reward": 1.186481614317745, "reward_std": 0.7302230117369618, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.14476838221307844, "rewards/format_reward": 0.63125, "rewards/reasoning_steps_reward": 0.668750005401671, "step": 2525 }, { "completion_length": 1024.0, "epoch": 0.5587995748264104, "grad_norm": 1.753321795976581, "kl": 0.9668701171875, "learning_rate": 9.691596022705527e-06, "loss": 0.0387, "reward": 1.1860073703341185, "reward_std": 0.570556367138488, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.11399263035709736, "rewards/format_reward": 0.5625, "rewards/reasoning_steps_reward": 0.7062500042840838, "step": 2530 }, { "completion_length": 1024.0, "epoch": 0.5599039218122334, "grad_norm": 2.3586733201486587, "kl": 1.7131591796875, "learning_rate": 9.653060140244524e-06, "loss": 0.0685, "reward": 0.7566529627889395, "reward_std": 0.45097853311744984, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.17043037730327343, "rewards/format_reward": 0.175, "rewards/reasoning_steps_reward": 0.7458333386108279, "step": 2535 }, { "completion_length": 1024.0, "epoch": 0.5610082687980563, "grad_norm": 2.4868110206825675, "kl": 0.817779541015625, "learning_rate": 9.614529415427556e-06, "loss": 0.0327, "reward": 0.7670935079455375, "reward_std": 0.35415494777553247, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.16623983170138673, "rewards/format_reward": 0.04375, "rewards/reasoning_steps_reward": 0.8645833404734731, "step": 2540 }, { "completion_length": 1024.0, "epoch": 0.5621126157838793, "grad_norm": 8.994673033795959, "kl": 1.3493743896484376, "learning_rate": 9.576004421056389e-06, "loss": 0.0541, "reward": 0.7187479682266712, "reward_std": 0.24309571331159532, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.18541869991458954, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.8666666701436043, "step": 2545 }, { "completion_length": 1024.0, "epoch": 0.5632169627697022, "grad_norm": 1.9064717191281668, "kl": 0.844708251953125, "learning_rate": 9.537485729847594e-06, "loss": 0.0338, "reward": 0.6888711890205741, "reward_std": 0.25518429378644214, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2194621481387003, "rewards/format_reward": 0.00625, "rewards/reasoning_steps_reward": 0.9020833350718022, "step": 2550 }, { "completion_length": 1024.0, "epoch": 0.5643213097555252, "grad_norm": 6.497313362493659, "kl": 1.571319580078125, "learning_rate": 9.498973914424035e-06, "loss": 0.0629, "reward": 0.6698607638478279, "reward_std": 0.27875330625029165, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21972257128509226, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.8583333335816861, "step": 2555 }, { "completion_length": 1024.0, "epoch": 0.5654256567413481, "grad_norm": 1.5891156033669043, "kl": 3.031695556640625, "learning_rate": 9.460469547306375e-06, "loss": 0.1212, "reward": 0.7087379619479179, "reward_std": 0.3350365572499868, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.15584536720998585, "rewards/format_reward": 0.025, "rewards/reasoning_steps_reward": 0.8208333358168602, "step": 2560 }, { "completion_length": 1024.0, "epoch": 0.5665300037271711, "grad_norm": 5.078097996420085, "kl": 1.387432861328125, "learning_rate": 9.421973200904538e-06, "loss": 0.0555, "reward": 0.6076501269359141, "reward_std": 0.3604350686266116, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.20693321338949316, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.7770833358168602, "step": 2565 }, { "completion_length": 1024.0, "epoch": 0.5676343507129941, "grad_norm": 12.377672982659016, "kl": 1.8506378173828124, "learning_rate": 9.38348544750921e-06, "loss": 0.0741, "reward": 0.6430217208398972, "reward_std": 0.3532645304381731, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.23406161246239207, "rewards/format_reward": 0.04375, "rewards/reasoning_steps_reward": 0.8333333343267441, "step": 2570 }, { "completion_length": 1024.0, "epoch": 0.568738697698817, "grad_norm": 1.8285393227548976, "kl": 1.123150634765625, "learning_rate": 9.345006859283338e-06, "loss": 0.045, "reward": 0.7630160832777619, "reward_std": 0.3213708248760668, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.19323392115911703, "rewards/format_reward": 0.05625, "rewards/reasoning_steps_reward": 0.8750000027939677, "step": 2575 }, { "completion_length": 1024.0, "epoch": 0.5698430446846399, "grad_norm": 3.9635852553643303, "kl": 0.58538818359375, "learning_rate": 9.306538008253611e-06, "loss": 0.0234, "reward": 0.7108107196167112, "reward_std": 0.24838708396055154, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21835594929289073, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.9166666705161333, "step": 2580 }, { "completion_length": 1024.0, "epoch": 0.5709473916704628, "grad_norm": 0.21806865311979415, "kl": 0.7177490234375, "learning_rate": 9.268079466301978e-06, "loss": 0.0287, "reward": 0.7812843410298228, "reward_std": 0.2617356756320078, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1874656603875337, "rewards/format_reward": 0.025, "rewards/reasoning_steps_reward": 0.9312500005587936, "step": 2585 }, { "completion_length": 1024.0, "epoch": 0.5720517386562858, "grad_norm": 6.301133014500009, "kl": 0.8725372314453125, "learning_rate": 9.229631805157116e-06, "loss": 0.0349, "reward": 0.7710512263700366, "reward_std": 0.2632641014934052, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.18936544355237855, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.9229166684672236, "step": 2590 }, { "completion_length": 1024.0, "epoch": 0.5731560856421087, "grad_norm": 8.909114886388627, "kl": 0.811669921875, "learning_rate": 9.19119559638596e-06, "loss": 0.0325, "reward": 0.8804346274584531, "reward_std": 0.34509176830179056, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16331537813821342, "rewards/format_reward": 0.10625, "rewards/reasoning_steps_reward": 0.9250000014901161, "step": 2595 }, { "completion_length": 1024.0, "epoch": 0.5742604326279317, "grad_norm": 1.244644274672843, "kl": 1.816058349609375, "learning_rate": 9.15277141138518e-06, "loss": 0.0727, "reward": 0.8285146844573319, "reward_std": 0.3990332660243439, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.10690198320662603, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.8291666699573398, "step": 2600 }, { "epoch": 0.5742604326279317, "eval_completion_length": 1024.0, "eval_kl": 1.531572265625, "eval_loss": 0.0615340955555439, "eval_reward": 0.8556969636678695, "eval_reward_std": 0.4192620050907135, "eval_rewards/accuracy_reward": 0.03, "eval_rewards/cosine_scaled_reward": -0.14596970692276956, "eval_rewards/format_reward": 0.1, "eval_rewards/reasoning_steps_reward": 0.871666669845581, "eval_runtime": 203.5802, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.123, "step": 2600 }, { "completion_length": 1024.0, "epoch": 0.5753647796137547, "grad_norm": 3.507048504579022, "kl": 1.900311279296875, "learning_rate": 9.114359821372714e-06, "loss": 0.076, "reward": 0.7871741138980723, "reward_std": 0.38461359875800555, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1753258895187173, "rewards/format_reward": 0.15, "rewards/reasoning_steps_reward": 0.8062500018626452, "step": 2605 }, { "completion_length": 1024.0, "epoch": 0.5764691265995776, "grad_norm": 2.5214326423125244, "kl": 1.04107666015625, "learning_rate": 9.075961397379247e-06, "loss": 0.0417, "reward": 0.8862119485624135, "reward_std": 0.4803070175581524, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13878805282874965, "rewards/format_reward": 0.1625, "rewards/reasoning_steps_reward": 0.8500000033527613, "step": 2610 }, { "completion_length": 1024.0, "epoch": 0.5775734735854006, "grad_norm": 3.307341729299716, "kl": 2.38963623046875, "learning_rate": 9.037576710239748e-06, "loss": 0.0955, "reward": 0.841398511081934, "reward_std": 0.4593913863740454, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.2002681548066903, "rewards/format_reward": 0.2, "rewards/reasoning_steps_reward": 0.8291666697710752, "step": 2615 }, { "completion_length": 1024.0, "epoch": 0.5786778205712235, "grad_norm": 1.1047461484931345, "kl": 1.93094482421875, "learning_rate": 8.999206330584969e-06, "loss": 0.0773, "reward": 0.8279006748460234, "reward_std": 0.4829202242734027, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.16584932910191127, "rewards/format_reward": 0.19375, "rewards/reasoning_steps_reward": 0.7625000022351742, "step": 2620 }, { "completion_length": 1024.0, "epoch": 0.5797821675570465, "grad_norm": 1.065156980266071, "kl": 583.0839782714844, "learning_rate": 8.960850828832958e-06, "loss": 23.2767, "reward": 0.8661070578498766, "reward_std": 0.4774655077977513, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.17139294502558186, "rewards/format_reward": 0.15625, "rewards/reasoning_steps_reward": 0.8437500035390257, "step": 2625 }, { "completion_length": 1024.0, "epoch": 0.5808865145428693, "grad_norm": 4.568392600149099, "kl": 4.222686767578125, "learning_rate": 8.9225107751806e-06, "loss": 0.1693, "reward": 0.8945860045030714, "reward_std": 0.3841389437075122, "rewards/accuracy_reward": 0.05625, "rewards/cosine_scaled_reward": -0.17208065872546285, "rewards/format_reward": 0.15, "rewards/reasoning_steps_reward": 0.8604166731238365, "step": 2630 }, { "completion_length": 1024.0, "epoch": 0.5819908615286923, "grad_norm": 1.981971937068967, "kl": 0.79658203125, "learning_rate": 8.884186739595114e-06, "loss": 0.0319, "reward": 0.7690370593219995, "reward_std": 0.3609249549546803, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.21221294170245528, "rewards/format_reward": 0.225, "rewards/reasoning_steps_reward": 0.7437500122934579, "step": 2635 }, { "completion_length": 1024.0, "epoch": 0.5830952085145152, "grad_norm": 2.7805804505170175, "kl": 1.688623046875, "learning_rate": 8.845879291805605e-06, "loss": 0.0675, "reward": 1.1183765586465597, "reward_std": 0.6193717653281056, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.15454010646790267, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.7541666772216559, "step": 2640 }, { "completion_length": 1024.0, "epoch": 0.5841995555003382, "grad_norm": 2.1901160477872876, "kl": 2.324169921875, "learning_rate": 8.807589001294571e-06, "loss": 0.093, "reward": 1.1031825848389416, "reward_std": 0.5278661086966168, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.24473407644545658, "rewards/format_reward": 0.73125, "rewards/reasoning_steps_reward": 0.591666679084301, "step": 2645 }, { "completion_length": 1024.0, "epoch": 0.5853039024861612, "grad_norm": 5.1317753880312935, "kl": 2.041021728515625, "learning_rate": 8.769316437289456e-06, "loss": 0.0817, "reward": 1.161124537140131, "reward_std": 0.6632860126585001, "rewards/accuracy_reward": 0.05, "rewards/cosine_scaled_reward": -0.21387546394253149, "rewards/format_reward": 0.69375, "rewards/reasoning_steps_reward": 0.631250012665987, "step": 2650 }, { "completion_length": 1024.0, "epoch": 0.5864082494719841, "grad_norm": 2.978675995841801, "kl": 1.001495361328125, "learning_rate": 8.731062168754174e-06, "loss": 0.0401, "reward": 1.357415765337646, "reward_std": 0.47671173595408617, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.1863342406024458, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.7125000091269612, "step": 2655 }, { "completion_length": 1024.0, "epoch": 0.5875125964578071, "grad_norm": 2.0018445118420565, "kl": 1.871514892578125, "learning_rate": 8.692826764380662e-06, "loss": 0.0748, "reward": 1.3236260378733278, "reward_std": 0.5231166693600244, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.20970730545814148, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.7083333449438214, "step": 2660 }, { "completion_length": 1024.0, "epoch": 0.58861694344363, "grad_norm": 2.1407022480159283, "kl": 0.8106689453125, "learning_rate": 8.654610792580415e-06, "loss": 0.0324, "reward": 1.5535705825313926, "reward_std": 0.3941577763791429, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.18809608481969917, "rewards/format_reward": 0.9, "rewards/reasoning_steps_reward": 0.816666678711772, "step": 2665 }, { "completion_length": 1024.0, "epoch": 0.589721290429453, "grad_norm": 2.3049029672167585, "kl": 1.445733642578125, "learning_rate": 8.616414821476048e-06, "loss": 0.0578, "reward": 1.6248370364308358, "reward_std": 0.4025070207238969, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.17099629567801458, "rewards/format_reward": 0.90625, "rewards/reasoning_steps_reward": 0.8770833343267441, "step": 2670 }, { "completion_length": 1024.0, "epoch": 0.5908256374152758, "grad_norm": 0.6919174013627627, "kl": 2.08385009765625, "learning_rate": 8.57823941889284e-06, "loss": 0.0834, "reward": 1.5777956765145063, "reward_std": 0.48239568906356, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16595432664034887, "rewards/format_reward": 0.8875, "rewards/reasoning_steps_reward": 0.8437500029802323, "step": 2675 }, { "completion_length": 1024.0, "epoch": 0.5919299844010988, "grad_norm": 0.997312984133755, "kl": 1.877044677734375, "learning_rate": 8.54008515235029e-06, "loss": 0.075, "reward": 1.5631065297173337, "reward_std": 0.3414640254137339, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14314347317558712, "rewards/format_reward": 0.75, "rewards/reasoning_steps_reward": 0.9437500011175871, "step": 2680 }, { "completion_length": 1024.0, "epoch": 0.5930343313869217, "grad_norm": 1.5488170229596216, "kl": 0.488629150390625, "learning_rate": 8.501952589053694e-06, "loss": 0.0196, "reward": 1.6224470026791096, "reward_std": 0.35797781147266505, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16921966964146123, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.9291666716337204, "step": 2685 }, { "completion_length": 1024.0, "epoch": 0.5941386783727447, "grad_norm": 4.133285080841036, "kl": 0.826751708984375, "learning_rate": 8.463842295885712e-06, "loss": 0.0331, "reward": 1.6491547813639045, "reward_std": 0.37279988423979377, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12792855495936237, "rewards/format_reward": 0.9, "rewards/reasoning_steps_reward": 0.8645833399146795, "step": 2690 }, { "completion_length": 1024.0, "epoch": 0.5952430253585677, "grad_norm": 1.5372371953697817, "kl": 1.359478759765625, "learning_rate": 8.425754839397917e-06, "loss": 0.0544, "reward": 1.5732461759354919, "reward_std": 0.4112935331926565, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.18092048349790274, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.8604166703298688, "step": 2695 }, { "completion_length": 1023.89375, "epoch": 0.5963473723443906, "grad_norm": 0.7879978615183016, "kl": 0.82923583984375, "learning_rate": 8.387690785802403e-06, "loss": 0.0332, "reward": 1.5549915019422769, "reward_std": 0.42069571325846483, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14709182740189136, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.9083333384245634, "step": 2700 }, { "epoch": 0.5963473723443906, "eval_completion_length": 1024.0, "eval_kl": 0.63541015625, "eval_loss": 0.025506604462862015, "eval_reward": 1.5887797927856446, "eval_reward_std": 0.43425711914896964, "eval_rewards/accuracy_reward": 0.015, "eval_rewards/cosine_scaled_reward": -0.1462202015519142, "eval_rewards/format_reward": 0.81, "eval_rewards/reasoning_steps_reward": 0.9100000047683716, "eval_runtime": 202.0281, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.124, "step": 2700 }, { "completion_length": 1024.0, "epoch": 0.5974517193302136, "grad_norm": 0.2954926442766836, "kl": 0.60306396484375, "learning_rate": 8.349650700963346e-06, "loss": 0.0241, "reward": 1.5327742783352734, "reward_std": 0.43929143614368515, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.15055906748748385, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.877083340473473, "step": 2705 }, { "completion_length": 1024.0, "epoch": 0.5985560663160365, "grad_norm": 0.40255214529511685, "kl": 0.2451416015625, "learning_rate": 8.311635150388607e-06, "loss": 0.0098, "reward": 1.6107220947742462, "reward_std": 0.3211441752166138, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16219456993276254, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.904166679829359, "step": 2710 }, { "completion_length": 1024.0, "epoch": 0.5996604133018595, "grad_norm": 1.0582522044938396, "kl": 0.6796630859375, "learning_rate": 8.273644699221309e-06, "loss": 0.0272, "reward": 1.708713711425662, "reward_std": 0.3196752316202037, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.12670296079013496, "rewards/format_reward": 0.925, "rewards/reasoning_steps_reward": 0.8854166727513075, "step": 2715 }, { "completion_length": 1024.0, "epoch": 0.6007647602876824, "grad_norm": 2.277187831822091, "kl": 1.67298583984375, "learning_rate": 8.235679912231456e-06, "loss": 0.0669, "reward": 1.528065111860633, "reward_std": 0.4964781049713565, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11776822947686014, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.827083338983357, "step": 2720 }, { "completion_length": 1023.20625, "epoch": 0.6018691072735053, "grad_norm": 18.823418701689295, "kl": 3.2572021484375, "learning_rate": 8.197741353807515e-06, "loss": 0.1303, "reward": 1.1230425384826959, "reward_std": 0.7203874601105781, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.26654079704312605, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.6895833391696214, "step": 2725 }, { "completion_length": 1011.4875, "epoch": 0.6029734542593282, "grad_norm": 4.2381751634354945, "kl": 4.9869384765625, "learning_rate": 8.159829587948048e-06, "loss": 0.1993, "reward": -0.06850880788988434, "reward_std": 0.2341864599104156, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.20809214230976067, "rewards/format_reward": 0.06875, "rewards/reasoning_steps_reward": 0.06458333525806666, "step": 2730 }, { "completion_length": 1024.0, "epoch": 0.6040778012451512, "grad_norm": 96482763.82745863, "kl": 180908570442.25247, "learning_rate": 8.1219451782533e-06, "loss": 7239816806.4, "reward": -0.38960654605180023, "reward_std": 0.10572403705027682, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.41252321302890776, "rewards/format_reward": 0.0125, "rewards/reasoning_steps_reward": 0.010416666977107525, "step": 2735 }, { "completion_length": 1024.0, "epoch": 0.6051821482309742, "grad_norm": 3267730.5942430696, "kl": 421346.476171875, "learning_rate": 8.084088687916853e-06, "loss": 16876.4047, "reward": -0.27517261541215704, "reward_std": 0.13037438962201123, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.30642261592438447, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.031250000931322575, "step": 2740 }, { "completion_length": 1024.0, "epoch": 0.6062864952167971, "grad_norm": 723724.1963519381, "kl": 60747.18715820312, "learning_rate": 8.046260679717225e-06, "loss": 2430.2525, "reward": -0.1426667131279828, "reward_std": 0.13045339315722232, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17391671401273925, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.031250000931322575, "step": 2745 }, { "completion_length": 1024.0, "epoch": 0.6073908422026201, "grad_norm": 1441.8019454907167, "kl": 521.3070068359375, "learning_rate": 8.00846171600952e-06, "loss": 20.84, "reward": -0.2268078915774822, "reward_std": 0.1437462084002618, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2684745579957962, "rewards/format_reward": 0.03125, "rewards/reasoning_steps_reward": 0.010416666977107525, "step": 2750 }, { "completion_length": 1024.0, "epoch": 0.608495189188443, "grad_norm": 9.380650885562765, "kl": 684.062841796875, "learning_rate": 7.970692358717067e-06, "loss": 27.3489, "reward": -0.055707710242131725, "reward_std": 0.30809029219599326, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.259874378569657, "rewards/format_reward": 0.1875, "rewards/reasoning_steps_reward": 0.01666666716337204, "step": 2755 }, { "completion_length": 1024.0, "epoch": 0.609599536174266, "grad_norm": 22.947054770935207, "kl": 3.025634765625, "learning_rate": 7.932953169323057e-06, "loss": 0.1211, "reward": 0.1340429156436585, "reward_std": 0.403066113893874, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.18887375577469356, "rewards/format_reward": 0.275, "rewards/reasoning_steps_reward": 0.022916667349636554, "step": 2760 }, { "completion_length": 1024.0, "epoch": 0.610703883160089, "grad_norm": 4.1216872479716695, "kl": 1.19234619140625, "learning_rate": 7.895244708862204e-06, "loss": 0.0477, "reward": 0.19553506562951953, "reward_std": 0.39290520365666454, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21071493784547785, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.031250000931322575, "step": 2765 }, { "completion_length": 1024.0, "epoch": 0.6118082301459118, "grad_norm": 1.6959155942935658, "kl": 1.01761474609375, "learning_rate": 7.857567537912404e-06, "loss": 0.0407, "reward": 0.5248618606012314, "reward_std": 0.4952483652741648, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.21680481360817794, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.21666667181998492, "step": 2770 }, { "completion_length": 1024.0, "epoch": 0.6129125771317347, "grad_norm": 1.5482569009814988, "kl": 0.517236328125, "learning_rate": 7.8199222165864e-06, "loss": 0.0207, "reward": 0.8010378686711193, "reward_std": 0.49899706967407836, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.20729547403170728, "rewards/format_reward": 0.6375, "rewards/reasoning_steps_reward": 0.37083333916962147, "step": 2775 }, { "completion_length": 1024.0, "epoch": 0.6140169241175577, "grad_norm": 2.4108682922884355, "kl": 0.65213623046875, "learning_rate": 7.78230930452345e-06, "loss": 0.0261, "reward": 1.0279582727060188, "reward_std": 0.6527682813815773, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13454173824720783, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.43125001043081285, "step": 2780 }, { "completion_length": 1024.0, "epoch": 0.6151212711033807, "grad_norm": 3.0357670258767997, "kl": 0.825726318359375, "learning_rate": 7.744729360881023e-06, "loss": 0.033, "reward": 0.9146947997360257, "reward_std": 0.5863862118949328, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13530521424108882, "rewards/format_reward": 0.61875, "rewards/reasoning_steps_reward": 0.4250000087544322, "step": 2785 }, { "completion_length": 1024.0, "epoch": 0.6162256180892036, "grad_norm": 5.402255250510556, "kl": 1.20025634765625, "learning_rate": 7.70718294432646e-06, "loss": 0.048, "reward": 0.8170664728269912, "reward_std": 0.5222204860349848, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.20168353992048652, "rewards/format_reward": 0.525, "rewards/reasoning_steps_reward": 0.4937500128522515, "step": 2790 }, { "completion_length": 1024.0, "epoch": 0.6173299650750266, "grad_norm": 2.6490412611159906, "kl": 0.7834259033203125, "learning_rate": 7.669670613028705e-06, "loss": 0.0313, "reward": 0.9308588748739567, "reward_std": 0.5479372062931361, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21289113731472753, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.5562500149011612, "step": 2795 }, { "completion_length": 1024.0, "epoch": 0.6184343120608495, "grad_norm": 0.9296004266230328, "kl": 0.679315185546875, "learning_rate": 7.632192924649969e-06, "loss": 0.0272, "reward": 1.3005210721777984, "reward_std": 0.42411895469613226, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.20572894245560747, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.668750011920929, "step": 2800 }, { "epoch": 0.6184343120608495, "eval_completion_length": 1024.0, "eval_kl": 0.41427734375, "eval_loss": 0.016660606488585472, "eval_reward": 1.3896788090467453, "eval_reward_std": 0.3565726025402546, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.2536545431613922, "eval_rewards/format_reward": 0.89, "eval_rewards/reasoning_steps_reward": 0.7533333519101143, "eval_runtime": 203.0994, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.123, "step": 2800 }, { "completion_length": 1024.0, "epoch": 0.6195386590466725, "grad_norm": 0.819894636043627, "kl": 0.622088623046875, "learning_rate": 7.594750436337467e-06, "loss": 0.0249, "reward": 1.3817081528744892, "reward_std": 0.39700459074229, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2516251891473075, "rewards/format_reward": 0.88125, "rewards/reasoning_steps_reward": 0.7520833408460021, "step": 2805 }, { "completion_length": 1024.0, "epoch": 0.6206430060324954, "grad_norm": 24.5216205783706, "kl": 0.28448486328125, "learning_rate": 7.557343704715121e-06, "loss": 0.0114, "reward": 1.6452918566763401, "reward_std": 0.2875926383348997, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2067914859391749, "rewards/format_reward": 0.95625, "rewards/reasoning_steps_reward": 0.8958333423361182, "step": 2810 }, { "completion_length": 1024.0, "epoch": 0.6217473530183184, "grad_norm": 0.4780364776400206, "kl": 2.242742919921875, "learning_rate": 7.519973285875303e-06, "loss": 0.0896, "reward": 1.669452325697057, "reward_std": 0.25911546872521285, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.18888101823395118, "rewards/format_reward": 0.93125, "rewards/reasoning_steps_reward": 0.9270833373069763, "step": 2815 }, { "completion_length": 1024.0, "epoch": 0.6228517000041413, "grad_norm": 0.30599849603614737, "kl": 0.239642333984375, "learning_rate": 7.482639735370536e-06, "loss": 0.0096, "reward": 1.7530787236988545, "reward_std": 0.22120716450335748, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1531712787807919, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9500000029802322, "step": 2820 }, { "completion_length": 1024.0, "epoch": 0.6239560469899642, "grad_norm": 0.3026466856046581, "kl": 0.4430419921875, "learning_rate": 7.445343608205273e-06, "loss": 0.0177, "reward": 1.7619009755551815, "reward_std": 0.2164007437779219, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15059902559150942, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9625000029802322, "step": 2825 }, { "completion_length": 1024.0, "epoch": 0.6250603939757872, "grad_norm": 7.081063593030829, "kl": 1.41685791015625, "learning_rate": 7.408085458827612e-06, "loss": 0.0566, "reward": 1.679977324604988, "reward_std": 0.3088035559238051, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13877267238276544, "rewards/format_reward": 0.875, "rewards/reasoning_steps_reward": 0.9437500014901161, "step": 2830 }, { "completion_length": 1024.0, "epoch": 0.6261647409616101, "grad_norm": 2.1946555791360374, "kl": 0.98446044921875, "learning_rate": 7.37086584112108e-06, "loss": 0.0394, "reward": 1.635563358478248, "reward_std": 0.3597205114591816, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11443664449179777, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.9187500059604645, "step": 2835 }, { "completion_length": 1024.0, "epoch": 0.6272690879474331, "grad_norm": 5.092191379211699, "kl": 1.26165771484375, "learning_rate": 7.333685308396383e-06, "loss": 0.0505, "reward": 1.437623752374202, "reward_std": 0.564753658437985, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11445958482654532, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.8333333428949118, "step": 2840 }, { "completion_length": 1024.0, "epoch": 0.628373434933256, "grad_norm": 0.3518558091842223, "kl": 0.6064208984375, "learning_rate": 7.2965444133831905e-06, "loss": 0.0243, "reward": 1.6313017681241035, "reward_std": 0.39246049159555696, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12494824056047946, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.900000006891787, "step": 2845 }, { "completion_length": 1024.0, "epoch": 0.629477781919079, "grad_norm": 0.4555829404371441, "kl": 0.33995361328125, "learning_rate": 7.2594437082219074e-06, "loss": 0.0136, "reward": 1.7459608260542154, "reward_std": 0.23804061831906437, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12487251611310057, "rewards/format_reward": 0.95, "rewards/reasoning_steps_reward": 0.9145833410322666, "step": 2850 }, { "completion_length": 1024.0, "epoch": 0.630582128904902, "grad_norm": 1.0048391384575739, "kl": 0.7876708984375, "learning_rate": 7.222383744455477e-06, "loss": 0.0315, "reward": 1.6474058616906404, "reward_std": 0.2795259444072144, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12759414308093256, "rewards/format_reward": 0.91875, "rewards/reasoning_steps_reward": 0.8562500108033418, "step": 2855 }, { "completion_length": 1024.0, "epoch": 0.6316864758907249, "grad_norm": 13.683680496200315, "kl": 1.6222900390625, "learning_rate": 7.185365073021171e-06, "loss": 0.0649, "reward": 1.6242793073877693, "reward_std": 0.4118321215661126, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11113736048791907, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.8666666738688946, "step": 2860 }, { "completion_length": 1024.0, "epoch": 0.6327908228765479, "grad_norm": 15.701500203307981, "kl": 5.002978515625, "learning_rate": 7.148388244242414e-06, "loss": 0.2001, "reward": 0.950605523493141, "reward_std": 0.5147591066779569, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.18481114405440166, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.6604166783392429, "step": 2865 }, { "completion_length": 1024.0, "epoch": 0.6338951698623707, "grad_norm": 7.517215117513362, "kl": 4.4798828125, "learning_rate": 7.111453807820587e-06, "loss": 0.1791, "reward": 0.7040133336733561, "reward_std": 0.5188803709228523, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.17307000582586624, "rewards/format_reward": 0.2625, "rewards/reasoning_steps_reward": 0.6020833441987634, "step": 2870 }, { "completion_length": 1024.0, "epoch": 0.6349995168481937, "grad_norm": 2.7433463665535838, "kl": 1.028369140625, "learning_rate": 7.0745623128268605e-06, "loss": 0.0411, "reward": 0.9007511441479437, "reward_std": 0.5913131707464345, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17008219271956476, "rewards/format_reward": 0.35625, "rewards/reasoning_steps_reward": 0.7145833415910602, "step": 2875 }, { "completion_length": 1024.0, "epoch": 0.6361038638340166, "grad_norm": 4.233552397302543, "kl": 1.39521484375, "learning_rate": 7.037714307694038e-06, "loss": 0.0558, "reward": 0.8574047698173672, "reward_std": 0.5745913892163514, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1780118998591206, "rewards/format_reward": 0.33125, "rewards/reasoning_steps_reward": 0.7041666755452752, "step": 2880 }, { "completion_length": 1024.0, "epoch": 0.6372082108198396, "grad_norm": 7.045129815332537, "kl": 2.99322509765625, "learning_rate": 7.000910340208393e-06, "loss": 0.1197, "reward": 0.6241100358776748, "reward_std": 0.6179765696193499, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2154732992494246, "rewards/format_reward": 0.225, "rewards/reasoning_steps_reward": 0.6145833460614085, "step": 2885 }, { "completion_length": 1024.0, "epoch": 0.6383125578056625, "grad_norm": 2.9137501903252745, "kl": 2.52530517578125, "learning_rate": 6.964150957501538e-06, "loss": 0.101, "reward": 0.6827161773107946, "reward_std": 0.5930506098521618, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2006171574297241, "rewards/format_reward": 0.2625, "rewards/reasoning_steps_reward": 0.6208333447575569, "step": 2890 }, { "completion_length": 1024.0, "epoch": 0.6394169047914855, "grad_norm": 3.1443553762227796, "kl": 1.66383056640625, "learning_rate": 6.927436706042276e-06, "loss": 0.0666, "reward": 0.8434384000953287, "reward_std": 0.697918272089737, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21072826540221284, "rewards/format_reward": 0.3625, "rewards/reasoning_steps_reward": 0.6916666748002172, "step": 2895 }, { "completion_length": 1024.0, "epoch": 0.6405212517773085, "grad_norm": 2.1234073498427977, "kl": 1.063555908203125, "learning_rate": 6.890768131628492e-06, "loss": 0.0425, "reward": 1.0372354218969122, "reward_std": 0.5802448318753705, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16901458023348823, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7375000087544322, "step": 2900 }, { "epoch": 0.6405212517773085, "eval_completion_length": 1024.0, "eval_kl": 1.18333984375, "eval_loss": 0.04755154624581337, "eval_reward": 1.1212347888946532, "eval_reward_std": 0.60481853954494, "eval_rewards/accuracy_reward": 0.025, "eval_rewards/cosine_scaled_reward": -0.1587652049958706, "eval_rewards/format_reward": 0.475, "eval_rewards/reasoning_steps_reward": 0.7800000095367432, "eval_runtime": 203.4898, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.123, "step": 2900 }, { "completion_length": 1024.0, "epoch": 0.6416255987631314, "grad_norm": 8.761149447970329, "kl": 1.7990478515625, "learning_rate": 6.8541457793790204e-06, "loss": 0.0719, "reward": 1.0065348925068975, "reward_std": 0.7884352072956972, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.19346511634066701, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7187500128522515, "step": 2905 }, { "completion_length": 1022.025, "epoch": 0.6427299457489544, "grad_norm": 37.871727195710804, "kl": 3.3865478515625, "learning_rate": 6.8175701937255645e-06, "loss": 0.1355, "reward": 0.8355722818523645, "reward_std": 0.5863746992239612, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.2581777243176475, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7000000117346644, "step": 2910 }, { "completion_length": 1022.73125, "epoch": 0.6438342927347772, "grad_norm": 7.935357600252908, "kl": 2.79166259765625, "learning_rate": 6.781041918404578e-06, "loss": 0.1117, "reward": 0.8582875849679112, "reward_std": 0.7126139059808339, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.20629576151259243, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.677083345130086, "step": 2915 }, { "completion_length": 1024.0, "epoch": 0.6449386397206002, "grad_norm": 1.322793615061763, "kl": 1.676708984375, "learning_rate": 6.744561496449208e-06, "loss": 0.0671, "reward": 1.1432567204814403, "reward_std": 0.6100759642431512, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17132661846117117, "rewards/format_reward": 0.55, "rewards/reasoning_steps_reward": 0.7645833445712924, "step": 2920 }, { "completion_length": 1024.0, "epoch": 0.6460429867064231, "grad_norm": 3.0878247518849595, "kl": 1.036236572265625, "learning_rate": 6.708129470181197e-06, "loss": 0.0414, "reward": 1.1472035638988018, "reward_std": 0.6603698913229892, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14029642865643838, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7812500115483999, "step": 2925 }, { "completion_length": 1024.0, "epoch": 0.6471473336922461, "grad_norm": 7.62569345930808, "kl": 0.920257568359375, "learning_rate": 6.671746381202835e-06, "loss": 0.0368, "reward": 1.327859591320157, "reward_std": 0.5649135158251738, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14089040720136836, "rewards/format_reward": 0.5625, "rewards/reasoning_steps_reward": 0.8937500085681677, "step": 2930 }, { "completion_length": 1024.0, "epoch": 0.648251680678069, "grad_norm": 3.927819408054046, "kl": 1.06234130859375, "learning_rate": 6.635412770388911e-06, "loss": 0.0425, "reward": 1.2363583998754621, "reward_std": 0.5483630039729178, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.17614159435615875, "rewards/format_reward": 0.56875, "rewards/reasoning_steps_reward": 0.8375000089406968, "step": 2935 }, { "completion_length": 1024.0, "epoch": 0.649356027663892, "grad_norm": 10.957938169885486, "kl": 1.73878173828125, "learning_rate": 6.5991291778786556e-06, "loss": 0.0696, "reward": 1.2223307210952044, "reward_std": 0.5801997775997734, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.18808595033478923, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.8416666766628623, "step": 2940 }, { "completion_length": 1024.0, "epoch": 0.650460374649715, "grad_norm": 7.657170788292764, "kl": 0.892596435546875, "learning_rate": 6.562896143067734e-06, "loss": 0.0357, "reward": 1.2322623513638973, "reward_std": 0.6118116105441004, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1469043156481348, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.8291666766628623, "step": 2945 }, { "completion_length": 1024.0, "epoch": 0.6515647216355379, "grad_norm": 8.99018306454909, "kl": 1.005328369140625, "learning_rate": 6.526714204600212e-06, "loss": 0.0402, "reward": 1.110228473204188, "reward_std": 0.5596809437090997, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14393819727119989, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7791666759178042, "step": 2950 }, { "completion_length": 1024.0, "epoch": 0.6526690686213609, "grad_norm": 7.671710767110563, "kl": 0.838983154296875, "learning_rate": 6.490583900360543e-06, "loss": 0.0336, "reward": 1.3401226574555039, "reward_std": 0.553086530593282, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13904401452746243, "rewards/format_reward": 0.60625, "rewards/reasoning_steps_reward": 0.8604166749864817, "step": 2955 }, { "completion_length": 1024.0, "epoch": 0.6537734156071838, "grad_norm": 3.292747371908634, "kl": 0.92437744140625, "learning_rate": 6.4545057674655954e-06, "loss": 0.037, "reward": 1.328625155496411, "reward_std": 0.4291026462393347, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12137485419007135, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.8125000057742, "step": 2960 }, { "completion_length": 1024.0, "epoch": 0.6548777625930067, "grad_norm": 4.362719186181906, "kl": 0.94898681640625, "learning_rate": 6.418480342256635e-06, "loss": 0.038, "reward": 1.357650207653205, "reward_std": 0.538313817546441, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13609979636312347, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.862500006146729, "step": 2965 }, { "completion_length": 1024.0, "epoch": 0.6559821095788296, "grad_norm": 6.047466207524628, "kl": 1.5346435546875, "learning_rate": 6.38250816029139e-06, "loss": 0.0614, "reward": 1.3210783490445466, "reward_std": 0.5543214490637183, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13100498942367267, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 0.8270833432674408, "step": 2970 }, { "completion_length": 1024.0, "epoch": 0.6570864565646526, "grad_norm": 11.380391296560157, "kl": 0.86226806640625, "learning_rate": 6.34658975633605e-06, "loss": 0.0345, "reward": 1.3079326836625114, "reward_std": 0.427550901808354, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12540064963977784, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.8083333384245635, "step": 2975 }, { "completion_length": 1024.0, "epoch": 0.6581908035504755, "grad_norm": 3.430002203421272, "kl": 1.301171875, "learning_rate": 6.310725664357349e-06, "loss": 0.0521, "reward": 1.3198414511978627, "reward_std": 0.5648883628775365, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.08849188148742541, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.8270833415910601, "step": 2980 }, { "completion_length": 1024.0, "epoch": 0.6592951505362985, "grad_norm": 10.740243313709202, "kl": 1.353033447265625, "learning_rate": 6.274916417514605e-06, "loss": 0.0542, "reward": 1.3674539031460882, "reward_std": 0.3986359235073905, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12837943203630858, "rewards/format_reward": 0.6625, "rewards/reasoning_steps_reward": 0.8270833410322667, "step": 2985 }, { "completion_length": 1024.0, "epoch": 0.6603994975221215, "grad_norm": 6.295784777875361, "kl": 1.66905517578125, "learning_rate": 6.239162548151809e-06, "loss": 0.0667, "reward": 1.2881278064567596, "reward_std": 0.6237002839145134, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.1264555389279849, "rewards/format_reward": 0.58125, "rewards/reasoning_steps_reward": 0.8083333417773246, "step": 2990 }, { "completion_length": 1024.0, "epoch": 0.6615038445079444, "grad_norm": 4.214532608040704, "kl": 1.88565673828125, "learning_rate": 6.2034645877897e-06, "loss": 0.0754, "reward": 1.2514354882296175, "reward_std": 0.5845151668967447, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.11939785096910782, "rewards/format_reward": 0.55, "rewards/reasoning_steps_reward": 0.7958333421498537, "step": 2995 }, { "completion_length": 1024.0, "epoch": 0.6626081914937674, "grad_norm": 9.193960699184903, "kl": 2.572412109375, "learning_rate": 6.167823067117868e-06, "loss": 0.1029, "reward": 1.358169614057988, "reward_std": 0.5987432187257582, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11891372348181903, "rewards/format_reward": 0.63125, "rewards/reasoning_steps_reward": 0.8270833395421505, "step": 3000 }, { "epoch": 0.6626081914937674, "eval_completion_length": 1024.0, "eval_kl": 0.82634765625, "eval_loss": 0.03317331522703171, "eval_reward": 1.4329055428504944, "eval_reward_std": 0.47894756741821765, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.15376112081110477, "eval_rewards/format_reward": 0.69, "eval_rewards/reasoning_steps_reward": 0.896666671037674, "eval_runtime": 202.2678, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.124, "step": 3000 }, { "completion_length": 1024.0, "epoch": 0.6637125384795903, "grad_norm": 4.332199272935333, "kl": 1.16727294921875, "learning_rate": 6.132238515986868e-06, "loss": 0.0467, "reward": 1.2984854570306197, "reward_std": 0.4428606638291967, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12859787809356932, "rewards/format_reward": 0.61875, "rewards/reasoning_steps_reward": 0.8083333378657699, "step": 3005 }, { "completion_length": 1024.0, "epoch": 0.6648168854654133, "grad_norm": 6.405145163446423, "kl": 1.905499267578125, "learning_rate": 6.096711463400333e-06, "loss": 0.0762, "reward": 1.4704199727624654, "reward_std": 0.47390592549927535, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15249669990153053, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.8979166725650429, "step": 3010 }, { "completion_length": 1024.0, "epoch": 0.6659212324512361, "grad_norm": 2.7983690454882835, "kl": 1.94473876953125, "learning_rate": 6.061242437507131e-06, "loss": 0.0778, "reward": 1.250732819433324, "reward_std": 0.6140466192155145, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.15968385117303113, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.8354166734963655, "step": 3015 }, { "completion_length": 1024.0, "epoch": 0.6670255794370591, "grad_norm": 2.84797048221281, "kl": 2.10213623046875, "learning_rate": 6.025831965593479e-06, "loss": 0.0841, "reward": 1.107512214500457, "reward_std": 0.5237852192483843, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.19457112488453276, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.8270833421498537, "step": 3020 }, { "completion_length": 1024.0, "epoch": 0.668129926422882, "grad_norm": 1.7479375003128341, "kl": 2.409619140625, "learning_rate": 5.990480574075143e-06, "loss": 0.0963, "reward": 1.113942611636594, "reward_std": 0.6643439802435751, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1798073928861413, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.8125000087544322, "step": 3025 }, { "completion_length": 1024.0, "epoch": 0.669234273408705, "grad_norm": 4.910115773642404, "kl": 2.628387451171875, "learning_rate": 5.955188788489583e-06, "loss": 0.1052, "reward": 1.0177073845639826, "reward_std": 0.6183391271624714, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.18854261809028686, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.8250000083819031, "step": 3030 }, { "completion_length": 1024.0, "epoch": 0.670338620394528, "grad_norm": 2.5567080757004326, "kl": 1.95263671875, "learning_rate": 5.919957133488155e-06, "loss": 0.078, "reward": 0.9483737903181464, "reward_std": 0.5370062646285078, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1974595475825481, "rewards/format_reward": 0.35, "rewards/reasoning_steps_reward": 0.7958333401009441, "step": 3035 }, { "completion_length": 1024.0, "epoch": 0.6714429673803509, "grad_norm": 53.531605237061235, "kl": 1.660748291015625, "learning_rate": 5.884786132828304e-06, "loss": 0.0664, "reward": 1.0435184644535185, "reward_std": 0.6034316588047659, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.18773154099471867, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.8437500104308129, "step": 3040 }, { "completion_length": 1024.0, "epoch": 0.6725473143661739, "grad_norm": 22.074165945119677, "kl": 4.23909912109375, "learning_rate": 5.849676309365786e-06, "loss": 0.1697, "reward": 1.0926960329525173, "reward_std": 0.6052615458262153, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.18022063072421587, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.7979166788980365, "step": 3045 }, { "completion_length": 1024.0, "epoch": 0.6736516613519968, "grad_norm": 95.44466344567884, "kl": 3.39718017578125, "learning_rate": 5.814628185046884e-06, "loss": 0.1359, "reward": 1.2230933974977234, "reward_std": 0.6664319176386926, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1852399377487018, "rewards/format_reward": 0.58125, "rewards/reasoning_steps_reward": 0.8145833408460021, "step": 3050 }, { "completion_length": 1024.0, "epoch": 0.6747560083378198, "grad_norm": 6.30138216782726, "kl": 4.382000732421875, "learning_rate": 5.779642280900668e-06, "loss": 0.1753, "reward": 1.2127747944556178, "reward_std": 0.521913470455911, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.18930854120990262, "rewards/format_reward": 0.58125, "rewards/reasoning_steps_reward": 0.8208333421498537, "step": 3055 }, { "completion_length": 1024.0, "epoch": 0.6758603553236426, "grad_norm": 3.3382749208758082, "kl": 1.1747314453125, "learning_rate": 5.744719117031217e-06, "loss": 0.047, "reward": 1.2256219832226634, "reward_std": 0.6094286283361725, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.19521136274852324, "rewards/format_reward": 0.6125, "rewards/reasoning_steps_reward": 0.7958333427086473, "step": 3060 }, { "completion_length": 1024.0, "epoch": 0.6769647023094656, "grad_norm": 1.0005265918987625, "kl": 1.160723876953125, "learning_rate": 5.709859212609919e-06, "loss": 0.0464, "reward": 1.1743648422183468, "reward_std": 0.4749204738356639, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.20063516062800774, "rewards/format_reward": 0.58125, "rewards/reasoning_steps_reward": 0.7875000093132257, "step": 3065 }, { "completion_length": 1022.075, "epoch": 0.6780690492952886, "grad_norm": 1.821401805667734, "kl": 1.6499267578125, "learning_rate": 5.675063085867747e-06, "loss": 0.066, "reward": 1.3006652948854025, "reward_std": 0.5155112744258077, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.21600137041823472, "rewards/format_reward": 0.675, "rewards/reasoning_steps_reward": 0.8416666749864816, "step": 3070 }, { "completion_length": 1024.0, "epoch": 0.6791733962811115, "grad_norm": 5.981269217881959, "kl": 1.138519287109375, "learning_rate": 5.6403312540875325e-06, "loss": 0.0456, "reward": 1.1754640196362742, "reward_std": 0.5700981944799424, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.17036931174516212, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.7708333428949118, "step": 3075 }, { "completion_length": 1024.0, "epoch": 0.6802777432669345, "grad_norm": 0.41069015629939837, "kl": 0.76771240234375, "learning_rate": 5.6056642335963e-06, "loss": 0.0307, "reward": 1.1382618664763868, "reward_std": 0.5532342360922484, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12007147440890549, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7770833456888795, "step": 3080 }, { "completion_length": 1024.0, "epoch": 0.6813820902527574, "grad_norm": 0.45858301017395814, "kl": 0.736260986328125, "learning_rate": 5.571062539757582e-06, "loss": 0.0295, "reward": 1.1341605888563209, "reward_std": 0.5165931562354672, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12417274994077161, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7770833443850279, "step": 3085 }, { "completion_length": 1024.0, "epoch": 0.6824864372385804, "grad_norm": 0.6870617586646819, "kl": 0.554400634765625, "learning_rate": 5.536526686963762e-06, "loss": 0.0222, "reward": 1.3453298162668943, "reward_std": 0.41773030079348245, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14425351886893623, "rewards/format_reward": 0.64375, "rewards/reasoning_steps_reward": 0.845833346620202, "step": 3090 }, { "completion_length": 1024.0, "epoch": 0.6835907842244033, "grad_norm": 0.2919537380153754, "kl": 0.2393585205078125, "learning_rate": 5.50205718862841e-06, "loss": 0.0096, "reward": 1.4464579613879323, "reward_std": 0.3893053664593026, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12229204796021804, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.8437500229105354, "step": 3095 }, { "completion_length": 1024.0, "epoch": 0.6846951312102263, "grad_norm": 0.8647207902345468, "kl": 0.148114013671875, "learning_rate": 5.467654557178679e-06, "loss": 0.0059, "reward": 1.5913012862205504, "reward_std": 0.28262074058166037, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.131615390918887, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.8729166831821203, "step": 3100 }, { "epoch": 0.6846951312102263, "eval_completion_length": 1024.0, "eval_kl": 0.34259765625, "eval_loss": 0.013789056800305843, "eval_reward": 1.687197803258896, "eval_reward_std": 0.24495098181068897, "eval_rewards/accuracy_reward": 0.005, "eval_rewards/cosine_scaled_reward": -0.12946887340396643, "eval_rewards/format_reward": 0.93, "eval_rewards/reasoning_steps_reward": 0.881666682958603, "eval_runtime": 201.4218, "eval_samples_per_second": 0.492, "eval_steps_per_second": 0.124, "step": 3100 }, { "completion_length": 1024.0, "epoch": 0.6857994781960493, "grad_norm": 0.20802814162316793, "kl": 0.171466064453125, "learning_rate": 5.433319304047666e-06, "loss": 0.0069, "reward": 1.7369946524500848, "reward_std": 0.22078395002754406, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15050535116752145, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 0.9250000149011612, "step": 3105 }, { "completion_length": 1024.0, "epoch": 0.6869038251818721, "grad_norm": 0.22469975874112494, "kl": 0.1765625, "learning_rate": 5.399051939666817e-06, "loss": 0.0071, "reward": 1.8477778874337674, "reward_std": 0.16415738863433943, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.10013877979945392, "rewards/format_reward": 0.9625, "rewards/reasoning_steps_reward": 0.9729166686534881, "step": 3110 }, { "completion_length": 1024.0, "epoch": 0.688008172167695, "grad_norm": 0.29801419479847224, "kl": 0.175927734375, "learning_rate": 5.36485297345833e-06, "loss": 0.007, "reward": 1.815246258676052, "reward_std": 0.18331017740074457, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.11392041069957486, "rewards/format_reward": 0.94375, "rewards/reasoning_steps_reward": 0.9791666708886624, "step": 3115 }, { "completion_length": 1024.0, "epoch": 0.689112519153518, "grad_norm": 0.29642392200106615, "kl": 0.22320556640625, "learning_rate": 5.330722913827594e-06, "loss": 0.0089, "reward": 1.831030797213316, "reward_std": 0.12136474607978016, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09813587362295947, "rewards/format_reward": 0.95625, "rewards/reasoning_steps_reward": 0.9729166693985463, "step": 3120 }, { "completion_length": 1024.0, "epoch": 0.690216866139341, "grad_norm": 0.7212414204228248, "kl": 0.284033203125, "learning_rate": 5.29666226815563e-06, "loss": 0.0114, "reward": 1.931699000298977, "reward_std": 0.184273699127516, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.05580101099476451, "rewards/format_reward": 0.96875, "rewards/reasoning_steps_reward": 0.993750000745058, "step": 3125 }, { "completion_length": 1024.0, "epoch": 0.6913212131251639, "grad_norm": 2.361609329654343, "kl": 0.662255859375, "learning_rate": 5.262671542791531e-06, "loss": 0.0265, "reward": 1.7818097308278085, "reward_std": 0.2128821130763754, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.09319027925921546, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.9812500014901161, "step": 3130 }, { "completion_length": 1024.0, "epoch": 0.6924255601109869, "grad_norm": 12.86003879073767, "kl": 2.10830078125, "learning_rate": 5.228751243044961e-06, "loss": 0.0843, "reward": 1.560298126633279, "reward_std": 0.4203119643294485, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10845187624436221, "rewards/format_reward": 0.7375, "rewards/reasoning_steps_reward": 0.9250000037252903, "step": 3135 }, { "completion_length": 1024.0, "epoch": 0.6935299070968098, "grad_norm": 0.7414360033034371, "kl": 2.04703369140625, "learning_rate": 5.194901873178622e-06, "loss": 0.0819, "reward": 1.4238286472856998, "reward_std": 0.524612655222063, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13658802091204053, "rewards/format_reward": 0.6375, "rewards/reasoning_steps_reward": 0.9166666744276881, "step": 3140 }, { "completion_length": 1024.0, "epoch": 0.6946342540826328, "grad_norm": 1.948404998094558, "kl": 0.603228759765625, "learning_rate": 5.1611239364007694e-06, "loss": 0.0241, "reward": 1.5344082202762366, "reward_std": 0.4847637562903401, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.08642512287769932, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.9333333365619183, "step": 3145 }, { "completion_length": 1024.0, "epoch": 0.6957386010684558, "grad_norm": 1.966165060762746, "kl": 0.9482421875, "learning_rate": 5.127417934857718e-06, "loss": 0.0379, "reward": 1.6038374023512005, "reward_std": 0.4338164870128821, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11282926524436335, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.9104166701436043, "step": 3150 }, { "completion_length": 1024.0, "epoch": 0.6968429480542786, "grad_norm": 1.209511319929151, "kl": 1.12919921875, "learning_rate": 5.093784369626397e-06, "loss": 0.0452, "reward": 1.657803256250918, "reward_std": 0.4734325369603539, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.11303008127142675, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.9145833380520344, "step": 3155 }, { "completion_length": 1024.0, "epoch": 0.6979472950401016, "grad_norm": 2.5216009135121897, "kl": 1.0867431640625, "learning_rate": 5.060223740706883e-06, "loss": 0.0435, "reward": 1.681024150364101, "reward_std": 0.35479468195644587, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13772585423866984, "rewards/format_reward": 0.89375, "rewards/reasoning_steps_reward": 0.9125000040978193, "step": 3160 }, { "completion_length": 1024.0, "epoch": 0.6990516420259245, "grad_norm": 0.8886495088503168, "kl": 1.002899169921875, "learning_rate": 5.026736547014981e-06, "loss": 0.0401, "reward": 1.5900187201797962, "reward_std": 0.3701528381861863, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14123128662467935, "rewards/format_reward": 0.85, "rewards/reasoning_steps_reward": 0.8750000044703483, "step": 3165 }, { "completion_length": 1024.0, "epoch": 0.7001559890117475, "grad_norm": 27.57217136094774, "kl": 0.6616455078125, "learning_rate": 4.993323286374787e-06, "loss": 0.0265, "reward": 1.66028910233872, "reward_std": 0.4426726102217799, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11054423401947133, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.8895833365619182, "step": 3170 }, { "completion_length": 1024.0, "epoch": 0.7012603359975704, "grad_norm": 5.439057868497935, "kl": 1.2489013671875, "learning_rate": 4.959984455511313e-06, "loss": 0.05, "reward": 1.7117673270404339, "reward_std": 0.3744364564627176, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.0944826710416237, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.9125000016763807, "step": 3175 }, { "completion_length": 1024.0, "epoch": 0.7023646829833934, "grad_norm": 1.2836582871647162, "kl": 1.3407470703125, "learning_rate": 4.926720550043089e-06, "loss": 0.0536, "reward": 1.5685864306986332, "reward_std": 0.4689555343808024, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11266357623535442, "rewards/format_reward": 0.81875, "rewards/reasoning_steps_reward": 0.8437500027939677, "step": 3180 }, { "completion_length": 1024.0, "epoch": 0.7034690299692163, "grad_norm": 2.055876499575165, "kl": 0.6889404296875, "learning_rate": 4.893532064474787e-06, "loss": 0.0276, "reward": 1.6821095246821642, "reward_std": 0.35570907073124547, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.09914047343772836, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9000000033527613, "step": 3185 }, { "completion_length": 1024.0, "epoch": 0.7045733769550393, "grad_norm": 0.849845748423523, "kl": 0.6326416015625, "learning_rate": 4.860419492189886e-06, "loss": 0.0253, "reward": 1.7199891421943903, "reward_std": 0.4098593617709412, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.09042752947134432, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.941666667163372, "step": 3190 }, { "completion_length": 1024.0, "epoch": 0.7056777239408623, "grad_norm": 1.5168426575194247, "kl": 0.715838623046875, "learning_rate": 4.827383325443331e-06, "loss": 0.0286, "reward": 1.6638664927333593, "reward_std": 0.4076169016363565, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1132168541662395, "rewards/format_reward": 0.84375, "rewards/reasoning_steps_reward": 0.9208333348855376, "step": 3195 }, { "completion_length": 1024.0, "epoch": 0.7067820709266852, "grad_norm": 1.096524838167028, "kl": 0.705718994140625, "learning_rate": 4.794424055354213e-06, "loss": 0.0283, "reward": 1.6534100268036127, "reward_std": 0.34983972859299683, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1278399708433426, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.9125000040978193, "step": 3200 }, { "epoch": 0.7067820709266852, "eval_completion_length": 1024.0, "eval_kl": 1.029453125, "eval_loss": 0.041368499398231506, "eval_reward": 1.6281694555282593, "eval_reward_std": 0.3215235733985901, "eval_rewards/accuracy_reward": 0.005, "eval_rewards/cosine_scaled_reward": -0.12349721165373921, "eval_rewards/format_reward": 0.85, "eval_rewards/reasoning_steps_reward": 0.896666671037674, "eval_runtime": 202.4017, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.124, "step": 3200 }, { "completion_length": 1024.0, "epoch": 0.7078864179125081, "grad_norm": 0.43758151226762365, "kl": 1.76549072265625, "learning_rate": 4.761542171898469e-06, "loss": 0.0706, "reward": 1.5854908142238855, "reward_std": 0.4213532349691377, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1353425225330284, "rewards/format_reward": 0.83125, "rewards/reasoning_steps_reward": 0.8833333387970924, "step": 3205 }, { "completion_length": 1024.0, "epoch": 0.708990764898331, "grad_norm": 4.873529816062781, "kl": 1.70841064453125, "learning_rate": 4.728738163901597e-06, "loss": 0.0684, "reward": 1.571311548165977, "reward_std": 0.5427065275493078, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.10577178624807856, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.8333333378657699, "step": 3210 }, { "completion_length": 1024.0, "epoch": 0.710095111884154, "grad_norm": 2.164610457649961, "kl": 2.211083984375, "learning_rate": 4.696012519031397e-06, "loss": 0.0885, "reward": 1.3987850124016403, "reward_std": 0.6345736057992326, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14913165720063262, "rewards/format_reward": 0.7375, "rewards/reasoning_steps_reward": 0.7979166723787785, "step": 3215 }, { "completion_length": 1024.0, "epoch": 0.7111994588699769, "grad_norm": 2.5538641739468426, "kl": 1.83671875, "learning_rate": 4.663365723790698e-06, "loss": 0.0735, "reward": 1.4091109903994947, "reward_std": 0.6236417117870587, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15755568039567153, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.7979166768491268, "step": 3220 }, { "completion_length": 1024.0, "epoch": 0.7123038058557999, "grad_norm": 1.350379876708449, "kl": 0.92711181640625, "learning_rate": 4.630798263510162e-06, "loss": 0.0371, "reward": 1.528088748920709, "reward_std": 0.629235400253674, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.13024459112311887, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.8083333395421505, "step": 3225 }, { "completion_length": 1024.0, "epoch": 0.7134081528416228, "grad_norm": 0.7771904145990121, "kl": 0.715679931640625, "learning_rate": 4.598310622341037e-06, "loss": 0.0286, "reward": 1.4236940758302807, "reward_std": 0.3954155091239954, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16797260310559067, "rewards/format_reward": 0.78125, "rewards/reasoning_steps_reward": 0.7979166796430945, "step": 3230 }, { "completion_length": 1024.0, "epoch": 0.7145124998274458, "grad_norm": 0.8225437506678818, "kl": 0.7590087890625, "learning_rate": 4.565903283247981e-06, "loss": 0.0304, "reward": 1.4585623749531806, "reward_std": 0.5462490013138449, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15185430651181378, "rewards/format_reward": 0.8, "rewards/reasoning_steps_reward": 0.7979166800156235, "step": 3235 }, { "completion_length": 1024.0, "epoch": 0.7156168468132688, "grad_norm": 6.890597605713054, "kl": 1.92825927734375, "learning_rate": 4.533576728001858e-06, "loss": 0.0772, "reward": 1.270110378577374, "reward_std": 0.48464135241520123, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.12988962087547407, "rewards/format_reward": 0.6625, "rewards/reasoning_steps_reward": 0.7062500072643161, "step": 3240 }, { "completion_length": 1024.0, "epoch": 0.7167211937990917, "grad_norm": 7.0282697170862685, "kl": 1.8246063232421874, "learning_rate": 4.501331437172606e-06, "loss": 0.073, "reward": 1.0211014951419202, "reward_std": 0.5092876911558051, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13723184492700966, "rewards/format_reward": 0.53125, "rewards/reasoning_steps_reward": 0.6208333423361182, "step": 3245 }, { "completion_length": 1024.0, "epoch": 0.7178255407849147, "grad_norm": 18.009744755911356, "kl": 1.937615966796875, "learning_rate": 4.469167890122073e-06, "loss": 0.0775, "reward": 0.9181118378648534, "reward_std": 0.48384187065748846, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15897149957600049, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.6020833395421505, "step": 3250 }, { "completion_length": 1024.0, "epoch": 0.7189298877707375, "grad_norm": 11.741470309490381, "kl": 1.74998779296875, "learning_rate": 4.437086564996891e-06, "loss": 0.07, "reward": 0.7014416160909605, "reward_std": 0.4629500539504988, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.19439171630911006, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.5208333391696215, "step": 3255 }, { "completion_length": 1024.0, "epoch": 0.7200342347565605, "grad_norm": 3.4613430082652017, "kl": 0.99676513671875, "learning_rate": 4.405087938721376e-06, "loss": 0.0399, "reward": 0.7130648781159834, "reward_std": 0.4935263180799666, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1994351311448554, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.5375000078231096, "step": 3260 }, { "completion_length": 1024.0, "epoch": 0.7211385817423834, "grad_norm": 6.852820579545043, "kl": 1.585101318359375, "learning_rate": 4.373172486990436e-06, "loss": 0.0634, "reward": 0.5891364488064028, "reward_std": 0.5846462953391892, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2025302262120931, "rewards/format_reward": 0.28125, "rewards/reasoning_steps_reward": 0.5104166753590107, "step": 3265 }, { "completion_length": 1024.0, "epoch": 0.7222429287282064, "grad_norm": 7.2550760579621345, "kl": 1.471478271484375, "learning_rate": 4.341340684262498e-06, "loss": 0.0589, "reward": 0.9112151099252515, "reward_std": 0.5459779361841356, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15128489717026242, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.5812500080093741, "step": 3270 }, { "completion_length": 1024.0, "epoch": 0.7233472757140293, "grad_norm": 1.943678687735993, "kl": 1.039892578125, "learning_rate": 4.309593003752446e-06, "loss": 0.0416, "reward": 1.2046927298419177, "reward_std": 0.5959788782405667, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16405727104865947, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.7312500104308128, "step": 3275 }, { "completion_length": 1024.0, "epoch": 0.7244516226998523, "grad_norm": 3.426760417270529, "kl": 1.6177978515625, "learning_rate": 4.277929917424602e-06, "loss": 0.0647, "reward": 1.2421426644548774, "reward_std": 0.5169150336907478, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17869066282291896, "rewards/format_reward": 0.6875, "rewards/reasoning_steps_reward": 0.7333333460614085, "step": 3280 }, { "completion_length": 1024.0, "epoch": 0.7255559696856753, "grad_norm": 3.12832092985529, "kl": 1.526617431640625, "learning_rate": 4.246351895985702e-06, "loss": 0.0611, "reward": 1.3539482331834733, "reward_std": 0.5149914252076997, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16480177526245826, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.8062500070780516, "step": 3285 }, { "completion_length": 1024.0, "epoch": 0.7266603166714982, "grad_norm": 2.271368859704475, "kl": 1.709259033203125, "learning_rate": 4.214859408877899e-06, "loss": 0.0683, "reward": 1.2573978632688523, "reward_std": 0.6140790973193362, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16968547967262565, "rewards/format_reward": 0.66875, "rewards/reasoning_steps_reward": 0.745833345875144, "step": 3290 }, { "completion_length": 1024.0, "epoch": 0.7277646636573212, "grad_norm": 1.525812788718264, "kl": 1.299114990234375, "learning_rate": 4.183452924271776e-06, "loss": 0.052, "reward": 1.341607284604106, "reward_std": 0.5649268690554891, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15422605765979824, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.7770833449438215, "step": 3295 }, { "completion_length": 1024.0, "epoch": 0.728869010643144, "grad_norm": 9.383515179336834, "kl": 2.305963134765625, "learning_rate": 4.152132909059402e-06, "loss": 0.0923, "reward": 1.2610476991161703, "reward_std": 0.5714556918683229, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.18270230072084814, "rewards/format_reward": 0.7, "rewards/reasoning_steps_reward": 0.7375000089406967, "step": 3300 }, { "epoch": 0.728869010643144, "eval_completion_length": 1024.0, "eval_kl": 1.10443359375, "eval_loss": 0.044205810874700546, "eval_reward": 1.5034566915035248, "eval_reward_std": 0.48886050406843423, "eval_rewards/accuracy_reward": 0.01, "eval_rewards/cosine_scaled_reward": -0.15654332179576158, "eval_rewards/format_reward": 0.805, "eval_rewards/reasoning_steps_reward": 0.8450000095367431, "eval_runtime": 203.426, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.123, "step": 3300 }, { "completion_length": 1024.0, "epoch": 0.729973357628967, "grad_norm": 2.618356452427954, "kl": 1.448486328125, "learning_rate": 4.120899828847385e-06, "loss": 0.058, "reward": 1.4807198433205486, "reward_std": 0.4224258393329364, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1880301550409058, "rewards/format_reward": 0.80625, "rewards/reasoning_steps_reward": 0.8625000074505806, "step": 3305 }, { "completion_length": 1024.0, "epoch": 0.7310777046147899, "grad_norm": 16.51623283552961, "kl": 2.0116455078125, "learning_rate": 4.089754147949935e-06, "loss": 0.0806, "reward": 1.6333741225302219, "reward_std": 0.3553940106343362, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1457925467126188, "rewards/format_reward": 0.86875, "rewards/reasoning_steps_reward": 0.8979166746139526, "step": 3310 }, { "completion_length": 1024.0, "epoch": 0.7321820516006129, "grad_norm": 0.5974081751959402, "kl": 0.870172119140625, "learning_rate": 4.058696329381987e-06, "loss": 0.0348, "reward": 1.6305226560682058, "reward_std": 0.34538418338706833, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.15281066768729942, "rewards/format_reward": 0.8625, "rewards/reasoning_steps_reward": 0.902083340473473, "step": 3315 }, { "completion_length": 1024.0, "epoch": 0.7332863985864359, "grad_norm": 2.29446866118963, "kl": 1.221923828125, "learning_rate": 4.027726834852303e-06, "loss": 0.0489, "reward": 1.5875793149694801, "reward_std": 0.477928023977438, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14158735829405486, "rewards/format_reward": 0.8125, "rewards/reasoning_steps_reward": 0.9041666734963656, "step": 3320 }, { "completion_length": 1024.0, "epoch": 0.7343907455722588, "grad_norm": 2.8765044679308747, "kl": 1.35091552734375, "learning_rate": 3.996846124756609e-06, "loss": 0.0541, "reward": 1.4872914606705308, "reward_std": 0.411738165695715, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.16687520117702662, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.8604166748002171, "step": 3325 }, { "completion_length": 1024.0, "epoch": 0.7354950925580818, "grad_norm": 1.4728439317575581, "kl": 2.201837158203125, "learning_rate": 3.966054658170754e-06, "loss": 0.0881, "reward": 1.3737801656650845, "reward_std": 0.5207441252474382, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17830317884217947, "rewards/format_reward": 0.7125, "rewards/reasoning_steps_reward": 0.8395833428949118, "step": 3330 }, { "completion_length": 1024.0, "epoch": 0.7365994395439047, "grad_norm": 0.6119173500048468, "kl": 0.925311279296875, "learning_rate": 3.93535289284388e-06, "loss": 0.037, "reward": 1.5270693870261312, "reward_std": 0.49105042346992606, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1458472884342882, "rewards/format_reward": 0.75625, "rewards/reasoning_steps_reward": 0.9041666742414236, "step": 3335 }, { "completion_length": 1024.0, "epoch": 0.7377037865297277, "grad_norm": 0.5654163389180057, "kl": 0.505694580078125, "learning_rate": 3.904741285191629e-06, "loss": 0.0202, "reward": 1.5745669988915325, "reward_std": 0.4111982766771689, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14834966310299932, "rewards/format_reward": 0.825, "rewards/reasoning_steps_reward": 0.8979166766628623, "step": 3340 }, { "completion_length": 1024.0, "epoch": 0.7388081335155506, "grad_norm": 2.6402255771621412, "kl": 1.411602783203125, "learning_rate": 3.874220290289337e-06, "loss": 0.0565, "reward": 1.5241700040176511, "reward_std": 0.5233304193599906, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13833000464364886, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.8562500102445483, "step": 3345 }, { "completion_length": 1024.0, "epoch": 0.7399124805013735, "grad_norm": 0.6883781397562108, "kl": 0.793212890625, "learning_rate": 3.8437903618652895e-06, "loss": 0.0318, "reward": 1.5530781971290708, "reward_std": 0.3981657780946989, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10733847725496162, "rewards/format_reward": 0.79375, "rewards/reasoning_steps_reward": 0.860416678711772, "step": 3350 }, { "completion_length": 1024.0, "epoch": 0.7410168274871964, "grad_norm": 27.700580105645894, "kl": 0.977423095703125, "learning_rate": 3.8134519522939693e-06, "loss": 0.0391, "reward": 1.5121993293985725, "reward_std": 0.4588557916787977, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14196734559372998, "rewards/format_reward": 0.7875, "rewards/reasoning_steps_reward": 0.8541666759178043, "step": 3355 }, { "completion_length": 1024.0, "epoch": 0.7421211744730194, "grad_norm": 2.2978020557582335, "kl": 1.358447265625, "learning_rate": 3.7832055125893318e-06, "loss": 0.0544, "reward": 1.4460453005507587, "reward_std": 0.6020016885802761, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1414546983760374, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.8187500098720193, "step": 3360 }, { "completion_length": 1024.0, "epoch": 0.7432255214588424, "grad_norm": 0.4062793871357252, "kl": 0.942327880859375, "learning_rate": 3.753051492398089e-06, "loss": 0.0377, "reward": 1.6512468622997403, "reward_std": 0.397758130193688, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.10083647546125576, "rewards/format_reward": 0.8375, "rewards/reasoning_steps_reward": 0.889583339355886, "step": 3365 }, { "completion_length": 1024.0, "epoch": 0.7443298684446653, "grad_norm": 1.0151769195066895, "kl": 1.735552978515625, "learning_rate": 3.7229903399930423e-06, "loss": 0.0694, "reward": 1.4343198793707415, "reward_std": 0.4978488245993503, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14901345187099652, "rewards/format_reward": 0.7375, "rewards/reasoning_steps_reward": 0.8333333402872085, "step": 3370 }, { "completion_length": 1021.4625, "epoch": 0.7454342154304883, "grad_norm": 2.398157240549169, "kl": 1.697705078125, "learning_rate": 3.6930225022664136e-06, "loss": 0.0679, "reward": 1.379991829302162, "reward_std": 0.5659466957542463, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16167483936878851, "rewards/format_reward": 0.70625, "rewards/reasoning_steps_reward": 0.8229166742414236, "step": 3375 }, { "completion_length": 1024.0, "epoch": 0.7465385624163112, "grad_norm": 2.8096952907954975, "kl": 1.368798828125, "learning_rate": 3.6631484247231896e-06, "loss": 0.0547, "reward": 1.3315429392270743, "reward_std": 0.6065680258263455, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.16220706112799235, "rewards/format_reward": 0.69375, "rewards/reasoning_steps_reward": 0.7937500094994903, "step": 3380 }, { "completion_length": 1024.0, "epoch": 0.7476429094021342, "grad_norm": 10.578165514570484, "kl": 1.167584228515625, "learning_rate": 3.6333685514745165e-06, "loss": 0.0467, "reward": 1.3127060623912257, "reward_std": 0.5424171181814017, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1456272709775476, "rewards/format_reward": 0.65625, "rewards/reasoning_steps_reward": 0.8020833404734731, "step": 3385 }, { "completion_length": 1024.0, "epoch": 0.7487472563879571, "grad_norm": 0.6269448458862071, "kl": 1.054925537109375, "learning_rate": 3.6036833252310887e-06, "loss": 0.0422, "reward": 1.5182505875825882, "reward_std": 0.43510723081126346, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12758274595544208, "rewards/format_reward": 0.7625, "rewards/reasoning_steps_reward": 0.8833333406597376, "step": 3390 }, { "completion_length": 1024.0, "epoch": 0.7498516033737801, "grad_norm": 3.0610939003327315, "kl": 1.247955322265625, "learning_rate": 3.574093187296568e-06, "loss": 0.0499, "reward": 1.3740362918004394, "reward_std": 0.5503279601005489, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12804705323942472, "rewards/format_reward": 0.625, "rewards/reasoning_steps_reward": 0.8583333391696215, "step": 3395 }, { "completion_length": 1024.0, "epoch": 0.7509559503596029, "grad_norm": 1.4397719595687892, "kl": 0.642218017578125, "learning_rate": 3.544598577561016e-06, "loss": 0.0257, "reward": 1.5150041854009033, "reward_std": 0.5328479968578904, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.09332914527039975, "rewards/format_reward": 0.675, "rewards/reasoning_steps_reward": 0.9145833414047957, "step": 3400 }, { "epoch": 0.7509559503596029, "eval_completion_length": 1024.0, "eval_kl": 1.03607421875, "eval_loss": 0.041635580360889435, "eval_reward": 1.522298811674118, "eval_reward_std": 0.45481792830396445, "eval_rewards/accuracy_reward": 0.025, "eval_rewards/cosine_scaled_reward": -0.0977012000605464, "eval_rewards/format_reward": 0.73, "eval_rewards/reasoning_steps_reward": 0.8650000083446503, "eval_runtime": 205.7972, "eval_samples_per_second": 0.481, "eval_steps_per_second": 0.121, "step": 3400 }, { "completion_length": 1020.38125, "epoch": 0.7520602973454259, "grad_norm": 1.4135755886673778, "kl": 0.83087158203125, "learning_rate": 3.515199934494373e-06, "loss": 0.0332, "reward": 1.5240996377862757, "reward_std": 0.5872578708353103, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.07590036567416973, "rewards/format_reward": 0.71875, "rewards/reasoning_steps_reward": 0.8500000078231096, "step": 3405 }, { "completion_length": 1024.0, "epoch": 0.7531646443312489, "grad_norm": 1.440038114167272, "kl": 1.372210693359375, "learning_rate": 3.4858976951399237e-06, "loss": 0.0549, "reward": 1.447450523695443, "reward_std": 0.5360916848971101, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.11921614635066362, "rewards/format_reward": 0.725, "rewards/reasoning_steps_reward": 0.8291666748002171, "step": 3410 }, { "completion_length": 1023.81875, "epoch": 0.7542689913170718, "grad_norm": 6.136632317031085, "kl": 1.036328125, "learning_rate": 3.4566922951078086e-06, "loss": 0.0415, "reward": 1.3465895116969477, "reward_std": 0.6256579857220459, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13049382427416276, "rewards/format_reward": 0.65625, "rewards/reasoning_steps_reward": 0.8083333387970925, "step": 3415 }, { "completion_length": 1024.0, "epoch": 0.7553733383028948, "grad_norm": 1.6173327075684585, "kl": 1.124951171875, "learning_rate": 3.427584168568535e-06, "loss": 0.045, "reward": 1.369578105956316, "reward_std": 0.5765885033455561, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12625523412593792, "rewards/format_reward": 0.65, "rewards/reasoning_steps_reward": 0.8270833423361182, "step": 3420 }, { "completion_length": 1024.0, "epoch": 0.7564776852887177, "grad_norm": 3.4475234051324466, "kl": 1.0495849609375, "learning_rate": 3.398573748246544e-06, "loss": 0.042, "reward": 1.4252166913822293, "reward_std": 0.4993854798289249, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.11644997615949251, "rewards/format_reward": 0.65625, "rewards/reasoning_steps_reward": 0.8791666707023978, "step": 3425 }, { "completion_length": 1024.0, "epoch": 0.7575820322745407, "grad_norm": 3.1336201942373756, "kl": 1.468182373046875, "learning_rate": 3.3696614654137637e-06, "loss": 0.0587, "reward": 1.265179492533207, "reward_std": 0.6175256297712621, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12440384250803618, "rewards/format_reward": 0.525, "rewards/reasoning_steps_reward": 0.8583333404734731, "step": 3430 }, { "completion_length": 1024.0, "epoch": 0.7586863792603636, "grad_norm": 31.463751192253017, "kl": 1.829046630859375, "learning_rate": 3.3408477498831917e-06, "loss": 0.0732, "reward": 1.123897957149893, "reward_std": 0.4925203584745759, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1281853732885793, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.8020833419635892, "step": 3435 }, { "completion_length": 1024.0, "epoch": 0.7597907262461866, "grad_norm": 2.3970625271113373, "kl": 1.119512939453125, "learning_rate": 3.3121330300025222e-06, "loss": 0.0448, "reward": 1.135027837054804, "reward_std": 0.5514261644682847, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1274721680150833, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.8312500065192581, "step": 3440 }, { "completion_length": 1023.64375, "epoch": 0.7608950732320094, "grad_norm": 1.6382197678473562, "kl": 1.080792236328125, "learning_rate": 3.2835177326477675e-06, "loss": 0.0432, "reward": 1.072135358909145, "reward_std": 0.5708221859607875, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.10911463984884903, "rewards/format_reward": 0.40625, "rewards/reasoning_steps_reward": 0.7687500072643161, "step": 3445 }, { "completion_length": 1024.0, "epoch": 0.7619994202178324, "grad_norm": 7.773948160603541, "kl": 1.0737548828125, "learning_rate": 3.2550022832169125e-06, "loss": 0.043, "reward": 1.1452186492097098, "reward_std": 0.531599986756919, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.10269802667899057, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.8041666718199849, "step": 3450 }, { "completion_length": 1024.0, "epoch": 0.7631037672036554, "grad_norm": 7.451659626325706, "kl": 1.38333740234375, "learning_rate": 3.2265871056235974e-06, "loss": 0.0553, "reward": 1.0934795890934765, "reward_std": 0.5327975626219995, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.11485373933683149, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.7895833387970924, "step": 3455 }, { "completion_length": 1024.0, "epoch": 0.7642081141894783, "grad_norm": 4.642606409507648, "kl": 1.634930419921875, "learning_rate": 3.1982726222908046e-06, "loss": 0.0655, "reward": 1.1026644762256184, "reward_std": 0.4953272847373228, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.09733552844481892, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.812500007264316, "step": 3460 }, { "completion_length": 1024.0, "epoch": 0.7653124611753013, "grad_norm": 3.6705444957957836, "kl": 0.82242431640625, "learning_rate": 3.170059254144593e-06, "loss": 0.0329, "reward": 1.031765272654593, "reward_std": 0.5869754700732301, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.11823472948744893, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7562500076368451, "step": 3465 }, { "completion_length": 1024.0, "epoch": 0.7664168081611242, "grad_norm": 10.804721272461153, "kl": 1.247711181640625, "learning_rate": 3.1419474206078203e-06, "loss": 0.0499, "reward": 0.9572328898822888, "reward_std": 0.5374889099814026, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14068377380608582, "rewards/format_reward": 0.35, "rewards/reasoning_steps_reward": 0.7416666751727462, "step": 3470 }, { "completion_length": 1024.0, "epoch": 0.7675211551469472, "grad_norm": 12.231601833459766, "kl": 1.7123046875, "learning_rate": 3.113937539593931e-06, "loss": 0.0685, "reward": 0.9976887149776303, "reward_std": 0.4858238591014924, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1293946221430815, "rewards/format_reward": 0.40625, "rewards/reasoning_steps_reward": 0.7208333428949117, "step": 3475 }, { "completion_length": 1024.0, "epoch": 0.7686255021327701, "grad_norm": 40.059144629731655, "kl": 2.3020477294921875, "learning_rate": 3.086030027500728e-06, "loss": 0.0921, "reward": 0.886852508764423, "reward_std": 0.5351808949055339, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14648083351930835, "rewards/format_reward": 0.33125, "rewards/reasoning_steps_reward": 0.6958333432674408, "step": 3480 }, { "completion_length": 1024.0, "epoch": 0.7697298491185931, "grad_norm": 6.123342241192849, "kl": 1.230517578125, "learning_rate": 3.058225299204195e-06, "loss": 0.0492, "reward": 0.7911023863998707, "reward_std": 0.46425032978731906, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13181428615062032, "rewards/format_reward": 0.275, "rewards/reasoning_steps_reward": 0.6479166723787785, "step": 3485 }, { "completion_length": 1024.0, "epoch": 0.7708341961044161, "grad_norm": 7.471941635569114, "kl": 2.369635009765625, "learning_rate": 3.0305237680523046e-06, "loss": 0.0947, "reward": 0.9569121636566706, "reward_std": 0.5213623499432287, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14517117467548815, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.7083333432674408, "step": 3490 }, { "completion_length": 1024.0, "epoch": 0.7719385430902389, "grad_norm": 5.404185333488702, "kl": 1.592156982421875, "learning_rate": 3.002925845858905e-06, "loss": 0.0637, "reward": 0.9760491037741303, "reward_std": 0.5136939262922169, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14686756571754814, "rewards/format_reward": 0.39375, "rewards/reasoning_steps_reward": 0.7229166744276881, "step": 3495 }, { "completion_length": 1019.0, "epoch": 0.7730428900760619, "grad_norm": 50.5244563754164, "kl": 2.205517578125, "learning_rate": 2.9754319428975796e-06, "loss": 0.0883, "reward": 0.7373021919673193, "reward_std": 0.4538879245365024, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14186448440450478, "rewards/format_reward": 0.2875, "rewards/reasoning_steps_reward": 0.5854166746139526, "step": 3500 }, { "epoch": 0.7730428900760619, "eval_completion_length": 1023.54, "eval_kl": 1.16181640625, "eval_loss": 0.04671285301446915, "eval_reward": 0.856374105066061, "eval_reward_std": 0.5129354545962996, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.15029256213456393, "eval_rewards/format_reward": 0.3, "eval_rewards/reasoning_steps_reward": 0.7066666767001152, "eval_runtime": 204.9165, "eval_samples_per_second": 0.483, "eval_steps_per_second": 0.122, "step": 3500 }, { "completion_length": 1024.0, "epoch": 0.7741472370618848, "grad_norm": 7.70175417588847, "kl": 1.191650390625, "learning_rate": 2.948042467895544e-06, "loss": 0.0477, "reward": 0.7259443550603464, "reward_std": 0.4864570820616791, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1490556534990901, "rewards/format_reward": 0.25625, "rewards/reasoning_steps_reward": 0.6187500074505806, "step": 3505 }, { "completion_length": 1024.0, "epoch": 0.7752515840477078, "grad_norm": 9.742286641987212, "kl": 1.1121826171875, "learning_rate": 2.920757828027586e-06, "loss": 0.0445, "reward": 0.8020869728876278, "reward_std": 0.49714295062603925, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1333296972559765, "rewards/format_reward": 0.25, "rewards/reasoning_steps_reward": 0.6729166770353914, "step": 3510 }, { "completion_length": 1024.0, "epoch": 0.7763559310335307, "grad_norm": 9.596508493581142, "kl": 1.29034423828125, "learning_rate": 2.893578428909998e-06, "loss": 0.0516, "reward": 0.7311014153528959, "reward_std": 0.48636515507059813, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1293152589641977, "rewards/format_reward": 0.26875, "rewards/reasoning_steps_reward": 0.5854166768491268, "step": 3515 }, { "completion_length": 1024.0, "epoch": 0.7774602780193537, "grad_norm": 14.081251027714208, "kl": 1.638250732421875, "learning_rate": 2.8665046745945555e-06, "loss": 0.0655, "reward": 0.8628839520883048, "reward_std": 0.5482322660624049, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14336604780110065, "rewards/format_reward": 0.34375, "rewards/reasoning_steps_reward": 0.6625000078231096, "step": 3520 }, { "completion_length": 1024.0, "epoch": 0.7785646250051766, "grad_norm": 45.591594718848036, "kl": 1.7972900390625, "learning_rate": 2.839536967562504e-06, "loss": 0.0718, "reward": 0.7295638629118912, "reward_std": 0.45584265950456027, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15585280851228162, "rewards/format_reward": 0.25, "rewards/reasoning_steps_reward": 0.6291666755452752, "step": 3525 }, { "completion_length": 1024.0, "epoch": 0.7796689719909996, "grad_norm": 2.437142557750893, "kl": 1.276025390625, "learning_rate": 2.8126757087185797e-06, "loss": 0.051, "reward": 0.8280719975009561, "reward_std": 0.5058557034655677, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14276133371167815, "rewards/format_reward": 0.275, "rewards/reasoning_steps_reward": 0.6895833410322666, "step": 3530 }, { "completion_length": 1024.0, "epoch": 0.7807733189768226, "grad_norm": 12.311576928951354, "kl": 2.1121337890625, "learning_rate": 2.7859212973850535e-06, "loss": 0.0846, "reward": 0.8593616144207772, "reward_std": 0.42143602522992296, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15522172061318998, "rewards/format_reward": 0.34375, "rewards/reasoning_steps_reward": 0.670833339355886, "step": 3535 }, { "completion_length": 1024.0, "epoch": 0.7818776659626455, "grad_norm": 11.646125532907798, "kl": 1.4277587890625, "learning_rate": 2.759274131295787e-06, "loss": 0.0571, "reward": 0.8780025206928258, "reward_std": 0.42374305160010406, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12408081038593081, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.620833339355886, "step": 3540 }, { "completion_length": 1024.0, "epoch": 0.7829820129484684, "grad_norm": 39.017400560487744, "kl": 1.5975341796875, "learning_rate": 2.732734606590318e-06, "loss": 0.0639, "reward": 0.5735942371771671, "reward_std": 0.37694009622591695, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14932243004150222, "rewards/format_reward": 0.19375, "rewards/reasoning_steps_reward": 0.5229166757315398, "step": 3545 }, { "completion_length": 1024.0, "epoch": 0.7840863599342913, "grad_norm": 2.6124122163287264, "kl": 1.3592529296875, "learning_rate": 2.7063031178079847e-06, "loss": 0.0544, "reward": 0.6471456294239033, "reward_std": 0.39274544618529034, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15493770116772793, "rewards/format_reward": 0.225, "rewards/reasoning_steps_reward": 0.5645833417773247, "step": 3550 }, { "completion_length": 1024.0, "epoch": 0.7851907069201143, "grad_norm": 6.2513767416820345, "kl": 1.19891357421875, "learning_rate": 2.679980057882049e-06, "loss": 0.048, "reward": 0.7008785362413619, "reward_std": 0.45641955096180026, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11995480215264251, "rewards/format_reward": 0.25625, "rewards/reasoning_steps_reward": 0.5458333419635892, "step": 3555 }, { "completion_length": 1024.0, "epoch": 0.7862950539059372, "grad_norm": 2.9178799008139684, "kl": 0.7404937744140625, "learning_rate": 2.6537658181338534e-06, "loss": 0.0296, "reward": 0.7164174870704301, "reward_std": 0.44081706466859033, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.11691585160442627, "rewards/format_reward": 0.25, "rewards/reasoning_steps_reward": 0.570833340100944, "step": 3560 }, { "completion_length": 1024.0, "epoch": 0.7873994008917602, "grad_norm": 4.960918513864824, "kl": 0.7436981201171875, "learning_rate": 2.6276607882670135e-06, "loss": 0.0297, "reward": 0.7412137555482332, "reward_std": 0.46463715256686555, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14003624991455582, "rewards/format_reward": 0.28125, "rewards/reasoning_steps_reward": 0.6000000102445483, "step": 3565 }, { "completion_length": 1024.0, "epoch": 0.7885037478775831, "grad_norm": 10.639247963083111, "kl": 0.886761474609375, "learning_rate": 2.60166535636162e-06, "loss": 0.0355, "reward": 0.8172663133533206, "reward_std": 0.45502894536784877, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14523369194939734, "rewards/format_reward": 0.275, "rewards/reasoning_steps_reward": 0.6750000080093741, "step": 3570 }, { "completion_length": 1024.0, "epoch": 0.7896080948634061, "grad_norm": 28.08455167991598, "kl": 1.941729736328125, "learning_rate": 2.5757799088684654e-06, "loss": 0.0777, "reward": 0.843649728321634, "reward_std": 0.5577885125540888, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.16468361105535223, "rewards/format_reward": 0.3, "rewards/reasoning_steps_reward": 0.7020833423361182, "step": 3575 }, { "completion_length": 1024.0, "epoch": 0.7907124418492291, "grad_norm": 3.970425059446987, "kl": 1.81851806640625, "learning_rate": 2.5500048306033065e-06, "loss": 0.0727, "reward": 0.9449512905091979, "reward_std": 0.5703302607609657, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15088205109277625, "rewards/format_reward": 0.3375, "rewards/reasoning_steps_reward": 0.7458333436399698, "step": 3580 }, { "completion_length": 1024.0, "epoch": 0.791816788835052, "grad_norm": 2.682816536500783, "kl": 1.6467498779296874, "learning_rate": 2.5243405047411353e-06, "loss": 0.0659, "reward": 0.9549707896774635, "reward_std": 0.5791340425539602, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.17836254732101225, "rewards/format_reward": 0.34375, "rewards/reasoning_steps_reward": 0.7833333445712924, "step": 3585 }, { "completion_length": 1024.0, "epoch": 0.7929211358208749, "grad_norm": 1.9396200301781, "kl": 1.76396484375, "learning_rate": 2.498787312810492e-06, "loss": 0.0706, "reward": 1.0381604361347854, "reward_std": 0.7050543383204058, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1305895757221151, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.7875000135973096, "step": 3590 }, { "completion_length": 1024.0, "epoch": 0.7940254828066978, "grad_norm": 1.2512973833054954, "kl": 1.017486572265625, "learning_rate": 2.4733456346877817e-06, "loss": 0.0407, "reward": 0.9714192368090153, "reward_std": 0.5311943096166942, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.16399744372538408, "rewards/format_reward": 0.33125, "rewards/reasoning_steps_reward": 0.7979166781529784, "step": 3595 }, { "completion_length": 1024.0, "epoch": 0.7951298297925208, "grad_norm": 4.205148920710968, "kl": 2.4321533203125, "learning_rate": 2.448015848591638e-06, "loss": 0.0973, "reward": 0.9355880039744079, "reward_std": 0.5558298278599978, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1706620103039313, "rewards/format_reward": 0.35, "rewards/reasoning_steps_reward": 0.7562500150874257, "step": 3600 }, { "epoch": 0.7951298297925208, "eval_completion_length": 1024.0, "eval_kl": 3.12923828125, "eval_loss": 0.1257171779870987, "eval_reward": 1.0359005191922188, "eval_reward_std": 0.5016864457726479, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.18409949489636346, "eval_rewards/format_reward": 0.425, "eval_rewards/reasoning_steps_reward": 0.7950000166893005, "eval_runtime": 201.9601, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.124, "step": 3600 }, { "completion_length": 1024.0, "epoch": 0.7962341767783437, "grad_norm": 2.9772870979661126, "kl": 2.625927734375, "learning_rate": 2.4227983310772963e-06, "loss": 0.1052, "reward": 0.941523305606097, "reward_std": 0.5259468010030105, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.19806003746198259, "rewards/format_reward": 0.3625, "rewards/reasoning_steps_reward": 0.7770833464339375, "step": 3605 }, { "completion_length": 1024.0, "epoch": 0.7973385237641667, "grad_norm": 2.731744844543052, "kl": 2.591387939453125, "learning_rate": 2.3976934570309974e-06, "loss": 0.1037, "reward": 0.896416311757639, "reward_std": 0.5306536434363807, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.2119170312071219, "rewards/format_reward": 0.3625, "rewards/reasoning_steps_reward": 0.7395833466202021, "step": 3610 }, { "completion_length": 1024.0, "epoch": 0.7984428707499897, "grad_norm": 7.5385515356990185, "kl": 1.553973388671875, "learning_rate": 2.3727015996644043e-06, "loss": 0.0622, "reward": 0.8575988472090103, "reward_std": 0.5232200874595037, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16323449706978863, "rewards/format_reward": 0.34375, "rewards/reasoning_steps_reward": 0.6770833473652601, "step": 3615 }, { "completion_length": 1024.0, "epoch": 0.7995472177358126, "grad_norm": 9.021182502422391, "kl": 1.311651611328125, "learning_rate": 2.3478231305090694e-06, "loss": 0.0524, "reward": 0.9543035954702646, "reward_std": 0.5737970527330617, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15819641145644708, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.7250000117346644, "step": 3620 }, { "completion_length": 1024.0, "epoch": 0.8006515647216356, "grad_norm": 4.472787988182606, "kl": 1.0048095703125, "learning_rate": 2.3230584194109074e-06, "loss": 0.0402, "reward": 0.9397510398179293, "reward_std": 0.5379055161934957, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.17691563499101903, "rewards/format_reward": 0.3625, "rewards/reasoning_steps_reward": 0.7479166820645332, "step": 3625 }, { "completion_length": 1024.0, "epoch": 0.8017559117074585, "grad_norm": 4.113018900515695, "kl": 1.157135009765625, "learning_rate": 2.298407834524682e-06, "loss": 0.0463, "reward": 0.9861817553406581, "reward_std": 0.4714244429342216, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.16590158142935252, "rewards/format_reward": 0.4, "rewards/reasoning_steps_reward": 0.7395833438262344, "step": 3630 }, { "completion_length": 1024.0, "epoch": 0.8028602586932815, "grad_norm": 5.611788601802546, "kl": 1.5865966796875, "learning_rate": 2.2738717423085543e-06, "loss": 0.0635, "reward": 1.0154616593383252, "reward_std": 0.5189789967440447, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13870502139761812, "rewards/format_reward": 0.425, "rewards/reasoning_steps_reward": 0.729166678339243, "step": 3635 }, { "completion_length": 1024.0, "epoch": 0.8039646056791043, "grad_norm": 4.863709299234087, "kl": 1.793304443359375, "learning_rate": 2.2494505075186234e-06, "loss": 0.0718, "reward": 1.0299670369713567, "reward_std": 0.6288951909449679, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1679496320008184, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.7541666803881526, "step": 3640 }, { "completion_length": 1024.0, "epoch": 0.8050689526649273, "grad_norm": 3.0389529591387983, "kl": 2.23280029296875, "learning_rate": 2.2251444932035094e-06, "loss": 0.0893, "reward": 0.9538824698538519, "reward_std": 0.5754079432575964, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.18570087201660498, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.7645833479240537, "step": 3645 }, { "completion_length": 1024.0, "epoch": 0.8061732996507502, "grad_norm": 4.261038530209944, "kl": 2.365380859375, "learning_rate": 2.200954060698941e-06, "loss": 0.0947, "reward": 1.0910434238612652, "reward_std": 0.5977108788116311, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.18395657959727033, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7937500169500709, "step": 3650 }, { "completion_length": 1024.0, "epoch": 0.8072776466365732, "grad_norm": 11.856530569543205, "kl": 1.533966064453125, "learning_rate": 2.176879569622409e-06, "loss": 0.0613, "reward": 0.9856115130707621, "reward_std": 0.5456289411027683, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22063849223195575, "rewards/format_reward": 0.4375, "rewards/reasoning_steps_reward": 0.7687500143423677, "step": 3655 }, { "completion_length": 1024.0, "epoch": 0.8083819936223962, "grad_norm": 3.2452205346158083, "kl": 1.236077880859375, "learning_rate": 2.1529213778677993e-06, "loss": 0.0494, "reward": 0.8988889244385063, "reward_std": 0.6373270594252972, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.194861083329306, "rewards/format_reward": 0.33125, "rewards/reasoning_steps_reward": 0.7312500124797225, "step": 3660 }, { "completion_length": 1024.0, "epoch": 0.8094863406082191, "grad_norm": 12.439028568659024, "kl": 0.76240234375, "learning_rate": 2.1290798416000857e-06, "loss": 0.0305, "reward": 1.0351363457739353, "reward_std": 0.5226037830223504, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.18361366387798625, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.793750013038516, "step": 3665 }, { "completion_length": 1024.0, "epoch": 0.8105906875940421, "grad_norm": 0.6611045827209621, "kl": 0.63751220703125, "learning_rate": 2.1053553152500204e-06, "loss": 0.0255, "reward": 1.0747706493362785, "reward_std": 0.5134325242426712, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.16481269130890724, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.839583345875144, "step": 3670 }, { "completion_length": 1024.0, "epoch": 0.811695034579865, "grad_norm": 0.943299396325815, "kl": 0.6408935546875, "learning_rate": 2.081748151508883e-06, "loss": 0.0256, "reward": 0.9957553435117006, "reward_std": 0.49763372070156037, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.18341132483328693, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.8041666820645332, "step": 3675 }, { "completion_length": 1024.0, "epoch": 0.812799381565688, "grad_norm": 0.6683933277746549, "kl": 0.687762451171875, "learning_rate": 2.0582587013232268e-06, "loss": 0.0275, "reward": 1.0955076265148818, "reward_std": 0.5081618877709844, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.18157571223564445, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.8458333477377892, "step": 3680 }, { "completion_length": 1024.0, "epoch": 0.8139037285515108, "grad_norm": 1.0847188860596124, "kl": 0.6183135986328125, "learning_rate": 2.0348873138896563e-06, "loss": 0.0247, "reward": 1.1622585462406279, "reward_std": 0.5471403273332044, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1877414623158984, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.8312500134110451, "step": 3685 }, { "completion_length": 1024.0, "epoch": 0.8150080755373338, "grad_norm": 1.5112078670527096, "kl": 0.8150054931640625, "learning_rate": 2.0116343366496493e-06, "loss": 0.0326, "reward": 1.1093786764889955, "reward_std": 0.48941911092260854, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.19687132386607117, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.8562500154599547, "step": 3690 }, { "completion_length": 1024.0, "epoch": 0.8161124225231567, "grad_norm": 11.427469945106585, "kl": 1.845751953125, "learning_rate": 1.988500115284385e-06, "loss": 0.0738, "reward": 1.3140614761039615, "reward_std": 0.5058772598677024, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.17135519032599406, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.9166666734963655, "step": 3695 }, { "completion_length": 1024.0, "epoch": 0.8172167695089797, "grad_norm": 1.7741207108559547, "kl": 0.9895751953125, "learning_rate": 1.9654849937096033e-06, "loss": 0.0396, "reward": 1.33723512776196, "reward_std": 0.42841523091628914, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.17526487102441024, "rewards/format_reward": 0.58125, "rewards/reasoning_steps_reward": 0.9125000096857547, "step": 3700 }, { "epoch": 0.8172167695089797, "eval_completion_length": 1024.0, "eval_kl": 1.0012890625, "eval_loss": 0.040022626519203186, "eval_reward": 1.4169398856163025, "eval_reward_std": 0.4867195551842451, "eval_rewards/accuracy_reward": 0.01, "eval_rewards/cosine_scaled_reward": -0.15139344856142997, "eval_rewards/format_reward": 0.61, "eval_rewards/reasoning_steps_reward": 0.9483333396911621, "eval_runtime": 203.2325, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.123, "step": 3700 }, { "completion_length": 1024.0, "epoch": 0.8183211164948027, "grad_norm": 12.054030744773742, "kl": 1.897283935546875, "learning_rate": 1.942589314070494e-06, "loss": 0.0759, "reward": 1.3313264921307564, "reward_std": 0.46922859043552306, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14575683891016525, "rewards/format_reward": 0.525, "rewards/reasoning_steps_reward": 0.9395833417773247, "step": 3705 }, { "completion_length": 1024.0, "epoch": 0.8194254634806256, "grad_norm": 3.7069945069091936, "kl": 1.22840576171875, "learning_rate": 1.9198134167366156e-06, "loss": 0.0492, "reward": 1.4052705839276314, "reward_std": 0.403043683465512, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1197294215176953, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.9562500081956387, "step": 3710 }, { "completion_length": 1024.0, "epoch": 0.8205298104664486, "grad_norm": 2.768907455782949, "kl": 2.87457275390625, "learning_rate": 1.897157640296825e-06, "loss": 0.1152, "reward": 1.4017615104094148, "reward_std": 0.3942436770506902, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11490515425248304, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.9291666714474559, "step": 3715 }, { "completion_length": 1024.0, "epoch": 0.8216341574522715, "grad_norm": 6.158506141748816, "kl": 1.50045166015625, "learning_rate": 1.8746223215542482e-06, "loss": 0.06, "reward": 1.4124196864664555, "reward_std": 0.43862366709727213, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13133030838798732, "rewards/format_reward": 0.5875, "rewards/reasoning_steps_reward": 0.9437500080093741, "step": 3720 }, { "completion_length": 1024.0, "epoch": 0.8227385044380945, "grad_norm": 4.954257940316965, "kl": 2.506939697265625, "learning_rate": 1.8522077955212791e-06, "loss": 0.1003, "reward": 1.2474359845742584, "reward_std": 0.409401986663579, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1900640235922765, "rewards/format_reward": 0.53125, "rewards/reasoning_steps_reward": 0.9062500108033419, "step": 3725 }, { "completion_length": 1024.0, "epoch": 0.8238428514239174, "grad_norm": 3.384994347227523, "kl": 2.5660552978515625, "learning_rate": 1.8299143954145926e-06, "loss": 0.1026, "reward": 1.2905212937039323, "reward_std": 0.5092491055722348, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.19489537274930627, "rewards/format_reward": 0.56875, "rewards/reasoning_steps_reward": 0.8979166783392429, "step": 3730 }, { "completion_length": 1024.0, "epoch": 0.8249471984097403, "grad_norm": 16.989052594983022, "kl": 2.394305419921875, "learning_rate": 1.8077424526501964e-06, "loss": 0.0958, "reward": 1.2323162292130292, "reward_std": 0.43714046496825176, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17601710449671373, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.8708333456888795, "step": 3735 }, { "completion_length": 1024.0, "epoch": 0.8260515453955632, "grad_norm": 3.3697087191377157, "kl": 2.461981201171875, "learning_rate": 1.7856922968384926e-06, "loss": 0.0985, "reward": 1.3591851346194743, "reward_std": 0.4494941798264335, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.16789820450503612, "rewards/format_reward": 0.59375, "rewards/reasoning_steps_reward": 0.9270833421498537, "step": 3740 }, { "completion_length": 1024.0, "epoch": 0.8271558923813862, "grad_norm": 3.3314479469990848, "kl": 1.8787109375, "learning_rate": 1.763764255779392e-06, "loss": 0.0751, "reward": 1.2150082999374718, "reward_std": 0.45866580544243335, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13915836790692993, "rewards/format_reward": 0.45, "rewards/reasoning_steps_reward": 0.8854166736826301, "step": 3745 }, { "completion_length": 1024.0, "epoch": 0.8282602393672092, "grad_norm": 5.075803385145104, "kl": 3.6768798828125, "learning_rate": 1.7419586554574364e-06, "loss": 0.147, "reward": 1.2929603595286607, "reward_std": 0.5276947294652927, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.1549563185122679, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.8854166813194752, "step": 3750 }, { "completion_length": 1024.0, "epoch": 0.8293645863530321, "grad_norm": 4.835328190612411, "kl": 2.53955078125, "learning_rate": 1.720275820036944e-06, "loss": 0.1016, "reward": 1.0985228657722472, "reward_std": 0.5404113541560946, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.1389771427086089, "rewards/format_reward": 0.40625, "rewards/reasoning_steps_reward": 0.8125000113621355, "step": 3755 }, { "completion_length": 1024.0, "epoch": 0.8304689333388551, "grad_norm": 3.6128842543406647, "kl": 1.5438568115234375, "learning_rate": 1.6987160718572027e-06, "loss": 0.0617, "reward": 1.0970849219709635, "reward_std": 0.4372379134729272, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15499841592172742, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.820833345875144, "step": 3760 }, { "completion_length": 1024.0, "epoch": 0.831573280324678, "grad_norm": 3.522120303062643, "kl": 1.2506439208984375, "learning_rate": 1.6772797314276712e-06, "loss": 0.05, "reward": 0.9676698416282307, "reward_std": 0.39677214640614694, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.178163488789869, "rewards/format_reward": 0.35625, "rewards/reasoning_steps_reward": 0.7895833473652601, "step": 3765 }, { "completion_length": 1024.0, "epoch": 0.832677627310501, "grad_norm": 6.35794295461514, "kl": 1.3462554931640625, "learning_rate": 1.6559671174232195e-06, "loss": 0.0539, "reward": 1.0329537893645466, "reward_std": 0.48206213802768616, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14412954530816932, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.7958333443850278, "step": 3770 }, { "completion_length": 1024.0, "epoch": 0.833781974296324, "grad_norm": 4.258986368861116, "kl": 2.50684814453125, "learning_rate": 1.6347785466793764e-06, "loss": 0.1003, "reward": 0.950670113041997, "reward_std": 0.46344148809494073, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13057988811051474, "rewards/format_reward": 0.28125, "rewards/reasoning_steps_reward": 0.787500013038516, "step": 3775 }, { "completion_length": 1024.0, "epoch": 0.8348863212821469, "grad_norm": 2.4134853475595444, "kl": 2.65799560546875, "learning_rate": 1.6137143341876439e-06, "loss": 0.1063, "reward": 1.0280450066551565, "reward_std": 0.46901671985397114, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13862167526385746, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.7916666807606816, "step": 3780 }, { "completion_length": 1024.0, "epoch": 0.8359906682679697, "grad_norm": 4.096493293080795, "kl": 2.3369903564453125, "learning_rate": 1.5927747930907921e-06, "loss": 0.0935, "reward": 1.0754234604537487, "reward_std": 0.45691707939695336, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14124320970195187, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.8041666811332107, "step": 3785 }, { "completion_length": 1024.0, "epoch": 0.8370950152537927, "grad_norm": 3.2060019572278375, "kl": 1.8758026123046876, "learning_rate": 1.5719602346782215e-06, "loss": 0.075, "reward": 0.9844577558338642, "reward_std": 0.48203894472389947, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12804225913132541, "rewards/format_reward": 0.31875, "rewards/reasoning_steps_reward": 0.7812500189989805, "step": 3790 }, { "completion_length": 1024.0, "epoch": 0.8381993622396157, "grad_norm": 1.9710529098598488, "kl": 2.16820068359375, "learning_rate": 1.5512709683813165e-06, "loss": 0.0868, "reward": 1.0551761870272458, "reward_std": 0.42293115961001604, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.09690715366800759, "rewards/format_reward": 0.31875, "rewards/reasoning_steps_reward": 0.8208333468064666, "step": 3795 }, { "completion_length": 1024.0, "epoch": 0.8393037092254386, "grad_norm": 4.843746007302358, "kl": 1.226190185546875, "learning_rate": 1.5307073017688644e-06, "loss": 0.0491, "reward": 1.1217388808727264, "reward_std": 0.4583800950756995, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.11367778851745243, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.8166666807606816, "step": 3800 }, { "epoch": 0.8393037092254386, "eval_completion_length": 1024.0, "eval_kl": 0.873955078125, "eval_loss": 0.03470970690250397, "eval_reward": 1.0069829231500627, "eval_reward_std": 0.4387795051932335, "eval_rewards/accuracy_reward": 0.0, "eval_rewards/cosine_scaled_reward": -0.14968374449759722, "eval_rewards/format_reward": 0.355, "eval_rewards/reasoning_steps_reward": 0.8016666841506958, "eval_runtime": 203.6085, "eval_samples_per_second": 0.486, "eval_steps_per_second": 0.123, "step": 3800 }, { "completion_length": 1024.0, "epoch": 0.8404080562112616, "grad_norm": 9.8277486524576, "kl": 1.67528076171875, "learning_rate": 1.5102695405424738e-06, "loss": 0.0671, "reward": 0.9232415302656591, "reward_std": 0.48757856400334276, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1225918134470703, "rewards/format_reward": 0.3125, "rewards/reasoning_steps_reward": 0.7333333453163504, "step": 3805 }, { "completion_length": 1024.0, "epoch": 0.8415124031970845, "grad_norm": 7.887505524007284, "kl": 1.87099609375, "learning_rate": 1.4899579885320237e-06, "loss": 0.0749, "reward": 1.1503344973316416, "reward_std": 0.41806495695200285, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1309155066817766, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.8250000141561031, "step": 3810 }, { "completion_length": 1024.0, "epoch": 0.8426167501829075, "grad_norm": 3.811520777347624, "kl": 1.80526123046875, "learning_rate": 1.4697729476911614e-06, "loss": 0.0722, "reward": 1.0021475785411895, "reward_std": 0.48045386319281536, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13743576427805237, "rewards/format_reward": 0.35625, "rewards/reasoning_steps_reward": 0.770833350904286, "step": 3815 }, { "completion_length": 1024.0, "epoch": 0.8437210971687304, "grad_norm": 2.407666114214345, "kl": 1.65206298828125, "learning_rate": 1.449714718092803e-06, "loss": 0.0661, "reward": 1.1085011329501868, "reward_std": 0.4444706824200694, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.0914988732081838, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.7937500156462193, "step": 3820 }, { "completion_length": 1024.0, "epoch": 0.8448254441545534, "grad_norm": 4.045358362837525, "kl": 1.606927490234375, "learning_rate": 1.4297835979246777e-06, "loss": 0.0643, "reward": 1.0195800764486194, "reward_std": 0.4161707017852677, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14916992889229733, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7875000160187483, "step": 3825 }, { "completion_length": 1024.0, "epoch": 0.8459297911403763, "grad_norm": 4.254843536541897, "kl": 1.788482666015625, "learning_rate": 1.4099798834848855e-06, "loss": 0.0716, "reward": 1.0221065394580364, "reward_std": 0.4706483832735103, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13206013098242692, "rewards/format_reward": 0.35, "rewards/reasoning_steps_reward": 0.7979166816920042, "step": 3830 }, { "completion_length": 1024.0, "epoch": 0.8470341381261992, "grad_norm": 4.7405984768373415, "kl": 2.34656982421875, "learning_rate": 1.3903038691775095e-06, "loss": 0.0938, "reward": 1.0039814393036068, "reward_std": 0.4533116746461019, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12101856818553643, "rewards/format_reward": 0.35625, "rewards/reasoning_steps_reward": 0.7625000121071934, "step": 3835 }, { "completion_length": 1024.0, "epoch": 0.8481384851120222, "grad_norm": 4.516232325190879, "kl": 1.359423828125, "learning_rate": 1.370755847508226e-06, "loss": 0.0543, "reward": 1.0372777149896137, "reward_std": 0.492635853459069, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.10647228800517042, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7500000139698386, "step": 3840 }, { "completion_length": 1024.0, "epoch": 0.8492428320978451, "grad_norm": 3.2830116476433098, "kl": 1.38970947265625, "learning_rate": 1.3513361090799537e-06, "loss": 0.0556, "reward": 1.0849484650418162, "reward_std": 0.4841716932147392, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14421821158321108, "rewards/format_reward": 0.425, "rewards/reasoning_steps_reward": 0.7979166807606817, "step": 3845 }, { "completion_length": 1024.0, "epoch": 0.8503471790836681, "grad_norm": 4.480857992077528, "kl": 1.501470947265625, "learning_rate": 1.332044942588545e-06, "loss": 0.0601, "reward": 1.1137627013260498, "reward_std": 0.42765071511385033, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12582063591689802, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.8083333443850279, "step": 3850 }, { "completion_length": 1024.0, "epoch": 0.851451526069491, "grad_norm": 7.943484649640199, "kl": 1.7091888427734374, "learning_rate": 1.3128826348184886e-06, "loss": 0.0684, "reward": 1.0067575078748632, "reward_std": 0.5076355822726327, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12449250066274545, "rewards/format_reward": 0.36875, "rewards/reasoning_steps_reward": 0.7500000152736902, "step": 3855 }, { "completion_length": 1024.0, "epoch": 0.852555873055314, "grad_norm": 2.5412807678609806, "kl": 0.94459228515625, "learning_rate": 1.2938494706386462e-06, "loss": 0.0378, "reward": 1.156890353001654, "reward_std": 0.48906847709586143, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.10560965545591898, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.8187500165775419, "step": 3860 }, { "completion_length": 1024.0, "epoch": 0.853660220041137, "grad_norm": 3.8180837005980854, "kl": 1.16829833984375, "learning_rate": 1.2749457329980108e-06, "loss": 0.0467, "reward": 1.0843943199433852, "reward_std": 0.4156025383039378, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1385223493918602, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.8354166794568301, "step": 3865 }, { "completion_length": 1024.0, "epoch": 0.8547645670269599, "grad_norm": 34.666542207088746, "kl": 2.95928955078125, "learning_rate": 1.256171702921516e-06, "loss": 0.1184, "reward": 1.1096524391323328, "reward_std": 0.4731335958989803, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16743089526426047, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7833333468064666, "step": 3870 }, { "completion_length": 1024.0, "epoch": 0.8558689140127829, "grad_norm": 4.192208427980209, "kl": 1.46943359375, "learning_rate": 1.237527659505846e-06, "loss": 0.0588, "reward": 1.0440983198815956, "reward_std": 0.40177030080858456, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14756835330772447, "rewards/format_reward": 0.39375, "rewards/reasoning_steps_reward": 0.7916666785255074, "step": 3875 }, { "completion_length": 1024.0, "epoch": 0.8569732609986057, "grad_norm": 1.4373607975014058, "kl": 0.4958740234375, "learning_rate": 1.2190138799152851e-06, "loss": 0.0198, "reward": 1.1634813494980336, "reward_std": 0.49837744958058466, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12193531318916939, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.82291667945683, "step": 3880 }, { "completion_length": 1024.0, "epoch": 0.8580776079844287, "grad_norm": 10.062619229211846, "kl": 0.8890899658203125, "learning_rate": 1.200630639377609e-06, "loss": 0.0356, "reward": 0.9816565293818712, "reward_std": 0.38651110691134816, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1329268047120422, "rewards/format_reward": 0.35625, "rewards/reasoning_steps_reward": 0.7583333479240537, "step": 3885 }, { "completion_length": 1024.0, "epoch": 0.8591819549702516, "grad_norm": 2.517396839676297, "kl": 1.968524169921875, "learning_rate": 1.1823782111799843e-06, "loss": 0.0787, "reward": 0.959625584539026, "reward_std": 0.5189180550776655, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15079108572681435, "rewards/format_reward": 0.35, "rewards/reasoning_steps_reward": 0.7541666792705655, "step": 3890 }, { "completion_length": 1024.0, "epoch": 0.8602863019560746, "grad_norm": 2.8178537770706886, "kl": 0.726678466796875, "learning_rate": 1.1642568666649067e-06, "loss": 0.0291, "reward": 1.0694204801693559, "reward_std": 0.44945709503699616, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1493295207095798, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.7875000167638063, "step": 3895 }, { "completion_length": 1024.0, "epoch": 0.8613906489418975, "grad_norm": 1.2175267033985402, "kl": 0.955523681640625, "learning_rate": 1.1462668752261652e-06, "loss": 0.0382, "reward": 1.000610039383173, "reward_std": 0.49909483049996195, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1598066317845223, "rewards/format_reward": 0.4, "rewards/reasoning_steps_reward": 0.7604166818782687, "step": 3900 }, { "epoch": 0.8613906489418975, "eval_completion_length": 1024.0, "eval_kl": 1.683125, "eval_loss": 0.0676305741071701, "eval_reward": 0.9381104612350464, "eval_reward_std": 0.4509727944433689, "eval_rewards/accuracy_reward": 0.015, "eval_rewards/cosine_scaled_reward": -0.1618895485624671, "eval_rewards/format_reward": 0.28, "eval_rewards/reasoning_steps_reward": 0.8050000178813934, "eval_runtime": 201.783, "eval_samples_per_second": 0.491, "eval_steps_per_second": 0.124, "step": 3900 }, { "completion_length": 1024.0, "epoch": 0.8624949959277205, "grad_norm": 2.170430471378303, "kl": 1.952362060546875, "learning_rate": 1.1284085043048465e-06, "loss": 0.0781, "reward": 1.025868459790945, "reward_std": 0.5059246151708067, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17413154328241945, "rewards/format_reward": 0.4375, "rewards/reasoning_steps_reward": 0.7625000141561031, "step": 3905 }, { "completion_length": 1024.0, "epoch": 0.8635993429135435, "grad_norm": 5.357087985724468, "kl": 1.887725830078125, "learning_rate": 1.1106820193853484e-06, "loss": 0.0755, "reward": 1.0269752063788473, "reward_std": 0.44671607576310635, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15219147218740545, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.7791666839271784, "step": 3910 }, { "completion_length": 1024.0, "epoch": 0.8647036898993664, "grad_norm": 2.5863387297050693, "kl": 0.875408935546875, "learning_rate": 1.0930876839914418e-06, "loss": 0.035, "reward": 1.1502701964229345, "reward_std": 0.44783246733859416, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13514647865085863, "rewards/format_reward": 0.46875, "rewards/reasoning_steps_reward": 0.8104166803881526, "step": 3915 }, { "completion_length": 1024.0, "epoch": 0.8658080368851894, "grad_norm": 3.23242533022308, "kl": 1.299554443359375, "learning_rate": 1.0756257596823427e-06, "loss": 0.052, "reward": 1.112755262479186, "reward_std": 0.49227567511698, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1414114096784033, "rewards/format_reward": 0.43125, "rewards/reasoning_steps_reward": 0.8229166833683849, "step": 3920 }, { "completion_length": 1024.0, "epoch": 0.8669123838710123, "grad_norm": 2.138140290646965, "kl": 1.885113525390625, "learning_rate": 1.058296506048836e-06, "loss": 0.0754, "reward": 1.164114381093532, "reward_std": 0.5346207804002916, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13796895813720766, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.7958333490416407, "step": 3925 }, { "completion_length": 1024.0, "epoch": 0.8680167308568352, "grad_norm": 5.127451325350497, "kl": 1.73660888671875, "learning_rate": 1.04110018070941e-06, "loss": 0.0695, "reward": 1.0745520979922731, "reward_std": 0.49730775001007715, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1400312382105767, "rewards/format_reward": 0.46875, "rewards/reasoning_steps_reward": 0.7458333507180214, "step": 3930 }, { "completion_length": 1024.0, "epoch": 0.8691210778426581, "grad_norm": 3.665613966481991, "kl": 0.8619049072265625, "learning_rate": 1.0240370393064235e-06, "loss": 0.0345, "reward": 1.1202016398310661, "reward_std": 0.60314673992807, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12771503719559404, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7854166829958558, "step": 3935 }, { "completion_length": 1024.0, "epoch": 0.8702254248284811, "grad_norm": 5.457231423921015, "kl": 1.279339599609375, "learning_rate": 1.0071073355023097e-06, "loss": 0.0512, "reward": 1.1111865887418388, "reward_std": 0.5126064650583431, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1346467504816246, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7645833479240537, "step": 3940 }, { "completion_length": 1024.0, "epoch": 0.871329771814304, "grad_norm": 1.4204895897110041, "kl": 1.849835205078125, "learning_rate": 9.903113209758098e-07, "loss": 0.074, "reward": 1.1405908815562724, "reward_std": 0.5249507926579099, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.15732579003379216, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.7854166811332106, "step": 3945 }, { "completion_length": 1024.0, "epoch": 0.872434118800127, "grad_norm": 2.613485274176624, "kl": 1.406982421875, "learning_rate": 9.736492454182211e-07, "loss": 0.0563, "reward": 0.9954834171570838, "reward_std": 0.5651785338108312, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12951659189420753, "rewards/format_reward": 0.34375, "rewards/reasoning_steps_reward": 0.7750000163912774, "step": 3950 }, { "completion_length": 1024.0, "epoch": 0.87353846578595, "grad_norm": 3.2220887386073467, "kl": 1.297503662109375, "learning_rate": 9.571213565296877e-07, "loss": 0.0519, "reward": 1.045906347129494, "reward_std": 0.5811525732686278, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1290936590579804, "rewards/format_reward": 0.40625, "rewards/reasoning_steps_reward": 0.7562500141561032, "step": 3955 }, { "completion_length": 1024.0, "epoch": 0.8746428127717729, "grad_norm": 3.4199691033243105, "kl": 0.97791748046875, "learning_rate": 9.407279000155311e-07, "loss": 0.0391, "reward": 1.1156769435852767, "reward_std": 0.5415843210706953, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14265640405064914, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.795833345875144, "step": 3960 }, { "completion_length": 1024.0, "epoch": 0.8757471597575959, "grad_norm": 6.611742861375454, "kl": 1.126373291015625, "learning_rate": 9.244691195825794e-07, "loss": 0.0451, "reward": 1.0655396494301386, "reward_std": 0.4242564166415832, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14487702874903335, "rewards/format_reward": 0.4375, "rewards/reasoning_steps_reward": 0.7666666831821203, "step": 3965 }, { "completion_length": 1024.0, "epoch": 0.8768515067434188, "grad_norm": 6.2650579753314295, "kl": 1.105645751953125, "learning_rate": 9.0834525693555e-07, "loss": 0.0442, "reward": 1.1305613292381167, "reward_std": 0.5012515306996648, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12568867837253492, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7687500163912773, "step": 3970 }, { "completion_length": 1024.0, "epoch": 0.8779558537292417, "grad_norm": 6.559802348774649, "kl": 1.6363189697265625, "learning_rate": 8.923565517734633e-07, "loss": 0.0654, "reward": 1.1925322379916907, "reward_std": 0.5651830204267754, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.11163442874731118, "rewards/format_reward": 0.46875, "rewards/reasoning_steps_reward": 0.8104166805744171, "step": 3975 }, { "completion_length": 1024.0, "epoch": 0.8790602007150646, "grad_norm": 3.897602069962089, "kl": 1.82530517578125, "learning_rate": 8.765032417860753e-07, "loss": 0.073, "reward": 0.9899700607638806, "reward_std": 0.4757543180807261, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1391966143855825, "rewards/format_reward": 0.38125, "rewards/reasoning_steps_reward": 0.7416666837409138, "step": 3980 }, { "completion_length": 1024.0, "epoch": 0.8801645477008876, "grad_norm": 7.591241433425781, "kl": 1.08575439453125, "learning_rate": 8.607855626503403e-07, "loss": 0.0434, "reward": 1.1372865000739694, "reward_std": 0.5247640458663227, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12938016958419213, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.8041666815057397, "step": 3985 }, { "completion_length": 1024.0, "epoch": 0.8812688946867105, "grad_norm": 4.303510587529375, "kl": 1.087200927734375, "learning_rate": 8.452037480269082e-07, "loss": 0.0435, "reward": 1.0119140914292075, "reward_std": 0.4009784645517357, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12350257498037535, "rewards/format_reward": 0.375, "rewards/reasoning_steps_reward": 0.7479166829958558, "step": 3990 }, { "completion_length": 1024.0, "epoch": 0.8823732416725335, "grad_norm": 5.929217922923497, "kl": 1.24005126953125, "learning_rate": 8.297580295566576e-07, "loss": 0.0496, "reward": 1.026335727609694, "reward_std": 0.48216943825391356, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1236642804076837, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.7250000150874257, "step": 3995 }, { "completion_length": 1024.0, "epoch": 0.8834775886583565, "grad_norm": 2.332684901634004, "kl": 1.697442626953125, "learning_rate": 8.144486368572468e-07, "loss": 0.0679, "reward": 1.0781446799635888, "reward_std": 0.4809949690039502, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13852199764514808, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.7916666816920042, "step": 4000 }, { "epoch": 0.8834775886583565, "eval_completion_length": 1024.0, "eval_kl": 1.759248046875, "eval_loss": 0.07073025405406952, "eval_reward": 1.0747132487595081, "eval_reward_std": 0.486686719302088, "eval_rewards/accuracy_reward": 0.015, "eval_rewards/cosine_scaled_reward": -0.12195342842489482, "eval_rewards/format_reward": 0.435, "eval_rewards/reasoning_steps_reward": 0.74666669100523, "eval_runtime": 203.224, "eval_samples_per_second": 0.487, "eval_steps_per_second": 0.123, "step": 4000 }, { "completion_length": 1024.0, "epoch": 0.8845819356441794, "grad_norm": 3.153161789027804, "kl": 1.55931396484375, "learning_rate": 7.992757975196974e-07, "loss": 0.0624, "reward": 1.0648077727295457, "reward_std": 0.45864389188354837, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1289422349364031, "rewards/format_reward": 0.39375, "rewards/reasoning_steps_reward": 0.8000000143423677, "step": 4005 }, { "completion_length": 1024.0, "epoch": 0.8856862826300024, "grad_norm": 2.2325669337373983, "kl": 1.570654296875, "learning_rate": 7.842397371050181e-07, "loss": 0.0628, "reward": 1.065108502563089, "reward_std": 0.5289823830302339, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.16197483572468627, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.7645833469927311, "step": 4010 }, { "completion_length": 1024.0, "epoch": 0.8867906296158253, "grad_norm": 2.7142887474028012, "kl": 1.02591552734375, "learning_rate": 7.693406791408476e-07, "loss": 0.041, "reward": 1.1182113092392683, "reward_std": 0.5619218776490016, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1463720285333693, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7895833475515246, "step": 4015 }, { "completion_length": 1024.0, "epoch": 0.8878949766016483, "grad_norm": 4.411870374708956, "kl": 0.986627197265625, "learning_rate": 7.545788451181313e-07, "loss": 0.0395, "reward": 1.1376007285900414, "reward_std": 0.46481511711303936, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1394826118749279, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7645833507180214, "step": 4020 }, { "completion_length": 1024.0, "epoch": 0.8889993235874711, "grad_norm": 3.425420744219714, "kl": 1.2092529296875, "learning_rate": 7.399544544878268e-07, "loss": 0.0484, "reward": 1.093935468606651, "reward_std": 0.5276332072971854, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15606453956934274, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.756250019557774, "step": 4025 }, { "completion_length": 1024.0, "epoch": 0.8901036705732941, "grad_norm": 6.2482885105172725, "kl": 2.00596923828125, "learning_rate": 7.25467724657647e-07, "loss": 0.0803, "reward": 1.112189820688218, "reward_std": 0.5027000481175492, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.135726847530168, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7666666828095913, "step": 4030 }, { "completion_length": 1024.0, "epoch": 0.891208017559117, "grad_norm": 4.123267493368509, "kl": 1.91627197265625, "learning_rate": 7.11118870988825e-07, "loss": 0.0767, "reward": 1.0083888062275945, "reward_std": 0.5308977565960958, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.17077786354784621, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.7291666796430946, "step": 4035 }, { "completion_length": 1024.0, "epoch": 0.89231236454494, "grad_norm": 2.5447247120995904, "kl": 1.29932861328125, "learning_rate": 6.969081067929129e-07, "loss": 0.052, "reward": 1.1347805107012392, "reward_std": 0.37370490196408357, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14438615987601225, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.8041666796430945, "step": 4040 }, { "completion_length": 1024.0, "epoch": 0.893416711530763, "grad_norm": 3.3545211281045546, "kl": 1.1327117919921874, "learning_rate": 6.828356433286065e-07, "loss": 0.0453, "reward": 0.9914261367172003, "reward_std": 0.5204619866504799, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1294072063108615, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.6958333469927311, "step": 4045 }, { "completion_length": 1024.0, "epoch": 0.8945210585165859, "grad_norm": 4.230761353854734, "kl": 0.87974853515625, "learning_rate": 6.689016897986123e-07, "loss": 0.0352, "reward": 1.0761857211589814, "reward_std": 0.5829236682388, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.14256428971712012, "rewards/format_reward": 0.4375, "rewards/reasoning_steps_reward": 0.7625000156462193, "step": 4050 }, { "completion_length": 1024.0, "epoch": 0.8956254055024089, "grad_norm": 1.8923064047781268, "kl": 1.389752197265625, "learning_rate": 6.551064533465335e-07, "loss": 0.0556, "reward": 1.1430289884097875, "reward_std": 0.5576779363034803, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1382210265833237, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.7562500160187483, "step": 4055 }, { "completion_length": 1024.0, "epoch": 0.8967297524882318, "grad_norm": 6.0362623942979505, "kl": 2.644989013671875, "learning_rate": 6.414501390537875e-07, "loss": 0.1057, "reward": 1.150946792308241, "reward_std": 0.48350526331923904, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.15530321877513414, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7750000149011612, "step": 4060 }, { "completion_length": 1024.0, "epoch": 0.8978340994740548, "grad_norm": 2.199768849556265, "kl": 2.11390380859375, "learning_rate": 6.279329499365649e-07, "loss": 0.0846, "reward": 1.1383668217342346, "reward_std": 0.46480973800826175, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.10954985310381744, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.7791666816920042, "step": 4065 }, { "completion_length": 1024.0, "epoch": 0.8989384464598776, "grad_norm": 9.295433499575353, "kl": 1.520904541015625, "learning_rate": 6.14555086942804e-07, "loss": 0.0608, "reward": 1.1201986480504273, "reward_std": 0.4280889055877196, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13605136539481463, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7937500178813934, "step": 4070 }, { "completion_length": 1024.0, "epoch": 0.9000427934457006, "grad_norm": 1.6615044294375885, "kl": 1.034375, "learning_rate": 6.013167489492089e-07, "loss": 0.0414, "reward": 1.0116388690192253, "reward_std": 0.46661656610522184, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1404444719890307, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.7333333438262344, "step": 4075 }, { "completion_length": 1024.0, "epoch": 0.9011471404315236, "grad_norm": 5.062466581880976, "kl": 1.399224853515625, "learning_rate": 5.88218132758287e-07, "loss": 0.056, "reward": 1.0753349607344718, "reward_std": 0.48987307887655335, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12258170813620381, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.7479166820645332, "step": 4080 }, { "completion_length": 1024.0, "epoch": 0.9022514874173465, "grad_norm": 11.209836105553178, "kl": 1.842578125, "learning_rate": 5.752594330954275e-07, "loss": 0.0737, "reward": 1.0284711010754108, "reward_std": 0.5226584098192688, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.14652890994620976, "rewards/format_reward": 0.4125, "rewards/reasoning_steps_reward": 0.7437500147148967, "step": 4085 }, { "completion_length": 1024.0, "epoch": 0.9033558344031695, "grad_norm": 7.199991876784292, "kl": 2.265875244140625, "learning_rate": 5.624408426060124e-07, "loss": 0.0907, "reward": 1.1538773463093093, "reward_std": 0.5723583094921196, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14403933168505317, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7916666816920042, "step": 4090 }, { "completion_length": 1024.0, "epoch": 0.9044601813889924, "grad_norm": 3.975211613433658, "kl": 1.727044677734375, "learning_rate": 5.497625518525374e-07, "loss": 0.0691, "reward": 1.1335043588653206, "reward_std": 0.4624854526555282, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12899564821491366, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7937500139698386, "step": 4095 }, { "completion_length": 1024.0, "epoch": 0.9055645283748154, "grad_norm": 3.383012591131137, "kl": 1.18646240234375, "learning_rate": 5.372247493117921e-07, "loss": 0.0474, "reward": 1.1752978217788042, "reward_std": 0.3756918714298081, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14761884533763805, "rewards/format_reward": 0.525, "rewards/reasoning_steps_reward": 0.7916666809469461, "step": 4100 }, { "epoch": 0.9055645283748154, "eval_completion_length": 1024.0, "eval_kl": 1.332880859375, "eval_loss": 0.05309397354722023, "eval_reward": 1.1452737122774124, "eval_reward_std": 0.6122168021649123, "eval_rewards/accuracy_reward": 0.02, "eval_rewards/cosine_scaled_reward": -0.12472628904506564, "eval_rewards/format_reward": 0.475, "eval_rewards/reasoning_steps_reward": 0.7750000178813934, "eval_runtime": 202.1824, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.124, "step": 4100 }, { "completion_length": 1024.0, "epoch": 0.9066688753606383, "grad_norm": 3.4174559729540874, "kl": 1.212176513671875, "learning_rate": 5.248276213720526e-07, "loss": 0.0485, "reward": 1.0279372279532253, "reward_std": 0.5210070614280994, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15956277402001434, "rewards/format_reward": 0.3875, "rewards/reasoning_steps_reward": 0.7937500147148967, "step": 4105 }, { "completion_length": 1024.0, "epoch": 0.9077732223464613, "grad_norm": 4.039284591694262, "kl": 1.042962646484375, "learning_rate": 5.125713523303133e-07, "loss": 0.0417, "reward": 1.1345659455750137, "reward_std": 0.5074929667287507, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1612673976487713, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.802083351649344, "step": 4110 }, { "completion_length": 1024.0, "epoch": 0.9088775693322843, "grad_norm": 2.1407975550189455, "kl": 1.029180908203125, "learning_rate": 5.004561243895433e-07, "loss": 0.0412, "reward": 1.1098061236087233, "reward_std": 0.4296879690635251, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1506105485586886, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.7979166826233268, "step": 4115 }, { "completion_length": 1024.0, "epoch": 0.9099819163181071, "grad_norm": 2.787687525177025, "kl": 1.16190185546875, "learning_rate": 4.884821176559817e-07, "loss": 0.0465, "reward": 1.1772496801801027, "reward_std": 0.45350805636044245, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14983365589287131, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.8145833475515246, "step": 4120 }, { "completion_length": 1024.0, "epoch": 0.91108626330393, "grad_norm": 3.4685777527475925, "kl": 1.169305419921875, "learning_rate": 4.7664951013645875e-07, "loss": 0.0468, "reward": 1.1000913422554732, "reward_std": 0.586482156632701, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13740866679772806, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7750000130385161, "step": 4125 }, { "completion_length": 1024.0, "epoch": 0.912190610289753, "grad_norm": 2.5440932847767126, "kl": 0.869580078125, "learning_rate": 4.649584777357452e-07, "loss": 0.0348, "reward": 1.101002143137157, "reward_std": 0.5315961849104497, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14691453777631977, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.785416685976088, "step": 4130 }, { "completion_length": 1024.0, "epoch": 0.913294957275576, "grad_norm": 4.050421877530695, "kl": 1.568072509765625, "learning_rate": 4.534091942539476e-07, "loss": 0.0628, "reward": 1.1267952339723706, "reward_std": 0.4579050815096707, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13778810024832638, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7833333531394601, "step": 4135 }, { "completion_length": 1024.0, "epoch": 0.9143993042613989, "grad_norm": 9.301363016784023, "kl": 1.900787353515625, "learning_rate": 4.420018313839147e-07, "loss": 0.076, "reward": 1.1611902100965381, "reward_std": 0.4580170904053375, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15547646980849095, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.7666666839271784, "step": 4140 }, { "completion_length": 1024.0, "epoch": 0.9155036512472219, "grad_norm": 3.174997058824143, "kl": 1.6366119384765625, "learning_rate": 4.3073655870869093e-07, "loss": 0.0655, "reward": 1.1532112454995513, "reward_std": 0.6160767867557297, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14053876251273323, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7937500162050128, "step": 4145 }, { "completion_length": 1024.0, "epoch": 0.9166079982330448, "grad_norm": 5.719018592647839, "kl": 1.850433349609375, "learning_rate": 4.1961354369898675e-07, "loss": 0.0741, "reward": 1.1542285384610296, "reward_std": 0.4916458193274593, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12493812668617466, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7791666813194752, "step": 4150 }, { "completion_length": 1024.0, "epoch": 0.9177123452188678, "grad_norm": 6.25154325723326, "kl": 1.735711669921875, "learning_rate": 4.086329517107046e-07, "loss": 0.0694, "reward": 1.0918453134596349, "reward_std": 0.5446474470940302, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12898802184672603, "rewards/format_reward": 0.41875, "rewards/reasoning_steps_reward": 0.7958333440124988, "step": 4155 }, { "completion_length": 1024.0, "epoch": 0.9188166922046908, "grad_norm": 2.5591543960133207, "kl": 1.272998046875, "learning_rate": 3.9779494598246484e-07, "loss": 0.0509, "reward": 1.1463804263621569, "reward_std": 0.5207044870971004, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14945291494550475, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7833333505317569, "step": 4160 }, { "completion_length": 1024.0, "epoch": 0.9199210391905137, "grad_norm": 7.67735841511338, "kl": 2.06146240234375, "learning_rate": 3.8709968763318894e-07, "loss": 0.0824, "reward": 1.0511977900750935, "reward_std": 0.5209969227365946, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14046887978074665, "rewards/format_reward": 0.425, "rewards/reasoning_steps_reward": 0.7604166788980364, "step": 4165 }, { "completion_length": 1024.0, "epoch": 0.9210253861763366, "grad_norm": 1.0227618516880124, "kl": 1.127886962890625, "learning_rate": 3.7654733565969826e-07, "loss": 0.0451, "reward": 1.1872975867241622, "reward_std": 0.46389186800297466, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14603575810724578, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.7958333490416407, "step": 4170 }, { "completion_length": 1024.0, "epoch": 0.9221297331621595, "grad_norm": 7.718448222947808, "kl": 1.17996826171875, "learning_rate": 3.661380469343556e-07, "loss": 0.0472, "reward": 1.190551941562444, "reward_std": 0.565188505727565, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.12403139998496045, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.8083333481103182, "step": 4175 }, { "completion_length": 1024.0, "epoch": 0.9232340801479825, "grad_norm": 4.845262970075181, "kl": 1.309710693359375, "learning_rate": 3.558719762027307e-07, "loss": 0.0524, "reward": 1.1957894197665155, "reward_std": 0.4323232921247836, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12921058931970036, "rewards/format_reward": 0.53125, "rewards/reasoning_steps_reward": 0.7875000156462193, "step": 4180 }, { "completion_length": 1024.0, "epoch": 0.9243384271338054, "grad_norm": 4.766569710401553, "kl": 1.71771240234375, "learning_rate": 3.457492760812975e-07, "loss": 0.0687, "reward": 1.2371738120913505, "reward_std": 0.5003112346317267, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12949285372342273, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.8166666815057397, "step": 4185 }, { "completion_length": 1024.0, "epoch": 0.9254427741196284, "grad_norm": 8.214132928933958, "kl": 2.177740478515625, "learning_rate": 3.357700970551681e-07, "loss": 0.0871, "reward": 1.1125389066524805, "reward_std": 0.4511790112737799, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12912776759476402, "rewards/format_reward": 0.4375, "rewards/reasoning_steps_reward": 0.7979166798293591, "step": 4190 }, { "completion_length": 1024.0, "epoch": 0.9265471211054513, "grad_norm": 12.11864779361759, "kl": 1.639886474609375, "learning_rate": 3.2593458747585683e-07, "loss": 0.0656, "reward": 1.0972121067345142, "reward_std": 0.49666505910572595, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1527878977904038, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7687500169500708, "step": 4195 }, { "completion_length": 1024.0, "epoch": 0.9276514680912743, "grad_norm": 1.7415606550831522, "kl": 1.255419921875, "learning_rate": 3.1624289355907334e-07, "loss": 0.0502, "reward": 1.1839751296676695, "reward_std": 0.47496756471573465, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12852488255703065, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.8187500152736902, "step": 4200 }, { "epoch": 0.9276514680912743, "eval_completion_length": 1024.0, "eval_kl": 2.0467578125, "eval_loss": 0.08228980004787445, "eval_reward": 1.2043625664711, "eval_reward_std": 0.5759655183041468, "eval_rewards/accuracy_reward": 0.02, "eval_rewards/cosine_scaled_reward": -0.1273041057959199, "eval_rewards/format_reward": 0.53, "eval_rewards/reasoning_steps_reward": 0.7816666835546493, "eval_runtime": 201.8511, "eval_samples_per_second": 0.49, "eval_steps_per_second": 0.124, "step": 4200 }, { "completion_length": 1024.0, "epoch": 0.9287558150770973, "grad_norm": 4.138803338820017, "kl": 1.4931396484375, "learning_rate": 3.0669515938254404e-07, "loss": 0.0597, "reward": 1.0497166961431503, "reward_std": 0.4923819173818629, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15236664820404258, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.7395833484828472, "step": 4205 }, { "completion_length": 1024.0, "epoch": 0.9298601620629202, "grad_norm": 4.359344826638161, "kl": 1.354461669921875, "learning_rate": 2.972915268838794e-07, "loss": 0.0542, "reward": 1.1435699885245412, "reward_std": 0.5130232198811427, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13976335329643916, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7708333484828472, "step": 4210 }, { "completion_length": 1024.0, "epoch": 0.9309645090487431, "grad_norm": 2.438246106743701, "kl": 1.33017578125, "learning_rate": 2.8803213585846036e-07, "loss": 0.0532, "reward": 1.134584633493796, "reward_std": 0.5522605210964684, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13208204362870218, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7604166816920042, "step": 4215 }, { "completion_length": 1024.0, "epoch": 0.932068856034566, "grad_norm": 3.668505135077039, "kl": 1.239410400390625, "learning_rate": 2.7891712395735513e-07, "loss": 0.0496, "reward": 1.1889993457123638, "reward_std": 0.49140313523857915, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.12558400264533703, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7895833475515246, "step": 4220 }, { "completion_length": 1024.0, "epoch": 0.933173203020389, "grad_norm": 2.3683790983764506, "kl": 1.01651611328125, "learning_rate": 2.699466266852779e-07, "loss": 0.0407, "reward": 1.2326950676739217, "reward_std": 0.4832893053477164, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15272159982996528, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.8229166805744171, "step": 4225 }, { "completion_length": 1024.0, "epoch": 0.9342775500062119, "grad_norm": 7.580296007856714, "kl": 1.550482177734375, "learning_rate": 2.6112077739857465e-07, "loss": 0.062, "reward": 1.0981525180861353, "reward_std": 0.48764542372955477, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12476415821192859, "rewards/format_reward": 0.45, "rewards/reasoning_steps_reward": 0.7729166800156235, "step": 4230 }, { "completion_length": 1024.0, "epoch": 0.9353818969920349, "grad_norm": 2.347265033901168, "kl": 1.591693115234375, "learning_rate": 2.524397073032403e-07, "loss": 0.0637, "reward": 1.104093014076352, "reward_std": 0.6151681105347961, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13132366243735305, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7416666794568301, "step": 4235 }, { "completion_length": 1024.0, "epoch": 0.9364862439778578, "grad_norm": 1.8325563935417812, "kl": 1.41153564453125, "learning_rate": 2.4390354545296257e-07, "loss": 0.0565, "reward": 1.1709908257238566, "reward_std": 0.5391441511004814, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.135259176461841, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7875000162050128, "step": 4240 }, { "completion_length": 1024.0, "epoch": 0.9375905909636808, "grad_norm": 2.4516010722839416, "kl": 1.5540771484375, "learning_rate": 2.3551241874721353e-07, "loss": 0.0622, "reward": 1.093654316617176, "reward_std": 0.5696152574062581, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14592901616124437, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7645833492279053, "step": 4245 }, { "completion_length": 1024.0, "epoch": 0.9386949379495038, "grad_norm": 7.947155244735769, "kl": 1.482958984375, "learning_rate": 2.272664519293566e-07, "loss": 0.0593, "reward": 1.1012504249811172, "reward_std": 0.5641292022948619, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.167499577824492, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.7750000167638064, "step": 4250 }, { "completion_length": 1024.0, "epoch": 0.9397992849353267, "grad_norm": 3.319777474470088, "kl": 1.46353759765625, "learning_rate": 2.1916576758478913e-07, "loss": 0.0586, "reward": 1.1003205741755664, "reward_std": 0.5253774059608987, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15384610984983738, "rewards/format_reward": 0.46875, "rewards/reasoning_steps_reward": 0.7854166815057397, "step": 4255 }, { "completion_length": 1024.0, "epoch": 0.9409036319211497, "grad_norm": 5.512661571326239, "kl": 1.366729736328125, "learning_rate": 2.1121048613912843e-07, "loss": 0.0547, "reward": 1.1612554124556482, "reward_std": 0.5421569795755203, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13457791769033065, "rewards/format_reward": 0.53125, "rewards/reasoning_steps_reward": 0.7458333497866988, "step": 4260 }, { "completion_length": 1024.0, "epoch": 0.9420079789069725, "grad_norm": 2.1266648680818863, "kl": 1.49979248046875, "learning_rate": 2.0340072585641523e-07, "loss": 0.06, "reward": 1.111027823621407, "reward_std": 0.5750096809195384, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.1348055164780817, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.7270833497866989, "step": 4265 }, { "completion_length": 1024.0, "epoch": 0.9431123258927955, "grad_norm": 3.406473823796893, "kl": 1.421197509765625, "learning_rate": 1.9573660283735974e-07, "loss": 0.0568, "reward": 1.0889284812612459, "reward_std": 0.4393613349617226, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.146488197666622, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.729166685603559, "step": 4270 }, { "completion_length": 1024.0, "epoch": 0.9442166728786184, "grad_norm": 4.687927464271602, "kl": 1.366455078125, "learning_rate": 1.8821823101760949e-07, "loss": 0.0547, "reward": 1.1325325137935578, "reward_std": 0.5590258315745359, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13205082423082787, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7833333486691118, "step": 4275 }, { "completion_length": 1024.0, "epoch": 0.9453210198644414, "grad_norm": 2.9056519481828955, "kl": 1.257012939453125, "learning_rate": 1.8084572216606422e-07, "loss": 0.0503, "reward": 1.1094948038109578, "reward_std": 0.5337844159294036, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1259218716812029, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7416666800156235, "step": 4280 }, { "completion_length": 1024.0, "epoch": 0.9464253668502643, "grad_norm": 3.211780491107793, "kl": 1.4444091796875, "learning_rate": 1.736191858832048e-07, "loss": 0.0578, "reward": 1.1021898888982833, "reward_std": 0.5546243787208368, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1436434510455001, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7833333473652602, "step": 4285 }, { "completion_length": 1024.0, "epoch": 0.9475297138360873, "grad_norm": 3.791982940822234, "kl": 1.3896728515625, "learning_rate": 1.665387295994747e-07, "loss": 0.0556, "reward": 1.0838238134048879, "reward_std": 0.493443389685126, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13284285950794583, "rewards/format_reward": 0.44375, "rewards/reasoning_steps_reward": 0.7729166828095912, "step": 4290 }, { "completion_length": 1024.0, "epoch": 0.9486340608219103, "grad_norm": 3.5429912382327564, "kl": 1.38509521484375, "learning_rate": 1.5960445857367003e-07, "loss": 0.0554, "reward": 1.16405497957021, "reward_std": 0.5455976179917343, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1505283652804792, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.7645833499729633, "step": 4295 }, { "completion_length": 1024.0, "epoch": 0.9497384078077332, "grad_norm": 3.754514777830821, "kl": 1.62933349609375, "learning_rate": 1.5281647589138527e-07, "loss": 0.0652, "reward": 1.1515638804994524, "reward_std": 0.5102403333643452, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1526028042600956, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.7791666816920042, "step": 4300 }, { "epoch": 0.9497384078077332, "eval_completion_length": 1024.0, "eval_kl": 1.414345703125, "eval_loss": 0.056856490671634674, "eval_reward": 1.3021593242883682, "eval_reward_std": 0.5029132961481809, "eval_rewards/accuracy_reward": 0.03, "eval_rewards/cosine_scaled_reward": -0.10784066822379827, "eval_rewards/format_reward": 0.565, "eval_rewards/reasoning_steps_reward": 0.8150000131130218, "eval_runtime": 202.3543, "eval_samples_per_second": 0.489, "eval_steps_per_second": 0.124, "step": 4300 }, { "completion_length": 1024.0, "epoch": 0.9508427547935562, "grad_norm": 5.874297414860863, "kl": 1.873388671875, "learning_rate": 1.4617488246348012e-07, "loss": 0.0749, "reward": 1.091808697162196, "reward_std": 0.4958706302659266, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15194130790732743, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7562500137835741, "step": 4305 }, { "completion_length": 1024.0, "epoch": 0.9519471017793791, "grad_norm": 3.022042116074618, "kl": 1.3565673828125, "learning_rate": 1.3967977702456946e-07, "loss": 0.0542, "reward": 1.2365093669854104, "reward_std": 0.5081978341855574, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.14265730559855, "rewards/format_reward": 0.5625, "rewards/reasoning_steps_reward": 0.8041666803881526, "step": 4310 }, { "completion_length": 1024.0, "epoch": 0.953051448765202, "grad_norm": 2.89681170743707, "kl": 1.37025146484375, "learning_rate": 1.3333125613156695e-07, "loss": 0.0548, "reward": 1.167290734499693, "reward_std": 0.5237989299959736, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14104260036001506, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.7583333503454923, "step": 4315 }, { "completion_length": 1024.0, "epoch": 0.9541557957510249, "grad_norm": 3.303852665082722, "kl": 1.4704833984375, "learning_rate": 1.271294141622459e-07, "loss": 0.0588, "reward": 1.1580770617350935, "reward_std": 0.525305885047419, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13775627961731515, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7833333499729633, "step": 4320 }, { "completion_length": 1024.0, "epoch": 0.9552601427368479, "grad_norm": 154.75500513505, "kl": 4.75120849609375, "learning_rate": 1.2107434331383504e-07, "loss": 0.1896, "reward": 1.1435575605370105, "reward_std": 0.5372929855715484, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14185910428859644, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.7791666820645332, "step": 4325 }, { "completion_length": 1024.0, "epoch": 0.9563644897226709, "grad_norm": 4.195260582471402, "kl": 1.28109130859375, "learning_rate": 1.1516613360164408e-07, "loss": 0.0512, "reward": 1.194129934720695, "reward_std": 0.5264870421490742, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13503673870000057, "rewards/format_reward": 0.53125, "rewards/reasoning_steps_reward": 0.7854166809469462, "step": 4330 }, { "completion_length": 1024.0, "epoch": 0.9574688367084938, "grad_norm": 4.697501623828217, "kl": 1.60849609375, "learning_rate": 1.094048728577346e-07, "loss": 0.0643, "reward": 1.1921196983545088, "reward_std": 0.46050074373306416, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14954696901404532, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.7854166820645332, "step": 4335 }, { "completion_length": 1024.0, "epoch": 0.9585731836943168, "grad_norm": 5.237784344319697, "kl": 1.4283721923828125, "learning_rate": 1.0379064672960793e-07, "loss": 0.0571, "reward": 1.1046239531598985, "reward_std": 0.5052039703403353, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.13495938415580894, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.777083345875144, "step": 4340 }, { "completion_length": 1024.0, "epoch": 0.9596775306801397, "grad_norm": 3.8322720113450583, "kl": 1.65595703125, "learning_rate": 9.832353867893385e-08, "loss": 0.0663, "reward": 1.173404076125007, "reward_std": 0.49049041836988183, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13284592391137268, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7875000132247806, "step": 4345 }, { "completion_length": 1024.0, "epoch": 0.9607818776659627, "grad_norm": 1.4873099565182297, "kl": 1.376214599609375, "learning_rate": 9.300362998030832e-08, "loss": 0.0551, "reward": 1.1576748417923226, "reward_std": 0.43831962359836324, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13190850754035638, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.7833333479240536, "step": 4350 }, { "completion_length": 1024.0, "epoch": 0.9618862246517856, "grad_norm": 2.317212736464978, "kl": 1.31639404296875, "learning_rate": 8.783099972004882e-08, "loss": 0.0526, "reward": 1.2284595191478729, "reward_std": 0.5590447532267717, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.09862381973216543, "rewards/format_reward": 0.5, "rewards/reasoning_steps_reward": 0.7895833496004343, "step": 4355 }, { "completion_length": 1024.0, "epoch": 0.9629905716376085, "grad_norm": 4.2578007685169394, "kl": 1.872528076171875, "learning_rate": 8.280572479501426e-08, "loss": 0.0749, "reward": 1.1292044205591083, "reward_std": 0.5521556083040196, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.16662891303567448, "rewards/format_reward": 0.525, "rewards/reasoning_steps_reward": 0.7645833449438214, "step": 4360 }, { "completion_length": 1024.0, "epoch": 0.9640949186234314, "grad_norm": 2.917500800023099, "kl": 1.26483154296875, "learning_rate": 7.792787991146356e-08, "loss": 0.0506, "reward": 1.15064637940377, "reward_std": 0.5034463145228074, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14935362476826414, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.8062500160187482, "step": 4365 }, { "completion_length": 1024.0, "epoch": 0.9651992656092544, "grad_norm": 4.463219264295833, "kl": 1.64820556640625, "learning_rate": 7.319753758394665e-08, "loss": 0.066, "reward": 1.1375942932441832, "reward_std": 0.5617788247385761, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1478223788541982, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7666666842997074, "step": 4370 }, { "completion_length": 1024.0, "epoch": 0.9663036125950774, "grad_norm": 1.973179732100984, "kl": 1.30853271484375, "learning_rate": 6.861476813422419e-08, "loss": 0.0523, "reward": 1.1293663954362274, "reward_std": 0.529233861564353, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12063361131085912, "rewards/format_reward": 0.4625, "rewards/reasoning_steps_reward": 0.7812500165775418, "step": 4375 }, { "completion_length": 1024.0, "epoch": 0.9674079595809003, "grad_norm": 9.289039619646017, "kl": 1.227276611328125, "learning_rate": 6.417963969022389e-08, "loss": 0.0491, "reward": 1.1480536976829172, "reward_std": 0.553274897771189, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13527963738015386, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7645833507180214, "step": 4380 }, { "completion_length": 1024.0, "epoch": 0.9685123065667233, "grad_norm": 1.9727874950880977, "kl": 1.340447998046875, "learning_rate": 5.989221818502478e-08, "loss": 0.0536, "reward": 1.1438379530794918, "reward_std": 0.5070553012817982, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.11657871988791157, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7416666815057397, "step": 4385 }, { "completion_length": 1024.0, "epoch": 0.9696166535525462, "grad_norm": 4.708920919089679, "kl": 1.432037353515625, "learning_rate": 5.5752567355883415e-08, "loss": 0.0573, "reward": 1.2146405932493507, "reward_std": 0.5154525164925872, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1145260815290385, "rewards/format_reward": 0.55, "rewards/reasoning_steps_reward": 0.7791666824370622, "step": 4390 }, { "completion_length": 1024.0, "epoch": 0.9707210005383692, "grad_norm": 5.44416839852759, "kl": 1.595782470703125, "learning_rate": 5.176074874327919e-08, "loss": 0.0639, "reward": 1.1949712364934384, "reward_std": 0.4661798856162932, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12377876958362322, "rewards/format_reward": 0.55, "rewards/reasoning_steps_reward": 0.7625000156462193, "step": 4395 }, { "completion_length": 1024.0, "epoch": 0.9718253475241921, "grad_norm": 1.4586617640539672, "kl": 1.410101318359375, "learning_rate": 4.791682169000056e-08, "loss": 0.0564, "reward": 1.189527632854879, "reward_std": 0.5215653205494164, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13130570322609855, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7958333488553763, "step": 4400 }, { "epoch": 0.9718253475241921, "eval_completion_length": 1024.0, "eval_kl": 1.406845703125, "eval_loss": 0.05658888816833496, "eval_reward": 1.2143233251571655, "eval_reward_std": 0.5284116192162037, "eval_rewards/accuracy_reward": 0.005, "eval_rewards/cosine_scaled_reward": -0.13401001332793386, "eval_rewards/format_reward": 0.535, "eval_rewards/reasoning_steps_reward": 0.8083333480358124, "eval_runtime": 204.7668, "eval_samples_per_second": 0.483, "eval_steps_per_second": 0.122, "step": 4400 }, { "completion_length": 1024.0, "epoch": 0.9729296945100151, "grad_norm": 4.00824183633246, "kl": 1.24869384765625, "learning_rate": 4.4220843340269105e-08, "loss": 0.0499, "reward": 1.1324331050971523, "reward_std": 0.48450650404556656, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1425668969808612, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.7875000163912773, "step": 4405 }, { "completion_length": 1024.0, "epoch": 0.9740340414958379, "grad_norm": 6.004531096378766, "kl": 1.369793701171875, "learning_rate": 4.067286863888131e-08, "loss": 0.0548, "reward": 1.1905505992472172, "reward_std": 0.549637848045677, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14486607483704575, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.7979166818782687, "step": 4410 }, { "completion_length": 1024.0, "epoch": 0.9751383884816609, "grad_norm": 3.1816931594107856, "kl": 1.212567138671875, "learning_rate": 3.727295033040035e-08, "loss": 0.0485, "reward": 1.2453083097934723, "reward_std": 0.4840336770255817, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1255250307280221, "rewards/format_reward": 0.55, "rewards/reasoning_steps_reward": 0.8083333456888795, "step": 4415 }, { "completion_length": 1024.0, "epoch": 0.9762427354674839, "grad_norm": 2.127607846360171, "kl": 1.438409423828125, "learning_rate": 3.402113895836445e-08, "loss": 0.0575, "reward": 1.1015230394899844, "reward_std": 0.5222396170902357, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14014363521273482, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.754166679829359, "step": 4420 }, { "completion_length": 1024.0, "epoch": 0.9773470824533068, "grad_norm": 1.885777803810275, "kl": 1.2944549560546874, "learning_rate": 3.091748286453866e-08, "loss": 0.0518, "reward": 1.235387963615358, "reward_std": 0.4532736351233325, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.15211204282531982, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.8375000137835741, "step": 4425 }, { "completion_length": 1024.0, "epoch": 0.9784514294391298, "grad_norm": 2.6855196178764076, "kl": 1.15111083984375, "learning_rate": 2.796202818819871e-08, "loss": 0.0461, "reward": 1.233874310180545, "reward_std": 0.4524709703262488, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.1473756947714719, "rewards/format_reward": 0.575, "rewards/reasoning_steps_reward": 0.8062500150874257, "step": 4430 }, { "completion_length": 1024.0, "epoch": 0.9795557764249527, "grad_norm": 4.215344003298246, "kl": 1.727777099609375, "learning_rate": 2.5154818865440466e-08, "loss": 0.0692, "reward": 1.0581813110038638, "reward_std": 0.47543474311573847, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14390203252357878, "rewards/format_reward": 0.45, "rewards/reasoning_steps_reward": 0.7520833509042859, "step": 4435 }, { "completion_length": 1024.0, "epoch": 0.9806601234107757, "grad_norm": 5.2448583567855565, "kl": 1.29886474609375, "learning_rate": 2.2495896628529355e-08, "loss": 0.052, "reward": 1.2382936247624456, "reward_std": 0.5197429495237884, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13462304897766444, "rewards/format_reward": 0.55625, "rewards/reasoning_steps_reward": 0.8041666820645332, "step": 4440 }, { "completion_length": 1024.0, "epoch": 0.9817644703965986, "grad_norm": 2.0177248381920503, "kl": 1.354461669921875, "learning_rate": 1.9985301005280843e-08, "loss": 0.0541, "reward": 1.1819130264222621, "reward_std": 0.4609925127326278, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.17017031446448527, "rewards/format_reward": 0.54375, "rewards/reasoning_steps_reward": 0.8083333512768149, "step": 4445 }, { "completion_length": 1024.0, "epoch": 0.9828688173824216, "grad_norm": 3.9907433219645108, "kl": 1.760162353515625, "learning_rate": 1.7623069318469797e-08, "loss": 0.0704, "reward": 1.1392354678362608, "reward_std": 0.5574802142305998, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13784787233062162, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7520833496004343, "step": 4450 }, { "completion_length": 1024.0, "epoch": 0.9839731643682446, "grad_norm": 2.3118290114345497, "kl": 1.65042724609375, "learning_rate": 1.5409236685277608e-08, "loss": 0.066, "reward": 1.1408525642938911, "reward_std": 0.548295050940942, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1403974376269616, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7625000149011611, "step": 4455 }, { "completion_length": 1024.0, "epoch": 0.9850775113540674, "grad_norm": 57.29691554401206, "kl": 2.243560791015625, "learning_rate": 1.3343836016772582e-08, "loss": 0.0898, "reward": 1.1517346784472466, "reward_std": 0.47670648889688894, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13368199388496577, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.779166679829359, "step": 4460 }, { "completion_length": 1024.0, "epoch": 0.9861818583398904, "grad_norm": 5.2108404717142465, "kl": 1.72296142578125, "learning_rate": 1.1426898017412591e-08, "loss": 0.0689, "reward": 1.1585712847299874, "reward_std": 0.5616339144078666, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1539287220512051, "rewards/format_reward": 0.5125, "rewards/reasoning_steps_reward": 0.7937500132247806, "step": 4465 }, { "completion_length": 1024.0, "epoch": 0.9872862053257133, "grad_norm": 4.200732044863656, "kl": 2.089434814453125, "learning_rate": 9.658451184600959e-09, "loss": 0.0836, "reward": 1.157062985189259, "reward_std": 0.5377040596376901, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12210368756641402, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.797916679084301, "step": 4470 }, { "completion_length": 1024.0, "epoch": 0.9883905523115363, "grad_norm": 7.880275781288607, "kl": 1.3374755859375, "learning_rate": 8.038521808249045e-09, "loss": 0.0535, "reward": 1.1648781194817275, "reward_std": 0.464659771242259, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.1309552209175308, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.7895833460614086, "step": 4475 }, { "completion_length": 1024.0, "epoch": 0.9894948992973592, "grad_norm": 3.1424315830543934, "kl": 1.504827880859375, "learning_rate": 6.567133970397654e-09, "loss": 0.0602, "reward": 1.1670374654233455, "reward_std": 0.5484940701397136, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1412958735250868, "rewards/format_reward": 0.51875, "rewards/reasoning_steps_reward": 0.7833333479240536, "step": 4480 }, { "completion_length": 1024.0, "epoch": 0.9905992462831822, "grad_norm": 2.587303669883422, "kl": 2.021490478515625, "learning_rate": 5.2443095448506674e-09, "loss": 0.0809, "reward": 1.1091021137312054, "reward_std": 0.5705063059137274, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.13881455489261044, "rewards/format_reward": 0.49375, "rewards/reasoning_steps_reward": 0.7479166816920042, "step": 4485 }, { "completion_length": 1024.0, "epoch": 0.9917035932690051, "grad_norm": 6.095524885050565, "kl": 1.741949462890625, "learning_rate": 4.070068196853072e-09, "loss": 0.0697, "reward": 1.156708344630897, "reward_std": 0.4693000786108314, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14954165498202202, "rewards/format_reward": 0.5375, "rewards/reasoning_steps_reward": 0.7625000149011611, "step": 4490 }, { "completion_length": 1024.0, "epoch": 0.9928079402548281, "grad_norm": 2.396353477035366, "kl": 1.70526123046875, "learning_rate": 3.0444273828000857e-09, "loss": 0.0682, "reward": 1.0966801326721907, "reward_std": 0.6420538909413154, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.14498654929120675, "rewards/format_reward": 0.475, "rewards/reasoning_steps_reward": 0.7604166805744171, "step": 4495 }, { "completion_length": 1024.0, "epoch": 0.9939122872406511, "grad_norm": 7.241557754267188, "kl": 1.6732666015625, "learning_rate": 2.167402349972925e-09, "loss": 0.0669, "reward": 1.077176136802882, "reward_std": 0.5696488693953142, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.13740720303030685, "rewards/format_reward": 0.45625, "rewards/reasoning_steps_reward": 0.7458333460614085, "step": 4500 }, { "epoch": 0.9939122872406511, "eval_completion_length": 1024.0, "eval_kl": 1.605205078125, "eval_loss": 0.06404020637273788, "eval_reward": 1.1858943843841552, "eval_reward_std": 0.5475942821498029, "eval_rewards/accuracy_reward": 0.01, "eval_rewards/cosine_scaled_reward": -0.13243895017309115, "eval_rewards/format_reward": 0.515, "eval_rewards/reasoning_steps_reward": 0.7933333450555802, "eval_runtime": 227.8369, "eval_samples_per_second": 0.435, "eval_steps_per_second": 0.11, "step": 4500 }, { "completion_length": 1024.0, "epoch": 0.9950166342264739, "grad_norm": 2.4597613761069437, "kl": 1.281756591796875, "learning_rate": 1.4390061363189767e-09, "loss": 0.0513, "reward": 1.1734716016799212, "reward_std": 0.4506149138222099, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12861173490746297, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7895833510905504, "step": 4505 }, { "completion_length": 1024.0, "epoch": 0.9961209812122969, "grad_norm": 7.245936093195075, "kl": 1.83837890625, "learning_rate": 8.592495702497427e-10, "loss": 0.0735, "reward": 1.1442578772082925, "reward_std": 0.49708576006232763, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.14532546755945078, "rewards/format_reward": 0.50625, "rewards/reasoning_steps_reward": 0.7833333475515246, "step": 4510 }, { "completion_length": 1024.0, "epoch": 0.9972253281981198, "grad_norm": 4.855720269609806, "kl": 1.75643310546875, "learning_rate": 4.2814127048873553e-10, "loss": 0.0703, "reward": 1.1811686454340815, "reward_std": 0.5081049962056567, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.11883136416436173, "rewards/format_reward": 0.4875, "rewards/reasoning_steps_reward": 0.8062500145286322, "step": 4515 }, { "completion_length": 1024.0, "epoch": 0.9983296751839428, "grad_norm": 9.240096677091826, "kl": 1.80220947265625, "learning_rate": 1.4568764593603235e-10, "loss": 0.0721, "reward": 1.1370168601162731, "reward_std": 0.5179185615059396, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1338164767366834, "rewards/format_reward": 0.48125, "rewards/reasoning_steps_reward": 0.7833333497866988, "step": 4520 }, { "completion_length": 1024.0, "epoch": 0.9994340221697657, "grad_norm": 8.38153363176688, "kl": 1.0825714111328124, "learning_rate": 1.1892895576126606e-11, "loss": 0.0433, "reward": 1.2265190588310362, "reward_std": 0.4433173163793981, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.15056427880190312, "rewards/format_reward": 0.5625, "rewards/reasoning_steps_reward": 0.8145833492279053, "step": 4525 }, { "completion_length": 1024.0, "epoch": 0.9998757609640949, "kl": 1.4818878173828125, "reward": 1.2614081082865596, "reward_std": 0.6296223533936427, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.12400856785438918, "rewards/format_reward": 0.578125, "rewards/reasoning_steps_reward": 0.8072916818782687, "step": 4527, "total_flos": 0.0, "train_loss": 7996306.261529862, "train_runtime": 280239.113, "train_samples_per_second": 0.258, "train_steps_per_second": 0.016 } ], "logging_steps": 5, "max_steps": 4527, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }