{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 2, "global_step": 312, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 553.6277812957763, "epoch": 0.016, "grad_norm": 0.21352672576904297, "kl": 0.0001860499382019043, "learning_rate": 4.6875e-07, "loss": 0.0, "reward": 0.5361111229285598, "reward_std": 0.2633297026157379, "rewards/accuracy_reward": 0.3111111169680953, "rewards/format_reward": 0.22500000484287738, "step": 5 }, { "completion_length": 509.1916721343994, "epoch": 0.032, "grad_norm": 0.08460841327905655, "kl": 0.0006537675857543945, "learning_rate": 9.375e-07, "loss": 0.0, "reward": 0.6250000141561032, "reward_std": 0.22612885609269143, "rewards/accuracy_reward": 0.32500000633299353, "rewards/format_reward": 0.30000000819563866, "step": 10 }, { "completion_length": 414.31111640930175, "epoch": 0.048, "grad_norm": 0.09808748215436935, "kl": 0.009083938598632813, "learning_rate": 1.40625e-06, "loss": 0.0004, "reward": 0.7472222343087196, "reward_std": 0.25241802744567393, "rewards/accuracy_reward": 0.12500000279396772, "rewards/format_reward": 0.6222222335636616, "step": 15 }, { "completion_length": 250.90277996063233, "epoch": 0.064, "grad_norm": 0.05115974321961403, "kl": 0.03193817138671875, "learning_rate": 1.875e-06, "loss": 0.0013, "reward": 0.9416666746139526, "reward_std": 0.15361464098095895, "rewards/accuracy_reward": 0.03611111156642437, "rewards/format_reward": 0.9055555611848831, "step": 20 }, { "completion_length": 211.14444694519042, "epoch": 0.08, "grad_norm": 0.06653334200382233, "kl": 0.035968017578125, "learning_rate": 2.3437500000000002e-06, "loss": 0.0014, "reward": 0.986111119389534, "reward_std": 0.10103629790246486, "rewards/accuracy_reward": 0.0305555559694767, "rewards/format_reward": 0.9555555582046509, "step": 25 }, { "completion_length": 246.1000036239624, "epoch": 0.096, "grad_norm": 0.06397304683923721, "kl": 0.0366363525390625, "learning_rate": 2.8125e-06, "loss": 0.0015, "reward": 1.0000000089406966, "reward_std": 0.07216878421604633, "rewards/accuracy_reward": 0.038888889364898205, "rewards/format_reward": 0.9611111134290695, "step": 30 }, { "completion_length": 255.99722633361816, "epoch": 0.112, "grad_norm": 0.12618552148342133, "kl": 0.03426666259765625, "learning_rate": 2.9991503375003e-06, "loss": 0.0014, "reward": 1.025000013411045, "reward_std": 0.11547005474567414, "rewards/accuracy_reward": 0.06388888955116272, "rewards/format_reward": 0.9611111134290695, "step": 35 }, { "completion_length": 258.1111114501953, "epoch": 0.128, "grad_norm": 0.09856373071670532, "kl": 0.0427825927734375, "learning_rate": 2.993961440992859e-06, "loss": 0.0017, "reward": 1.086111131310463, "reward_std": 0.1732050821185112, "rewards/accuracy_reward": 0.12500000167638065, "rewards/format_reward": 0.9611111134290695, "step": 40 }, { "completion_length": 296.8416700363159, "epoch": 0.144, "grad_norm": 0.091950424015522, "kl": 0.0412506103515625, "learning_rate": 2.984071989079555e-06, "loss": 0.0017, "reward": 1.0666666805744172, "reward_std": 0.1658154871314764, "rewards/accuracy_reward": 0.10000000093132258, "rewards/format_reward": 0.9666666686534882, "step": 45 }, { "completion_length": 317.56111373901365, "epoch": 0.16, "grad_norm": 0.09281204640865326, "kl": 0.0566925048828125, "learning_rate": 2.9695130976348534e-06, "loss": 0.0023, "reward": 1.0972222343087197, "reward_std": 0.21873926185071468, "rewards/accuracy_reward": 0.15555555745959282, "rewards/format_reward": 0.9416666701436043, "step": 50 }, { "completion_length": 281.8416681289673, "epoch": 0.176, "grad_norm": 0.10378360003232956, "kl": 0.07989501953125, "learning_rate": 2.9503305743175096e-06, "loss": 0.0032, "reward": 1.1777777940034866, "reward_std": 0.25370719768106936, "rewards/accuracy_reward": 0.20833333637565374, "rewards/format_reward": 0.9694444462656975, "step": 55 }, { "completion_length": 254.96389064788818, "epoch": 0.192, "grad_norm": 0.14641663432121277, "kl": 0.092510986328125, "learning_rate": 2.9265847744427307e-06, "loss": 0.0037, "reward": 1.1916666761040688, "reward_std": 0.2572292793542147, "rewards/accuracy_reward": 0.23333333786576987, "rewards/format_reward": 0.9583333358168602, "step": 60 }, { "completion_length": 270.2777801513672, "epoch": 0.208, "grad_norm": 0.09665928781032562, "kl": 0.097540283203125, "learning_rate": 2.8983504110820214e-06, "loss": 0.0039, "reward": 1.172222228348255, "reward_std": 0.25980762131512164, "rewards/accuracy_reward": 0.2305555608123541, "rewards/format_reward": 0.9416666701436043, "step": 65 }, { "completion_length": 372.66111488342284, "epoch": 0.224, "grad_norm": 0.13982528448104858, "kl": 0.0855712890625, "learning_rate": 2.865716319988224e-06, "loss": 0.0034, "reward": 1.2000000104308128, "reward_std": 0.32587598264217377, "rewards/accuracy_reward": 0.2916666727513075, "rewards/format_reward": 0.9083333387970924, "step": 70 }, { "completion_length": 372.80278129577636, "epoch": 0.24, "grad_norm": 0.07452689111232758, "kl": 0.07293701171875, "learning_rate": 2.82878518008537e-06, "loss": 0.0029, "reward": 1.3944444462656975, "reward_std": 0.30276345908641816, "rewards/accuracy_reward": 0.4333333451300859, "rewards/format_reward": 0.9611111134290695, "step": 75 }, { "completion_length": 376.4416694641113, "epoch": 0.256, "grad_norm": 0.08069849759340286, "kl": 0.08565673828125, "learning_rate": 2.7876731904027993e-06, "loss": 0.0034, "reward": 1.3472222313284874, "reward_std": 0.36049848720431327, "rewards/accuracy_reward": 0.40000000949949027, "rewards/format_reward": 0.9472222253680229, "step": 80 }, { "completion_length": 354.7555561065674, "epoch": 0.272, "grad_norm": 0.10501620918512344, "kl": 0.0765869140625, "learning_rate": 2.7425097044700246e-06, "loss": 0.0031, "reward": 1.3944444552063942, "reward_std": 0.27776345871388913, "rewards/accuracy_reward": 0.45000001005828383, "rewards/format_reward": 0.9444444477558136, "step": 85 }, { "completion_length": 403.3444492340088, "epoch": 0.288, "grad_norm": 0.12747347354888916, "kl": 0.073785400390625, "learning_rate": 2.6934368233226715e-06, "loss": 0.003, "reward": 1.2694444522261619, "reward_std": 0.34864307269454003, "rewards/accuracy_reward": 0.38055556602776053, "rewards/format_reward": 0.8888888947665692, "step": 90 }, { "completion_length": 413.0500057220459, "epoch": 0.304, "grad_norm": 0.08874136209487915, "kl": 0.070477294921875, "learning_rate": 2.6406089484000465e-06, "loss": 0.0028, "reward": 1.297222228348255, "reward_std": 0.3640205666422844, "rewards/accuracy_reward": 0.4333333447575569, "rewards/format_reward": 0.8638888970017433, "step": 95 }, { "completion_length": 381.81389274597166, "epoch": 0.32, "grad_norm": 0.05798633396625519, "kl": 0.07501220703125, "learning_rate": 2.584192295741087e-06, "loss": 0.003, "reward": 1.3750000134110452, "reward_std": 0.3136751361191273, "rewards/accuracy_reward": 0.4416666753590107, "rewards/format_reward": 0.9333333373069763, "step": 100 }, { "completion_length": 369.6000038146973, "epoch": 0.336, "grad_norm": 0.07807120680809021, "kl": 0.077154541015625, "learning_rate": 2.5243643730072105e-06, "loss": 0.0031, "reward": 1.494444453716278, "reward_std": 0.31015305407345295, "rewards/accuracy_reward": 0.5555555684491992, "rewards/format_reward": 0.938888892531395, "step": 105 }, { "completion_length": 390.78889389038085, "epoch": 0.352, "grad_norm": 0.08092948794364929, "kl": 0.08060302734375, "learning_rate": 2.461313420977536e-06, "loss": 0.0032, "reward": 1.380555558204651, "reward_std": 0.3232976388186216, "rewards/accuracy_reward": 0.4527777874842286, "rewards/format_reward": 0.9277777820825577, "step": 110 }, { "completion_length": 413.20278282165526, "epoch": 0.368, "grad_norm": 0.059824734926223755, "kl": 0.0724517822265625, "learning_rate": 2.3952378212737554e-06, "loss": 0.0029, "reward": 1.3944444507360458, "reward_std": 0.337731396406889, "rewards/accuracy_reward": 0.4611111244186759, "rewards/format_reward": 0.9333333373069763, "step": 115 }, { "completion_length": 403.9111152648926, "epoch": 0.384, "grad_norm": 0.07250163704156876, "kl": 0.078021240234375, "learning_rate": 2.3263454721781537e-06, "loss": 0.0031, "reward": 1.4305555507540704, "reward_std": 0.299241379275918, "rewards/accuracy_reward": 0.530555566214025, "rewards/format_reward": 0.9000000059604645, "step": 120 }, { "completion_length": 378.2222255706787, "epoch": 0.4, "grad_norm": 0.12146278470754623, "kl": 0.0875885009765625, "learning_rate": 2.2548531345087003e-06, "loss": 0.0035, "reward": 1.3388888984918594, "reward_std": 0.28480762206017973, "rewards/accuracy_reward": 0.386111119389534, "rewards/format_reward": 0.9527777805924416, "step": 125 }, { "completion_length": 324.2388910293579, "epoch": 0.416, "grad_norm": 0.09016001224517822, "kl": 0.102557373046875, "learning_rate": 2.18098574960932e-06, "loss": 0.0041, "reward": 1.4750000044703484, "reward_std": 0.28257471285760405, "rewards/accuracy_reward": 0.5138889042660594, "rewards/format_reward": 0.9611111134290695, "step": 130 }, { "completion_length": 425.48055801391604, "epoch": 0.432, "grad_norm": 0.06732641160488129, "kl": 0.089459228515625, "learning_rate": 2.104975731601208e-06, "loss": 0.0036, "reward": 1.3027777835726737, "reward_std": 0.32329764030873775, "rewards/accuracy_reward": 0.38611112013459203, "rewards/format_reward": 0.9166666716337204, "step": 135 }, { "completion_length": 475.63889541625974, "epoch": 0.448, "grad_norm": 0.06925955414772034, "kl": 0.06644287109375, "learning_rate": 2.027062236122014e-06, "loss": 0.0027, "reward": 1.3805555671453476, "reward_std": 0.35310889072716234, "rewards/accuracy_reward": 0.4888889010995626, "rewards/format_reward": 0.8916666731238365, "step": 140 }, { "completion_length": 469.69444847106934, "epoch": 0.464, "grad_norm": 0.04355955123901367, "kl": 0.073883056640625, "learning_rate": 1.9474904078537343e-06, "loss": 0.003, "reward": 1.4388888955116272, "reward_std": 0.2873859636485577, "rewards/accuracy_reward": 0.5583333482965827, "rewards/format_reward": 0.8805555619299412, "step": 145 }, { "completion_length": 500.4750095367432, "epoch": 0.48, "grad_norm": 0.06400442123413086, "kl": 0.06243896484375, "learning_rate": 1.866510609206841e-06, "loss": 0.0025, "reward": 1.477777788043022, "reward_std": 0.274241379275918, "rewards/accuracy_reward": 0.5611111238598824, "rewards/format_reward": 0.9166666716337204, "step": 150 }, { "completion_length": 482.2611141204834, "epoch": 0.496, "grad_norm": 0.07321769744157791, "kl": 0.063238525390625, "learning_rate": 1.784377632587518e-06, "loss": 0.0025, "reward": 1.4861111238598823, "reward_std": 0.30147428885102273, "rewards/accuracy_reward": 0.541666678711772, "rewards/format_reward": 0.9444444477558136, "step": 155 }, { "completion_length": 458.41389694213865, "epoch": 0.512, "grad_norm": 0.05824045091867447, "kl": 0.06778564453125, "learning_rate": 1.7013498987264833e-06, "loss": 0.0027, "reward": 1.475000013411045, "reward_std": 0.23446219004690647, "rewards/accuracy_reward": 0.5250000132247805, "rewards/format_reward": 0.9500000029802322, "step": 160 }, { "completion_length": 437.94444770812987, "epoch": 0.528, "grad_norm": 0.07412311434745789, "kl": 0.070013427734375, "learning_rate": 1.6176886435917677e-06, "loss": 0.0028, "reward": 1.5305555671453477, "reward_std": 0.2790526311844587, "rewards/accuracy_reward": 0.5833333466202021, "rewards/format_reward": 0.9472222253680229, "step": 165 }, { "completion_length": 425.4944492340088, "epoch": 0.544, "grad_norm": 0.06518115103244781, "kl": 0.070721435546875, "learning_rate": 1.5336570964437077e-06, "loss": 0.0028, "reward": 1.563888892531395, "reward_std": 0.21392801143229007, "rewards/accuracy_reward": 0.5972222350537777, "rewards/format_reward": 0.9666666686534882, "step": 170 }, { "completion_length": 447.97500228881836, "epoch": 0.56, "grad_norm": 0.05302765220403671, "kl": 0.06865234375, "learning_rate": 1.4495196516183096e-06, "loss": 0.0027, "reward": 1.4805555552244187, "reward_std": 0.2787071973085403, "rewards/accuracy_reward": 0.5361111257225275, "rewards/format_reward": 0.9444444477558136, "step": 175 }, { "completion_length": 441.51111526489257, "epoch": 0.576, "grad_norm": 0.0858602300286293, "kl": 0.0727752685546875, "learning_rate": 1.3655410366448499e-06, "loss": 0.0029, "reward": 1.5250000014901162, "reward_std": 0.2572292808443308, "rewards/accuracy_reward": 0.5666666816920042, "rewards/format_reward": 0.9583333358168602, "step": 180 }, { "completion_length": 470.7583396911621, "epoch": 0.592, "grad_norm": 0.07894453406333923, "kl": 0.065618896484375, "learning_rate": 1.2819854793151313e-06, "loss": 0.0026, "reward": 1.4527777865529061, "reward_std": 0.2738959465175867, "rewards/accuracy_reward": 0.516666678711772, "rewards/format_reward": 0.9361111149191856, "step": 185 }, { "completion_length": 449.9416721343994, "epoch": 0.608, "grad_norm": 0.08513162285089493, "kl": 0.070989990234375, "learning_rate": 1.199115876325091e-06, "loss": 0.0028, "reward": 1.4611111134290695, "reward_std": 0.3258759815245867, "rewards/accuracy_reward": 0.5277777882292867, "rewards/format_reward": 0.9333333373069763, "step": 190 }, { "completion_length": 412.6527816772461, "epoch": 0.624, "grad_norm": 0.0992361381649971, "kl": 0.0746826171875, "learning_rate": 1.1171929661045361e-06, "loss": 0.003, "reward": 1.4388888955116272, "reward_std": 0.3390205677598715, "rewards/accuracy_reward": 0.5277777882292867, "rewards/format_reward": 0.9111111164093018, "step": 195 }, { "completion_length": 434.86667137146, "epoch": 0.64, "grad_norm": 0.08208976686000824, "kl": 0.06728515625, "learning_rate": 1.036474508437579e-06, "loss": 0.0027, "reward": 1.547222228348255, "reward_std": 0.25499636940658094, "rewards/accuracy_reward": 0.5944444581866264, "rewards/format_reward": 0.9527777805924416, "step": 200 }, { "completion_length": 457.2805625915527, "epoch": 0.656, "grad_norm": 0.04328469559550285, "kl": 0.058355712890625, "learning_rate": 9.57214473454992e-07, "loss": 0.0023, "reward": 1.5222222253680229, "reward_std": 0.22002843283116819, "rewards/accuracy_reward": 0.58888890016824, "rewards/format_reward": 0.9333333373069763, "step": 205 }, { "completion_length": 453.0138931274414, "epoch": 0.672, "grad_norm": 0.06149205565452576, "kl": 0.0665008544921875, "learning_rate": 8.796622425502193e-07, "loss": 0.0027, "reward": 1.5250000014901162, "reward_std": 0.30310888960957527, "rewards/accuracy_reward": 0.6111111264675856, "rewards/format_reward": 0.9138888940215111, "step": 210 }, { "completion_length": 513.1500045776368, "epoch": 0.688, "grad_norm": 0.06691473722457886, "kl": 0.0645904541015625, "learning_rate": 8.040618237332491e-07, "loss": 0.0026, "reward": 1.436111108958721, "reward_std": 0.3447755578905344, "rewards/accuracy_reward": 0.5305555699393153, "rewards/format_reward": 0.9055555611848831, "step": 215 }, { "completion_length": 480.86389236450196, "epoch": 0.704, "grad_norm": 0.06451098620891571, "kl": 0.0659393310546875, "learning_rate": 7.30651083891141e-07, "loss": 0.0026, "reward": 1.5166666820645331, "reward_std": 0.34606473073363303, "rewards/accuracy_reward": 0.5916666787117719, "rewards/format_reward": 0.9250000044703484, "step": 220 }, { "completion_length": 507.2722282409668, "epoch": 0.72, "grad_norm": 0.05560953915119171, "kl": 0.06065673828125, "learning_rate": 6.596610003707959e-07, "loss": 0.0024, "reward": 1.5138888970017432, "reward_std": 0.28480762280523775, "rewards/accuracy_reward": 0.6055555680766702, "rewards/format_reward": 0.9083333387970924, "step": 225 }, { "completion_length": 514.2500030517579, "epoch": 0.736, "grad_norm": 0.06804082542657852, "kl": 0.0566162109375, "learning_rate": 5.913149342387704e-07, "loss": 0.0023, "reward": 1.5194444492459298, "reward_std": 0.2549963690340519, "rewards/accuracy_reward": 0.5833333430811762, "rewards/format_reward": 0.9361111149191856, "step": 230 }, { "completion_length": 468.55556259155276, "epoch": 0.752, "grad_norm": 0.06439075618982315, "kl": 0.0638336181640625, "learning_rate": 5.258279275047247e-07, "loss": 0.0026, "reward": 1.5250000029802322, "reward_std": 0.28257471323013306, "rewards/accuracy_reward": 0.5750000124797225, "rewards/format_reward": 0.9500000029802322, "step": 235 }, { "completion_length": 444.02500381469724, "epoch": 0.768, "grad_norm": 0.055538810789585114, "kl": 0.063116455078125, "learning_rate": 4.63406026519703e-07, "loss": 0.0025, "reward": 1.575, "reward_std": 0.2524180270731449, "rewards/accuracy_reward": 0.633333345502615, "rewards/format_reward": 0.9416666701436043, "step": 240 }, { "completion_length": 477.8666702270508, "epoch": 0.784, "grad_norm": 0.055306848138570786, "kl": 0.0677734375, "learning_rate": 4.042456336780838e-07, "loss": 0.0027, "reward": 1.4888888821005821, "reward_std": 0.2729522068053484, "rewards/accuracy_reward": 0.5527777902781963, "rewards/format_reward": 0.9361111149191856, "step": 245 }, { "completion_length": 463.327783203125, "epoch": 0.8, "grad_norm": 0.0591103695333004, "kl": 0.0603515625, "learning_rate": 3.4853288946298335e-07, "loss": 0.0024, "reward": 1.475000011920929, "reward_std": 0.29443012587726114, "rewards/accuracy_reward": 0.5361111234873533, "rewards/format_reward": 0.938888892531395, "step": 250 }, { "completion_length": 450.82778282165526, "epoch": 0.816, "grad_norm": 0.06738787144422531, "kl": 0.0596343994140625, "learning_rate": 2.9644308677943315e-07, "loss": 0.0024, "reward": 1.5000000134110452, "reward_std": 0.28128554075956347, "rewards/accuracy_reward": 0.5722222346812487, "rewards/format_reward": 0.9277777820825577, "step": 255 }, { "completion_length": 455.39445037841796, "epoch": 0.832, "grad_norm": 0.06288747489452362, "kl": 0.068408203125, "learning_rate": 2.48140119418046e-07, "loss": 0.0027, "reward": 1.4805555552244187, "reward_std": 0.2825747117400169, "rewards/accuracy_reward": 0.5305555703118443, "rewards/format_reward": 0.9500000029802322, "step": 260 }, { "completion_length": 470.1777805328369, "epoch": 0.848, "grad_norm": 0.07291523367166519, "kl": 0.062567138671875, "learning_rate": 2.0377596638451812e-07, "loss": 0.0025, "reward": 1.4444444566965102, "reward_std": 0.3079201437532902, "rewards/accuracy_reward": 0.5027777899056673, "rewards/format_reward": 0.9416666701436043, "step": 265 }, { "completion_length": 468.23333740234375, "epoch": 0.864, "grad_norm": 0.07961365580558777, "kl": 0.07003173828125, "learning_rate": 1.634902137174483e-07, "loss": 0.0028, "reward": 1.4638888970017434, "reward_std": 0.29924137964844705, "rewards/accuracy_reward": 0.541666678339243, "rewards/format_reward": 0.922222226858139, "step": 270 }, { "completion_length": 431.6388931274414, "epoch": 0.88, "grad_norm": 0.061992090195417404, "kl": 0.06956787109375, "learning_rate": 1.274096152990203e-07, "loss": 0.0028, "reward": 1.4666666746139527, "reward_std": 0.28609679341316224, "rewards/accuracy_reward": 0.5222222328186035, "rewards/format_reward": 0.9444444477558136, "step": 275 }, { "completion_length": 435.9138957977295, "epoch": 0.896, "grad_norm": 0.08779603242874146, "kl": 0.0734649658203125, "learning_rate": 9.564769404039419e-08, "loss": 0.0029, "reward": 1.425000011920929, "reward_std": 0.2979522071778774, "rewards/accuracy_reward": 0.48611112125217915, "rewards/format_reward": 0.938888892531395, "step": 280 }, { "completion_length": 432.5888919830322, "epoch": 0.912, "grad_norm": 0.0677870661020279, "kl": 0.074609375, "learning_rate": 6.830438469662892e-08, "loss": 0.003, "reward": 1.5027777940034865, "reward_std": 0.2764742888510227, "rewards/accuracy_reward": 0.5444444557651877, "rewards/format_reward": 0.9583333358168602, "step": 285 }, { "completion_length": 415.408337020874, "epoch": 0.928, "grad_norm": 0.08877945691347122, "kl": 0.079901123046875, "learning_rate": 4.546571943496969e-08, "loss": 0.0032, "reward": 1.5222222372889518, "reward_std": 0.29666303619742396, "rewards/accuracy_reward": 0.5972222350537777, "rewards/format_reward": 0.9250000044703484, "step": 290 }, { "completion_length": 470.35555839538574, "epoch": 0.944, "grad_norm": 0.06486877799034119, "kl": 0.062762451171875, "learning_rate": 2.72035571458224e-08, "loss": 0.0025, "reward": 1.4722222238779068, "reward_std": 0.26685178354382516, "rewards/accuracy_reward": 0.5361111244186759, "rewards/format_reward": 0.9361111149191856, "step": 295 }, { "completion_length": 442.53889656066895, "epoch": 0.96, "grad_norm": 0.06168290227651596, "kl": 0.06766357421875, "learning_rate": 1.357535734809795e-08, "loss": 0.0027, "reward": 1.4666666641831398, "reward_std": 0.31109679453074934, "rewards/accuracy_reward": 0.5361111233010888, "rewards/format_reward": 0.930555559694767, "step": 300 }, { "completion_length": 447.8833366394043, "epoch": 0.976, "grad_norm": 0.06104287877678871, "kl": 0.0698974609375, "learning_rate": 4.623999400308054e-09, "loss": 0.0028, "reward": 1.5027777954936028, "reward_std": 0.2777634594589472, "rewards/accuracy_reward": 0.5472222346812486, "rewards/format_reward": 0.9555555582046509, "step": 305 }, { "completion_length": 406.5805576324463, "epoch": 0.992, "grad_norm": 0.06284002214670181, "kl": 0.0779296875, "learning_rate": 3.77647586240204e-10, "loss": 0.0031, "reward": 1.4805555641651154, "reward_std": 0.2690846938639879, "rewards/accuracy_reward": 0.5388889022171497, "rewards/format_reward": 0.9416666693985463, "step": 310 }, { "completion_length": 461.6458377838135, "epoch": 0.9984, "kl": 0.06114959716796875, "reward": 1.4236111268401146, "reward_std": 0.29425740987062454, "rewards/accuracy_reward": 0.5000000135041773, "rewards/format_reward": 0.9236111156642437, "step": 312, "total_flos": 0.0, "train_loss": 0.002609439611100802, "train_runtime": 17087.0679, "train_samples_per_second": 0.439, "train_steps_per_second": 0.018 } ], "logging_steps": 5, "max_steps": 312, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }