{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9967914438502674, "eval_steps": 500, "global_step": 233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 228.734375, "epoch": 0.017112299465240642, "grad_norm": 0.25726935267448425, "kl": 0.0005143880844116211, "learning_rate": 9.82832618025751e-07, "loss": 0.0, "reward": 0.08203125, "reward_std": 0.12251314427703619, "rewards/validate_answer_with_correct_format": 0.060546875, "rewards/validate_format": 0.021484375, "step": 4 }, { "completion_length": 214.3046875, "epoch": 0.034224598930481284, "grad_norm": 0.2438514530658722, "kl": 0.004357337951660156, "learning_rate": 9.656652360515022e-07, "loss": 0.0002, "reward": 0.166015625, "reward_std": 0.22593521419912577, "rewards/validate_answer_with_correct_format": 0.134765625, "rewards/validate_format": 0.03125, "step": 8 }, { "completion_length": 201.08984375, "epoch": 0.051336898395721926, "grad_norm": 0.3771411180496216, "kl": 0.016904830932617188, "learning_rate": 9.484978540772532e-07, "loss": 0.0007, "reward": 0.44921875, "reward_std": 0.4575687777251005, "rewards/validate_answer_with_correct_format": 0.2578125, "rewards/validate_format": 0.19140625, "step": 12 }, { "completion_length": 193.501953125, "epoch": 0.06844919786096257, "grad_norm": 0.33645591139793396, "kl": 0.023540496826171875, "learning_rate": 9.313304721030042e-07, "loss": 0.0009, "reward": 0.96484375, "reward_std": 0.6036693137139082, "rewards/validate_answer_with_correct_format": 0.37109375, "rewards/validate_format": 0.59375, "step": 16 }, { "completion_length": 175.50390625, "epoch": 0.0855614973262032, "grad_norm": 0.3233410716056824, "kl": 0.03643798828125, "learning_rate": 9.141630901287554e-07, "loss": 0.0015, "reward": 1.302734375, "reward_std": 0.4221517601981759, "rewards/validate_answer_with_correct_format": 0.48828125, "rewards/validate_format": 0.814453125, "step": 20 }, { "completion_length": 176.00390625, "epoch": 0.10267379679144385, "grad_norm": 0.2788090109825134, "kl": 0.029022216796875, "learning_rate": 8.969957081545064e-07, "loss": 0.0012, "reward": 1.359375, "reward_std": 0.3464417774230242, "rewards/validate_answer_with_correct_format": 0.521484375, "rewards/validate_format": 0.837890625, "step": 24 }, { "completion_length": 168.138671875, "epoch": 0.11978609625668449, "grad_norm": 0.2867412269115448, "kl": 0.03003692626953125, "learning_rate": 8.798283261802575e-07, "loss": 0.0012, "reward": 1.5, "reward_std": 0.35856608115136623, "rewards/validate_answer_with_correct_format": 0.62109375, "rewards/validate_format": 0.87890625, "step": 28 }, { "completion_length": 167.71484375, "epoch": 0.13689839572192514, "grad_norm": 0.2035263180732727, "kl": 0.0883941650390625, "learning_rate": 8.626609442060086e-07, "loss": 0.0035, "reward": 1.466796875, "reward_std": 0.2889786111190915, "rewards/validate_answer_with_correct_format": 0.576171875, "rewards/validate_format": 0.890625, "step": 32 }, { "completion_length": 159.65234375, "epoch": 0.15401069518716579, "grad_norm": 0.22620172798633575, "kl": 0.0332489013671875, "learning_rate": 8.454935622317596e-07, "loss": 0.0013, "reward": 1.51171875, "reward_std": 0.22670799400657415, "rewards/validate_answer_with_correct_format": 0.58203125, "rewards/validate_format": 0.9296875, "step": 36 }, { "completion_length": 165.24609375, "epoch": 0.1711229946524064, "grad_norm": 0.270819753408432, "kl": 0.0323638916015625, "learning_rate": 8.283261802575107e-07, "loss": 0.0013, "reward": 1.49609375, "reward_std": 0.2671235203742981, "rewards/validate_answer_with_correct_format": 0.60546875, "rewards/validate_format": 0.890625, "step": 40 }, { "completion_length": 155.900390625, "epoch": 0.18823529411764706, "grad_norm": 0.249853253364563, "kl": 0.0360260009765625, "learning_rate": 8.111587982832617e-07, "loss": 0.0014, "reward": 1.607421875, "reward_std": 0.2183889476582408, "rewards/validate_answer_with_correct_format": 0.6796875, "rewards/validate_format": 0.927734375, "step": 44 }, { "completion_length": 162.89453125, "epoch": 0.2053475935828877, "grad_norm": 0.3030960261821747, "kl": 0.04419708251953125, "learning_rate": 7.939914163090128e-07, "loss": 0.0018, "reward": 1.6171875, "reward_std": 0.28787759225815535, "rewards/validate_answer_with_correct_format": 0.6875, "rewards/validate_format": 0.9296875, "step": 48 }, { "completion_length": 164.138671875, "epoch": 0.22245989304812835, "grad_norm": 0.2891474962234497, "kl": 0.03287506103515625, "learning_rate": 7.76824034334764e-07, "loss": 0.0013, "reward": 1.56640625, "reward_std": 0.3518991004675627, "rewards/validate_answer_with_correct_format": 0.689453125, "rewards/validate_format": 0.876953125, "step": 52 }, { "completion_length": 164.32421875, "epoch": 0.23957219251336898, "grad_norm": 0.26431283354759216, "kl": 0.04036712646484375, "learning_rate": 7.59656652360515e-07, "loss": 0.0016, "reward": 1.546875, "reward_std": 0.4236298883333802, "rewards/validate_answer_with_correct_format": 0.708984375, "rewards/validate_format": 0.837890625, "step": 56 }, { "completion_length": 169.640625, "epoch": 0.25668449197860965, "grad_norm": 0.2717176675796509, "kl": 0.03812408447265625, "learning_rate": 7.424892703862661e-07, "loss": 0.0015, "reward": 1.646484375, "reward_std": 0.35125905089080334, "rewards/validate_answer_with_correct_format": 0.783203125, "rewards/validate_format": 0.86328125, "step": 60 }, { "completion_length": 169.064453125, "epoch": 0.2737967914438503, "grad_norm": 0.2818020284175873, "kl": 0.04370880126953125, "learning_rate": 7.253218884120171e-07, "loss": 0.0017, "reward": 1.6171875, "reward_std": 0.3997631352394819, "rewards/validate_answer_with_correct_format": 0.744140625, "rewards/validate_format": 0.873046875, "step": 64 }, { "completion_length": 172.17578125, "epoch": 0.2909090909090909, "grad_norm": 0.35611966252326965, "kl": 0.0448455810546875, "learning_rate": 7.081545064377682e-07, "loss": 0.0018, "reward": 1.611328125, "reward_std": 0.3241206342354417, "rewards/validate_answer_with_correct_format": 0.75, "rewards/validate_format": 0.861328125, "step": 68 }, { "completion_length": 163.498046875, "epoch": 0.30802139037433157, "grad_norm": 0.2753399908542633, "kl": 0.04538726806640625, "learning_rate": 6.909871244635192e-07, "loss": 0.0018, "reward": 1.701171875, "reward_std": 0.3228708282113075, "rewards/validate_answer_with_correct_format": 0.802734375, "rewards/validate_format": 0.8984375, "step": 72 }, { "completion_length": 166.0234375, "epoch": 0.3251336898395722, "grad_norm": 0.401327520608902, "kl": 0.053253173828125, "learning_rate": 6.738197424892703e-07, "loss": 0.0021, "reward": 1.728515625, "reward_std": 0.3109824899584055, "rewards/validate_answer_with_correct_format": 0.826171875, "rewards/validate_format": 0.90234375, "step": 76 }, { "completion_length": 168.322265625, "epoch": 0.3422459893048128, "grad_norm": 0.4816475510597229, "kl": 0.0675201416015625, "learning_rate": 6.566523605150214e-07, "loss": 0.0027, "reward": 1.568359375, "reward_std": 0.36935339495539665, "rewards/validate_answer_with_correct_format": 0.748046875, "rewards/validate_format": 0.8203125, "step": 80 }, { "completion_length": 173.77734375, "epoch": 0.3593582887700535, "grad_norm": 1.3737239837646484, "kl": 0.1086883544921875, "learning_rate": 6.394849785407725e-07, "loss": 0.0043, "reward": 1.595703125, "reward_std": 0.3840404311195016, "rewards/validate_answer_with_correct_format": 0.771484375, "rewards/validate_format": 0.82421875, "step": 84 }, { "completion_length": 174.169921875, "epoch": 0.3764705882352941, "grad_norm": 1.1611127853393555, "kl": 0.1685333251953125, "learning_rate": 6.223175965665236e-07, "loss": 0.0067, "reward": 1.494140625, "reward_std": 0.4635760700330138, "rewards/validate_answer_with_correct_format": 0.724609375, "rewards/validate_format": 0.76953125, "step": 88 }, { "completion_length": 188.8828125, "epoch": 0.39358288770053473, "grad_norm": 1.5201658010482788, "kl": 0.60052490234375, "learning_rate": 6.051502145922746e-07, "loss": 0.024, "reward": 1.169921875, "reward_std": 0.6621273942291737, "rewards/validate_answer_with_correct_format": 0.587890625, "rewards/validate_format": 0.58203125, "step": 92 }, { "completion_length": 193.64453125, "epoch": 0.4106951871657754, "grad_norm": 4.456446170806885, "kl": 1.027099609375, "learning_rate": 5.879828326180257e-07, "loss": 0.041, "reward": 1.052734375, "reward_std": 0.6838541068136692, "rewards/validate_answer_with_correct_format": 0.51953125, "rewards/validate_format": 0.533203125, "step": 96 }, { "completion_length": 179.8125, "epoch": 0.42780748663101603, "grad_norm": 0.6613827347755432, "kl": 0.86444091796875, "learning_rate": 5.708154506437767e-07, "loss": 0.0346, "reward": 1.2578125, "reward_std": 0.540657652541995, "rewards/validate_answer_with_correct_format": 0.609375, "rewards/validate_format": 0.6484375, "step": 100 }, { "completion_length": 169.509765625, "epoch": 0.4449197860962567, "grad_norm": 5.908604621887207, "kl": 0.630767822265625, "learning_rate": 5.536480686695278e-07, "loss": 0.0252, "reward": 1.537109375, "reward_std": 0.4994529504328966, "rewards/validate_answer_with_correct_format": 0.736328125, "rewards/validate_format": 0.80078125, "step": 104 }, { "completion_length": 160.07421875, "epoch": 0.46203208556149733, "grad_norm": 1.3123823404312134, "kl": 0.44525146484375, "learning_rate": 5.364806866952789e-07, "loss": 0.0178, "reward": 1.591796875, "reward_std": 0.42215616535395384, "rewards/validate_answer_with_correct_format": 0.744140625, "rewards/validate_format": 0.84765625, "step": 108 }, { "completion_length": 162.609375, "epoch": 0.47914438502673795, "grad_norm": 3.7777585983276367, "kl": 0.751312255859375, "learning_rate": 5.193133047210299e-07, "loss": 0.03, "reward": 1.533203125, "reward_std": 0.5010853223502636, "rewards/validate_answer_with_correct_format": 0.72265625, "rewards/validate_format": 0.810546875, "step": 112 }, { "completion_length": 176.544921875, "epoch": 0.49625668449197863, "grad_norm": 1.9199061393737793, "kl": 1.5675048828125, "learning_rate": 5.021459227467812e-07, "loss": 0.0628, "reward": 1.33984375, "reward_std": 0.5347601640969515, "rewards/validate_answer_with_correct_format": 0.619140625, "rewards/validate_format": 0.720703125, "step": 116 }, { "completion_length": 174.685546875, "epoch": 0.5133689839572193, "grad_norm": 1.4069411754608154, "kl": 1.360107421875, "learning_rate": 4.849785407725322e-07, "loss": 0.0544, "reward": 1.25, "reward_std": 0.5469169113785028, "rewards/validate_answer_with_correct_format": 0.537109375, "rewards/validate_format": 0.712890625, "step": 120 }, { "completion_length": 168.380859375, "epoch": 0.5304812834224599, "grad_norm": 1.0460318326950073, "kl": 0.49078369140625, "learning_rate": 4.6781115879828326e-07, "loss": 0.0196, "reward": 1.337890625, "reward_std": 0.4842473194003105, "rewards/validate_answer_with_correct_format": 0.564453125, "rewards/validate_format": 0.7734375, "step": 124 }, { "completion_length": 167.39453125, "epoch": 0.5475935828877005, "grad_norm": 0.717617392539978, "kl": 0.53839111328125, "learning_rate": 4.506437768240343e-07, "loss": 0.0216, "reward": 1.43359375, "reward_std": 0.47893994580954313, "rewards/validate_answer_with_correct_format": 0.623046875, "rewards/validate_format": 0.810546875, "step": 128 }, { "completion_length": 179.7890625, "epoch": 0.5647058823529412, "grad_norm": 1.0693079233169556, "kl": 1.01556396484375, "learning_rate": 4.3347639484978536e-07, "loss": 0.0407, "reward": 1.298828125, "reward_std": 0.5207763016223907, "rewards/validate_answer_with_correct_format": 0.58203125, "rewards/validate_format": 0.716796875, "step": 132 }, { "completion_length": 176.11328125, "epoch": 0.5818181818181818, "grad_norm": 1.5559654235839844, "kl": 1.3896484375, "learning_rate": 4.163090128755364e-07, "loss": 0.0556, "reward": 1.31640625, "reward_std": 0.6244864724576473, "rewards/validate_answer_with_correct_format": 0.580078125, "rewards/validate_format": 0.736328125, "step": 136 }, { "completion_length": 171.017578125, "epoch": 0.5989304812834224, "grad_norm": 1.3196905851364136, "kl": 1.96533203125, "learning_rate": 3.991416309012876e-07, "loss": 0.0786, "reward": 1.373046875, "reward_std": 0.5217751991003752, "rewards/validate_answer_with_correct_format": 0.63671875, "rewards/validate_format": 0.736328125, "step": 140 }, { "completion_length": 181.798828125, "epoch": 0.6160427807486631, "grad_norm": 0.8346861004829407, "kl": 2.1689453125, "learning_rate": 3.819742489270386e-07, "loss": 0.0867, "reward": 1.2734375, "reward_std": 0.5743660591542721, "rewards/validate_answer_with_correct_format": 0.58984375, "rewards/validate_format": 0.68359375, "step": 144 }, { "completion_length": 175.63671875, "epoch": 0.6331550802139038, "grad_norm": 1.6434240341186523, "kl": 1.395263671875, "learning_rate": 3.648068669527897e-07, "loss": 0.0559, "reward": 1.3671875, "reward_std": 0.4787444490939379, "rewards/validate_answer_with_correct_format": 0.6015625, "rewards/validate_format": 0.765625, "step": 148 }, { "completion_length": 166.970703125, "epoch": 0.6502673796791444, "grad_norm": 2.3217926025390625, "kl": 0.79571533203125, "learning_rate": 3.4763948497854073e-07, "loss": 0.0318, "reward": 1.486328125, "reward_std": 0.4878078643232584, "rewards/validate_answer_with_correct_format": 0.6640625, "rewards/validate_format": 0.822265625, "step": 152 }, { "completion_length": 172.341796875, "epoch": 0.667379679144385, "grad_norm": 2.0409505367279053, "kl": 1.2677001953125, "learning_rate": 3.3047210300429184e-07, "loss": 0.0507, "reward": 1.443359375, "reward_std": 0.5103737181052566, "rewards/validate_answer_with_correct_format": 0.65234375, "rewards/validate_format": 0.791015625, "step": 156 }, { "completion_length": 173.0703125, "epoch": 0.6844919786096256, "grad_norm": 0.673163115978241, "kl": 1.423583984375, "learning_rate": 3.133047210300429e-07, "loss": 0.057, "reward": 1.40625, "reward_std": 0.4554907586425543, "rewards/validate_answer_with_correct_format": 0.638671875, "rewards/validate_format": 0.767578125, "step": 160 }, { "completion_length": 172.994140625, "epoch": 0.7016042780748664, "grad_norm": 0.7499634623527527, "kl": 1.368408203125, "learning_rate": 2.96137339055794e-07, "loss": 0.0548, "reward": 1.427734375, "reward_std": 0.48304858803749084, "rewards/validate_answer_with_correct_format": 0.671875, "rewards/validate_format": 0.755859375, "step": 164 }, { "completion_length": 174.158203125, "epoch": 0.718716577540107, "grad_norm": 0.622071385383606, "kl": 1.55810546875, "learning_rate": 2.7896995708154505e-07, "loss": 0.0623, "reward": 1.375, "reward_std": 0.46824304293841124, "rewards/validate_answer_with_correct_format": 0.646484375, "rewards/validate_format": 0.728515625, "step": 168 }, { "completion_length": 172.513671875, "epoch": 0.7358288770053476, "grad_norm": 1.0885876417160034, "kl": 1.2890625, "learning_rate": 2.6180257510729615e-07, "loss": 0.0516, "reward": 1.34375, "reward_std": 0.5326037332415581, "rewards/validate_answer_with_correct_format": 0.609375, "rewards/validate_format": 0.734375, "step": 172 }, { "completion_length": 177.36328125, "epoch": 0.7529411764705882, "grad_norm": 1.3465913534164429, "kl": 1.707763671875, "learning_rate": 2.446351931330472e-07, "loss": 0.0683, "reward": 1.29296875, "reward_std": 0.5761713199317455, "rewards/validate_answer_with_correct_format": 0.595703125, "rewards/validate_format": 0.697265625, "step": 176 }, { "completion_length": 175.986328125, "epoch": 0.7700534759358288, "grad_norm": 1.9746425151824951, "kl": 1.796875, "learning_rate": 2.2746781115879825e-07, "loss": 0.0718, "reward": 1.291015625, "reward_std": 0.5592877455055714, "rewards/validate_answer_with_correct_format": 0.609375, "rewards/validate_format": 0.681640625, "step": 180 }, { "completion_length": 181.4921875, "epoch": 0.7871657754010695, "grad_norm": 1.2594670057296753, "kl": 2.2677001953125, "learning_rate": 2.1030042918454936e-07, "loss": 0.0906, "reward": 1.29296875, "reward_std": 0.5261205593124032, "rewards/validate_answer_with_correct_format": 0.59765625, "rewards/validate_format": 0.6953125, "step": 184 }, { "completion_length": 182.068359375, "epoch": 0.8042780748663102, "grad_norm": 0.9596216678619385, "kl": 1.4912109375, "learning_rate": 1.931330472103004e-07, "loss": 0.0597, "reward": 1.2734375, "reward_std": 0.5831566601991653, "rewards/validate_answer_with_correct_format": 0.580078125, "rewards/validate_format": 0.693359375, "step": 188 }, { "completion_length": 176.224609375, "epoch": 0.8213903743315508, "grad_norm": 1.0331825017929077, "kl": 1.3037109375, "learning_rate": 1.759656652360515e-07, "loss": 0.0522, "reward": 1.384765625, "reward_std": 0.48507228679955006, "rewards/validate_answer_with_correct_format": 0.640625, "rewards/validate_format": 0.744140625, "step": 192 }, { "completion_length": 170.205078125, "epoch": 0.8385026737967914, "grad_norm": 1.1908960342407227, "kl": 1.3134765625, "learning_rate": 1.5879828326180257e-07, "loss": 0.0526, "reward": 1.44140625, "reward_std": 0.5417510252445936, "rewards/validate_answer_with_correct_format": 0.658203125, "rewards/validate_format": 0.783203125, "step": 196 }, { "completion_length": 175.8125, "epoch": 0.8556149732620321, "grad_norm": 2.4777722358703613, "kl": 1.801025390625, "learning_rate": 1.4163090128755365e-07, "loss": 0.072, "reward": 1.306640625, "reward_std": 0.5542392712086439, "rewards/validate_answer_with_correct_format": 0.595703125, "rewards/validate_format": 0.7109375, "step": 200 }, { "completion_length": 177.724609375, "epoch": 0.8727272727272727, "grad_norm": 0.8101657032966614, "kl": 2.00537109375, "learning_rate": 1.2446351931330473e-07, "loss": 0.0802, "reward": 1.28125, "reward_std": 0.5756953954696655, "rewards/validate_answer_with_correct_format": 0.595703125, "rewards/validate_format": 0.685546875, "step": 204 }, { "completion_length": 178.109375, "epoch": 0.8898395721925134, "grad_norm": 4.290409564971924, "kl": 2.436279296875, "learning_rate": 1.0729613733905579e-07, "loss": 0.0974, "reward": 1.2265625, "reward_std": 0.5841891095042229, "rewards/validate_answer_with_correct_format": 0.54296875, "rewards/validate_format": 0.68359375, "step": 208 }, { "completion_length": 182.623046875, "epoch": 0.906951871657754, "grad_norm": 1.0480223894119263, "kl": 2.49560546875, "learning_rate": 9.012875536480687e-08, "loss": 0.0999, "reward": 1.173828125, "reward_std": 0.6358627937734127, "rewards/validate_answer_with_correct_format": 0.54296875, "rewards/validate_format": 0.630859375, "step": 212 }, { "completion_length": 180.333984375, "epoch": 0.9240641711229947, "grad_norm": 1.4673086404800415, "kl": 2.2333984375, "learning_rate": 7.296137339055794e-08, "loss": 0.0895, "reward": 1.158203125, "reward_std": 0.6267144195735455, "rewards/validate_answer_with_correct_format": 0.50390625, "rewards/validate_format": 0.654296875, "step": 216 }, { "completion_length": 177.82421875, "epoch": 0.9411764705882353, "grad_norm": 1.3507096767425537, "kl": 2.12353515625, "learning_rate": 5.5793991416309014e-08, "loss": 0.0849, "reward": 1.212890625, "reward_std": 0.6212888453155756, "rewards/validate_answer_with_correct_format": 0.5546875, "rewards/validate_format": 0.658203125, "step": 220 }, { "completion_length": 176.25390625, "epoch": 0.9582887700534759, "grad_norm": 1.1437978744506836, "kl": 1.736328125, "learning_rate": 3.8626609442060086e-08, "loss": 0.0695, "reward": 1.244140625, "reward_std": 0.5541076026856899, "rewards/validate_answer_with_correct_format": 0.568359375, "rewards/validate_format": 0.67578125, "step": 224 }, { "completion_length": 172.615234375, "epoch": 0.9754010695187165, "grad_norm": 2.013885974884033, "kl": 1.874267578125, "learning_rate": 2.1459227467811158e-08, "loss": 0.075, "reward": 1.31640625, "reward_std": 0.5559380035847425, "rewards/validate_answer_with_correct_format": 0.6015625, "rewards/validate_format": 0.71484375, "step": 228 }, { "completion_length": 173.4765625, "epoch": 0.9925133689839573, "grad_norm": 0.9704756140708923, "kl": 1.89306640625, "learning_rate": 4.291845493562231e-09, "loss": 0.0757, "reward": 1.271484375, "reward_std": 0.5627450533211231, "rewards/validate_answer_with_correct_format": 0.59375, "rewards/validate_format": 0.677734375, "step": 232 } ], "logging_steps": 4, "max_steps": 233, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }