Qwen-2.5-3B-Simple-RL / trainer_state.json
Typiiing's picture
Model save
dcc8c2d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984,
"eval_steps": 2,
"global_step": 312,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 553.6277812957763,
"epoch": 0.016,
"grad_norm": 0.21352672576904297,
"kl": 0.0001860499382019043,
"learning_rate": 4.6875e-07,
"loss": 0.0,
"reward": 0.5361111229285598,
"reward_std": 0.2633297026157379,
"rewards/accuracy_reward": 0.3111111169680953,
"rewards/format_reward": 0.22500000484287738,
"step": 5
},
{
"completion_length": 509.1916721343994,
"epoch": 0.032,
"grad_norm": 0.08460841327905655,
"kl": 0.0006537675857543945,
"learning_rate": 9.375e-07,
"loss": 0.0,
"reward": 0.6250000141561032,
"reward_std": 0.22612885609269143,
"rewards/accuracy_reward": 0.32500000633299353,
"rewards/format_reward": 0.30000000819563866,
"step": 10
},
{
"completion_length": 414.31111640930175,
"epoch": 0.048,
"grad_norm": 0.09808748215436935,
"kl": 0.009083938598632813,
"learning_rate": 1.40625e-06,
"loss": 0.0004,
"reward": 0.7472222343087196,
"reward_std": 0.25241802744567393,
"rewards/accuracy_reward": 0.12500000279396772,
"rewards/format_reward": 0.6222222335636616,
"step": 15
},
{
"completion_length": 250.90277996063233,
"epoch": 0.064,
"grad_norm": 0.05115974321961403,
"kl": 0.03193817138671875,
"learning_rate": 1.875e-06,
"loss": 0.0013,
"reward": 0.9416666746139526,
"reward_std": 0.15361464098095895,
"rewards/accuracy_reward": 0.03611111156642437,
"rewards/format_reward": 0.9055555611848831,
"step": 20
},
{
"completion_length": 211.14444694519042,
"epoch": 0.08,
"grad_norm": 0.06653334200382233,
"kl": 0.035968017578125,
"learning_rate": 2.3437500000000002e-06,
"loss": 0.0014,
"reward": 0.986111119389534,
"reward_std": 0.10103629790246486,
"rewards/accuracy_reward": 0.0305555559694767,
"rewards/format_reward": 0.9555555582046509,
"step": 25
},
{
"completion_length": 246.1000036239624,
"epoch": 0.096,
"grad_norm": 0.06397304683923721,
"kl": 0.0366363525390625,
"learning_rate": 2.8125e-06,
"loss": 0.0015,
"reward": 1.0000000089406966,
"reward_std": 0.07216878421604633,
"rewards/accuracy_reward": 0.038888889364898205,
"rewards/format_reward": 0.9611111134290695,
"step": 30
},
{
"completion_length": 255.99722633361816,
"epoch": 0.112,
"grad_norm": 0.12618552148342133,
"kl": 0.03426666259765625,
"learning_rate": 2.9991503375003e-06,
"loss": 0.0014,
"reward": 1.025000013411045,
"reward_std": 0.11547005474567414,
"rewards/accuracy_reward": 0.06388888955116272,
"rewards/format_reward": 0.9611111134290695,
"step": 35
},
{
"completion_length": 258.1111114501953,
"epoch": 0.128,
"grad_norm": 0.09856373071670532,
"kl": 0.0427825927734375,
"learning_rate": 2.993961440992859e-06,
"loss": 0.0017,
"reward": 1.086111131310463,
"reward_std": 0.1732050821185112,
"rewards/accuracy_reward": 0.12500000167638065,
"rewards/format_reward": 0.9611111134290695,
"step": 40
},
{
"completion_length": 296.8416700363159,
"epoch": 0.144,
"grad_norm": 0.091950424015522,
"kl": 0.0412506103515625,
"learning_rate": 2.984071989079555e-06,
"loss": 0.0017,
"reward": 1.0666666805744172,
"reward_std": 0.1658154871314764,
"rewards/accuracy_reward": 0.10000000093132258,
"rewards/format_reward": 0.9666666686534882,
"step": 45
},
{
"completion_length": 317.56111373901365,
"epoch": 0.16,
"grad_norm": 0.09281204640865326,
"kl": 0.0566925048828125,
"learning_rate": 2.9695130976348534e-06,
"loss": 0.0023,
"reward": 1.0972222343087197,
"reward_std": 0.21873926185071468,
"rewards/accuracy_reward": 0.15555555745959282,
"rewards/format_reward": 0.9416666701436043,
"step": 50
},
{
"completion_length": 281.8416681289673,
"epoch": 0.176,
"grad_norm": 0.10378360003232956,
"kl": 0.07989501953125,
"learning_rate": 2.9503305743175096e-06,
"loss": 0.0032,
"reward": 1.1777777940034866,
"reward_std": 0.25370719768106936,
"rewards/accuracy_reward": 0.20833333637565374,
"rewards/format_reward": 0.9694444462656975,
"step": 55
},
{
"completion_length": 254.96389064788818,
"epoch": 0.192,
"grad_norm": 0.14641663432121277,
"kl": 0.092510986328125,
"learning_rate": 2.9265847744427307e-06,
"loss": 0.0037,
"reward": 1.1916666761040688,
"reward_std": 0.2572292793542147,
"rewards/accuracy_reward": 0.23333333786576987,
"rewards/format_reward": 0.9583333358168602,
"step": 60
},
{
"completion_length": 270.2777801513672,
"epoch": 0.208,
"grad_norm": 0.09665928781032562,
"kl": 0.097540283203125,
"learning_rate": 2.8983504110820214e-06,
"loss": 0.0039,
"reward": 1.172222228348255,
"reward_std": 0.25980762131512164,
"rewards/accuracy_reward": 0.2305555608123541,
"rewards/format_reward": 0.9416666701436043,
"step": 65
},
{
"completion_length": 372.66111488342284,
"epoch": 0.224,
"grad_norm": 0.13982528448104858,
"kl": 0.0855712890625,
"learning_rate": 2.865716319988224e-06,
"loss": 0.0034,
"reward": 1.2000000104308128,
"reward_std": 0.32587598264217377,
"rewards/accuracy_reward": 0.2916666727513075,
"rewards/format_reward": 0.9083333387970924,
"step": 70
},
{
"completion_length": 372.80278129577636,
"epoch": 0.24,
"grad_norm": 0.07452689111232758,
"kl": 0.07293701171875,
"learning_rate": 2.82878518008537e-06,
"loss": 0.0029,
"reward": 1.3944444462656975,
"reward_std": 0.30276345908641816,
"rewards/accuracy_reward": 0.4333333451300859,
"rewards/format_reward": 0.9611111134290695,
"step": 75
},
{
"completion_length": 376.4416694641113,
"epoch": 0.256,
"grad_norm": 0.08069849759340286,
"kl": 0.08565673828125,
"learning_rate": 2.7876731904027993e-06,
"loss": 0.0034,
"reward": 1.3472222313284874,
"reward_std": 0.36049848720431327,
"rewards/accuracy_reward": 0.40000000949949027,
"rewards/format_reward": 0.9472222253680229,
"step": 80
},
{
"completion_length": 354.7555561065674,
"epoch": 0.272,
"grad_norm": 0.10501620918512344,
"kl": 0.0765869140625,
"learning_rate": 2.7425097044700246e-06,
"loss": 0.0031,
"reward": 1.3944444552063942,
"reward_std": 0.27776345871388913,
"rewards/accuracy_reward": 0.45000001005828383,
"rewards/format_reward": 0.9444444477558136,
"step": 85
},
{
"completion_length": 403.3444492340088,
"epoch": 0.288,
"grad_norm": 0.12747347354888916,
"kl": 0.073785400390625,
"learning_rate": 2.6934368233226715e-06,
"loss": 0.003,
"reward": 1.2694444522261619,
"reward_std": 0.34864307269454003,
"rewards/accuracy_reward": 0.38055556602776053,
"rewards/format_reward": 0.8888888947665692,
"step": 90
},
{
"completion_length": 413.0500057220459,
"epoch": 0.304,
"grad_norm": 0.08874136209487915,
"kl": 0.070477294921875,
"learning_rate": 2.6406089484000465e-06,
"loss": 0.0028,
"reward": 1.297222228348255,
"reward_std": 0.3640205666422844,
"rewards/accuracy_reward": 0.4333333447575569,
"rewards/format_reward": 0.8638888970017433,
"step": 95
},
{
"completion_length": 381.81389274597166,
"epoch": 0.32,
"grad_norm": 0.05798633396625519,
"kl": 0.07501220703125,
"learning_rate": 2.584192295741087e-06,
"loss": 0.003,
"reward": 1.3750000134110452,
"reward_std": 0.3136751361191273,
"rewards/accuracy_reward": 0.4416666753590107,
"rewards/format_reward": 0.9333333373069763,
"step": 100
},
{
"completion_length": 369.6000038146973,
"epoch": 0.336,
"grad_norm": 0.07807120680809021,
"kl": 0.077154541015625,
"learning_rate": 2.5243643730072105e-06,
"loss": 0.0031,
"reward": 1.494444453716278,
"reward_std": 0.31015305407345295,
"rewards/accuracy_reward": 0.5555555684491992,
"rewards/format_reward": 0.938888892531395,
"step": 105
},
{
"completion_length": 390.78889389038085,
"epoch": 0.352,
"grad_norm": 0.08092948794364929,
"kl": 0.08060302734375,
"learning_rate": 2.461313420977536e-06,
"loss": 0.0032,
"reward": 1.380555558204651,
"reward_std": 0.3232976388186216,
"rewards/accuracy_reward": 0.4527777874842286,
"rewards/format_reward": 0.9277777820825577,
"step": 110
},
{
"completion_length": 413.20278282165526,
"epoch": 0.368,
"grad_norm": 0.059824734926223755,
"kl": 0.0724517822265625,
"learning_rate": 2.3952378212737554e-06,
"loss": 0.0029,
"reward": 1.3944444507360458,
"reward_std": 0.337731396406889,
"rewards/accuracy_reward": 0.4611111244186759,
"rewards/format_reward": 0.9333333373069763,
"step": 115
},
{
"completion_length": 403.9111152648926,
"epoch": 0.384,
"grad_norm": 0.07250163704156876,
"kl": 0.078021240234375,
"learning_rate": 2.3263454721781537e-06,
"loss": 0.0031,
"reward": 1.4305555507540704,
"reward_std": 0.299241379275918,
"rewards/accuracy_reward": 0.530555566214025,
"rewards/format_reward": 0.9000000059604645,
"step": 120
},
{
"completion_length": 378.2222255706787,
"epoch": 0.4,
"grad_norm": 0.12146278470754623,
"kl": 0.0875885009765625,
"learning_rate": 2.2548531345087003e-06,
"loss": 0.0035,
"reward": 1.3388888984918594,
"reward_std": 0.28480762206017973,
"rewards/accuracy_reward": 0.386111119389534,
"rewards/format_reward": 0.9527777805924416,
"step": 125
},
{
"completion_length": 324.2388910293579,
"epoch": 0.416,
"grad_norm": 0.09016001224517822,
"kl": 0.102557373046875,
"learning_rate": 2.18098574960932e-06,
"loss": 0.0041,
"reward": 1.4750000044703484,
"reward_std": 0.28257471285760405,
"rewards/accuracy_reward": 0.5138889042660594,
"rewards/format_reward": 0.9611111134290695,
"step": 130
},
{
"completion_length": 425.48055801391604,
"epoch": 0.432,
"grad_norm": 0.06732641160488129,
"kl": 0.089459228515625,
"learning_rate": 2.104975731601208e-06,
"loss": 0.0036,
"reward": 1.3027777835726737,
"reward_std": 0.32329764030873775,
"rewards/accuracy_reward": 0.38611112013459203,
"rewards/format_reward": 0.9166666716337204,
"step": 135
},
{
"completion_length": 475.63889541625974,
"epoch": 0.448,
"grad_norm": 0.06925955414772034,
"kl": 0.06644287109375,
"learning_rate": 2.027062236122014e-06,
"loss": 0.0027,
"reward": 1.3805555671453476,
"reward_std": 0.35310889072716234,
"rewards/accuracy_reward": 0.4888889010995626,
"rewards/format_reward": 0.8916666731238365,
"step": 140
},
{
"completion_length": 469.69444847106934,
"epoch": 0.464,
"grad_norm": 0.04355955123901367,
"kl": 0.073883056640625,
"learning_rate": 1.9474904078537343e-06,
"loss": 0.003,
"reward": 1.4388888955116272,
"reward_std": 0.2873859636485577,
"rewards/accuracy_reward": 0.5583333482965827,
"rewards/format_reward": 0.8805555619299412,
"step": 145
},
{
"completion_length": 500.4750095367432,
"epoch": 0.48,
"grad_norm": 0.06400442123413086,
"kl": 0.06243896484375,
"learning_rate": 1.866510609206841e-06,
"loss": 0.0025,
"reward": 1.477777788043022,
"reward_std": 0.274241379275918,
"rewards/accuracy_reward": 0.5611111238598824,
"rewards/format_reward": 0.9166666716337204,
"step": 150
},
{
"completion_length": 482.2611141204834,
"epoch": 0.496,
"grad_norm": 0.07321769744157791,
"kl": 0.063238525390625,
"learning_rate": 1.784377632587518e-06,
"loss": 0.0025,
"reward": 1.4861111238598823,
"reward_std": 0.30147428885102273,
"rewards/accuracy_reward": 0.541666678711772,
"rewards/format_reward": 0.9444444477558136,
"step": 155
},
{
"completion_length": 458.41389694213865,
"epoch": 0.512,
"grad_norm": 0.05824045091867447,
"kl": 0.06778564453125,
"learning_rate": 1.7013498987264833e-06,
"loss": 0.0027,
"reward": 1.475000013411045,
"reward_std": 0.23446219004690647,
"rewards/accuracy_reward": 0.5250000132247805,
"rewards/format_reward": 0.9500000029802322,
"step": 160
},
{
"completion_length": 437.94444770812987,
"epoch": 0.528,
"grad_norm": 0.07412311434745789,
"kl": 0.070013427734375,
"learning_rate": 1.6176886435917677e-06,
"loss": 0.0028,
"reward": 1.5305555671453477,
"reward_std": 0.2790526311844587,
"rewards/accuracy_reward": 0.5833333466202021,
"rewards/format_reward": 0.9472222253680229,
"step": 165
},
{
"completion_length": 425.4944492340088,
"epoch": 0.544,
"grad_norm": 0.06518115103244781,
"kl": 0.070721435546875,
"learning_rate": 1.5336570964437077e-06,
"loss": 0.0028,
"reward": 1.563888892531395,
"reward_std": 0.21392801143229007,
"rewards/accuracy_reward": 0.5972222350537777,
"rewards/format_reward": 0.9666666686534882,
"step": 170
},
{
"completion_length": 447.97500228881836,
"epoch": 0.56,
"grad_norm": 0.05302765220403671,
"kl": 0.06865234375,
"learning_rate": 1.4495196516183096e-06,
"loss": 0.0027,
"reward": 1.4805555552244187,
"reward_std": 0.2787071973085403,
"rewards/accuracy_reward": 0.5361111257225275,
"rewards/format_reward": 0.9444444477558136,
"step": 175
},
{
"completion_length": 441.51111526489257,
"epoch": 0.576,
"grad_norm": 0.0858602300286293,
"kl": 0.0727752685546875,
"learning_rate": 1.3655410366448499e-06,
"loss": 0.0029,
"reward": 1.5250000014901162,
"reward_std": 0.2572292808443308,
"rewards/accuracy_reward": 0.5666666816920042,
"rewards/format_reward": 0.9583333358168602,
"step": 180
},
{
"completion_length": 470.7583396911621,
"epoch": 0.592,
"grad_norm": 0.07894453406333923,
"kl": 0.065618896484375,
"learning_rate": 1.2819854793151313e-06,
"loss": 0.0026,
"reward": 1.4527777865529061,
"reward_std": 0.2738959465175867,
"rewards/accuracy_reward": 0.516666678711772,
"rewards/format_reward": 0.9361111149191856,
"step": 185
},
{
"completion_length": 449.9416721343994,
"epoch": 0.608,
"grad_norm": 0.08513162285089493,
"kl": 0.070989990234375,
"learning_rate": 1.199115876325091e-06,
"loss": 0.0028,
"reward": 1.4611111134290695,
"reward_std": 0.3258759815245867,
"rewards/accuracy_reward": 0.5277777882292867,
"rewards/format_reward": 0.9333333373069763,
"step": 190
},
{
"completion_length": 412.6527816772461,
"epoch": 0.624,
"grad_norm": 0.0992361381649971,
"kl": 0.0746826171875,
"learning_rate": 1.1171929661045361e-06,
"loss": 0.003,
"reward": 1.4388888955116272,
"reward_std": 0.3390205677598715,
"rewards/accuracy_reward": 0.5277777882292867,
"rewards/format_reward": 0.9111111164093018,
"step": 195
},
{
"completion_length": 434.86667137146,
"epoch": 0.64,
"grad_norm": 0.08208976686000824,
"kl": 0.06728515625,
"learning_rate": 1.036474508437579e-06,
"loss": 0.0027,
"reward": 1.547222228348255,
"reward_std": 0.25499636940658094,
"rewards/accuracy_reward": 0.5944444581866264,
"rewards/format_reward": 0.9527777805924416,
"step": 200
},
{
"completion_length": 457.2805625915527,
"epoch": 0.656,
"grad_norm": 0.04328469559550285,
"kl": 0.058355712890625,
"learning_rate": 9.57214473454992e-07,
"loss": 0.0023,
"reward": 1.5222222253680229,
"reward_std": 0.22002843283116819,
"rewards/accuracy_reward": 0.58888890016824,
"rewards/format_reward": 0.9333333373069763,
"step": 205
},
{
"completion_length": 453.0138931274414,
"epoch": 0.672,
"grad_norm": 0.06149205565452576,
"kl": 0.0665008544921875,
"learning_rate": 8.796622425502193e-07,
"loss": 0.0027,
"reward": 1.5250000014901162,
"reward_std": 0.30310888960957527,
"rewards/accuracy_reward": 0.6111111264675856,
"rewards/format_reward": 0.9138888940215111,
"step": 210
},
{
"completion_length": 513.1500045776368,
"epoch": 0.688,
"grad_norm": 0.06691473722457886,
"kl": 0.0645904541015625,
"learning_rate": 8.040618237332491e-07,
"loss": 0.0026,
"reward": 1.436111108958721,
"reward_std": 0.3447755578905344,
"rewards/accuracy_reward": 0.5305555699393153,
"rewards/format_reward": 0.9055555611848831,
"step": 215
},
{
"completion_length": 480.86389236450196,
"epoch": 0.704,
"grad_norm": 0.06451098620891571,
"kl": 0.0659393310546875,
"learning_rate": 7.30651083891141e-07,
"loss": 0.0026,
"reward": 1.5166666820645331,
"reward_std": 0.34606473073363303,
"rewards/accuracy_reward": 0.5916666787117719,
"rewards/format_reward": 0.9250000044703484,
"step": 220
},
{
"completion_length": 507.2722282409668,
"epoch": 0.72,
"grad_norm": 0.05560953915119171,
"kl": 0.06065673828125,
"learning_rate": 6.596610003707959e-07,
"loss": 0.0024,
"reward": 1.5138888970017432,
"reward_std": 0.28480762280523775,
"rewards/accuracy_reward": 0.6055555680766702,
"rewards/format_reward": 0.9083333387970924,
"step": 225
},
{
"completion_length": 514.2500030517579,
"epoch": 0.736,
"grad_norm": 0.06804082542657852,
"kl": 0.0566162109375,
"learning_rate": 5.913149342387704e-07,
"loss": 0.0023,
"reward": 1.5194444492459298,
"reward_std": 0.2549963690340519,
"rewards/accuracy_reward": 0.5833333430811762,
"rewards/format_reward": 0.9361111149191856,
"step": 230
},
{
"completion_length": 468.55556259155276,
"epoch": 0.752,
"grad_norm": 0.06439075618982315,
"kl": 0.0638336181640625,
"learning_rate": 5.258279275047247e-07,
"loss": 0.0026,
"reward": 1.5250000029802322,
"reward_std": 0.28257471323013306,
"rewards/accuracy_reward": 0.5750000124797225,
"rewards/format_reward": 0.9500000029802322,
"step": 235
},
{
"completion_length": 444.02500381469724,
"epoch": 0.768,
"grad_norm": 0.055538810789585114,
"kl": 0.063116455078125,
"learning_rate": 4.63406026519703e-07,
"loss": 0.0025,
"reward": 1.575,
"reward_std": 0.2524180270731449,
"rewards/accuracy_reward": 0.633333345502615,
"rewards/format_reward": 0.9416666701436043,
"step": 240
},
{
"completion_length": 477.8666702270508,
"epoch": 0.784,
"grad_norm": 0.055306848138570786,
"kl": 0.0677734375,
"learning_rate": 4.042456336780838e-07,
"loss": 0.0027,
"reward": 1.4888888821005821,
"reward_std": 0.2729522068053484,
"rewards/accuracy_reward": 0.5527777902781963,
"rewards/format_reward": 0.9361111149191856,
"step": 245
},
{
"completion_length": 463.327783203125,
"epoch": 0.8,
"grad_norm": 0.0591103695333004,
"kl": 0.0603515625,
"learning_rate": 3.4853288946298335e-07,
"loss": 0.0024,
"reward": 1.475000011920929,
"reward_std": 0.29443012587726114,
"rewards/accuracy_reward": 0.5361111234873533,
"rewards/format_reward": 0.938888892531395,
"step": 250
},
{
"completion_length": 450.82778282165526,
"epoch": 0.816,
"grad_norm": 0.06738787144422531,
"kl": 0.0596343994140625,
"learning_rate": 2.9644308677943315e-07,
"loss": 0.0024,
"reward": 1.5000000134110452,
"reward_std": 0.28128554075956347,
"rewards/accuracy_reward": 0.5722222346812487,
"rewards/format_reward": 0.9277777820825577,
"step": 255
},
{
"completion_length": 455.39445037841796,
"epoch": 0.832,
"grad_norm": 0.06288747489452362,
"kl": 0.068408203125,
"learning_rate": 2.48140119418046e-07,
"loss": 0.0027,
"reward": 1.4805555552244187,
"reward_std": 0.2825747117400169,
"rewards/accuracy_reward": 0.5305555703118443,
"rewards/format_reward": 0.9500000029802322,
"step": 260
},
{
"completion_length": 470.1777805328369,
"epoch": 0.848,
"grad_norm": 0.07291523367166519,
"kl": 0.062567138671875,
"learning_rate": 2.0377596638451812e-07,
"loss": 0.0025,
"reward": 1.4444444566965102,
"reward_std": 0.3079201437532902,
"rewards/accuracy_reward": 0.5027777899056673,
"rewards/format_reward": 0.9416666701436043,
"step": 265
},
{
"completion_length": 468.23333740234375,
"epoch": 0.864,
"grad_norm": 0.07961365580558777,
"kl": 0.07003173828125,
"learning_rate": 1.634902137174483e-07,
"loss": 0.0028,
"reward": 1.4638888970017434,
"reward_std": 0.29924137964844705,
"rewards/accuracy_reward": 0.541666678339243,
"rewards/format_reward": 0.922222226858139,
"step": 270
},
{
"completion_length": 431.6388931274414,
"epoch": 0.88,
"grad_norm": 0.061992090195417404,
"kl": 0.06956787109375,
"learning_rate": 1.274096152990203e-07,
"loss": 0.0028,
"reward": 1.4666666746139527,
"reward_std": 0.28609679341316224,
"rewards/accuracy_reward": 0.5222222328186035,
"rewards/format_reward": 0.9444444477558136,
"step": 275
},
{
"completion_length": 435.9138957977295,
"epoch": 0.896,
"grad_norm": 0.08779603242874146,
"kl": 0.0734649658203125,
"learning_rate": 9.564769404039419e-08,
"loss": 0.0029,
"reward": 1.425000011920929,
"reward_std": 0.2979522071778774,
"rewards/accuracy_reward": 0.48611112125217915,
"rewards/format_reward": 0.938888892531395,
"step": 280
},
{
"completion_length": 432.5888919830322,
"epoch": 0.912,
"grad_norm": 0.0677870661020279,
"kl": 0.074609375,
"learning_rate": 6.830438469662892e-08,
"loss": 0.003,
"reward": 1.5027777940034865,
"reward_std": 0.2764742888510227,
"rewards/accuracy_reward": 0.5444444557651877,
"rewards/format_reward": 0.9583333358168602,
"step": 285
},
{
"completion_length": 415.408337020874,
"epoch": 0.928,
"grad_norm": 0.08877945691347122,
"kl": 0.079901123046875,
"learning_rate": 4.546571943496969e-08,
"loss": 0.0032,
"reward": 1.5222222372889518,
"reward_std": 0.29666303619742396,
"rewards/accuracy_reward": 0.5972222350537777,
"rewards/format_reward": 0.9250000044703484,
"step": 290
},
{
"completion_length": 470.35555839538574,
"epoch": 0.944,
"grad_norm": 0.06486877799034119,
"kl": 0.062762451171875,
"learning_rate": 2.72035571458224e-08,
"loss": 0.0025,
"reward": 1.4722222238779068,
"reward_std": 0.26685178354382516,
"rewards/accuracy_reward": 0.5361111244186759,
"rewards/format_reward": 0.9361111149191856,
"step": 295
},
{
"completion_length": 442.53889656066895,
"epoch": 0.96,
"grad_norm": 0.06168290227651596,
"kl": 0.06766357421875,
"learning_rate": 1.357535734809795e-08,
"loss": 0.0027,
"reward": 1.4666666641831398,
"reward_std": 0.31109679453074934,
"rewards/accuracy_reward": 0.5361111233010888,
"rewards/format_reward": 0.930555559694767,
"step": 300
},
{
"completion_length": 447.8833366394043,
"epoch": 0.976,
"grad_norm": 0.06104287877678871,
"kl": 0.0698974609375,
"learning_rate": 4.623999400308054e-09,
"loss": 0.0028,
"reward": 1.5027777954936028,
"reward_std": 0.2777634594589472,
"rewards/accuracy_reward": 0.5472222346812486,
"rewards/format_reward": 0.9555555582046509,
"step": 305
},
{
"completion_length": 406.5805576324463,
"epoch": 0.992,
"grad_norm": 0.06284002214670181,
"kl": 0.0779296875,
"learning_rate": 3.77647586240204e-10,
"loss": 0.0031,
"reward": 1.4805555641651154,
"reward_std": 0.2690846938639879,
"rewards/accuracy_reward": 0.5388889022171497,
"rewards/format_reward": 0.9416666693985463,
"step": 310
},
{
"completion_length": 461.6458377838135,
"epoch": 0.9984,
"kl": 0.06114959716796875,
"reward": 1.4236111268401146,
"reward_std": 0.29425740987062454,
"rewards/accuracy_reward": 0.5000000135041773,
"rewards/format_reward": 0.9236111156642437,
"step": 312,
"total_flos": 0.0,
"train_loss": 0.002609439611100802,
"train_runtime": 17087.0679,
"train_samples_per_second": 0.439,
"train_steps_per_second": 0.018
}
],
"logging_steps": 5,
"max_steps": 312,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}