|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 2, |
|
"global_step": 312, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 553.6277812957763, |
|
"epoch": 0.016, |
|
"grad_norm": 0.21352672576904297, |
|
"kl": 0.0001860499382019043, |
|
"learning_rate": 4.6875e-07, |
|
"loss": 0.0, |
|
"reward": 0.5361111229285598, |
|
"reward_std": 0.2633297026157379, |
|
"rewards/accuracy_reward": 0.3111111169680953, |
|
"rewards/format_reward": 0.22500000484287738, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 509.1916721343994, |
|
"epoch": 0.032, |
|
"grad_norm": 0.08460841327905655, |
|
"kl": 0.0006537675857543945, |
|
"learning_rate": 9.375e-07, |
|
"loss": 0.0, |
|
"reward": 0.6250000141561032, |
|
"reward_std": 0.22612885609269143, |
|
"rewards/accuracy_reward": 0.32500000633299353, |
|
"rewards/format_reward": 0.30000000819563866, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 414.31111640930175, |
|
"epoch": 0.048, |
|
"grad_norm": 0.09808748215436935, |
|
"kl": 0.009083938598632813, |
|
"learning_rate": 1.40625e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7472222343087196, |
|
"reward_std": 0.25241802744567393, |
|
"rewards/accuracy_reward": 0.12500000279396772, |
|
"rewards/format_reward": 0.6222222335636616, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 250.90277996063233, |
|
"epoch": 0.064, |
|
"grad_norm": 0.05115974321961403, |
|
"kl": 0.03193817138671875, |
|
"learning_rate": 1.875e-06, |
|
"loss": 0.0013, |
|
"reward": 0.9416666746139526, |
|
"reward_std": 0.15361464098095895, |
|
"rewards/accuracy_reward": 0.03611111156642437, |
|
"rewards/format_reward": 0.9055555611848831, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 211.14444694519042, |
|
"epoch": 0.08, |
|
"grad_norm": 0.06653334200382233, |
|
"kl": 0.035968017578125, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 0.0014, |
|
"reward": 0.986111119389534, |
|
"reward_std": 0.10103629790246486, |
|
"rewards/accuracy_reward": 0.0305555559694767, |
|
"rewards/format_reward": 0.9555555582046509, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 246.1000036239624, |
|
"epoch": 0.096, |
|
"grad_norm": 0.06397304683923721, |
|
"kl": 0.0366363525390625, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 0.0015, |
|
"reward": 1.0000000089406966, |
|
"reward_std": 0.07216878421604633, |
|
"rewards/accuracy_reward": 0.038888889364898205, |
|
"rewards/format_reward": 0.9611111134290695, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 255.99722633361816, |
|
"epoch": 0.112, |
|
"grad_norm": 0.12618552148342133, |
|
"kl": 0.03426666259765625, |
|
"learning_rate": 2.9991503375003e-06, |
|
"loss": 0.0014, |
|
"reward": 1.025000013411045, |
|
"reward_std": 0.11547005474567414, |
|
"rewards/accuracy_reward": 0.06388888955116272, |
|
"rewards/format_reward": 0.9611111134290695, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 258.1111114501953, |
|
"epoch": 0.128, |
|
"grad_norm": 0.09856373071670532, |
|
"kl": 0.0427825927734375, |
|
"learning_rate": 2.993961440992859e-06, |
|
"loss": 0.0017, |
|
"reward": 1.086111131310463, |
|
"reward_std": 0.1732050821185112, |
|
"rewards/accuracy_reward": 0.12500000167638065, |
|
"rewards/format_reward": 0.9611111134290695, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 296.8416700363159, |
|
"epoch": 0.144, |
|
"grad_norm": 0.091950424015522, |
|
"kl": 0.0412506103515625, |
|
"learning_rate": 2.984071989079555e-06, |
|
"loss": 0.0017, |
|
"reward": 1.0666666805744172, |
|
"reward_std": 0.1658154871314764, |
|
"rewards/accuracy_reward": 0.10000000093132258, |
|
"rewards/format_reward": 0.9666666686534882, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 317.56111373901365, |
|
"epoch": 0.16, |
|
"grad_norm": 0.09281204640865326, |
|
"kl": 0.0566925048828125, |
|
"learning_rate": 2.9695130976348534e-06, |
|
"loss": 0.0023, |
|
"reward": 1.0972222343087197, |
|
"reward_std": 0.21873926185071468, |
|
"rewards/accuracy_reward": 0.15555555745959282, |
|
"rewards/format_reward": 0.9416666701436043, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 281.8416681289673, |
|
"epoch": 0.176, |
|
"grad_norm": 0.10378360003232956, |
|
"kl": 0.07989501953125, |
|
"learning_rate": 2.9503305743175096e-06, |
|
"loss": 0.0032, |
|
"reward": 1.1777777940034866, |
|
"reward_std": 0.25370719768106936, |
|
"rewards/accuracy_reward": 0.20833333637565374, |
|
"rewards/format_reward": 0.9694444462656975, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 254.96389064788818, |
|
"epoch": 0.192, |
|
"grad_norm": 0.14641663432121277, |
|
"kl": 0.092510986328125, |
|
"learning_rate": 2.9265847744427307e-06, |
|
"loss": 0.0037, |
|
"reward": 1.1916666761040688, |
|
"reward_std": 0.2572292793542147, |
|
"rewards/accuracy_reward": 0.23333333786576987, |
|
"rewards/format_reward": 0.9583333358168602, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 270.2777801513672, |
|
"epoch": 0.208, |
|
"grad_norm": 0.09665928781032562, |
|
"kl": 0.097540283203125, |
|
"learning_rate": 2.8983504110820214e-06, |
|
"loss": 0.0039, |
|
"reward": 1.172222228348255, |
|
"reward_std": 0.25980762131512164, |
|
"rewards/accuracy_reward": 0.2305555608123541, |
|
"rewards/format_reward": 0.9416666701436043, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 372.66111488342284, |
|
"epoch": 0.224, |
|
"grad_norm": 0.13982528448104858, |
|
"kl": 0.0855712890625, |
|
"learning_rate": 2.865716319988224e-06, |
|
"loss": 0.0034, |
|
"reward": 1.2000000104308128, |
|
"reward_std": 0.32587598264217377, |
|
"rewards/accuracy_reward": 0.2916666727513075, |
|
"rewards/format_reward": 0.9083333387970924, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 372.80278129577636, |
|
"epoch": 0.24, |
|
"grad_norm": 0.07452689111232758, |
|
"kl": 0.07293701171875, |
|
"learning_rate": 2.82878518008537e-06, |
|
"loss": 0.0029, |
|
"reward": 1.3944444462656975, |
|
"reward_std": 0.30276345908641816, |
|
"rewards/accuracy_reward": 0.4333333451300859, |
|
"rewards/format_reward": 0.9611111134290695, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 376.4416694641113, |
|
"epoch": 0.256, |
|
"grad_norm": 0.08069849759340286, |
|
"kl": 0.08565673828125, |
|
"learning_rate": 2.7876731904027993e-06, |
|
"loss": 0.0034, |
|
"reward": 1.3472222313284874, |
|
"reward_std": 0.36049848720431327, |
|
"rewards/accuracy_reward": 0.40000000949949027, |
|
"rewards/format_reward": 0.9472222253680229, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 354.7555561065674, |
|
"epoch": 0.272, |
|
"grad_norm": 0.10501620918512344, |
|
"kl": 0.0765869140625, |
|
"learning_rate": 2.7425097044700246e-06, |
|
"loss": 0.0031, |
|
"reward": 1.3944444552063942, |
|
"reward_std": 0.27776345871388913, |
|
"rewards/accuracy_reward": 0.45000001005828383, |
|
"rewards/format_reward": 0.9444444477558136, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 403.3444492340088, |
|
"epoch": 0.288, |
|
"grad_norm": 0.12747347354888916, |
|
"kl": 0.073785400390625, |
|
"learning_rate": 2.6934368233226715e-06, |
|
"loss": 0.003, |
|
"reward": 1.2694444522261619, |
|
"reward_std": 0.34864307269454003, |
|
"rewards/accuracy_reward": 0.38055556602776053, |
|
"rewards/format_reward": 0.8888888947665692, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 413.0500057220459, |
|
"epoch": 0.304, |
|
"grad_norm": 0.08874136209487915, |
|
"kl": 0.070477294921875, |
|
"learning_rate": 2.6406089484000465e-06, |
|
"loss": 0.0028, |
|
"reward": 1.297222228348255, |
|
"reward_std": 0.3640205666422844, |
|
"rewards/accuracy_reward": 0.4333333447575569, |
|
"rewards/format_reward": 0.8638888970017433, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 381.81389274597166, |
|
"epoch": 0.32, |
|
"grad_norm": 0.05798633396625519, |
|
"kl": 0.07501220703125, |
|
"learning_rate": 2.584192295741087e-06, |
|
"loss": 0.003, |
|
"reward": 1.3750000134110452, |
|
"reward_std": 0.3136751361191273, |
|
"rewards/accuracy_reward": 0.4416666753590107, |
|
"rewards/format_reward": 0.9333333373069763, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 369.6000038146973, |
|
"epoch": 0.336, |
|
"grad_norm": 0.07807120680809021, |
|
"kl": 0.077154541015625, |
|
"learning_rate": 2.5243643730072105e-06, |
|
"loss": 0.0031, |
|
"reward": 1.494444453716278, |
|
"reward_std": 0.31015305407345295, |
|
"rewards/accuracy_reward": 0.5555555684491992, |
|
"rewards/format_reward": 0.938888892531395, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 390.78889389038085, |
|
"epoch": 0.352, |
|
"grad_norm": 0.08092948794364929, |
|
"kl": 0.08060302734375, |
|
"learning_rate": 2.461313420977536e-06, |
|
"loss": 0.0032, |
|
"reward": 1.380555558204651, |
|
"reward_std": 0.3232976388186216, |
|
"rewards/accuracy_reward": 0.4527777874842286, |
|
"rewards/format_reward": 0.9277777820825577, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 413.20278282165526, |
|
"epoch": 0.368, |
|
"grad_norm": 0.059824734926223755, |
|
"kl": 0.0724517822265625, |
|
"learning_rate": 2.3952378212737554e-06, |
|
"loss": 0.0029, |
|
"reward": 1.3944444507360458, |
|
"reward_std": 0.337731396406889, |
|
"rewards/accuracy_reward": 0.4611111244186759, |
|
"rewards/format_reward": 0.9333333373069763, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 403.9111152648926, |
|
"epoch": 0.384, |
|
"grad_norm": 0.07250163704156876, |
|
"kl": 0.078021240234375, |
|
"learning_rate": 2.3263454721781537e-06, |
|
"loss": 0.0031, |
|
"reward": 1.4305555507540704, |
|
"reward_std": 0.299241379275918, |
|
"rewards/accuracy_reward": 0.530555566214025, |
|
"rewards/format_reward": 0.9000000059604645, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 378.2222255706787, |
|
"epoch": 0.4, |
|
"grad_norm": 0.12146278470754623, |
|
"kl": 0.0875885009765625, |
|
"learning_rate": 2.2548531345087003e-06, |
|
"loss": 0.0035, |
|
"reward": 1.3388888984918594, |
|
"reward_std": 0.28480762206017973, |
|
"rewards/accuracy_reward": 0.386111119389534, |
|
"rewards/format_reward": 0.9527777805924416, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 324.2388910293579, |
|
"epoch": 0.416, |
|
"grad_norm": 0.09016001224517822, |
|
"kl": 0.102557373046875, |
|
"learning_rate": 2.18098574960932e-06, |
|
"loss": 0.0041, |
|
"reward": 1.4750000044703484, |
|
"reward_std": 0.28257471285760405, |
|
"rewards/accuracy_reward": 0.5138889042660594, |
|
"rewards/format_reward": 0.9611111134290695, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 425.48055801391604, |
|
"epoch": 0.432, |
|
"grad_norm": 0.06732641160488129, |
|
"kl": 0.089459228515625, |
|
"learning_rate": 2.104975731601208e-06, |
|
"loss": 0.0036, |
|
"reward": 1.3027777835726737, |
|
"reward_std": 0.32329764030873775, |
|
"rewards/accuracy_reward": 0.38611112013459203, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 475.63889541625974, |
|
"epoch": 0.448, |
|
"grad_norm": 0.06925955414772034, |
|
"kl": 0.06644287109375, |
|
"learning_rate": 2.027062236122014e-06, |
|
"loss": 0.0027, |
|
"reward": 1.3805555671453476, |
|
"reward_std": 0.35310889072716234, |
|
"rewards/accuracy_reward": 0.4888889010995626, |
|
"rewards/format_reward": 0.8916666731238365, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 469.69444847106934, |
|
"epoch": 0.464, |
|
"grad_norm": 0.04355955123901367, |
|
"kl": 0.073883056640625, |
|
"learning_rate": 1.9474904078537343e-06, |
|
"loss": 0.003, |
|
"reward": 1.4388888955116272, |
|
"reward_std": 0.2873859636485577, |
|
"rewards/accuracy_reward": 0.5583333482965827, |
|
"rewards/format_reward": 0.8805555619299412, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 500.4750095367432, |
|
"epoch": 0.48, |
|
"grad_norm": 0.06400442123413086, |
|
"kl": 0.06243896484375, |
|
"learning_rate": 1.866510609206841e-06, |
|
"loss": 0.0025, |
|
"reward": 1.477777788043022, |
|
"reward_std": 0.274241379275918, |
|
"rewards/accuracy_reward": 0.5611111238598824, |
|
"rewards/format_reward": 0.9166666716337204, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 482.2611141204834, |
|
"epoch": 0.496, |
|
"grad_norm": 0.07321769744157791, |
|
"kl": 0.063238525390625, |
|
"learning_rate": 1.784377632587518e-06, |
|
"loss": 0.0025, |
|
"reward": 1.4861111238598823, |
|
"reward_std": 0.30147428885102273, |
|
"rewards/accuracy_reward": 0.541666678711772, |
|
"rewards/format_reward": 0.9444444477558136, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 458.41389694213865, |
|
"epoch": 0.512, |
|
"grad_norm": 0.05824045091867447, |
|
"kl": 0.06778564453125, |
|
"learning_rate": 1.7013498987264833e-06, |
|
"loss": 0.0027, |
|
"reward": 1.475000013411045, |
|
"reward_std": 0.23446219004690647, |
|
"rewards/accuracy_reward": 0.5250000132247805, |
|
"rewards/format_reward": 0.9500000029802322, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 437.94444770812987, |
|
"epoch": 0.528, |
|
"grad_norm": 0.07412311434745789, |
|
"kl": 0.070013427734375, |
|
"learning_rate": 1.6176886435917677e-06, |
|
"loss": 0.0028, |
|
"reward": 1.5305555671453477, |
|
"reward_std": 0.2790526311844587, |
|
"rewards/accuracy_reward": 0.5833333466202021, |
|
"rewards/format_reward": 0.9472222253680229, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 425.4944492340088, |
|
"epoch": 0.544, |
|
"grad_norm": 0.06518115103244781, |
|
"kl": 0.070721435546875, |
|
"learning_rate": 1.5336570964437077e-06, |
|
"loss": 0.0028, |
|
"reward": 1.563888892531395, |
|
"reward_std": 0.21392801143229007, |
|
"rewards/accuracy_reward": 0.5972222350537777, |
|
"rewards/format_reward": 0.9666666686534882, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 447.97500228881836, |
|
"epoch": 0.56, |
|
"grad_norm": 0.05302765220403671, |
|
"kl": 0.06865234375, |
|
"learning_rate": 1.4495196516183096e-06, |
|
"loss": 0.0027, |
|
"reward": 1.4805555552244187, |
|
"reward_std": 0.2787071973085403, |
|
"rewards/accuracy_reward": 0.5361111257225275, |
|
"rewards/format_reward": 0.9444444477558136, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 441.51111526489257, |
|
"epoch": 0.576, |
|
"grad_norm": 0.0858602300286293, |
|
"kl": 0.0727752685546875, |
|
"learning_rate": 1.3655410366448499e-06, |
|
"loss": 0.0029, |
|
"reward": 1.5250000014901162, |
|
"reward_std": 0.2572292808443308, |
|
"rewards/accuracy_reward": 0.5666666816920042, |
|
"rewards/format_reward": 0.9583333358168602, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 470.7583396911621, |
|
"epoch": 0.592, |
|
"grad_norm": 0.07894453406333923, |
|
"kl": 0.065618896484375, |
|
"learning_rate": 1.2819854793151313e-06, |
|
"loss": 0.0026, |
|
"reward": 1.4527777865529061, |
|
"reward_std": 0.2738959465175867, |
|
"rewards/accuracy_reward": 0.516666678711772, |
|
"rewards/format_reward": 0.9361111149191856, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 449.9416721343994, |
|
"epoch": 0.608, |
|
"grad_norm": 0.08513162285089493, |
|
"kl": 0.070989990234375, |
|
"learning_rate": 1.199115876325091e-06, |
|
"loss": 0.0028, |
|
"reward": 1.4611111134290695, |
|
"reward_std": 0.3258759815245867, |
|
"rewards/accuracy_reward": 0.5277777882292867, |
|
"rewards/format_reward": 0.9333333373069763, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 412.6527816772461, |
|
"epoch": 0.624, |
|
"grad_norm": 0.0992361381649971, |
|
"kl": 0.0746826171875, |
|
"learning_rate": 1.1171929661045361e-06, |
|
"loss": 0.003, |
|
"reward": 1.4388888955116272, |
|
"reward_std": 0.3390205677598715, |
|
"rewards/accuracy_reward": 0.5277777882292867, |
|
"rewards/format_reward": 0.9111111164093018, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 434.86667137146, |
|
"epoch": 0.64, |
|
"grad_norm": 0.08208976686000824, |
|
"kl": 0.06728515625, |
|
"learning_rate": 1.036474508437579e-06, |
|
"loss": 0.0027, |
|
"reward": 1.547222228348255, |
|
"reward_std": 0.25499636940658094, |
|
"rewards/accuracy_reward": 0.5944444581866264, |
|
"rewards/format_reward": 0.9527777805924416, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 457.2805625915527, |
|
"epoch": 0.656, |
|
"grad_norm": 0.04328469559550285, |
|
"kl": 0.058355712890625, |
|
"learning_rate": 9.57214473454992e-07, |
|
"loss": 0.0023, |
|
"reward": 1.5222222253680229, |
|
"reward_std": 0.22002843283116819, |
|
"rewards/accuracy_reward": 0.58888890016824, |
|
"rewards/format_reward": 0.9333333373069763, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 453.0138931274414, |
|
"epoch": 0.672, |
|
"grad_norm": 0.06149205565452576, |
|
"kl": 0.0665008544921875, |
|
"learning_rate": 8.796622425502193e-07, |
|
"loss": 0.0027, |
|
"reward": 1.5250000014901162, |
|
"reward_std": 0.30310888960957527, |
|
"rewards/accuracy_reward": 0.6111111264675856, |
|
"rewards/format_reward": 0.9138888940215111, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 513.1500045776368, |
|
"epoch": 0.688, |
|
"grad_norm": 0.06691473722457886, |
|
"kl": 0.0645904541015625, |
|
"learning_rate": 8.040618237332491e-07, |
|
"loss": 0.0026, |
|
"reward": 1.436111108958721, |
|
"reward_std": 0.3447755578905344, |
|
"rewards/accuracy_reward": 0.5305555699393153, |
|
"rewards/format_reward": 0.9055555611848831, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 480.86389236450196, |
|
"epoch": 0.704, |
|
"grad_norm": 0.06451098620891571, |
|
"kl": 0.0659393310546875, |
|
"learning_rate": 7.30651083891141e-07, |
|
"loss": 0.0026, |
|
"reward": 1.5166666820645331, |
|
"reward_std": 0.34606473073363303, |
|
"rewards/accuracy_reward": 0.5916666787117719, |
|
"rewards/format_reward": 0.9250000044703484, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 507.2722282409668, |
|
"epoch": 0.72, |
|
"grad_norm": 0.05560953915119171, |
|
"kl": 0.06065673828125, |
|
"learning_rate": 6.596610003707959e-07, |
|
"loss": 0.0024, |
|
"reward": 1.5138888970017432, |
|
"reward_std": 0.28480762280523775, |
|
"rewards/accuracy_reward": 0.6055555680766702, |
|
"rewards/format_reward": 0.9083333387970924, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 514.2500030517579, |
|
"epoch": 0.736, |
|
"grad_norm": 0.06804082542657852, |
|
"kl": 0.0566162109375, |
|
"learning_rate": 5.913149342387704e-07, |
|
"loss": 0.0023, |
|
"reward": 1.5194444492459298, |
|
"reward_std": 0.2549963690340519, |
|
"rewards/accuracy_reward": 0.5833333430811762, |
|
"rewards/format_reward": 0.9361111149191856, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 468.55556259155276, |
|
"epoch": 0.752, |
|
"grad_norm": 0.06439075618982315, |
|
"kl": 0.0638336181640625, |
|
"learning_rate": 5.258279275047247e-07, |
|
"loss": 0.0026, |
|
"reward": 1.5250000029802322, |
|
"reward_std": 0.28257471323013306, |
|
"rewards/accuracy_reward": 0.5750000124797225, |
|
"rewards/format_reward": 0.9500000029802322, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 444.02500381469724, |
|
"epoch": 0.768, |
|
"grad_norm": 0.055538810789585114, |
|
"kl": 0.063116455078125, |
|
"learning_rate": 4.63406026519703e-07, |
|
"loss": 0.0025, |
|
"reward": 1.575, |
|
"reward_std": 0.2524180270731449, |
|
"rewards/accuracy_reward": 0.633333345502615, |
|
"rewards/format_reward": 0.9416666701436043, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 477.8666702270508, |
|
"epoch": 0.784, |
|
"grad_norm": 0.055306848138570786, |
|
"kl": 0.0677734375, |
|
"learning_rate": 4.042456336780838e-07, |
|
"loss": 0.0027, |
|
"reward": 1.4888888821005821, |
|
"reward_std": 0.2729522068053484, |
|
"rewards/accuracy_reward": 0.5527777902781963, |
|
"rewards/format_reward": 0.9361111149191856, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 463.327783203125, |
|
"epoch": 0.8, |
|
"grad_norm": 0.0591103695333004, |
|
"kl": 0.0603515625, |
|
"learning_rate": 3.4853288946298335e-07, |
|
"loss": 0.0024, |
|
"reward": 1.475000011920929, |
|
"reward_std": 0.29443012587726114, |
|
"rewards/accuracy_reward": 0.5361111234873533, |
|
"rewards/format_reward": 0.938888892531395, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 450.82778282165526, |
|
"epoch": 0.816, |
|
"grad_norm": 0.06738787144422531, |
|
"kl": 0.0596343994140625, |
|
"learning_rate": 2.9644308677943315e-07, |
|
"loss": 0.0024, |
|
"reward": 1.5000000134110452, |
|
"reward_std": 0.28128554075956347, |
|
"rewards/accuracy_reward": 0.5722222346812487, |
|
"rewards/format_reward": 0.9277777820825577, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 455.39445037841796, |
|
"epoch": 0.832, |
|
"grad_norm": 0.06288747489452362, |
|
"kl": 0.068408203125, |
|
"learning_rate": 2.48140119418046e-07, |
|
"loss": 0.0027, |
|
"reward": 1.4805555552244187, |
|
"reward_std": 0.2825747117400169, |
|
"rewards/accuracy_reward": 0.5305555703118443, |
|
"rewards/format_reward": 0.9500000029802322, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 470.1777805328369, |
|
"epoch": 0.848, |
|
"grad_norm": 0.07291523367166519, |
|
"kl": 0.062567138671875, |
|
"learning_rate": 2.0377596638451812e-07, |
|
"loss": 0.0025, |
|
"reward": 1.4444444566965102, |
|
"reward_std": 0.3079201437532902, |
|
"rewards/accuracy_reward": 0.5027777899056673, |
|
"rewards/format_reward": 0.9416666701436043, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 468.23333740234375, |
|
"epoch": 0.864, |
|
"grad_norm": 0.07961365580558777, |
|
"kl": 0.07003173828125, |
|
"learning_rate": 1.634902137174483e-07, |
|
"loss": 0.0028, |
|
"reward": 1.4638888970017434, |
|
"reward_std": 0.29924137964844705, |
|
"rewards/accuracy_reward": 0.541666678339243, |
|
"rewards/format_reward": 0.922222226858139, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 431.6388931274414, |
|
"epoch": 0.88, |
|
"grad_norm": 0.061992090195417404, |
|
"kl": 0.06956787109375, |
|
"learning_rate": 1.274096152990203e-07, |
|
"loss": 0.0028, |
|
"reward": 1.4666666746139527, |
|
"reward_std": 0.28609679341316224, |
|
"rewards/accuracy_reward": 0.5222222328186035, |
|
"rewards/format_reward": 0.9444444477558136, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 435.9138957977295, |
|
"epoch": 0.896, |
|
"grad_norm": 0.08779603242874146, |
|
"kl": 0.0734649658203125, |
|
"learning_rate": 9.564769404039419e-08, |
|
"loss": 0.0029, |
|
"reward": 1.425000011920929, |
|
"reward_std": 0.2979522071778774, |
|
"rewards/accuracy_reward": 0.48611112125217915, |
|
"rewards/format_reward": 0.938888892531395, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 432.5888919830322, |
|
"epoch": 0.912, |
|
"grad_norm": 0.0677870661020279, |
|
"kl": 0.074609375, |
|
"learning_rate": 6.830438469662892e-08, |
|
"loss": 0.003, |
|
"reward": 1.5027777940034865, |
|
"reward_std": 0.2764742888510227, |
|
"rewards/accuracy_reward": 0.5444444557651877, |
|
"rewards/format_reward": 0.9583333358168602, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 415.408337020874, |
|
"epoch": 0.928, |
|
"grad_norm": 0.08877945691347122, |
|
"kl": 0.079901123046875, |
|
"learning_rate": 4.546571943496969e-08, |
|
"loss": 0.0032, |
|
"reward": 1.5222222372889518, |
|
"reward_std": 0.29666303619742396, |
|
"rewards/accuracy_reward": 0.5972222350537777, |
|
"rewards/format_reward": 0.9250000044703484, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 470.35555839538574, |
|
"epoch": 0.944, |
|
"grad_norm": 0.06486877799034119, |
|
"kl": 0.062762451171875, |
|
"learning_rate": 2.72035571458224e-08, |
|
"loss": 0.0025, |
|
"reward": 1.4722222238779068, |
|
"reward_std": 0.26685178354382516, |
|
"rewards/accuracy_reward": 0.5361111244186759, |
|
"rewards/format_reward": 0.9361111149191856, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 442.53889656066895, |
|
"epoch": 0.96, |
|
"grad_norm": 0.06168290227651596, |
|
"kl": 0.06766357421875, |
|
"learning_rate": 1.357535734809795e-08, |
|
"loss": 0.0027, |
|
"reward": 1.4666666641831398, |
|
"reward_std": 0.31109679453074934, |
|
"rewards/accuracy_reward": 0.5361111233010888, |
|
"rewards/format_reward": 0.930555559694767, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 447.8833366394043, |
|
"epoch": 0.976, |
|
"grad_norm": 0.06104287877678871, |
|
"kl": 0.0698974609375, |
|
"learning_rate": 4.623999400308054e-09, |
|
"loss": 0.0028, |
|
"reward": 1.5027777954936028, |
|
"reward_std": 0.2777634594589472, |
|
"rewards/accuracy_reward": 0.5472222346812486, |
|
"rewards/format_reward": 0.9555555582046509, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 406.5805576324463, |
|
"epoch": 0.992, |
|
"grad_norm": 0.06284002214670181, |
|
"kl": 0.0779296875, |
|
"learning_rate": 3.77647586240204e-10, |
|
"loss": 0.0031, |
|
"reward": 1.4805555641651154, |
|
"reward_std": 0.2690846938639879, |
|
"rewards/accuracy_reward": 0.5388889022171497, |
|
"rewards/format_reward": 0.9416666693985463, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 461.6458377838135, |
|
"epoch": 0.9984, |
|
"kl": 0.06114959716796875, |
|
"reward": 1.4236111268401146, |
|
"reward_std": 0.29425740987062454, |
|
"rewards/accuracy_reward": 0.5000000135041773, |
|
"rewards/format_reward": 0.9236111156642437, |
|
"step": 312, |
|
"total_flos": 0.0, |
|
"train_loss": 0.002609439611100802, |
|
"train_runtime": 17087.0679, |
|
"train_samples_per_second": 0.439, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 312, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|