{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.002, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.004, "grad_norm": 0.6894801259040833, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0, "reward": 0.5625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 1845.75, "epoch": 0.006, "grad_norm": 0.0036829786840826273, "kl": 4.696846008300781e-05, "learning_rate": 6e-08, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.008, "grad_norm": 0.0038558689411729574, "kl": 9.775161743164062e-05, "learning_rate": 8e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.01, "grad_norm": 0.7485816478729248, "kl": 0.00010919570922851562, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1985.5, "epoch": 0.012, "grad_norm": 0.7458791732788086, "kl": 7.796287536621094e-05, "learning_rate": 1.2e-07, "loss": -0.023, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.014, "grad_norm": 0.003527791704982519, "kl": 6.628036499023438e-05, "learning_rate": 1.4e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.016, "grad_norm": 0.8426324129104614, "kl": 7.104873657226562e-05, "learning_rate": 1.6e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.018, "grad_norm": 0.7430940270423889, "kl": 9.5367431640625e-05, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.02, "grad_norm": 0.003660305170342326, "kl": 5.626678466796875e-05, "learning_rate": 2e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.022, "grad_norm": 0.9972493648529053, "kl": 0.0001544952392578125, "learning_rate": 2.1999999999999998e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.024, "grad_norm": 0.003556522075086832, "kl": 6.723403930664062e-05, "learning_rate": 2.4e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 1682.25, "epoch": 0.026, "grad_norm": 0.0050394581630826, "kl": 8.845329284667969e-05, "learning_rate": 2.6e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.028, "grad_norm": 0.004075606819242239, "kl": 0.0001049041748046875, "learning_rate": 2.8e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.03, "grad_norm": 0.6532822251319885, "kl": 7.2479248046875e-05, "learning_rate": 3e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.032, "grad_norm": 0.7762453556060791, "kl": 0.00011777877807617188, "learning_rate": 3.2e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.034, "grad_norm": 0.7456417679786682, "kl": 8.58306884765625e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 1770.5, "epoch": 0.036, "grad_norm": 0.0034950117114931345, "kl": 3.5762786865234375e-05, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.038, "grad_norm": 0.003600472817197442, "kl": 6.866455078125e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1856.25, "epoch": 0.04, "grad_norm": 0.8460555076599121, "kl": 8.106231689453125e-05, "learning_rate": 4e-07, "loss": 0.0814, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.042, "grad_norm": 0.0038911281153559685, "kl": 0.0001068115234375, "learning_rate": 4.1999999999999995e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.044, "grad_norm": 0.7139887809753418, "kl": 0.0001125335693359375, "learning_rate": 4.3999999999999997e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 1855.25, "epoch": 0.046, "grad_norm": 0.005301118828356266, "kl": 5.6862831115722656e-05, "learning_rate": 4.6e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.048, "grad_norm": 0.0036839963868260384, "kl": 7.62939453125e-05, "learning_rate": 4.8e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.05, "grad_norm": 0.003639479400590062, "kl": 8.678436279296875e-05, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.052, "grad_norm": 0.8071563243865967, "kl": 8.535385131835938e-05, "learning_rate": 5.2e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.054, "grad_norm": 0.0037776094395667315, "kl": 0.00011014938354492188, "learning_rate": 5.4e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 1608.75, "epoch": 0.056, "grad_norm": 0.6625211834907532, "kl": 4.398822784423828e-05, "learning_rate": 5.6e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 1841.25, "epoch": 0.058, "grad_norm": 0.0036761562805622816, "kl": 8.678436279296875e-05, "learning_rate": 5.8e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.06, "grad_norm": 0.0033802345860749483, "kl": 5.7697296142578125e-05, "learning_rate": 6e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.062, "grad_norm": 0.003240807680413127, "kl": 6.29425048828125e-05, "learning_rate": 6.2e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.064, "grad_norm": 0.0036673492286354303, "kl": 7.152557373046875e-05, "learning_rate": 6.4e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 1735.75, "epoch": 0.066, "grad_norm": 0.8675172328948975, "kl": 2.682209014892578e-05, "learning_rate": 6.6e-07, "loss": -0.0631, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.068, "grad_norm": 0.8184774518013, "kl": 9.775161743164062e-05, "learning_rate": 6.800000000000001e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.07, "grad_norm": 0.003660478862002492, "kl": 0.00010251998901367188, "learning_rate": 7e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.072, "grad_norm": 0.004034126177430153, "kl": 6.4849853515625e-05, "learning_rate": 7.2e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.074, "grad_norm": 0.7589001655578613, "kl": 0.00010824203491210938, "learning_rate": 7.4e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.076, "grad_norm": 0.6517212986946106, "kl": 8.20159912109375e-05, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.078, "grad_norm": 0.8392653465270996, "kl": 0.0001430511474609375, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.08, "grad_norm": 0.990591824054718, "kl": 0.0001125335693359375, "learning_rate": 8e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.082, "grad_norm": 0.7600575685501099, "kl": 0.00014066696166992188, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.084, "grad_norm": 0.004982170183211565, "kl": 0.00013589859008789062, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.086, "grad_norm": 0.6588786840438843, "kl": 0.00019502639770507812, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.088, "grad_norm": 0.876471996307373, "kl": 0.00020933151245117188, "learning_rate": 8.799999999999999e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 1857.0, "epoch": 0.09, "grad_norm": 0.007592031732201576, "kl": 0.00015783309936523438, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.092, "grad_norm": 0.649493932723999, "kl": 0.00015401840209960938, "learning_rate": 9.2e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.094, "grad_norm": 0.0060058352537453175, "kl": 0.00024318695068359375, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 1703.5, "epoch": 0.096, "grad_norm": 0.7367803454399109, "kl": 0.00028061866760253906, "learning_rate": 9.6e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.098, "grad_norm": 0.7102321982383728, "kl": 0.0003223419189453125, "learning_rate": 9.8e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.1, "grad_norm": 0.5775962471961975, "kl": 0.00010824203491210938, "learning_rate": 1e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.102, "grad_norm": 0.0037765211891382933, "kl": 7.62939453125e-05, "learning_rate": 9.999890338174275e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 1991.5, "epoch": 0.104, "grad_norm": 0.8791236281394958, "kl": 0.00046634674072265625, "learning_rate": 9.999561358041868e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.106, "grad_norm": 0.006120009813457727, "kl": 0.0004935264587402344, "learning_rate": 9.999013075636804e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.108, "grad_norm": 0.006084068212658167, "kl": 0.0002951622009277344, "learning_rate": 9.998245517681593e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 1732.5, "epoch": 0.11, "grad_norm": 0.008206584490835667, "kl": 0.00015926361083984375, "learning_rate": 9.997258721585931e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 2031.5, "epoch": 0.112, "grad_norm": 0.06737767159938812, "kl": 0.0014257431030273438, "learning_rate": 9.996052735444862e-07, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.114, "grad_norm": 0.6304325461387634, "kl": 0.0003185272216796875, "learning_rate": 9.994627618036452e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.116, "grad_norm": 0.6972232460975647, "kl": 0.000560760498046875, "learning_rate": 9.992983438818915e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.118, "grad_norm": 0.026448842138051987, "kl": 0.0009822845458984375, "learning_rate": 9.991120277927223e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.12, "grad_norm": 0.006011964753270149, "kl": 0.0007162094116210938, "learning_rate": 9.989038226169207e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.122, "grad_norm": 0.6680987477302551, "kl": 0.0005612373352050781, "learning_rate": 9.98673738502114e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 1933.75, "epoch": 0.124, "grad_norm": 0.9241335988044739, "kl": 0.0006771087646484375, "learning_rate": 9.98421786662277e-07, "loss": -0.0443, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.126, "grad_norm": 0.009985378012061119, "kl": 0.0005092620849609375, "learning_rate": 9.981479793771866e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.128, "grad_norm": 0.03178563341498375, "kl": 0.0007648468017578125, "learning_rate": 9.97852329991824e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 1685.75, "epoch": 0.13, "grad_norm": 0.9064852595329285, "kl": 0.00341796875, "learning_rate": 9.975348529157229e-07, "loss": -0.0454, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.132, "grad_norm": 0.7919860482215881, "kl": 0.001010894775390625, "learning_rate": 9.971955636222684e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 1887.25, "epoch": 0.134, "grad_norm": 0.8760141134262085, "kl": 0.000766754150390625, "learning_rate": 9.968344786479415e-07, "loss": 0.0659, "reward": 0.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.136, "grad_norm": 0.004815725143998861, "kl": 0.0005044937133789062, "learning_rate": 9.964516155915151e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 1797.25, "epoch": 0.138, "grad_norm": 0.8277981877326965, "kl": 0.0012836456298828125, "learning_rate": 9.960469931131936e-07, "loss": 0.1147, "reward": 0.9375, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.75, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 2000.25, "epoch": 0.14, "grad_norm": 0.8641435503959656, "kl": 0.0011048316955566406, "learning_rate": 9.956206309337066e-07, "loss": 0.0173, "reward": 0.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.142, "grad_norm": 0.009465116076171398, "kl": 0.00019884109497070312, "learning_rate": 9.951725498333448e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 1618.25, "epoch": 0.144, "grad_norm": 0.8773428797721863, "kl": 0.0106658935546875, "learning_rate": 9.947027716509488e-07, "loss": 0.0216, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.146, "grad_norm": 0.0040911422111094, "kl": 0.0006923675537109375, "learning_rate": 9.942113192828444e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.148, "grad_norm": 0.0059630973264575005, "kl": 0.000446319580078125, "learning_rate": 9.93698216681727e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 1836.75, "epoch": 0.15, "grad_norm": 1.146507978439331, "kl": 0.0004787445068359375, "learning_rate": 9.931634888554935e-07, "loss": -0.0918, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.152, "grad_norm": 0.7911372184753418, "kl": 0.00030231475830078125, "learning_rate": 9.926071618660237e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.154, "grad_norm": 0.8156778216362, "kl": 0.0006389617919921875, "learning_rate": 9.9202926282791e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 1915.25, "epoch": 0.156, "grad_norm": 0.005794746335595846, "kl": 0.0005588531494140625, "learning_rate": 9.91429819907136e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 1918.0, "epoch": 0.158, "grad_norm": 1.0309467315673828, "kl": 0.0005645751953125, "learning_rate": 9.908088623197048e-07, "loss": -0.0514, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.16, "grad_norm": 0.9030879735946655, "kl": 0.0043182373046875, "learning_rate": 9.901664203302124e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.162, "grad_norm": 0.8063321113586426, "kl": 0.00075531005859375, "learning_rate": 9.895025252503755e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.164, "grad_norm": 0.01648077741265297, "kl": 0.00039196014404296875, "learning_rate": 9.888172094375033e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 1855.0, "epoch": 0.166, "grad_norm": 0.7015650272369385, "kl": 0.0013475418090820312, "learning_rate": 9.881105062929221e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.168, "grad_norm": 0.006011520978063345, "kl": 0.00021600723266601562, "learning_rate": 9.873824502603459e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 1855.5, "epoch": 0.17, "grad_norm": 0.9796985983848572, "kl": 0.00054168701171875, "learning_rate": 9.866330768241983e-07, "loss": 0.0819, "reward": 0.5, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.172, "grad_norm": 0.006532273255288601, "kl": 0.0004496574401855469, "learning_rate": 9.85862422507884e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.174, "grad_norm": 0.007916338741779327, "kl": 0.0005373954772949219, "learning_rate": 9.850705248720068e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.176, "grad_norm": 0.006668766029179096, "kl": 0.00024175643920898438, "learning_rate": 9.8425742251254e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 1533.5, "epoch": 0.178, "grad_norm": 0.01681283488869667, "kl": 0.0006351470947265625, "learning_rate": 9.83423155058946e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.18, "grad_norm": 0.6811097860336304, "kl": 0.0013413429260253906, "learning_rate": 9.825677631722435e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.182, "grad_norm": 0.7697494626045227, "kl": 0.0030584335327148438, "learning_rate": 9.816912885430258e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 1913.25, "epoch": 0.184, "grad_norm": 1.1320937871932983, "kl": 0.001331329345703125, "learning_rate": 9.807937738894303e-07, "loss": 0.0536, "reward": 0.4375, "reward_std": 0.4419417306780815, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 1742.75, "epoch": 0.186, "grad_norm": 0.6808719635009766, "kl": 0.0010509490966796875, "learning_rate": 9.798752629550546e-07, "loss": -0.1501, "reward": 0.3125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.188, "grad_norm": 0.007471662946045399, "kl": 0.0004634857177734375, "learning_rate": 9.78935800506826e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.19, "grad_norm": 0.010327517054975033, "kl": 0.000568389892578125, "learning_rate": 9.779754323328192e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.192, "grad_norm": 0.6696183681488037, "kl": 0.0009899139404296875, "learning_rate": 9.769942052400235e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.194, "grad_norm": 0.8246662020683289, "kl": 0.0017538070678710938, "learning_rate": 9.759921670520634e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 1928.5, "epoch": 0.196, "grad_norm": 0.955489456653595, "kl": 0.0009098052978515625, "learning_rate": 9.749693666068663e-07, "loss": 0.0467, "reward": 0.375, "reward_std": 0.3535533770918846, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.198, "grad_norm": 0.03343038633465767, "kl": 0.0007419586181640625, "learning_rate": 9.739258537542835e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.2, "grad_norm": 0.006429341156035662, "kl": 0.001178741455078125, "learning_rate": 9.728616793536587e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.202, "grad_norm": 0.00873623974621296, "kl": 0.0005130767822265625, "learning_rate": 9.717768952713511e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 1959.0, "epoch": 0.204, "grad_norm": 0.017306441441178322, "kl": 0.000946044921875, "learning_rate": 9.706715543782064e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.206, "grad_norm": 0.7247556447982788, "kl": 0.0008764266967773438, "learning_rate": 9.695457105469804e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.208, "grad_norm": 0.008871670812368393, "kl": 0.00033664703369140625, "learning_rate": 9.683994186497132e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.21, "grad_norm": 0.007749219890683889, "kl": 0.00040531158447265625, "learning_rate": 9.672327345550543e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.212, "grad_norm": 0.010708320885896683, "kl": 0.0010166168212890625, "learning_rate": 9.66045715125541e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.214, "grad_norm": 0.007295151706784964, "kl": 0.0003333091735839844, "learning_rate": 9.648384182148252e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.216, "grad_norm": 0.7922310829162598, "kl": 0.000408172607421875, "learning_rate": 9.636109026648554e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.218, "grad_norm": 0.007899758405983448, "kl": 0.0006399154663085938, "learning_rate": 9.623632283030077e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.22, "grad_norm": 0.9048980474472046, "kl": 0.001056671142578125, "learning_rate": 9.610954559391704e-07, "loss": 0.0, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.222, "grad_norm": 0.010189698077738285, "kl": 0.00031948089599609375, "learning_rate": 9.598076473627796e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 1794.5, "epoch": 0.224, "grad_norm": 1.0683528184890747, "kl": 0.0034933090209960938, "learning_rate": 9.58499865339809e-07, "loss": -0.1162, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.226, "grad_norm": 0.011858138255774975, "kl": 0.000347137451171875, "learning_rate": 9.571721736097088e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.228, "grad_norm": 0.012356019578874111, "kl": 0.000885009765625, "learning_rate": 9.55824636882301e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 1752.5, "epoch": 0.23, "grad_norm": 1.0531798601150513, "kl": 0.001102447509765625, "learning_rate": 9.54457320834625e-07, "loss": 0.1434, "reward": 0.1875, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.232, "grad_norm": 0.012715999968349934, "kl": 0.00096893310546875, "learning_rate": 9.530702921077358e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 2033.5, "epoch": 0.234, "grad_norm": 0.7506431937217712, "kl": 0.0050945281982421875, "learning_rate": 9.516636183034564e-07, "loss": 0.0002, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.236, "grad_norm": 0.020277904346585274, "kl": 0.00078582763671875, "learning_rate": 9.502373679810839e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 1624.5, "epoch": 0.238, "grad_norm": 0.9856612086296082, "kl": 0.0022125244140625, "learning_rate": 9.487916106540465e-07, "loss": -0.0067, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.24, "grad_norm": 0.007632279768586159, "kl": 0.000583648681640625, "learning_rate": 9.473264167865171e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.242, "grad_norm": 0.007833893410861492, "kl": 0.0008401870727539062, "learning_rate": 9.458418577899774e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.244, "grad_norm": 0.00713867275044322, "kl": 0.0011892318725585938, "learning_rate": 9.443380060197385e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.246, "grad_norm": 0.011064039543271065, "kl": 0.0003185272216796875, "learning_rate": 9.428149347714143e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.248, "grad_norm": 0.009095462039113045, "kl": 0.0006237030029296875, "learning_rate": 9.412727182773486e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.25, "grad_norm": 0.007876625284552574, "kl": 0.00144195556640625, "learning_rate": 9.397114317029974e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.252, "grad_norm": 0.05358020216226578, "kl": 0.0010623931884765625, "learning_rate": 9.381311511432658e-07, "loss": 0.0, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.254, "grad_norm": 0.030433854088187218, "kl": 0.0003848075866699219, "learning_rate": 9.36531953618799e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.256, "grad_norm": 0.852528989315033, "kl": 0.0011425018310546875, "learning_rate": 9.34913917072228e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.258, "grad_norm": 0.013770471327006817, "kl": 0.0007495880126953125, "learning_rate": 9.332771203643714e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.26, "grad_norm": 0.7055062055587769, "kl": 0.0012664794921875, "learning_rate": 9.316216432703916e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.262, "grad_norm": 0.7149041295051575, "kl": 0.002803802490234375, "learning_rate": 9.299475664759068e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 1480.25, "epoch": 0.264, "grad_norm": 0.014864159747958183, "kl": 0.031280517578125, "learning_rate": 9.282549715730579e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.266, "grad_norm": 0.8071376085281372, "kl": 0.00067901611328125, "learning_rate": 9.265439410565328e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 2047.25, "epoch": 0.268, "grad_norm": 0.009836402721703053, "kl": 0.0012359619140625, "learning_rate": 9.248145583195447e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 2012.25, "epoch": 0.27, "grad_norm": 0.007896827533841133, "kl": 0.0009002685546875, "learning_rate": 9.230669076497687e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.272, "grad_norm": 0.0065347570925951, "kl": 0.000980377197265625, "learning_rate": 9.213010742252327e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.274, "grad_norm": 0.8659334778785706, "kl": 0.00147247314453125, "learning_rate": 9.195171441101668e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.276, "grad_norm": 0.009184672497212887, "kl": 0.001373291015625, "learning_rate": 9.177152042508077e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.278, "grad_norm": 0.00881196279078722, "kl": 0.001476287841796875, "learning_rate": 9.158953424711624e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 1876.0, "epoch": 0.28, "grad_norm": 0.7441470623016357, "kl": 0.0006885528564453125, "learning_rate": 9.140576474687263e-07, "loss": 0.06, "reward": 0.3125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.282, "grad_norm": 0.008356408216059208, "kl": 0.00101470947265625, "learning_rate": 9.122022088101613e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1991.5, "epoch": 0.284, "grad_norm": 0.9580811262130737, "kl": 0.006805419921875, "learning_rate": 9.103291169269299e-07, "loss": 0.0003, "reward": 0.5625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.286, "grad_norm": 0.008105803281068802, "kl": 0.002086639404296875, "learning_rate": 9.084384631108882e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.288, "grad_norm": 0.007048532832413912, "kl": 0.001430511474609375, "learning_rate": 9.065303395098358e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 1714.5, "epoch": 0.29, "grad_norm": 0.881592869758606, "kl": 0.001071929931640625, "learning_rate": 9.046048391230247e-07, "loss": 0.1707, "reward": 0.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.292, "grad_norm": 0.008095495402812958, "kl": 0.00165557861328125, "learning_rate": 9.026620557966279e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.294, "grad_norm": 0.019286898896098137, "kl": 0.00104522705078125, "learning_rate": 9.007020842191634e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.296, "grad_norm": 0.007973677478730679, "kl": 0.0015087127685546875, "learning_rate": 8.987250199168808e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.298, "grad_norm": 0.007949120365083218, "kl": 0.0004978179931640625, "learning_rate": 8.967309592491052e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.3, "grad_norm": 0.007726718205958605, "kl": 0.001689910888671875, "learning_rate": 8.9471999940354e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.302, "grad_norm": 0.00826709158718586, "kl": 0.0013580322265625, "learning_rate": 8.926922383915315e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.304, "grad_norm": 0.007963276468217373, "kl": 0.00203704833984375, "learning_rate": 8.906477750432903e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.306, "grad_norm": 0.008207273669540882, "kl": 0.0008563995361328125, "learning_rate": 8.88586709003076e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.308, "grad_norm": 0.010204545222222805, "kl": 0.0006160736083984375, "learning_rate": 8.865091407243394e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.31, "grad_norm": 0.00880539882928133, "kl": 0.001453399658203125, "learning_rate": 8.844151714648274e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.312, "grad_norm": 0.010449129156768322, "kl": 0.0010318756103515625, "learning_rate": 8.823049032816478e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.314, "grad_norm": 0.010188284330070019, "kl": 0.000789642333984375, "learning_rate": 8.801784390262943e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 1963.5, "epoch": 0.316, "grad_norm": 0.7694103717803955, "kl": 0.000640869140625, "learning_rate": 8.780358823396352e-07, "loss": 0.0318, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 1807.0, "epoch": 0.318, "grad_norm": 0.8925904631614685, "kl": 0.001445770263671875, "learning_rate": 8.758773376468604e-07, "loss": -0.1087, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.32, "grad_norm": 0.7490972280502319, "kl": 0.01116180419921875, "learning_rate": 8.737029101523929e-07, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.322, "grad_norm": 0.008408155292272568, "kl": 0.0014629364013671875, "learning_rate": 8.715127058347614e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.324, "grad_norm": 0.8539191484451294, "kl": 0.0008831024169921875, "learning_rate": 8.693068314414344e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.326, "grad_norm": 0.009831397794187069, "kl": 0.000682830810546875, "learning_rate": 8.670853944836176e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.328, "grad_norm": 0.011773304082453251, "kl": 0.001129150390625, "learning_rate": 8.648485032310144e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.33, "grad_norm": 0.7276328206062317, "kl": 0.001377105712890625, "learning_rate": 8.625962667065487e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 1512.75, "epoch": 0.332, "grad_norm": 0.008593901991844177, "kl": 0.0006933212280273438, "learning_rate": 8.603287946810513e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.334, "grad_norm": 0.011681273579597473, "kl": 0.000759124755859375, "learning_rate": 8.580461976679099e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.336, "grad_norm": 0.9719880819320679, "kl": 0.00112152099609375, "learning_rate": 8.557485869176825e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 1850.75, "epoch": 0.338, "grad_norm": 0.017566794529557228, "kl": 0.002292633056640625, "learning_rate": 8.534360744126753e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 2016.0, "epoch": 0.34, "grad_norm": 1.1285712718963623, "kl": 0.00118255615234375, "learning_rate": 8.511087728614862e-07, "loss": 0.0114, "reward": 0.3125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.342, "grad_norm": 0.009710059501230717, "kl": 0.001407623291015625, "learning_rate": 8.487667956935087e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.344, "grad_norm": 0.009776381775736809, "kl": 0.0014629364013671875, "learning_rate": 8.464102570534061e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.346, "grad_norm": 0.00871388241648674, "kl": 0.000507354736328125, "learning_rate": 8.440392717955475e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.348, "grad_norm": 0.00912429578602314, "kl": 0.000762939453125, "learning_rate": 8.416539554784089e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.35, "grad_norm": 0.011816666461527348, "kl": 0.0008449554443359375, "learning_rate": 8.392544243589427e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.352, "grad_norm": 0.00985003262758255, "kl": 0.001529693603515625, "learning_rate": 8.368407953869103e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.354, "grad_norm": 0.00919476430863142, "kl": 0.00167083740234375, "learning_rate": 8.344131861991828e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.356, "grad_norm": 0.01162977609783411, "kl": 0.00091552734375, "learning_rate": 8.319717151140072e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 1970.5, "epoch": 0.358, "grad_norm": 0.01332745049148798, "kl": 0.001888275146484375, "learning_rate": 8.295165011252396e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 1730.75, "epoch": 0.36, "grad_norm": 0.013342260383069515, "kl": 0.00089263916015625, "learning_rate": 8.270476638965461e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.362, "grad_norm": 0.012930563651025295, "kl": 0.001483917236328125, "learning_rate": 8.245653237555705e-07, "loss": 0.0001, "reward": 0.625, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.364, "grad_norm": 0.03247583284974098, "kl": 0.00113677978515625, "learning_rate": 8.220696016880687e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.366, "grad_norm": 0.8804360032081604, "kl": 0.0019130706787109375, "learning_rate": 8.195606193320136e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.368, "grad_norm": 0.007631482556462288, "kl": 0.001468658447265625, "learning_rate": 8.170384989716657e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.37, "grad_norm": 0.008286840282380581, "kl": 0.001613616943359375, "learning_rate": 8.145033635316128e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.372, "grad_norm": 0.016586236655712128, "kl": 0.0007076263427734375, "learning_rate": 8.119553365707802e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.374, "grad_norm": 0.010034182108938694, "kl": 0.0008754730224609375, "learning_rate": 8.093945422764069e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.376, "grad_norm": 0.7020014524459839, "kl": 0.0012836456298828125, "learning_rate": 8.068211054579943e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.378, "grad_norm": 1.163500189781189, "kl": 0.0016021728515625, "learning_rate": 8.04235151541222e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.38, "grad_norm": 0.8533800840377808, "kl": 0.0010223388671875, "learning_rate": 8.01636806561836e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.382, "grad_norm": 0.8227788805961609, "kl": 0.0030975341796875, "learning_rate": 7.990261971595048e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.384, "grad_norm": 0.011001263745129108, "kl": 0.0008344650268554688, "learning_rate": 7.964034505716476e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.386, "grad_norm": 0.009392702020704746, "kl": 0.001972198486328125, "learning_rate": 7.93768694627233e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 1788.25, "epoch": 0.388, "grad_norm": 1.1597681045532227, "kl": 0.01373291015625, "learning_rate": 7.911220577405484e-07, "loss": 0.1207, "reward": 0.1875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.39, "grad_norm": 0.00962373148649931, "kl": 0.0009326934814453125, "learning_rate": 7.884636689049422e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 2025.0, "epoch": 0.392, "grad_norm": 0.009538036771118641, "kl": 0.00116729736328125, "learning_rate": 7.857936576865356e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.394, "grad_norm": 0.01050383411347866, "kl": 0.0007953643798828125, "learning_rate": 7.831121542179086e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.396, "grad_norm": 0.009053800255060196, "kl": 0.001438140869140625, "learning_rate": 7.804192891917571e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.398, "grad_norm": 0.00897100381553173, "kl": 0.001312255859375, "learning_rate": 7.777151938545235e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.4, "grad_norm": 0.01025310903787613, "kl": 0.001468658447265625, "learning_rate": 7.75e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.402, "grad_norm": 0.054522059857845306, "kl": 0.0010223388671875, "learning_rate": 7.72273839962904e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 1953.5, "epoch": 0.404, "grad_norm": 0.037722665816545486, "kl": 0.0011157989501953125, "learning_rate": 7.695368466124296e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.406, "grad_norm": 0.008431609719991684, "kl": 0.0020599365234375, "learning_rate": 7.667891533457718e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 1929.0, "epoch": 0.408, "grad_norm": 0.9533175826072693, "kl": 0.001682281494140625, "learning_rate": 7.640308940816239e-07, "loss": 0.0001, "reward": 0.5625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 2036.0, "epoch": 0.41, "grad_norm": 0.013962327502667904, "kl": 0.0011119842529296875, "learning_rate": 7.612622032536507e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.412, "grad_norm": 0.010255957953631878, "kl": 0.0009479522705078125, "learning_rate": 7.584832158039378e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.414, "grad_norm": 0.01276308298110962, "kl": 0.0014495849609375, "learning_rate": 7.556940671764124e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 1809.5, "epoch": 0.416, "grad_norm": 0.01625184714794159, "kl": 0.00093841552734375, "learning_rate": 7.528948933102438e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.418, "grad_norm": 0.010438801720738411, "kl": 0.0012836456298828125, "learning_rate": 7.500858306332172e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.42, "grad_norm": 0.011556737124919891, "kl": 0.00215911865234375, "learning_rate": 7.472670160550848e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.422, "grad_norm": 0.009882017970085144, "kl": 0.0005950927734375, "learning_rate": 7.444385869608921e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 1786.0, "epoch": 0.424, "grad_norm": 0.8873146176338196, "kl": 0.002838134765625, "learning_rate": 7.416006812042827e-07, "loss": -0.0358, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.426, "grad_norm": 0.01222989521920681, "kl": 0.0014190673828125, "learning_rate": 7.387534371007797e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.428, "grad_norm": 0.009303831495344639, "kl": 0.00127410888671875, "learning_rate": 7.358969934210438e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.43, "grad_norm": 0.0105022257193923, "kl": 0.0008087158203125, "learning_rate": 7.330314893841101e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.432, "grad_norm": 0.763167679309845, "kl": 0.001544952392578125, "learning_rate": 7.301570646506027e-07, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.434, "grad_norm": 0.010868730954825878, "kl": 0.001415252685546875, "learning_rate": 7.27273859315928e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.436, "grad_norm": 0.016456812620162964, "kl": 0.0006694793701171875, "learning_rate": 7.243820139034464e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 1947.75, "epoch": 0.438, "grad_norm": 0.5713726282119751, "kl": 0.0039825439453125, "learning_rate": 7.214816693576234e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.44, "grad_norm": 0.008825725875794888, "kl": 0.000873565673828125, "learning_rate": 7.185729670371604e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.442, "grad_norm": 0.015775861218571663, "kl": 0.0016326904296875, "learning_rate": 7.156560487081051e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 1892.75, "epoch": 0.444, "grad_norm": 0.011794094927608967, "kl": 0.002288818359375, "learning_rate": 7.127310565369415e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.446, "grad_norm": 0.8902475833892822, "kl": 0.0021953582763671875, "learning_rate": 7.097981330836616e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 1976.5, "epoch": 0.448, "grad_norm": 0.01060924306511879, "kl": 0.00159454345703125, "learning_rate": 7.068574212948169e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 1884.0, "epoch": 0.45, "grad_norm": 0.7874477505683899, "kl": 0.00165557861328125, "learning_rate": 7.039090644965509e-07, "loss": 0.0675, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.452, "grad_norm": 0.009480384178459644, "kl": 0.001316070556640625, "learning_rate": 7.009532063876148e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.454, "grad_norm": 0.8971606492996216, "kl": 0.00217437744140625, "learning_rate": 6.979899910323624e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 1950.75, "epoch": 0.456, "grad_norm": 0.013516876846551895, "kl": 0.001598358154296875, "learning_rate": 6.950195628537299e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.458, "grad_norm": 0.012519976124167442, "kl": 0.0012359619140625, "learning_rate": 6.920420666261961e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 1604.75, "epoch": 0.46, "grad_norm": 1.0533803701400757, "kl": 0.0024871826171875, "learning_rate": 6.890576474687263e-07, "loss": 0.1645, "reward": 0.75, "reward_std": 0.7071067541837692, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.462, "grad_norm": 0.04883728548884392, "kl": 0.0012359619140625, "learning_rate": 6.860664508377001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.464, "grad_norm": 0.01617475040256977, "kl": 0.00106048583984375, "learning_rate": 6.83068622519821e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.466, "grad_norm": 0.01098883431404829, "kl": 0.00160980224609375, "learning_rate": 6.800643086250121e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.468, "grad_norm": 0.010283930227160454, "kl": 0.001750946044921875, "learning_rate": 6.770536555792944e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.47, "grad_norm": 0.7373048663139343, "kl": 0.00131988525390625, "learning_rate": 6.740368101176495e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.472, "grad_norm": 0.01158232893794775, "kl": 0.001461029052734375, "learning_rate": 6.710139192768694e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.474, "grad_norm": 0.8555125594139099, "kl": 0.001068115234375, "learning_rate": 6.679851303883891e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.476, "grad_norm": 0.8464781641960144, "kl": 0.006557464599609375, "learning_rate": 6.649505910711058e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.478, "grad_norm": 0.14680787920951843, "kl": 0.003086090087890625, "learning_rate": 6.619104492241847e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.48, "grad_norm": 0.8916466236114502, "kl": 0.010036468505859375, "learning_rate": 6.588648530198504e-07, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.482, "grad_norm": 0.012613446451723576, "kl": 0.001827239990234375, "learning_rate": 6.558139508961654e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.484, "grad_norm": 0.012667362578213215, "kl": 0.001438140869140625, "learning_rate": 6.527578915497951e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.486, "grad_norm": 0.011633522808551788, "kl": 0.0007953643798828125, "learning_rate": 6.496968239287603e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.488, "grad_norm": 0.010322234593331814, "kl": 0.00118255615234375, "learning_rate": 6.466308972251785e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.49, "grad_norm": 0.013764915056526661, "kl": 0.001338958740234375, "learning_rate": 6.435602608679916e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.492, "grad_norm": 0.016423719003796577, "kl": 0.0011043548583984375, "learning_rate": 6.404850645156841e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.494, "grad_norm": 0.011886836029589176, "kl": 0.0008907318115234375, "learning_rate": 6.374054580489873e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.496, "grad_norm": 0.013753964565694332, "kl": 0.0009822845458984375, "learning_rate": 6.343215915635761e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.498, "grad_norm": 0.012257426045835018, "kl": 0.00170135498046875, "learning_rate": 6.31233615362752e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.5, "grad_norm": 0.7262430191040039, "kl": 0.00194549560546875, "learning_rate": 6.281416799501187e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.502, "grad_norm": 0.7270606756210327, "kl": 0.0007076263427734375, "learning_rate": 6.25045936022246e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.504, "grad_norm": 0.017829036340117455, "kl": 0.0005779266357421875, "learning_rate": 6.219465344613258e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.506, "grad_norm": 0.0317312628030777, "kl": 0.0013294219970703125, "learning_rate": 6.188436263278172e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 1972.25, "epoch": 0.508, "grad_norm": 0.026640823110938072, "kl": 0.000835418701171875, "learning_rate": 6.157373628530852e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.51, "grad_norm": 0.012969509698450565, "kl": 0.001567840576171875, "learning_rate": 6.126278954320294e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.512, "grad_norm": 0.012548292055726051, "kl": 0.0010986328125, "learning_rate": 6.095153756157051e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.514, "grad_norm": 0.008928977884352207, "kl": 0.000873565673828125, "learning_rate": 6.06399955103937e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.516, "grad_norm": 0.8101487755775452, "kl": 0.0009307861328125, "learning_rate": 6.032817857379256e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.518, "grad_norm": 0.8978201746940613, "kl": 0.0007915496826171875, "learning_rate": 6.001610194928464e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.52, "grad_norm": 1.092624306678772, "kl": 0.00101470947265625, "learning_rate": 5.97037808470444e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.522, "grad_norm": 0.7928449511528015, "kl": 0.0011920928955078125, "learning_rate": 5.939123048916173e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 1534.75, "epoch": 0.524, "grad_norm": 1.0032625198364258, "kl": 0.0021257400512695312, "learning_rate": 5.907846610890011e-07, "loss": 0.0615, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.526, "grad_norm": 0.011720544658601284, "kl": 0.000644683837890625, "learning_rate": 5.87655029499542e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.528, "grad_norm": 0.011771513149142265, "kl": 0.0007781982421875, "learning_rate": 5.845235626570683e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.53, "grad_norm": 0.013503954745829105, "kl": 0.001155853271484375, "learning_rate": 5.813904131848564e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.532, "grad_norm": 0.09234623610973358, "kl": 0.0018482208251953125, "learning_rate": 5.78255733788191e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.534, "grad_norm": 0.011625738814473152, "kl": 0.0007114410400390625, "learning_rate": 5.751196772469237e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 1589.75, "epoch": 0.536, "grad_norm": 0.9924006462097168, "kl": 0.0024566650390625, "learning_rate": 5.71982396408026e-07, "loss": -0.0413, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.538, "grad_norm": 0.726823627948761, "kl": 0.001861572265625, "learning_rate": 5.688440441781398e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.54, "grad_norm": 0.011368845589458942, "kl": 0.001186370849609375, "learning_rate": 5.657047735161255e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.542, "grad_norm": 0.014150974340736866, "kl": 0.0012617111206054688, "learning_rate": 5.625647374256061e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.544, "grad_norm": 0.03309360519051552, "kl": 0.00139617919921875, "learning_rate": 5.594240889475106e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.546, "grad_norm": 0.027406711131334305, "kl": 0.002048492431640625, "learning_rate": 5.562829811526154e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 1862.75, "epoch": 0.548, "grad_norm": 0.872232973575592, "kl": 0.001857757568359375, "learning_rate": 5.531415671340826e-07, "loss": 0.0781, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 1893.5, "epoch": 0.55, "grad_norm": 0.013754754327237606, "kl": 0.00106048583984375, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.552, "grad_norm": 0.8218861222267151, "kl": 0.0015106201171875, "learning_rate": 5.468584328659172e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.554, "grad_norm": 0.01286914199590683, "kl": 0.001323699951171875, "learning_rate": 5.437170188473847e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.556, "grad_norm": 0.8357925415039062, "kl": 0.00146484375, "learning_rate": 5.405759110524894e-07, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.558, "grad_norm": 0.011115595698356628, "kl": 0.001163482666015625, "learning_rate": 5.37435262574394e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.56, "grad_norm": 0.7368820309638977, "kl": 0.0016918182373046875, "learning_rate": 5.342952264838747e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 1845.5, "epoch": 0.562, "grad_norm": 0.8080930709838867, "kl": 0.002285003662109375, "learning_rate": 5.311559558218603e-07, "loss": 0.0872, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.564, "grad_norm": 0.6212303638458252, "kl": 0.0017833709716796875, "learning_rate": 5.28017603591974e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.566, "grad_norm": 0.8698393106460571, "kl": 0.001556396484375, "learning_rate": 5.248803227530763e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.568, "grad_norm": 0.6195830702781677, "kl": 0.001125335693359375, "learning_rate": 5.21744266211809e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.57, "grad_norm": 0.014679288491606712, "kl": 0.00112152099609375, "learning_rate": 5.186095868151436e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.572, "grad_norm": 0.011439023539423943, "kl": 0.000751495361328125, "learning_rate": 5.154764373429315e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.574, "grad_norm": 0.013943897560238838, "kl": 0.00086212158203125, "learning_rate": 5.123449705004581e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.576, "grad_norm": 0.011966060847043991, "kl": 0.001087188720703125, "learning_rate": 5.09215338910999e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.578, "grad_norm": 0.7932072877883911, "kl": 0.002170562744140625, "learning_rate": 5.060876951083828e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.58, "grad_norm": 0.01169038936495781, "kl": 0.001129150390625, "learning_rate": 5.02962191529556e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.582, "grad_norm": 0.85643470287323, "kl": 0.002471923828125, "learning_rate": 4.998389805071536e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.584, "grad_norm": 1.001603364944458, "kl": 0.01709747314453125, "learning_rate": 4.967182142620745e-07, "loss": 0.0007, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 1886.25, "epoch": 0.586, "grad_norm": 0.7774127721786499, "kl": 0.001667022705078125, "learning_rate": 4.93600044896063e-07, "loss": 0.0664, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.588, "grad_norm": 1.0563451051712036, "kl": 0.00324249267578125, "learning_rate": 4.904846243842949e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.59, "grad_norm": 0.012082475237548351, "kl": 0.00081634521484375, "learning_rate": 4.873721045679706e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 2003.5, "epoch": 0.592, "grad_norm": 0.590258002281189, "kl": 0.001506805419921875, "learning_rate": 4.842626371469149e-07, "loss": 0.0161, "reward": 0.625, "reward_std": 0.5303300619125366, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.594, "grad_norm": 0.7182537913322449, "kl": 0.00550079345703125, "learning_rate": 4.811563736721829e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 1947.25, "epoch": 0.596, "grad_norm": 1.0634351968765259, "kl": 0.0025787353515625, "learning_rate": 4.780534655386743e-07, "loss": 0.0387, "reward": 0.6875, "reward_std": 0.6187184080481529, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.598, "grad_norm": 0.054920367896556854, "kl": 0.00203704833984375, "learning_rate": 4.749540639777539e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.6, "grad_norm": 0.012694926001131535, "kl": 0.000698089599609375, "learning_rate": 4.7185832004988133e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1795.5, "epoch": 0.602, "grad_norm": 0.759120523929596, "kl": 0.0024261474609375, "learning_rate": 4.68766384637248e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.604, "grad_norm": 0.010465751402080059, "kl": 0.001453399658203125, "learning_rate": 4.656784084364238e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 1853.0, "epoch": 0.606, "grad_norm": 0.8468216061592102, "kl": 0.00121307373046875, "learning_rate": 4.6259454195101267e-07, "loss": -0.0831, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.608, "grad_norm": 0.9815247654914856, "kl": 0.001689910888671875, "learning_rate": 4.59514935484316e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 1868.75, "epoch": 0.61, "grad_norm": 0.8037129044532776, "kl": 0.000972747802734375, "learning_rate": 4.5643973913200837e-07, "loss": 0.075, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.612, "grad_norm": 0.011851584538817406, "kl": 0.001220703125, "learning_rate": 4.5336910277482155e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.614, "grad_norm": 0.012452667579054832, "kl": 0.000675201416015625, "learning_rate": 4.503031760712397e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.616, "grad_norm": 0.8061856627464294, "kl": 0.001941680908203125, "learning_rate": 4.4724210845020494e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.618, "grad_norm": 0.011598404496908188, "kl": 0.0009021759033203125, "learning_rate": 4.441860491038345e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.62, "grad_norm": 0.013049165718257427, "kl": 0.0008697509765625, "learning_rate": 4.4113514698014953e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.622, "grad_norm": 0.01000931765884161, "kl": 0.0010738372802734375, "learning_rate": 4.3808955077581546e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.624, "grad_norm": 0.017103025689721107, "kl": 0.00090789794921875, "learning_rate": 4.350494089288943e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.626, "grad_norm": 0.015600494109094143, "kl": 0.001026153564453125, "learning_rate": 4.3201486961161093e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.628, "grad_norm": 0.68843674659729, "kl": 0.0009326934814453125, "learning_rate": 4.2898608072313045e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.63, "grad_norm": 0.02028859592974186, "kl": 0.001399993896484375, "learning_rate": 4.2596318988235037e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.632, "grad_norm": 0.010004539042711258, "kl": 0.0006055831909179688, "learning_rate": 4.2294634442070553e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.634, "grad_norm": 0.012407040223479271, "kl": 0.001323699951171875, "learning_rate": 4.1993569137498776e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.636, "grad_norm": 0.7615489363670349, "kl": 0.001617431640625, "learning_rate": 4.1693137748017915e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.638, "grad_norm": 0.7873152494430542, "kl": 0.00064849853515625, "learning_rate": 4.1393354916230005e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.64, "grad_norm": 0.012965940870344639, "kl": 0.000850677490234375, "learning_rate": 4.1094235253127374e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.642, "grad_norm": 0.013525367714464664, "kl": 0.000804901123046875, "learning_rate": 4.079579333738039e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.644, "grad_norm": 0.8479946255683899, "kl": 0.0014972686767578125, "learning_rate": 4.0498043714627006e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.646, "grad_norm": 0.7696375250816345, "kl": 0.002063751220703125, "learning_rate": 4.020100089676376e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.648, "grad_norm": 0.012891444377601147, "kl": 0.00150299072265625, "learning_rate": 3.9904679361238526e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.65, "grad_norm": 1.1341594457626343, "kl": 0.002437591552734375, "learning_rate": 3.9609093550344907e-07, "loss": 0.0001, "reward": 0.375, "reward_std": 0.5303300842642784, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.652, "grad_norm": 0.015124933794140816, "kl": 0.0007419586181640625, "learning_rate": 3.931425787051832e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.654, "grad_norm": 0.7730603814125061, "kl": 0.002941131591796875, "learning_rate": 3.902018669163384e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.656, "grad_norm": 0.02122451364994049, "kl": 0.000957489013671875, "learning_rate": 3.872689434630585e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.658, "grad_norm": 0.01871700957417488, "kl": 0.0010528564453125, "learning_rate": 3.843439512918949e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.66, "grad_norm": 0.0117810582742095, "kl": 0.0014801025390625, "learning_rate": 3.8142703296283953e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 1353.5, "epoch": 0.662, "grad_norm": 0.017292601987719536, "kl": 0.0030651092529296875, "learning_rate": 3.785183306423767e-07, "loss": 0.0001, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 1955.25, "epoch": 0.664, "grad_norm": 0.7610009908676147, "kl": 0.001689910888671875, "learning_rate": 3.7561798609655373e-07, "loss": 0.0353, "reward": 0.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.666, "grad_norm": 0.013269172981381416, "kl": 0.0008087158203125, "learning_rate": 3.72726140684072e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.668, "grad_norm": 0.01594419591128826, "kl": 0.000812530517578125, "learning_rate": 3.6984293534939737e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.67, "grad_norm": 0.010347824543714523, "kl": 0.000751495361328125, "learning_rate": 3.6696851061588994e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.672, "grad_norm": 0.014122740365564823, "kl": 0.0012054443359375, "learning_rate": 3.641030065789562e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.674, "grad_norm": 0.013101037591695786, "kl": 0.001033782958984375, "learning_rate": 3.612465628992203e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.676, "grad_norm": 0.666260838508606, "kl": 0.00324249267578125, "learning_rate": 3.5839931879571725e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 2022.75, "epoch": 0.678, "grad_norm": 0.7250146269798279, "kl": 0.00144195556640625, "learning_rate": 3.555614130391079e-07, "loss": 0.009, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 2003.25, "epoch": 0.68, "grad_norm": 0.7383328676223755, "kl": 0.00299072265625, "learning_rate": 3.5273298394491515e-07, "loss": -0.016, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.682, "grad_norm": 0.011795282363891602, "kl": 0.000850677490234375, "learning_rate": 3.4991416936678276e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.684, "grad_norm": 0.9384401440620422, "kl": 0.002079010009765625, "learning_rate": 3.471051066897562e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.686, "grad_norm": 0.010377887636423111, "kl": 0.0008144378662109375, "learning_rate": 3.4430593282358777e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.688, "grad_norm": 0.7026439309120178, "kl": 0.001007080078125, "learning_rate": 3.4151678419606233e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.69, "grad_norm": 0.011138451285660267, "kl": 0.0008373260498046875, "learning_rate": 3.387377967463493e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.692, "grad_norm": 0.7246440649032593, "kl": 0.0035762786865234375, "learning_rate": 3.359691059183761e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.694, "grad_norm": 0.01353926956653595, "kl": 0.001087188720703125, "learning_rate": 3.3321084665422803e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 1851.25, "epoch": 0.696, "grad_norm": 1.2388075590133667, "kl": 0.00121307373046875, "learning_rate": 3.3046315338757026e-07, "loss": 0.0841, "reward": 0.1875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 2005.75, "epoch": 0.698, "grad_norm": 0.7063978314399719, "kl": 0.002201080322265625, "learning_rate": 3.2772616003709616e-07, "loss": 0.0153, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.7, "grad_norm": 0.009749515913426876, "kl": 0.001163482666015625, "learning_rate": 3.250000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.702, "grad_norm": 0.01226428709924221, "kl": 0.00107574462890625, "learning_rate": 3.222848061454764e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.704, "grad_norm": 0.01303025335073471, "kl": 0.001399993896484375, "learning_rate": 3.195807108082429e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.706, "grad_norm": 0.028528396040201187, "kl": 0.001117706298828125, "learning_rate": 3.168878457820915e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 1785.25, "epoch": 0.708, "grad_norm": 0.9730527997016907, "kl": 0.001430511474609375, "learning_rate": 3.142063423134644e-07, "loss": 0.122, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.71, "grad_norm": 0.013550005853176117, "kl": 0.001338958740234375, "learning_rate": 3.115363310950578e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.712, "grad_norm": 0.010767250321805477, "kl": 0.00067138671875, "learning_rate": 3.0887794225945143e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.714, "grad_norm": 0.012552580796182156, "kl": 0.00140380859375, "learning_rate": 3.062313053727671e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.716, "grad_norm": 0.6516157984733582, "kl": 0.00159454345703125, "learning_rate": 3.0359654942835247e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.718, "grad_norm": 0.012717018835246563, "kl": 0.0008392333984375, "learning_rate": 3.0097380284049523e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.72, "grad_norm": 0.014254845678806305, "kl": 0.001018524169921875, "learning_rate": 2.9836319343816397e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.722, "grad_norm": 0.017017841339111328, "kl": 0.0009784698486328125, "learning_rate": 2.9576484845877793e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 1976.75, "epoch": 0.724, "grad_norm": 0.8839628100395203, "kl": 0.002727508544921875, "learning_rate": 2.931788945420058e-07, "loss": 0.0265, "reward": 0.3125, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.726, "grad_norm": 0.010562002658843994, "kl": 0.00103759765625, "learning_rate": 2.9060545772359305e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.728, "grad_norm": 0.013268781825900078, "kl": 0.0009174346923828125, "learning_rate": 2.8804466342921987e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 1927.75, "epoch": 0.73, "grad_norm": 0.8109666705131531, "kl": 0.0020313262939453125, "learning_rate": 2.854966364683872e-07, "loss": 0.0471, "reward": 0.875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.732, "grad_norm": 0.012277526780962944, "kl": 0.000827789306640625, "learning_rate": 2.829615010283344e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.734, "grad_norm": 0.7054362893104553, "kl": 0.0069103240966796875, "learning_rate": 2.8043938066798645e-07, "loss": 0.0003, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.736, "grad_norm": 0.012917861342430115, "kl": 0.0012969970703125, "learning_rate": 2.7793039831193133e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.738, "grad_norm": 0.011064048856496811, "kl": 0.0007419586181640625, "learning_rate": 2.7543467624442956e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.74, "grad_norm": 0.014153026975691319, "kl": 0.00079345703125, "learning_rate": 2.729523361034538e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 1957.5, "epoch": 0.742, "grad_norm": 0.02183438278734684, "kl": 0.002010345458984375, "learning_rate": 2.7048349887476037e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 1967.0, "epoch": 0.744, "grad_norm": 0.6841965317726135, "kl": 0.00157928466796875, "learning_rate": 2.6802828488599294e-07, "loss": 0.0304, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.746, "grad_norm": 0.010570400394499302, "kl": 0.00075531005859375, "learning_rate": 2.655868138008171e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.748, "grad_norm": 0.013351581990718842, "kl": 0.000873565673828125, "learning_rate": 2.631592046130896e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 1565.75, "epoch": 0.75, "grad_norm": 0.9974377751350403, "kl": 0.008136749267578125, "learning_rate": 2.6074557564105724e-07, "loss": -0.3145, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 1881.75, "epoch": 0.752, "grad_norm": 1.1940348148345947, "kl": 0.0038909912109375, "learning_rate": 2.583460445215911e-07, "loss": 0.0687, "reward": 0.3125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.754, "grad_norm": 0.01335156336426735, "kl": 0.0006580352783203125, "learning_rate": 2.5596072820445254e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.756, "grad_norm": 0.011012405157089233, "kl": 0.00142669677734375, "learning_rate": 2.5358974294659373e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.758, "grad_norm": 0.16783788800239563, "kl": 0.00443267822265625, "learning_rate": 2.512332043064913e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.76, "grad_norm": 0.011893996968865395, "kl": 0.001003265380859375, "learning_rate": 2.488912271385139e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.762, "grad_norm": 0.7219942212104797, "kl": 0.00760650634765625, "learning_rate": 2.465639255873246e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.764, "grad_norm": 0.04678433761000633, "kl": 0.0009021759033203125, "learning_rate": 2.4425141308231765e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.766, "grad_norm": 0.011990766040980816, "kl": 0.0006542205810546875, "learning_rate": 2.4195380233209006e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.768, "grad_norm": 0.011609113775193691, "kl": 0.0009002685546875, "learning_rate": 2.3967120531894857e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.77, "grad_norm": 0.029996510595083237, "kl": 0.001007080078125, "learning_rate": 2.374037332934512e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.772, "grad_norm": 0.7205003499984741, "kl": 0.001007080078125, "learning_rate": 2.3515149676898552e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.774, "grad_norm": 0.01278019044548273, "kl": 0.000644683837890625, "learning_rate": 2.3291460551638237e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.776, "grad_norm": 0.050757694989442825, "kl": 0.0013141632080078125, "learning_rate": 2.306931685585657e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.778, "grad_norm": 0.8608619570732117, "kl": 0.00222015380859375, "learning_rate": 2.2848729416523859e-07, "loss": 0.0001, "reward": 0.625, "reward_std": 0.5303300768136978, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.78, "grad_norm": 0.01890842616558075, "kl": 0.0016078948974609375, "learning_rate": 2.2629708984760706e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 1757.5, "epoch": 0.782, "grad_norm": 0.9615358710289001, "kl": 0.00257110595703125, "learning_rate": 2.2412266235313973e-07, "loss": 0.1401, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 2038.25, "epoch": 0.784, "grad_norm": 0.7946398854255676, "kl": 0.0009212493896484375, "learning_rate": 2.2196411766036487e-07, "loss": 0.0034, "reward": 0.8125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.3125, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.786, "grad_norm": 0.01277601532638073, "kl": 0.001102447509765625, "learning_rate": 2.1982156097370557e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 1800.5, "epoch": 0.788, "grad_norm": 0.9031627774238586, "kl": 0.00704193115234375, "learning_rate": 2.1769509671835223e-07, "loss": 0.1129, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.79, "grad_norm": 0.017206581309437752, "kl": 0.001468658447265625, "learning_rate": 2.1558482853517253e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 1820.75, "epoch": 0.792, "grad_norm": 0.9008931517601013, "kl": 0.0047149658203125, "learning_rate": 2.134908592756607e-07, "loss": 0.0002, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 2010.25, "epoch": 0.794, "grad_norm": 0.8031813502311707, "kl": 0.00140380859375, "learning_rate": 2.1141329099692406e-07, "loss": -0.0135, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.796, "grad_norm": 0.015710551291704178, "kl": 0.0016117095947265625, "learning_rate": 2.0935222495670968e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.798, "grad_norm": 0.7256066203117371, "kl": 0.00640869140625, "learning_rate": 2.0730776160846853e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.8, "grad_norm": 0.011895825155079365, "kl": 0.00131988525390625, "learning_rate": 2.0528000059645995e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.802, "grad_norm": 0.01262570358812809, "kl": 0.00177001953125, "learning_rate": 2.032690407508949e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.804, "grad_norm": 0.014729364775121212, "kl": 0.00090789794921875, "learning_rate": 2.0127498008311922e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.806, "grad_norm": 0.012951449491083622, "kl": 0.00146484375, "learning_rate": 1.9929791578083655e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.808, "grad_norm": 0.6974371671676636, "kl": 0.0015716552734375, "learning_rate": 1.9733794420337213e-07, "loss": 0.0001, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.81, "grad_norm": 0.012584330514073372, "kl": 0.001598358154296875, "learning_rate": 1.9539516087697517e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.812, "grad_norm": 0.011754573322832584, "kl": 0.00107574462890625, "learning_rate": 1.934696604901642e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.814, "grad_norm": 0.02457410842180252, "kl": 0.0015869140625, "learning_rate": 1.915615368891117e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.816, "grad_norm": 0.7841270565986633, "kl": 0.00121307373046875, "learning_rate": 1.8967088307307e-07, "loss": 0.0, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.818, "grad_norm": 0.01260992232710123, "kl": 0.0014495849609375, "learning_rate": 1.8779779118983867e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.82, "grad_norm": 0.011464129202067852, "kl": 0.0010471343994140625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 1915.0, "epoch": 0.822, "grad_norm": 0.8047151565551758, "kl": 0.00250244140625, "learning_rate": 1.8410465752883758e-07, "loss": -0.0527, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.824, "grad_norm": 0.0124558350071311, "kl": 0.00130462646484375, "learning_rate": 1.822847957491922e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.826, "grad_norm": 0.01380992028862238, "kl": 0.0008144378662109375, "learning_rate": 1.804828558898332e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.828, "grad_norm": 0.9551144242286682, "kl": 0.011430740356445312, "learning_rate": 1.7869892577476722e-07, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.83, "grad_norm": 0.012345471419394016, "kl": 0.001438140869140625, "learning_rate": 1.7693309235023127e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 1821.0, "epoch": 0.832, "grad_norm": 0.7977136969566345, "kl": 0.001773834228515625, "learning_rate": 1.7518544168045524e-07, "loss": 0.0981, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.834, "grad_norm": 0.011019429191946983, "kl": 0.0006771087646484375, "learning_rate": 1.7345605894346726e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.836, "grad_norm": 0.015206689946353436, "kl": 0.0016021728515625, "learning_rate": 1.7174502842694212e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.838, "grad_norm": 0.04012497141957283, "kl": 0.000911712646484375, "learning_rate": 1.7005243352409333e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.84, "grad_norm": 0.01337823923677206, "kl": 0.0009021759033203125, "learning_rate": 1.6837835672960831e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.842, "grad_norm": 0.010002349503338337, "kl": 0.0014495849609375, "learning_rate": 1.6672287963562852e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.844, "grad_norm": 0.716344952583313, "kl": 0.00653076171875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.846, "grad_norm": 0.037377193570137024, "kl": 0.001346588134765625, "learning_rate": 1.6346804638120098e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.848, "grad_norm": 0.011378524824976921, "kl": 0.0007991790771484375, "learning_rate": 1.6186884885673413e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.85, "grad_norm": 0.012089181691408157, "kl": 0.00154876708984375, "learning_rate": 1.6028856829700258e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.852, "grad_norm": 0.7250344753265381, "kl": 0.002223968505859375, "learning_rate": 1.5872728172265146e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1880.5, "epoch": 0.854, "grad_norm": 0.8333255648612976, "kl": 0.0015106201171875, "learning_rate": 1.5718506522858572e-07, "loss": 0.0692, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 1326.75, "epoch": 0.856, "grad_norm": 1.4510780572891235, "kl": 0.0024261474609375, "learning_rate": 1.5566199398026147e-07, "loss": 0.0319, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.858, "grad_norm": 0.01227201521396637, "kl": 0.0006866455078125, "learning_rate": 1.5415814221002265e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.86, "grad_norm": 0.012068121694028378, "kl": 0.0009098052978515625, "learning_rate": 1.5267358321348285e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.862, "grad_norm": 0.012486455962061882, "kl": 0.000980377197265625, "learning_rate": 1.5120838934595337e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.864, "grad_norm": 0.013689450919628143, "kl": 0.00124359130859375, "learning_rate": 1.4976263201891613e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.866, "grad_norm": 0.5863283276557922, "kl": 0.00223541259765625, "learning_rate": 1.483363816965435e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.868, "grad_norm": 0.01356459315866232, "kl": 0.00078582763671875, "learning_rate": 1.469297078922642e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.87, "grad_norm": 0.009759028442203999, "kl": 0.0011749267578125, "learning_rate": 1.4554267916537495e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.872, "grad_norm": 0.02292313612997532, "kl": 0.0009002685546875, "learning_rate": 1.4417536311769885e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.874, "grad_norm": 0.013289421796798706, "kl": 0.00077056884765625, "learning_rate": 1.4282782639029128e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.876, "grad_norm": 0.0265584085136652, "kl": 0.00119781494140625, "learning_rate": 1.4150013466019114e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 2008.25, "epoch": 0.878, "grad_norm": 0.013609832152724266, "kl": 0.000782012939453125, "learning_rate": 1.4019235263722034e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.88, "grad_norm": 0.010987906716763973, "kl": 0.00115203857421875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.882, "grad_norm": 0.01149643212556839, "kl": 0.0007152557373046875, "learning_rate": 1.3763677169699217e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.884, "grad_norm": 0.012063896283507347, "kl": 0.000858306884765625, "learning_rate": 1.3638909733514452e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.886, "grad_norm": 0.6539027094841003, "kl": 0.0033817291259765625, "learning_rate": 1.351615817851748e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.888, "grad_norm": 0.683651864528656, "kl": 0.001430511474609375, "learning_rate": 1.3395428487445914e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.89, "grad_norm": 0.014244834892451763, "kl": 0.001583099365234375, "learning_rate": 1.3276726544494571e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 1551.25, "epoch": 0.892, "grad_norm": 0.04096902534365654, "kl": 0.003017425537109375, "learning_rate": 1.316005813502869e-07, "loss": 0.0001, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.5, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.894, "grad_norm": 0.013455760665237904, "kl": 0.00115203857421875, "learning_rate": 1.3045428945301953e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 1822.0, "epoch": 0.896, "grad_norm": 1.1271547079086304, "kl": 0.0069122314453125, "learning_rate": 1.2932844562179352e-07, "loss": -0.0998, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.898, "grad_norm": 0.6208535432815552, "kl": 0.0009765625, "learning_rate": 1.2822310472864885e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.9, "grad_norm": 0.009892701171338558, "kl": 0.0006608963012695312, "learning_rate": 1.2713832064634125e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.902, "grad_norm": 0.0119470888748765, "kl": 0.000751495361328125, "learning_rate": 1.260741462457165e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 1717.0, "epoch": 0.904, "grad_norm": 0.015732314437627792, "kl": 0.001445770263671875, "learning_rate": 1.2503063339313356e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.906, "grad_norm": 0.7161591649055481, "kl": 0.001621246337890625, "learning_rate": 1.2400783294793668e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.908, "grad_norm": 0.009790318086743355, "kl": 0.0006418228149414062, "learning_rate": 1.2300579475997657e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.91, "grad_norm": 0.01178540289402008, "kl": 0.0012798309326171875, "learning_rate": 1.220245676671809e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 1967.75, "epoch": 0.912, "grad_norm": 0.918179452419281, "kl": 0.00170135498046875, "learning_rate": 1.2106419949317388e-07, "loss": 0.0301, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.914, "grad_norm": 0.7712357640266418, "kl": 0.00215911865234375, "learning_rate": 1.2012473704494537e-07, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 1705.5, "epoch": 0.916, "grad_norm": 0.8978808522224426, "kl": 0.00180816650390625, "learning_rate": 1.1920622611056974e-07, "loss": 0.0779, "reward": 0.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.918, "grad_norm": 0.8227681517601013, "kl": 0.00136566162109375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.92, "grad_norm": 0.009116187691688538, "kl": 0.00128936767578125, "learning_rate": 1.1743223682775649e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.922, "grad_norm": 0.014834447763860226, "kl": 0.00095367431640625, "learning_rate": 1.1657684494105386e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.924, "grad_norm": 0.013181531801819801, "kl": 0.0008907318115234375, "learning_rate": 1.1574257748745986e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.926, "grad_norm": 0.010957694612443447, "kl": 0.0006504058837890625, "learning_rate": 1.1492947512799328e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.928, "grad_norm": 0.8815539479255676, "kl": 0.0010204315185546875, "learning_rate": 1.1413757749211602e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.93, "grad_norm": 0.010624675080180168, "kl": 0.0008087158203125, "learning_rate": 1.1336692317580158e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.932, "grad_norm": 0.013844382017850876, "kl": 0.001033782958984375, "learning_rate": 1.1261754973965422e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.934, "grad_norm": 0.012338408268988132, "kl": 0.000690460205078125, "learning_rate": 1.1188949370707787e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 1994.0, "epoch": 0.936, "grad_norm": 0.7254545092582703, "kl": 0.0010471343994140625, "learning_rate": 1.1118279056249653e-07, "loss": 0.0197, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.938, "grad_norm": 0.01179241482168436, "kl": 0.0006885528564453125, "learning_rate": 1.1049747474962444e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.94, "grad_norm": 0.6890900135040283, "kl": 0.001373291015625, "learning_rate": 1.0983357966978745e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.942, "grad_norm": 0.5639128684997559, "kl": 0.00131988525390625, "learning_rate": 1.0919113768029517e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.944, "grad_norm": 0.012823720462620258, "kl": 0.001026153564453125, "learning_rate": 1.0857018009286381e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.946, "grad_norm": 0.009964537806808949, "kl": 0.0007228851318359375, "learning_rate": 1.0797073717209013e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.948, "grad_norm": 0.7185282707214355, "kl": 0.0028629302978515625, "learning_rate": 1.0739283813397639e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.95, "grad_norm": 0.012274986132979393, "kl": 0.00087738037109375, "learning_rate": 1.068365111445064e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 1744.25, "epoch": 0.952, "grad_norm": 0.8758648633956909, "kl": 0.002716064453125, "learning_rate": 1.063017833182728e-07, "loss": 0.1492, "reward": 0.1875, "reward_std": 0.2651650309562683, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.954, "grad_norm": 0.014645918272435665, "kl": 0.000720977783203125, "learning_rate": 1.0578868071715544e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.956, "grad_norm": 0.013319121673703194, "kl": 0.0009250640869140625, "learning_rate": 1.0529722834905125e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.958, "grad_norm": 0.009549077600240707, "kl": 0.0014190673828125, "learning_rate": 1.0482745016665526e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.96, "grad_norm": 0.015292295254766941, "kl": 0.0009307861328125, "learning_rate": 1.0437936906629334e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.962, "grad_norm": 1.031322717666626, "kl": 0.003509521484375, "learning_rate": 1.0395300688680625e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.125, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.964, "grad_norm": 0.011882193386554718, "kl": 0.001026153564453125, "learning_rate": 1.0354838440848501e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 1847.0, "epoch": 0.966, "grad_norm": 0.6594410538673401, "kl": 0.002811431884765625, "learning_rate": 1.0316552135205837e-07, "loss": 0.0864, "reward": 0.4375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.4375, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.968, "grad_norm": 0.013277575373649597, "kl": 0.000789642333984375, "learning_rate": 1.0280443637773163e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.97, "grad_norm": 0.7425467371940613, "kl": 0.00323486328125, "learning_rate": 1.0246514708427701e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.0625, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.972, "grad_norm": 0.01078079268336296, "kl": 0.00154876708984375, "learning_rate": 1.0214767000817596e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.974, "grad_norm": 0.012354187667369843, "kl": 0.0005855560302734375, "learning_rate": 1.0185202062281336e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.976, "grad_norm": 0.7250139117240906, "kl": 0.0037212371826171875, "learning_rate": 1.0157821333772304e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.978, "grad_norm": 0.014553495682775974, "kl": 0.00141143798828125, "learning_rate": 1.013262614978859e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.98, "grad_norm": 0.01307358592748642, "kl": 0.000835418701171875, "learning_rate": 1.0109617738307911e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 1908.5, "epoch": 0.982, "grad_norm": 0.7941016554832458, "kl": 0.0010204315185546875, "learning_rate": 1.0088797220727779e-07, "loss": 0.0558, "reward": 0.375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.375, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.984, "grad_norm": 0.008943392895162106, "kl": 0.000537872314453125, "learning_rate": 1.0070165611810855e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.986, "grad_norm": 0.012137016281485558, "kl": 0.001377105712890625, "learning_rate": 1.005372381963547e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.988, "grad_norm": 0.01392819918692112, "kl": 0.0009784698486328125, "learning_rate": 1.0039472645551372e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.99, "grad_norm": 0.010369054041802883, "kl": 0.000942230224609375, "learning_rate": 1.002741278414069e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.992, "grad_norm": 0.011974328197538853, "kl": 0.0006275177001953125, "learning_rate": 1.0017544823184055e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.994, "grad_norm": 0.01267942413687706, "kl": 0.00104522705078125, "learning_rate": 1.0009869243631952e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.996, "grad_norm": 0.876571536064148, "kl": 0.003925323486328125, "learning_rate": 1.000438641958131e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 0.998, "grad_norm": 0.6709615588188171, "kl": 0.001743316650390625, "learning_rate": 1.0001096618257236e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.1875, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 2048.0, "epoch": 1.0, "grad_norm": 0.014002146199345589, "kl": 0.0007781982421875, "learning_rate": 1e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.25, "step": 500 }, { "epoch": 1.0, "step": 500, "total_flos": 0.0, "train_loss": 0.0033100851627775683, "train_runtime": 14646.384, "train_samples_per_second": 0.068, "train_steps_per_second": 0.034 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }