{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.10202898550724637, "eval_steps": 100, "global_step": 33, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 303.61742544174194, "epoch": 0.015458937198067632, "grad_norm": 0.02257479541003704, "kl": 0.9676864624023438, "learning_rate": 1.9941379571543597e-05, "loss": 0.0386, "reward": 0.7926339689642191, "reward_std": 0.3637773351743817, "rewards/accuracy_reward": 0.11194196966243908, "rewards/format_reward": 0.6806919939815999, "step": 5 }, { "completion_length": 193.77601299285888, "epoch": 0.030917874396135265, "grad_norm": 0.02067318558692932, "kl": 0.1245941162109375, "learning_rate": 1.796093065705644e-05, "loss": 0.005, "reward": 1.0407366551458836, "reward_std": 0.2885524293407798, "rewards/accuracy_reward": 0.11953125573927537, "rewards/format_reward": 0.9212053991854191, "step": 10 }, { "completion_length": 223.56373710632323, "epoch": 0.0463768115942029, "grad_norm": 0.014934813603758812, "kl": 0.16761474609375, "learning_rate": 1.3701381553399147e-05, "loss": 0.0067, "reward": 1.0725446939468384, "reward_std": 0.3380734449252486, "rewards/accuracy_reward": 0.16651786545990035, "rewards/format_reward": 0.906026828289032, "step": 15 }, { "completion_length": 189.5631784439087, "epoch": 0.06183574879227053, "grad_norm": 0.015098211355507374, "kl": 0.175286865234375, "learning_rate": 8.382180034472353e-06, "loss": 0.007, "reward": 1.1564732655882835, "reward_std": 0.2815748773515224, "rewards/accuracy_reward": 0.19162947330623864, "rewards/format_reward": 0.964843787997961, "step": 20 }, { "completion_length": 190.79632415771485, "epoch": 0.07729468599033816, "grad_norm": 0.013767687603831291, "kl": 0.16260986328125, "learning_rate": 3.5261371521817247e-06, "loss": 0.0065, "reward": 1.1904018431901933, "reward_std": 0.306734830327332, "rewards/accuracy_reward": 0.22790179681032896, "rewards/format_reward": 0.9625000387430191, "step": 25 }, { "completion_length": 203.4788038253784, "epoch": 0.0927536231884058, "grad_norm": 0.014600388705730438, "kl": 0.143023681640625, "learning_rate": 5.234682881719766e-07, "loss": 0.0057, "reward": 1.206808091700077, "reward_std": 0.2881101544946432, "rewards/accuracy_reward": 0.23984376154839993, "rewards/format_reward": 0.9669643275439739, "step": 30 }, { "completion_length": 215.08613300323486, "epoch": 0.10202898550724637, "kl": 0.13948567708333334, "reward": 1.2220982710520427, "reward_std": 0.3094604279225071, "rewards/accuracy_reward": 0.252976200543344, "rewards/format_reward": 0.9691220708191395, "step": 33, "total_flos": 0.0, "train_loss": 0.011041131547906181, "train_runtime": 4812.1769, "train_samples_per_second": 1.505, "train_steps_per_second": 0.007 } ], "logging_steps": 5, "max_steps": 33, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }