{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 100, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 614.1698936462402, "epoch": 0.08528784648187633, "grad_norm": 1.6420103311538696, "kl": 0.00021245479583740235, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.6379464596509934, "reward_std": 0.3210102315992117, "rewards/accuracy_reward": 0.6375000312924385, "rewards/format_reward": 0.00044642859138548373, "step": 5 }, { "completion_length": 624.264535522461, "epoch": 0.17057569296375266, "grad_norm": 1526.457275390625, "kl": 0.14271965026855468, "learning_rate": 2.956412726139078e-06, "loss": 0.0057, "reward": 0.6959821730852127, "reward_std": 0.28622329905629157, "rewards/accuracy_reward": 0.6955357447266579, "rewards/format_reward": 0.00044642859138548373, "step": 10 }, { "completion_length": 616.0730186462403, "epoch": 0.255863539445629, "grad_norm": 0.11626364290714264, "kl": 0.0034526824951171876, "learning_rate": 2.7836719084521715e-06, "loss": 0.0001, "reward": 0.7558036059141159, "reward_std": 0.22891067173331975, "rewards/accuracy_reward": 0.7555803924798965, "rewards/format_reward": 0.00022321429569274187, "step": 15 }, { "completion_length": 602.7257011413574, "epoch": 0.3411513859275053, "grad_norm": 0.3678954541683197, "kl": 0.0035373687744140623, "learning_rate": 2.4946839873611927e-06, "loss": 0.0001, "reward": 0.7736607491970062, "reward_std": 0.1879386292770505, "rewards/accuracy_reward": 0.7736607491970062, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 606.3076171875, "epoch": 0.42643923240938164, "grad_norm": 0.08185073733329773, "kl": 0.008475685119628906, "learning_rate": 2.1156192081791355e-06, "loss": 0.0003, "reward": 0.7618303924798966, "reward_std": 0.18782664239406585, "rewards/accuracy_reward": 0.7618303924798966, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 615.8890892028809, "epoch": 0.511727078891258, "grad_norm": 0.1566159576177597, "kl": 0.0039794921875, "learning_rate": 1.6808050203829845e-06, "loss": 0.0002, "reward": 0.7493303924798965, "reward_std": 0.18519791485741735, "rewards/accuracy_reward": 0.7493303924798965, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 610.3998023986817, "epoch": 0.5970149253731343, "grad_norm": 0.07447274774312973, "kl": 0.004097747802734375, "learning_rate": 1.2296174432791415e-06, "loss": 0.0002, "reward": 0.7495536029338836, "reward_std": 0.1825955007225275, "rewards/accuracy_reward": 0.7495536029338836, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 595.3044914245605, "epoch": 0.6823027718550106, "grad_norm": 0.07800718396902084, "kl": 0.0040496826171875, "learning_rate": 8.029152419343472e-07, "loss": 0.0002, "reward": 0.7689732506871223, "reward_std": 0.17793030026368797, "rewards/accuracy_reward": 0.7689732506871223, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 614.7270347595215, "epoch": 0.767590618336887, "grad_norm": 0.07937229424715042, "kl": 0.004020309448242188, "learning_rate": 4.3933982822017883e-07, "loss": 0.0002, "reward": 0.7448661088943481, "reward_std": 0.20116904862225055, "rewards/accuracy_reward": 0.7446428954601287, "rewards/format_reward": 0.00022321429569274187, "step": 45 }, { "completion_length": 618.0094017028808, "epoch": 0.8528784648187633, "grad_norm": 0.074642114341259, "kl": 0.003514862060546875, "learning_rate": 1.718159615201853e-07, "loss": 0.0001, "reward": 0.743750037252903, "reward_std": 0.18902343986555933, "rewards/accuracy_reward": 0.743750037252903, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 605.6493591308594, "epoch": 0.9381663113006397, "grad_norm": 0.08093011379241943, "kl": 0.0038265228271484376, "learning_rate": 2.4570139579284723e-08, "loss": 0.0002, "reward": 0.7801339656114579, "reward_std": 0.19253778588026763, "rewards/accuracy_reward": 0.7801339656114579, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 596.0093282063802, "epoch": 0.9893390191897654, "kl": 0.003498077392578125, "reward": 0.7726934800545374, "reward_std": 0.18784288999934992, "rewards/accuracy_reward": 0.7726934800545374, "rewards/format_reward": 0.0, "step": 58, "total_flos": 0.0, "train_loss": 0.0014648210027220997, "train_runtime": 13165.6558, "train_samples_per_second": 0.57, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }