{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9884169884169884, "eval_steps": 100, "global_step": 32, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 291.36697788238524, "epoch": 0.15444015444015444, "grad_norm": 91.9554297295678, "kl": 0.3540959358215332, "learning_rate": 1.9937122098932428e-05, "loss": 0.0142, "reward": 0.6166573823895305, "reward_std": 0.452694922266528, "rewards/accuracy_reward": 0.10803571995347738, "rewards/cosine_scaled_reward": -0.10179503666004167, "rewards/format_reward": 0.5000000233761966, "rewards/reasoning_steps_reward": 0.110416672937572, "step": 5 }, { "completion_length": 43.16875188350677, "epoch": 0.3088803088803089, "grad_norm": 3.33245795174105, "kl": 0.8302490234375, "learning_rate": 1.78183148246803e-05, "loss": 0.0332, "reward": 0.9400907799601554, "reward_std": 0.19169574869779354, "rewards/accuracy_reward": 0.024107144121080636, "rewards/cosine_scaled_reward": 0.0037812213704455644, "rewards/format_reward": 0.9107143148779869, "rewards/reasoning_steps_reward": 0.0014880953822284937, "step": 10 }, { "completion_length": 15.207143580913543, "epoch": 0.46332046332046334, "grad_norm": 0.8155119970711466, "kl": 1.6966796875, "learning_rate": 1.3302790619551673e-05, "loss": 0.0679, "reward": 0.9947075441479682, "reward_std": 0.005296425729534348, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0017210765145136975, "rewards/format_reward": 0.9964285731315613, "rewards/reasoning_steps_reward": 0.0, "step": 15 }, { "completion_length": 14.96160796880722, "epoch": 0.6177606177606177, "grad_norm": 0.5795656668267815, "kl": 1.59228515625, "learning_rate": 7.774790660436857e-06, "loss": 0.0637, "reward": 0.9939029954373837, "reward_std": 0.006596084152567983, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0016327791643561795, "rewards/format_reward": 0.9946428596973419, "rewards/reasoning_steps_reward": 0.0008928571827709675, "step": 20 }, { "completion_length": 14.393750703334808, "epoch": 0.7722007722007722, "grad_norm": 4.257970780643175, "kl": 1.46865234375, "learning_rate": 2.9289321881345257e-06, "loss": 0.0588, "reward": 0.989137577265501, "reward_std": 0.013341282614055672, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0019339090387802571, "rewards/format_reward": 0.9910714328289032, "rewards/reasoning_steps_reward": 0.0, "step": 25 }, { "completion_length": 13.3160719871521, "epoch": 0.9266409266409267, "grad_norm": 0.5366189541461951, "kl": 1.44560546875, "learning_rate": 2.507208781817638e-07, "loss": 0.0578, "reward": 0.9885662972927094, "reward_std": 0.014164349375778328, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0016123341440106743, "rewards/format_reward": 0.9901785761117935, "rewards/reasoning_steps_reward": 0.0, "step": 30 }, { "completion_length": 12.924107730388641, "epoch": 0.9884169884169884, "kl": 1.577392578125, "reward": 0.9940091818571091, "reward_std": 0.006472988553753112, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0015265825350070372, "rewards/format_reward": 0.9955357164144516, "rewards/reasoning_steps_reward": 0.0, "step": 32, "total_flos": 0.0, "train_loss": 0.049734286265447736, "train_runtime": 1467.0643, "train_samples_per_second": 2.469, "train_steps_per_second": 0.022 } ], "logging_steps": 5, "max_steps": 32, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }