evgmaslov commited on
Commit
e560113
·
verified ·
1 Parent(s): 43873de

Training in progress, step 400, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ca83b99c572fe3930a823ab9ebf3b760a9cf710347a389b91c9256b2672ba8a
3
  size 17314248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14c1eeb2ad5423caa21025a6a684dc0bf9072a4b688fb237f1f2b015d80e9ae3
3
  size 17314248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3306fc503dd1ddbb8e3c363858f250e287d133ce5155291b4de29495a0bdfe3e
3
  size 34683834
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bbfe7b7ae50c79de2e7120affee22135eff234066f6d1ebd66fe41d48140b8e
3
  size 34683834
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8a54a2c72077c4a4f36878dea3326f2f54f1f77a30598502ed23524073db14b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8102a3ba3b25aac71d1873eea057b9ed5de58c89e96b429b43e2e254af6ee0df
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f270a3d78b8ab17a630c4ea775ddd0ff2a59bd22b3c01efd944df9aafc311546
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b696588ce6f23c499cf522a405429b637ff807b937e94016abb9e0f03b2cbd3d
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73a32be6e2d5a25843231ae04605874e4211895956911e62c82947e6bf374f75
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c25bf2f0f98848653386296025ff0a18b764a6915188669f8184cb58f0c63f05
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d99349dff39894ce85207c93efba1b4a9a79951583aea9195025073db5dd1193
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d3c9b6a50b613c2bf664942879c5b9e05f946793e1e06e7c8f239224a6db00a
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:22977592a282620c3f8e357127424e89090348024e0f63dd93982d62b8514053
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2feeabad28633664463cc743362089847201a9933ed476045f81ea765962f5c
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d90c581d1f1944a418056cf8e2a56636f1d605ecde7bd68b22717cfdcf36d039
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbe859ec227593d4e5ae780100dd3341b69207d36460a1285c2b178adb79a3d8
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9b901258296aa1cd1b84ed5049f9096ad41e176f5d05a183239c6f116d191cc4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80e71c8d9bdad100d9d7225aed23a3e514aebaf48e91154466a3b7c333031896
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40a0d5aa8e343ebcaa6ed3c99ba601e5f939b625317c6656b2cff2f5ed37cae6
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d1616ffb7a967870e2e11d0459bca8e44cce795e5850676f198ba8b3dc642b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2425222312045271,
5
  "eval_steps": 500,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -5107,6 +5107,1706 @@
5107
  "rewards/prompt_consistency_reward": 0.7500000596046448,
5108
  "rewards/walls_orthogonality_reward": 0.8199405074119568,
5109
  "step": 300
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5110
  }
5111
  ],
5112
  "logging_steps": 1,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.32336297493936944,
5
  "eval_steps": 500,
6
+ "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
5107
  "rewards/prompt_consistency_reward": 0.7500000596046448,
5108
  "rewards/walls_orthogonality_reward": 0.8199405074119568,
5109
  "step": 300
5110
+ },
5111
+ {
5112
+ "completion_length": 190.55357360839844,
5113
+ "epoch": 0.2433306386418755,
5114
+ "grad_norm": 0.03148113191127777,
5115
+ "kl": 0.02835637889802456,
5116
+ "learning_rate": 6.053354890864995e-06,
5117
+ "loss": 0.0011,
5118
+ "reward": 4.4354376792907715,
5119
+ "reward_std": 1.5036461353302002,
5120
+ "rewards/answer_format_reward": 0.8617773056030273,
5121
+ "rewards/common_format_reward": 0.9843750596046448,
5122
+ "rewards/doors_consistency_reward": 0.6428571939468384,
5123
+ "rewards/geometry_consistency_reward": 0.6428571939468384,
5124
+ "rewards/prompt_consistency_reward": 0.6428571939468384,
5125
+ "rewards/walls_orthogonality_reward": 0.660714328289032,
5126
+ "step": 301
5127
+ },
5128
+ {
5129
+ "completion_length": 199.5357208251953,
5130
+ "epoch": 0.24413904607922393,
5131
+ "grad_norm": 0.021602505818009377,
5132
+ "kl": 0.041999828070402145,
5133
+ "learning_rate": 6.046887631366209e-06,
5134
+ "loss": 0.0017,
5135
+ "reward": 4.528274059295654,
5136
+ "reward_std": 1.408253788948059,
5137
+ "rewards/answer_format_reward": 0.8266369104385376,
5138
+ "rewards/common_format_reward": 0.9709821939468384,
5139
+ "rewards/doors_consistency_reward": 0.6785714626312256,
5140
+ "rewards/geometry_consistency_reward": 0.6785714626312256,
5141
+ "rewards/prompt_consistency_reward": 0.6785714626312256,
5142
+ "rewards/walls_orthogonality_reward": 0.6949405074119568,
5143
+ "step": 302
5144
+ },
5145
+ {
5146
+ "completion_length": 190.4107208251953,
5147
+ "epoch": 0.24494745351657235,
5148
+ "grad_norm": 0.020547445863485336,
5149
+ "kl": 0.04101761057972908,
5150
+ "learning_rate": 6.040420371867421e-06,
5151
+ "loss": 0.0016,
5152
+ "reward": 4.422584533691406,
5153
+ "reward_std": 1.5716291666030884,
5154
+ "rewards/answer_format_reward": 0.841369092464447,
5155
+ "rewards/common_format_reward": 0.9933035969734192,
5156
+ "rewards/doors_consistency_reward": 0.6428571939468384,
5157
+ "rewards/geometry_consistency_reward": 0.6428571939468384,
5158
+ "rewards/prompt_consistency_reward": 0.6428571939468384,
5159
+ "rewards/walls_orthogonality_reward": 0.6593406796455383,
5160
+ "step": 303
5161
+ },
5162
+ {
5163
+ "completion_length": 187.98214721679688,
5164
+ "epoch": 0.24575586095392077,
5165
+ "grad_norm": 0.026932047680020332,
5166
+ "kl": 0.031999826431274414,
5167
+ "learning_rate": 6.033953112368633e-06,
5168
+ "loss": 0.0013,
5169
+ "reward": 4.860727787017822,
5170
+ "reward_std": 1.4902468919754028,
5171
+ "rewards/answer_format_reward": 0.9069940447807312,
5172
+ "rewards/common_format_reward": 1.0,
5173
+ "rewards/doors_consistency_reward": 0.7455357313156128,
5174
+ "rewards/geometry_consistency_reward": 0.7142857313156128,
5175
+ "rewards/prompt_consistency_reward": 0.7142857313156128,
5176
+ "rewards/walls_orthogonality_reward": 0.7796266674995422,
5177
+ "step": 304
5178
+ },
5179
+ {
5180
+ "completion_length": 236.58929443359375,
5181
+ "epoch": 0.2465642683912692,
5182
+ "grad_norm": 0.023645374923944473,
5183
+ "kl": 0.03161970525979996,
5184
+ "learning_rate": 6.027485852869846e-06,
5185
+ "loss": 0.0013,
5186
+ "reward": 4.254626750946045,
5187
+ "reward_std": 2.2542381286621094,
5188
+ "rewards/answer_format_reward": 0.8254464268684387,
5189
+ "rewards/common_format_reward": 0.9508929252624512,
5190
+ "rewards/doors_consistency_reward": 0.6227678656578064,
5191
+ "rewards/geometry_consistency_reward": 0.6071428656578064,
5192
+ "rewards/prompt_consistency_reward": 0.6071428656578064,
5193
+ "rewards/walls_orthogonality_reward": 0.6412338018417358,
5194
+ "step": 305
5195
+ },
5196
+ {
5197
+ "completion_length": 218.50001525878906,
5198
+ "epoch": 0.24737267582861763,
5199
+ "grad_norm": 0.02419598028063774,
5200
+ "kl": 0.03213595971465111,
5201
+ "learning_rate": 6.0210185933710585e-06,
5202
+ "loss": 0.0013,
5203
+ "reward": 4.889243125915527,
5204
+ "reward_std": 1.724419355392456,
5205
+ "rewards/answer_format_reward": 0.8929634690284729,
5206
+ "rewards/common_format_reward": 0.9464285969734192,
5207
+ "rewards/doors_consistency_reward": 0.7656250596046448,
5208
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
5209
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
5210
+ "rewards/walls_orthogonality_reward": 0.7842262387275696,
5211
+ "step": 306
5212
+ },
5213
+ {
5214
+ "completion_length": 221.58929443359375,
5215
+ "epoch": 0.24818108326596605,
5216
+ "grad_norm": 0.020586064085364342,
5217
+ "kl": 0.03023657388985157,
5218
+ "learning_rate": 6.0145513338722715e-06,
5219
+ "loss": 0.0012,
5220
+ "reward": 4.495535850524902,
5221
+ "reward_std": 1.938279628753662,
5222
+ "rewards/answer_format_reward": 0.8697917461395264,
5223
+ "rewards/common_format_reward": 0.9754464626312256,
5224
+ "rewards/doors_consistency_reward": 0.6428571939468384,
5225
+ "rewards/geometry_consistency_reward": 0.6428571939468384,
5226
+ "rewards/prompt_consistency_reward": 0.6428571939468384,
5227
+ "rewards/walls_orthogonality_reward": 0.7217262387275696,
5228
+ "step": 307
5229
+ },
5230
+ {
5231
+ "completion_length": 186.0178680419922,
5232
+ "epoch": 0.24898949070331447,
5233
+ "grad_norm": 0.02355961874127388,
5234
+ "kl": 0.02844185009598732,
5235
+ "learning_rate": 6.0080840743734845e-06,
5236
+ "loss": 0.0011,
5237
+ "reward": 4.813243865966797,
5238
+ "reward_std": 1.640316128730774,
5239
+ "rewards/answer_format_reward": 0.8571428060531616,
5240
+ "rewards/common_format_reward": 1.0,
5241
+ "rewards/doors_consistency_reward": 0.7433035969734192,
5242
+ "rewards/geometry_consistency_reward": 0.7142857313156128,
5243
+ "rewards/prompt_consistency_reward": 0.7142857313156128,
5244
+ "rewards/walls_orthogonality_reward": 0.7842261791229248,
5245
+ "step": 308
5246
+ },
5247
+ {
5248
+ "completion_length": 212.58929443359375,
5249
+ "epoch": 0.2497978981406629,
5250
+ "grad_norm": 0.03075120411813259,
5251
+ "kl": 0.03946709632873535,
5252
+ "learning_rate": 6.001616814874697e-06,
5253
+ "loss": 0.0016,
5254
+ "reward": 4.2298126220703125,
5255
+ "reward_std": 2.1380577087402344,
5256
+ "rewards/answer_format_reward": 0.843537449836731,
5257
+ "rewards/common_format_reward": 0.9508929252624512,
5258
+ "rewards/doors_consistency_reward": 0.6004464626312256,
5259
+ "rewards/geometry_consistency_reward": 0.5892857313156128,
5260
+ "rewards/prompt_consistency_reward": 0.5892857313156128,
5261
+ "rewards/walls_orthogonality_reward": 0.6563644409179688,
5262
+ "step": 309
5263
+ },
5264
+ {
5265
+ "completion_length": 200.46429443359375,
5266
+ "epoch": 0.25060630557801133,
5267
+ "grad_norm": 0.020126184448599815,
5268
+ "kl": 0.03708888962864876,
5269
+ "learning_rate": 5.995149555375909e-06,
5270
+ "loss": 0.0015,
5271
+ "reward": 5.089418888092041,
5272
+ "reward_std": 1.4195140600204468,
5273
+ "rewards/answer_format_reward": 0.9408801198005676,
5274
+ "rewards/common_format_reward": 1.0,
5275
+ "rewards/doors_consistency_reward": 0.7812500596046448,
5276
+ "rewards/geometry_consistency_reward": 0.7678571939468384,
5277
+ "rewards/prompt_consistency_reward": 0.7678571939468384,
5278
+ "rewards/walls_orthogonality_reward": 0.8315747380256653,
5279
+ "step": 310
5280
+ },
5281
+ {
5282
+ "completion_length": 191.17857360839844,
5283
+ "epoch": 0.2514147130153597,
5284
+ "grad_norm": 0.029773026704788208,
5285
+ "kl": 0.04655873402953148,
5286
+ "learning_rate": 5.988682295877122e-06,
5287
+ "loss": 0.0019,
5288
+ "reward": 3.4205567836761475,
5289
+ "reward_std": 1.718083381652832,
5290
+ "rewards/answer_format_reward": 0.808949887752533,
5291
+ "rewards/common_format_reward": 0.9955357313156128,
5292
+ "rewards/doors_consistency_reward": 0.3883928656578064,
5293
+ "rewards/geometry_consistency_reward": 0.3750000298023224,
5294
+ "rewards/prompt_consistency_reward": 0.3750000298023224,
5295
+ "rewards/walls_orthogonality_reward": 0.4776785671710968,
5296
+ "step": 311
5297
+ },
5298
+ {
5299
+ "completion_length": 197.57144165039062,
5300
+ "epoch": 0.25222312045270817,
5301
+ "grad_norm": 0.02616206184029579,
5302
+ "kl": 0.03850039467215538,
5303
+ "learning_rate": 5.982215036378335e-06,
5304
+ "loss": 0.0015,
5305
+ "reward": 4.84484338760376,
5306
+ "reward_std": 1.5422664880752563,
5307
+ "rewards/answer_format_reward": 0.9211416840553284,
5308
+ "rewards/common_format_reward": 1.0,
5309
+ "rewards/doors_consistency_reward": 0.7500000596046448,
5310
+ "rewards/geometry_consistency_reward": 0.6964285969734192,
5311
+ "rewards/prompt_consistency_reward": 0.6964285969734192,
5312
+ "rewards/walls_orthogonality_reward": 0.7808441519737244,
5313
+ "step": 312
5314
+ },
5315
+ {
5316
+ "completion_length": 208.0178680419922,
5317
+ "epoch": 0.25303152789005656,
5318
+ "grad_norm": 0.020660288631916046,
5319
+ "kl": 0.04136011749505997,
5320
+ "learning_rate": 5.975747776879547e-06,
5321
+ "loss": 0.0017,
5322
+ "reward": 4.912982940673828,
5323
+ "reward_std": 1.6619864702224731,
5324
+ "rewards/answer_format_reward": 0.9121244549751282,
5325
+ "rewards/common_format_reward": 0.988839328289032,
5326
+ "rewards/doors_consistency_reward": 0.7455357313156128,
5327
+ "rewards/geometry_consistency_reward": 0.7202380895614624,
5328
+ "rewards/prompt_consistency_reward": 0.7321428656578064,
5329
+ "rewards/walls_orthogonality_reward": 0.8141025900840759,
5330
+ "step": 313
5331
+ },
5332
+ {
5333
+ "completion_length": 194.4107208251953,
5334
+ "epoch": 0.253839935327405,
5335
+ "grad_norm": 0.0314546674489975,
5336
+ "kl": 0.04389024153351784,
5337
+ "learning_rate": 5.96928051738076e-06,
5338
+ "loss": 0.0018,
5339
+ "reward": 4.55452823638916,
5340
+ "reward_std": 1.9581618309020996,
5341
+ "rewards/answer_format_reward": 0.8705357313156128,
5342
+ "rewards/common_format_reward": 0.9933035969734192,
5343
+ "rewards/doors_consistency_reward": 0.660714328289032,
5344
+ "rewards/geometry_consistency_reward": 0.660714328289032,
5345
+ "rewards/prompt_consistency_reward": 0.660714328289032,
5346
+ "rewards/walls_orthogonality_reward": 0.708545982837677,
5347
+ "step": 314
5348
+ },
5349
+ {
5350
+ "completion_length": 187.69644165039062,
5351
+ "epoch": 0.25464834276475345,
5352
+ "grad_norm": 0.03105827234685421,
5353
+ "kl": 0.047631118446588516,
5354
+ "learning_rate": 5.962813257881972e-06,
5355
+ "loss": 0.0019,
5356
+ "reward": 4.598660945892334,
5357
+ "reward_std": 1.9349331855773926,
5358
+ "rewards/answer_format_reward": 0.8843750953674316,
5359
+ "rewards/common_format_reward": 1.0,
5360
+ "rewards/doors_consistency_reward": 0.6785714626312256,
5361
+ "rewards/geometry_consistency_reward": 0.6785714626312256,
5362
+ "rewards/prompt_consistency_reward": 0.6785714626312256,
5363
+ "rewards/walls_orthogonality_reward": 0.6785714626312256,
5364
+ "step": 315
5365
+ },
5366
+ {
5367
+ "completion_length": 197.2678680419922,
5368
+ "epoch": 0.25545675020210185,
5369
+ "grad_norm": 0.02918073907494545,
5370
+ "kl": 0.05234503000974655,
5371
+ "learning_rate": 5.956345998383184e-06,
5372
+ "loss": 0.0021,
5373
+ "reward": 4.496049880981445,
5374
+ "reward_std": 1.9937241077423096,
5375
+ "rewards/answer_format_reward": 0.8920599818229675,
5376
+ "rewards/common_format_reward": 1.0,
5377
+ "rewards/doors_consistency_reward": 0.625,
5378
+ "rewards/geometry_consistency_reward": 0.625,
5379
+ "rewards/prompt_consistency_reward": 0.625,
5380
+ "rewards/walls_orthogonality_reward": 0.728989839553833,
5381
+ "step": 316
5382
+ },
5383
+ {
5384
+ "completion_length": 206.5178680419922,
5385
+ "epoch": 0.2562651576394503,
5386
+ "grad_norm": 0.02770458720624447,
5387
+ "kl": 0.041484441608190536,
5388
+ "learning_rate": 5.949878738884398e-06,
5389
+ "loss": 0.0017,
5390
+ "reward": 4.8770551681518555,
5391
+ "reward_std": 1.6818500757217407,
5392
+ "rewards/answer_format_reward": 0.8656463623046875,
5393
+ "rewards/common_format_reward": 0.9955357313156128,
5394
+ "rewards/doors_consistency_reward": 0.7500000596046448,
5395
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
5396
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
5397
+ "rewards/walls_orthogonality_reward": 0.7658730745315552,
5398
+ "step": 317
5399
+ },
5400
+ {
5401
+ "completion_length": 198.17857360839844,
5402
+ "epoch": 0.2570735650767987,
5403
+ "grad_norm": 0.021601324900984764,
5404
+ "kl": 0.04682294651865959,
5405
+ "learning_rate": 5.9434114793856104e-06,
5406
+ "loss": 0.0019,
5407
+ "reward": 4.784970760345459,
5408
+ "reward_std": 1.1434282064437866,
5409
+ "rewards/answer_format_reward": 0.9174107313156128,
5410
+ "rewards/common_format_reward": 0.9843750596046448,
5411
+ "rewards/doors_consistency_reward": 0.7254464626312256,
5412
+ "rewards/geometry_consistency_reward": 0.6964285969734192,
5413
+ "rewards/prompt_consistency_reward": 0.6964285969734192,
5414
+ "rewards/walls_orthogonality_reward": 0.7648809552192688,
5415
+ "step": 318
5416
+ },
5417
+ {
5418
+ "completion_length": 192.94644165039062,
5419
+ "epoch": 0.25788197251414713,
5420
+ "grad_norm": 0.03130035102367401,
5421
+ "kl": 0.03954165056347847,
5422
+ "learning_rate": 5.936944219886823e-06,
5423
+ "loss": 0.0016,
5424
+ "reward": 4.616085052490234,
5425
+ "reward_std": 1.9991627931594849,
5426
+ "rewards/answer_format_reward": 0.8848214149475098,
5427
+ "rewards/common_format_reward": 0.9843750596046448,
5428
+ "rewards/doors_consistency_reward": 0.6785714626312256,
5429
+ "rewards/geometry_consistency_reward": 0.6785714626312256,
5430
+ "rewards/prompt_consistency_reward": 0.6785714626312256,
5431
+ "rewards/walls_orthogonality_reward": 0.7111742496490479,
5432
+ "step": 319
5433
+ },
5434
+ {
5435
+ "completion_length": 197.98214721679688,
5436
+ "epoch": 0.2586903799514956,
5437
+ "grad_norm": 0.023986609652638435,
5438
+ "kl": 0.04991472512483597,
5439
+ "learning_rate": 5.930476960388035e-06,
5440
+ "loss": 0.002,
5441
+ "reward": 5.19819974899292,
5442
+ "reward_std": 1.4133762121200562,
5443
+ "rewards/answer_format_reward": 0.906037449836731,
5444
+ "rewards/common_format_reward": 1.0,
5445
+ "rewards/doors_consistency_reward": 0.816964328289032,
5446
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
5447
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
5448
+ "rewards/walls_orthogonality_reward": 0.8680555820465088,
5449
+ "step": 320
5450
+ },
5451
+ {
5452
+ "completion_length": 231.58929443359375,
5453
+ "epoch": 0.25949878738884397,
5454
+ "grad_norm": 0.023219523951411247,
5455
+ "kl": 0.04335100203752518,
5456
+ "learning_rate": 5.924009700889248e-06,
5457
+ "loss": 0.0017,
5458
+ "reward": 4.176699161529541,
5459
+ "reward_std": 2.019928216934204,
5460
+ "rewards/answer_format_reward": 0.8363308310508728,
5461
+ "rewards/common_format_reward": 0.973214328289032,
5462
+ "rewards/doors_consistency_reward": 0.59375,
5463
+ "rewards/geometry_consistency_reward": 0.535714328289032,
5464
+ "rewards/prompt_consistency_reward": 0.535714328289032,
5465
+ "rewards/walls_orthogonality_reward": 0.7019751667976379,
5466
+ "step": 321
5467
+ },
5468
+ {
5469
+ "completion_length": 197.58929443359375,
5470
+ "epoch": 0.2603071948261924,
5471
+ "grad_norm": 0.029109537601470947,
5472
+ "kl": 0.037719499319791794,
5473
+ "learning_rate": 5.917542441390461e-06,
5474
+ "loss": 0.0015,
5475
+ "reward": 4.509999752044678,
5476
+ "reward_std": 1.7329912185668945,
5477
+ "rewards/answer_format_reward": 0.902529776096344,
5478
+ "rewards/common_format_reward": 0.9843750596046448,
5479
+ "rewards/doors_consistency_reward": 0.6584821939468384,
5480
+ "rewards/geometry_consistency_reward": 0.6071428656578064,
5481
+ "rewards/prompt_consistency_reward": 0.6071428656578064,
5482
+ "rewards/walls_orthogonality_reward": 0.7503270506858826,
5483
+ "step": 322
5484
+ },
5485
+ {
5486
+ "completion_length": 189.7857208251953,
5487
+ "epoch": 0.2611156022635408,
5488
+ "grad_norm": 0.03885766863822937,
5489
+ "kl": 0.06143519654870033,
5490
+ "learning_rate": 5.911075181891673e-06,
5491
+ "loss": 0.0025,
5492
+ "reward": 4.793899059295654,
5493
+ "reward_std": 1.778590440750122,
5494
+ "rewards/answer_format_reward": 0.8616071939468384,
5495
+ "rewards/common_format_reward": 0.9709821939468384,
5496
+ "rewards/doors_consistency_reward": 0.7321428656578064,
5497
+ "rewards/geometry_consistency_reward": 0.7321428656578064,
5498
+ "rewards/prompt_consistency_reward": 0.7321428656578064,
5499
+ "rewards/walls_orthogonality_reward": 0.7648809552192688,
5500
+ "step": 323
5501
+ },
5502
+ {
5503
+ "completion_length": 191.6607208251953,
5504
+ "epoch": 0.26192400970088925,
5505
+ "grad_norm": 0.029916733503341675,
5506
+ "kl": 0.04059479385614395,
5507
+ "learning_rate": 5.904607922392886e-06,
5508
+ "loss": 0.0016,
5509
+ "reward": 4.733205795288086,
5510
+ "reward_std": 1.691454291343689,
5511
+ "rewards/answer_format_reward": 0.8589498400688171,
5512
+ "rewards/common_format_reward": 0.9843750596046448,
5513
+ "rewards/doors_consistency_reward": 0.7142857313156128,
5514
+ "rewards/geometry_consistency_reward": 0.7142857313156128,
5515
+ "rewards/prompt_consistency_reward": 0.7142857313156128,
5516
+ "rewards/walls_orthogonality_reward": 0.74702388048172,
5517
+ "step": 324
5518
+ },
5519
+ {
5520
+ "completion_length": 219.50001525878906,
5521
+ "epoch": 0.2627324171382377,
5522
+ "grad_norm": 0.02192886359989643,
5523
+ "kl": 0.044606756418943405,
5524
+ "learning_rate": 5.898140662894098e-06,
5525
+ "loss": 0.0018,
5526
+ "reward": 4.616021633148193,
5527
+ "reward_std": 1.9295297861099243,
5528
+ "rewards/answer_format_reward": 0.8615576028823853,
5529
+ "rewards/common_format_reward": 0.9754464626312256,
5530
+ "rewards/doors_consistency_reward": 0.707589328289032,
5531
+ "rewards/geometry_consistency_reward": 0.6785714626312256,
5532
+ "rewards/prompt_consistency_reward": 0.6785714626312256,
5533
+ "rewards/walls_orthogonality_reward": 0.7142857313156128,
5534
+ "step": 325
5535
+ },
5536
+ {
5537
+ "completion_length": 210.0178680419922,
5538
+ "epoch": 0.2635408245755861,
5539
+ "grad_norm": 0.03145517036318779,
5540
+ "kl": 0.04279856011271477,
5541
+ "learning_rate": 5.89167340339531e-06,
5542
+ "loss": 0.0017,
5543
+ "reward": 4.566220283508301,
5544
+ "reward_std": 2.05240535736084,
5545
+ "rewards/answer_format_reward": 0.8370535969734192,
5546
+ "rewards/common_format_reward": 0.9508929252624512,
5547
+ "rewards/doors_consistency_reward": 0.6785714626312256,
5548
+ "rewards/geometry_consistency_reward": 0.6785714626312256,
5549
+ "rewards/prompt_consistency_reward": 0.6785714626312256,
5550
+ "rewards/walls_orthogonality_reward": 0.742559552192688,
5551
+ "step": 326
5552
+ },
5553
+ {
5554
+ "completion_length": 221.6607208251953,
5555
+ "epoch": 0.26434923201293453,
5556
+ "grad_norm": 0.020835530012845993,
5557
+ "kl": 0.04281467944383621,
5558
+ "learning_rate": 5.885206143896524e-06,
5559
+ "loss": 0.0017,
5560
+ "reward": 4.998511791229248,
5561
+ "reward_std": 1.8436694145202637,
5562
+ "rewards/answer_format_reward": 0.9278274178504944,
5563
+ "rewards/common_format_reward": 0.9799107313156128,
5564
+ "rewards/doors_consistency_reward": 0.7767857313156128,
5565
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
5566
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
5567
+ "rewards/walls_orthogonality_reward": 0.8139880895614624,
5568
+ "step": 327
5569
+ },
5570
+ {
5571
+ "completion_length": 219.7678680419922,
5572
+ "epoch": 0.2651576394502829,
5573
+ "grad_norm": 0.020769478753209114,
5574
+ "kl": 0.033965181559324265,
5575
+ "learning_rate": 5.878738884397736e-06,
5576
+ "loss": 0.0014,
5577
+ "reward": 4.68898868560791,
5578
+ "reward_std": 1.2390503883361816,
5579
+ "rewards/answer_format_reward": 0.871279776096344,
5580
+ "rewards/common_format_reward": 0.988839328289032,
5581
+ "rewards/doors_consistency_reward": 0.7053571939468384,
5582
+ "rewards/geometry_consistency_reward": 0.6964285969734192,
5583
+ "rewards/prompt_consistency_reward": 0.6964285969734192,
5584
+ "rewards/walls_orthogonality_reward": 0.730654776096344,
5585
+ "step": 328
5586
+ },
5587
+ {
5588
+ "completion_length": 189.8928680419922,
5589
+ "epoch": 0.26596604688763137,
5590
+ "grad_norm": 0.02480081096291542,
5591
+ "kl": 0.03892555460333824,
5592
+ "learning_rate": 5.8722716248989485e-06,
5593
+ "loss": 0.0016,
5594
+ "reward": 4.891368865966797,
5595
+ "reward_std": 1.646767497062683,
5596
+ "rewards/answer_format_reward": 0.8779762387275696,
5597
+ "rewards/common_format_reward": 0.9821429252624512,
5598
+ "rewards/doors_consistency_reward": 0.7633928656578064,
5599
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
5600
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
5601
+ "rewards/walls_orthogonality_reward": 0.7678571939468384,
5602
+ "step": 329
5603
+ },
5604
+ {
5605
+ "completion_length": 196.2857208251953,
5606
+ "epoch": 0.2667744543249798,
5607
+ "grad_norm": 0.0251334086060524,
5608
+ "kl": 0.03177637606859207,
5609
+ "learning_rate": 5.8658043654001615e-06,
5610
+ "loss": 0.0013,
5611
+ "reward": 4.941220760345459,
5612
+ "reward_std": 1.2456035614013672,
5613
+ "rewards/answer_format_reward": 0.8988094925880432,
5614
+ "rewards/common_format_reward": 0.9709821939468384,
5615
+ "rewards/doors_consistency_reward": 0.7678571939468384,
5616
+ "rewards/geometry_consistency_reward": 0.7678571939468384,
5617
+ "rewards/prompt_consistency_reward": 0.7678571939468384,
5618
+ "rewards/walls_orthogonality_reward": 0.7678571939468384,
5619
+ "step": 330
5620
+ },
5621
+ {
5622
+ "completion_length": 199.1428680419922,
5623
+ "epoch": 0.2675828617623282,
5624
+ "grad_norm": 0.019027192145586014,
5625
+ "kl": 0.04209236800670624,
5626
+ "learning_rate": 5.859337105901374e-06,
5627
+ "loss": 0.0017,
5628
+ "reward": 4.462798118591309,
5629
+ "reward_std": 1.8810534477233887,
5630
+ "rewards/answer_format_reward": 0.8422619104385376,
5631
+ "rewards/common_format_reward": 0.9464285969734192,
5632
+ "rewards/doors_consistency_reward": 0.660714328289032,
5633
+ "rewards/geometry_consistency_reward": 0.660714328289032,
5634
+ "rewards/prompt_consistency_reward": 0.660714328289032,
5635
+ "rewards/walls_orthogonality_reward": 0.691964328289032,
5636
+ "step": 331
5637
+ },
5638
+ {
5639
+ "completion_length": 189.69644165039062,
5640
+ "epoch": 0.26839126919967665,
5641
+ "grad_norm": 0.030452800914645195,
5642
+ "kl": 0.05848146602511406,
5643
+ "learning_rate": 5.852869846402587e-06,
5644
+ "loss": 0.0023,
5645
+ "reward": 4.55026388168335,
5646
+ "reward_std": 1.8818013668060303,
5647
+ "rewards/answer_format_reward": 0.8790178894996643,
5648
+ "rewards/common_format_reward": 1.0,
5649
+ "rewards/doors_consistency_reward": 0.6741071939468384,
5650
+ "rewards/geometry_consistency_reward": 0.6428571939468384,
5651
+ "rewards/prompt_consistency_reward": 0.6428571939468384,
5652
+ "rewards/walls_orthogonality_reward": 0.7114240527153015,
5653
+ "step": 332
5654
+ },
5655
+ {
5656
+ "completion_length": 187.73214721679688,
5657
+ "epoch": 0.26919967663702504,
5658
+ "grad_norm": 0.029216378927230835,
5659
+ "kl": 0.051482971757650375,
5660
+ "learning_rate": 5.8464025869038e-06,
5661
+ "loss": 0.0021,
5662
+ "reward": 5.06615686416626,
5663
+ "reward_std": 1.7191449403762817,
5664
+ "rewards/answer_format_reward": 0.9166028499603271,
5665
+ "rewards/common_format_reward": 0.9754464626312256,
5666
+ "rewards/doors_consistency_reward": 0.785714328289032,
5667
+ "rewards/geometry_consistency_reward": 0.785714328289032,
5668
+ "rewards/prompt_consistency_reward": 0.785714328289032,
5669
+ "rewards/walls_orthogonality_reward": 0.816964328289032,
5670
+ "step": 333
5671
+ },
5672
+ {
5673
+ "completion_length": 198.10714721679688,
5674
+ "epoch": 0.2700080840743735,
5675
+ "grad_norm": 0.028731094673275948,
5676
+ "kl": 0.043341681361198425,
5677
+ "learning_rate": 5.839935327405012e-06,
5678
+ "loss": 0.0017,
5679
+ "reward": 4.845174312591553,
5680
+ "reward_std": 1.9085485935211182,
5681
+ "rewards/answer_format_reward": 0.9091623425483704,
5682
+ "rewards/common_format_reward": 0.9799107313156128,
5683
+ "rewards/doors_consistency_reward": 0.7566964626312256,
5684
+ "rewards/geometry_consistency_reward": 0.7023809552192688,
5685
+ "rewards/prompt_consistency_reward": 0.7142857313156128,
5686
+ "rewards/walls_orthogonality_reward": 0.7827381491661072,
5687
+ "step": 334
5688
+ },
5689
+ {
5690
+ "completion_length": 187.57144165039062,
5691
+ "epoch": 0.27081649151172194,
5692
+ "grad_norm": 0.02899596467614174,
5693
+ "kl": 0.05083652213215828,
5694
+ "learning_rate": 5.833468067906224e-06,
5695
+ "loss": 0.002,
5696
+ "reward": 4.886160850524902,
5697
+ "reward_std": 1.7213134765625,
5698
+ "rewards/answer_format_reward": 0.8883929252624512,
5699
+ "rewards/common_format_reward": 0.9843750596046448,
5700
+ "rewards/doors_consistency_reward": 0.7500000596046448,
5701
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
5702
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
5703
+ "rewards/walls_orthogonality_reward": 0.7633928656578064,
5704
+ "step": 335
5705
+ },
5706
+ {
5707
+ "completion_length": 202.2678680419922,
5708
+ "epoch": 0.2716248989490703,
5709
+ "grad_norm": 0.02415960468351841,
5710
+ "kl": 0.047176457941532135,
5711
+ "learning_rate": 5.827000808407437e-06,
5712
+ "loss": 0.0019,
5713
+ "reward": 4.804342269897461,
5714
+ "reward_std": 1.8799443244934082,
5715
+ "rewards/answer_format_reward": 0.8566964268684387,
5716
+ "rewards/common_format_reward": 0.9709821939468384,
5717
+ "rewards/doors_consistency_reward": 0.7477678656578064,
5718
+ "rewards/geometry_consistency_reward": 0.7321428656578064,
5719
+ "rewards/prompt_consistency_reward": 0.7321428656578064,
5720
+ "rewards/walls_orthogonality_reward": 0.7646104693412781,
5721
+ "step": 336
5722
+ },
5723
+ {
5724
+ "completion_length": 188.17857360839844,
5725
+ "epoch": 0.27243330638641877,
5726
+ "grad_norm": 0.02066534198820591,
5727
+ "kl": 0.042303018271923065,
5728
+ "learning_rate": 5.82053354890865e-06,
5729
+ "loss": 0.0017,
5730
+ "reward": 5.346726417541504,
5731
+ "reward_std": 1.3656119108200073,
5732
+ "rewards/answer_format_reward": 0.9181548357009888,
5733
+ "rewards/common_format_reward": 1.0,
5734
+ "rewards/doors_consistency_reward": 0.8571429252624512,
5735
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
5736
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
5737
+ "rewards/walls_orthogonality_reward": 0.8571429252624512,
5738
+ "step": 337
5739
+ },
5740
+ {
5741
+ "completion_length": 192.57144165039062,
5742
+ "epoch": 0.27324171382376716,
5743
+ "grad_norm": 0.03734434396028519,
5744
+ "kl": 0.04174423590302467,
5745
+ "learning_rate": 5.814066289409862e-06,
5746
+ "loss": 0.0017,
5747
+ "reward": 5.010425090789795,
5748
+ "reward_std": 1.6323578357696533,
5749
+ "rewards/answer_format_reward": 0.9164541363716125,
5750
+ "rewards/common_format_reward": 1.0,
5751
+ "rewards/doors_consistency_reward": 0.7790178656578064,
5752
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
5753
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
5754
+ "rewards/walls_orthogonality_reward": 0.8149529695510864,
5755
+ "step": 338
5756
+ },
5757
+ {
5758
+ "completion_length": 198.1428680419922,
5759
+ "epoch": 0.2740501212611156,
5760
+ "grad_norm": 0.019321925938129425,
5761
+ "kl": 0.029333505779504776,
5762
+ "learning_rate": 5.807599029911075e-06,
5763
+ "loss": 0.0012,
5764
+ "reward": 5.27888298034668,
5765
+ "reward_std": 1.504868984222412,
5766
+ "rewards/answer_format_reward": 0.9255952835083008,
5767
+ "rewards/common_format_reward": 0.9866071939468384,
5768
+ "rewards/doors_consistency_reward": 0.8370535969734192,
5769
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
5770
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
5771
+ "rewards/walls_orthogonality_reward": 0.8867694735527039,
5772
+ "step": 339
5773
+ },
5774
+ {
5775
+ "completion_length": 223.23214721679688,
5776
+ "epoch": 0.274858528698464,
5777
+ "grad_norm": 0.024310695007443428,
5778
+ "kl": 0.04415176063776016,
5779
+ "learning_rate": 5.8011317704122874e-06,
5780
+ "loss": 0.0018,
5781
+ "reward": 4.777827739715576,
5782
+ "reward_std": 2.0516269207000732,
5783
+ "rewards/answer_format_reward": 0.8827381134033203,
5784
+ "rewards/common_format_reward": 0.957589328289032,
5785
+ "rewards/doors_consistency_reward": 0.7276785969734192,
5786
+ "rewards/geometry_consistency_reward": 0.7142857313156128,
5787
+ "rewards/prompt_consistency_reward": 0.7142857313156128,
5788
+ "rewards/walls_orthogonality_reward": 0.7812500596046448,
5789
+ "step": 340
5790
+ },
5791
+ {
5792
+ "completion_length": 211.05357360839844,
5793
+ "epoch": 0.27566693613581245,
5794
+ "grad_norm": 0.019032742828130722,
5795
+ "kl": 0.041281454265117645,
5796
+ "learning_rate": 5.7946645109135e-06,
5797
+ "loss": 0.0017,
5798
+ "reward": 4.716002941131592,
5799
+ "reward_std": 1.4471924304962158,
5800
+ "rewards/answer_format_reward": 0.8913690447807312,
5801
+ "rewards/common_format_reward": 0.9866071939468384,
5802
+ "rewards/doors_consistency_reward": 0.7053571939468384,
5803
+ "rewards/geometry_consistency_reward": 0.660714328289032,
5804
+ "rewards/prompt_consistency_reward": 0.660714328289032,
5805
+ "rewards/walls_orthogonality_reward": 0.8112409114837646,
5806
+ "step": 341
5807
+ },
5808
+ {
5809
+ "completion_length": 211.94644165039062,
5810
+ "epoch": 0.2764753435731609,
5811
+ "grad_norm": 0.15315896272659302,
5812
+ "kl": 0.10882477462291718,
5813
+ "learning_rate": 5.7881972514147135e-06,
5814
+ "loss": 0.0044,
5815
+ "reward": 4.928571701049805,
5816
+ "reward_std": 2.0339837074279785,
5817
+ "rewards/answer_format_reward": 0.87202388048172,
5818
+ "rewards/common_format_reward": 0.9687500596046448,
5819
+ "rewards/doors_consistency_reward": 0.7678571939468384,
5820
+ "rewards/geometry_consistency_reward": 0.7678571939468384,
5821
+ "rewards/prompt_consistency_reward": 0.7678571939468384,
5822
+ "rewards/walls_orthogonality_reward": 0.7842262387275696,
5823
+ "step": 342
5824
+ },
5825
+ {
5826
+ "completion_length": 186.57144165039062,
5827
+ "epoch": 0.2772837510105093,
5828
+ "grad_norm": 0.026153016835451126,
5829
+ "kl": 0.039734289050102234,
5830
+ "learning_rate": 5.781729991915926e-06,
5831
+ "loss": 0.0016,
5832
+ "reward": 5.36919641494751,
5833
+ "reward_std": 1.3187062740325928,
5834
+ "rewards/answer_format_reward": 0.9354166984558105,
5835
+ "rewards/common_format_reward": 0.988839328289032,
5836
+ "rewards/doors_consistency_reward": 0.8571429252624512,
5837
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
5838
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
5839
+ "rewards/walls_orthogonality_reward": 0.8735119104385376,
5840
+ "step": 343
5841
+ },
5842
+ {
5843
+ "completion_length": 188.4107208251953,
5844
+ "epoch": 0.27809215844785773,
5845
+ "grad_norm": 0.015121434815227985,
5846
+ "kl": 0.04189826175570488,
5847
+ "learning_rate": 5.775262732417138e-06,
5848
+ "loss": 0.0017,
5849
+ "reward": 5.461309909820557,
5850
+ "reward_std": 0.9177731275558472,
5851
+ "rewards/answer_format_reward": 0.9568452835083008,
5852
+ "rewards/common_format_reward": 0.9955357313156128,
5853
+ "rewards/doors_consistency_reward": 0.8705357313156128,
5854
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
5855
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
5856
+ "rewards/walls_orthogonality_reward": 0.9241071939468384,
5857
+ "step": 344
5858
+ },
5859
+ {
5860
+ "completion_length": 189.23214721679688,
5861
+ "epoch": 0.2789005658852061,
5862
+ "grad_norm": 0.026444820687174797,
5863
+ "kl": 0.047124091535806656,
5864
+ "learning_rate": 5.76879547291835e-06,
5865
+ "loss": 0.0019,
5866
+ "reward": 4.651190757751465,
5867
+ "reward_std": 2.0851433277130127,
5868
+ "rewards/answer_format_reward": 0.8906250596046448,
5869
+ "rewards/common_format_reward": 0.9843750596046448,
5870
+ "rewards/doors_consistency_reward": 0.6785714626312256,
5871
+ "rewards/geometry_consistency_reward": 0.6785714626312256,
5872
+ "rewards/prompt_consistency_reward": 0.6785714626312256,
5873
+ "rewards/walls_orthogonality_reward": 0.7404762506484985,
5874
+ "step": 345
5875
+ },
5876
+ {
5877
+ "completion_length": 198.21429443359375,
5878
+ "epoch": 0.27970897332255457,
5879
+ "grad_norm": 0.017690090462565422,
5880
+ "kl": 0.03677573427557945,
5881
+ "learning_rate": 5.762328213419563e-06,
5882
+ "loss": 0.0015,
5883
+ "reward": 5.299851417541504,
5884
+ "reward_std": 1.3098024129867554,
5885
+ "rewards/answer_format_reward": 0.9382440447807312,
5886
+ "rewards/common_format_reward": 0.988839328289032,
5887
+ "rewards/doors_consistency_reward": 0.8549107313156128,
5888
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
5889
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
5890
+ "rewards/walls_orthogonality_reward": 0.8750000596046448,
5891
+ "step": 346
5892
+ },
5893
+ {
5894
+ "completion_length": 207.67857360839844,
5895
+ "epoch": 0.280517380759903,
5896
+ "grad_norm": 0.02095929905772209,
5897
+ "kl": 0.042624641209840775,
5898
+ "learning_rate": 5.755860953920776e-06,
5899
+ "loss": 0.0017,
5900
+ "reward": 5.204761505126953,
5901
+ "reward_std": 1.383862853050232,
5902
+ "rewards/answer_format_reward": 0.9473214745521545,
5903
+ "rewards/common_format_reward": 0.9933035969734192,
5904
+ "rewards/doors_consistency_reward": 0.8191964626312256,
5905
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
5906
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
5907
+ "rewards/walls_orthogonality_reward": 0.8377977013587952,
5908
+ "step": 347
5909
+ },
5910
+ {
5911
+ "completion_length": 226.08929443359375,
5912
+ "epoch": 0.2813257881972514,
5913
+ "grad_norm": 0.04316651448607445,
5914
+ "kl": 0.049189042299985886,
5915
+ "learning_rate": 5.749393694421988e-06,
5916
+ "loss": 0.002,
5917
+ "reward": 4.90625,
5918
+ "reward_std": 1.8971381187438965,
5919
+ "rewards/answer_format_reward": 0.8630953431129456,
5920
+ "rewards/common_format_reward": 0.9553571939468384,
5921
+ "rewards/doors_consistency_reward": 0.7678571939468384,
5922
+ "rewards/geometry_consistency_reward": 0.7678571939468384,
5923
+ "rewards/prompt_consistency_reward": 0.7678571939468384,
5924
+ "rewards/walls_orthogonality_reward": 0.7842261791229248,
5925
+ "step": 348
5926
+ },
5927
+ {
5928
+ "completion_length": 187.05357360839844,
5929
+ "epoch": 0.28213419563459985,
5930
+ "grad_norm": 0.019362403079867363,
5931
+ "kl": 0.036033060401678085,
5932
+ "learning_rate": 5.742926434923201e-06,
5933
+ "loss": 0.0014,
5934
+ "reward": 5.147931098937988,
5935
+ "reward_std": 1.363024115562439,
5936
+ "rewards/answer_format_reward": 0.944940447807312,
5937
+ "rewards/common_format_reward": 1.0,
5938
+ "rewards/doors_consistency_reward": 0.7834821939468384,
5939
+ "rewards/geometry_consistency_reward": 0.7678571939468384,
5940
+ "rewards/prompt_consistency_reward": 0.7678571939468384,
5941
+ "rewards/walls_orthogonality_reward": 0.8837933540344238,
5942
+ "step": 349
5943
+ },
5944
+ {
5945
+ "completion_length": 189.83929443359375,
5946
+ "epoch": 0.28294260307194824,
5947
+ "grad_norm": 0.021790187805891037,
5948
+ "kl": 0.03598780557513237,
5949
+ "learning_rate": 5.736459175424413e-06,
5950
+ "loss": 0.0014,
5951
+ "reward": 5.563988208770752,
5952
+ "reward_std": 0.8080568909645081,
5953
+ "rewards/answer_format_reward": 0.9657738208770752,
5954
+ "rewards/common_format_reward": 0.9933035969734192,
5955
+ "rewards/doors_consistency_reward": 0.9084821939468384,
5956
+ "rewards/geometry_consistency_reward": 0.8928571939468384,
5957
+ "rewards/prompt_consistency_reward": 0.8928571939468384,
5958
+ "rewards/walls_orthogonality_reward": 0.910714328289032,
5959
+ "step": 350
5960
+ },
5961
+ {
5962
+ "completion_length": 188.17857360839844,
5963
+ "epoch": 0.2837510105092967,
5964
+ "grad_norm": 0.018759772181510925,
5965
+ "kl": 0.033785514533519745,
5966
+ "learning_rate": 5.729991915925626e-06,
5967
+ "loss": 0.0014,
5968
+ "reward": 5.249851226806641,
5969
+ "reward_std": 1.5347867012023926,
5970
+ "rewards/answer_format_reward": 0.9514881372451782,
5971
+ "rewards/common_format_reward": 0.988839328289032,
5972
+ "rewards/doors_consistency_reward": 0.8035714626312256,
5973
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
5974
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
5975
+ "rewards/walls_orthogonality_reward": 0.8988096117973328,
5976
+ "step": 351
5977
+ },
5978
+ {
5979
+ "completion_length": 187.83929443359375,
5980
+ "epoch": 0.28455941794664513,
5981
+ "grad_norm": 0.025501880794763565,
5982
+ "kl": 0.034323111176490784,
5983
+ "learning_rate": 5.723524656426839e-06,
5984
+ "loss": 0.0014,
5985
+ "reward": 5.418899059295654,
5986
+ "reward_std": 1.1246989965438843,
5987
+ "rewards/answer_format_reward": 0.9270833134651184,
5988
+ "rewards/common_format_reward": 0.9754464626312256,
5989
+ "rewards/doors_consistency_reward": 0.8750000596046448,
5990
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
5991
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
5992
+ "rewards/walls_orthogonality_reward": 0.891369104385376,
5993
+ "step": 352
5994
+ },
5995
+ {
5996
+ "completion_length": 206.32144165039062,
5997
+ "epoch": 0.2853678253839935,
5998
+ "grad_norm": 0.024359941482543945,
5999
+ "kl": 0.05647893622517586,
6000
+ "learning_rate": 5.7170573969280515e-06,
6001
+ "loss": 0.0023,
6002
+ "reward": 5.103848457336426,
6003
+ "reward_std": 1.5120278596878052,
6004
+ "rewards/answer_format_reward": 0.9055059552192688,
6005
+ "rewards/common_format_reward": 0.9687500596046448,
6006
+ "rewards/doors_consistency_reward": 0.8035714626312256,
6007
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
6008
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
6009
+ "rewards/walls_orthogonality_reward": 0.8188775777816772,
6010
+ "step": 353
6011
+ },
6012
+ {
6013
+ "completion_length": 185.32144165039062,
6014
+ "epoch": 0.28617623282134197,
6015
+ "grad_norm": 0.014006072655320168,
6016
+ "kl": 0.032776981592178345,
6017
+ "learning_rate": 5.710590137429264e-06,
6018
+ "loss": 0.0013,
6019
+ "reward": 5.7990498542785645,
6020
+ "reward_std": 0.5683748126029968,
6021
+ "rewards/answer_format_reward": 0.9821429252624512,
6022
+ "rewards/common_format_reward": 1.0,
6023
+ "rewards/doors_consistency_reward": 0.9464285969734192,
6024
+ "rewards/geometry_consistency_reward": 0.9464285969734192,
6025
+ "rewards/prompt_consistency_reward": 0.9464285969734192,
6026
+ "rewards/walls_orthogonality_reward": 0.9776206612586975,
6027
+ "step": 354
6028
+ },
6029
+ {
6030
+ "completion_length": 185.80357360839844,
6031
+ "epoch": 0.28698464025869036,
6032
+ "grad_norm": 0.015259346924722195,
6033
+ "kl": 0.04929060861468315,
6034
+ "learning_rate": 5.704122877930477e-06,
6035
+ "loss": 0.002,
6036
+ "reward": 5.380952835083008,
6037
+ "reward_std": 1.1333816051483154,
6038
+ "rewards/answer_format_reward": 0.9471726417541504,
6039
+ "rewards/common_format_reward": 0.988839328289032,
6040
+ "rewards/doors_consistency_reward": 0.8571429252624512,
6041
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
6042
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
6043
+ "rewards/walls_orthogonality_reward": 0.8735119700431824,
6044
+ "step": 355
6045
+ },
6046
+ {
6047
+ "completion_length": 203.32144165039062,
6048
+ "epoch": 0.2877930476960388,
6049
+ "grad_norm": 0.01948886550962925,
6050
+ "kl": 0.0409274660050869,
6051
+ "learning_rate": 5.69765561843169e-06,
6052
+ "loss": 0.0016,
6053
+ "reward": 4.833333969116211,
6054
+ "reward_std": 1.8322272300720215,
6055
+ "rewards/answer_format_reward": 0.9047620296478271,
6056
+ "rewards/common_format_reward": 0.9687500596046448,
6057
+ "rewards/doors_consistency_reward": 0.7321428656578064,
6058
+ "rewards/geometry_consistency_reward": 0.7321428656578064,
6059
+ "rewards/prompt_consistency_reward": 0.7321428656578064,
6060
+ "rewards/walls_orthogonality_reward": 0.7633928656578064,
6061
+ "step": 356
6062
+ },
6063
+ {
6064
+ "completion_length": 192.0357208251953,
6065
+ "epoch": 0.28860145513338725,
6066
+ "grad_norm": 0.05670643970370293,
6067
+ "kl": 0.06968912482261658,
6068
+ "learning_rate": 5.691188358932902e-06,
6069
+ "loss": 0.0028,
6070
+ "reward": 4.667113304138184,
6071
+ "reward_std": 2.0373687744140625,
6072
+ "rewards/answer_format_reward": 0.8739583492279053,
6073
+ "rewards/common_format_reward": 0.9910714626312256,
6074
+ "rewards/doors_consistency_reward": 0.6964285969734192,
6075
+ "rewards/geometry_consistency_reward": 0.6964285969734192,
6076
+ "rewards/prompt_consistency_reward": 0.6964285969734192,
6077
+ "rewards/walls_orthogonality_reward": 0.7127977013587952,
6078
+ "step": 357
6079
+ },
6080
+ {
6081
+ "completion_length": 194.92857360839844,
6082
+ "epoch": 0.28940986257073564,
6083
+ "grad_norm": 0.02134573645889759,
6084
+ "kl": 0.039342574775218964,
6085
+ "learning_rate": 5.684721099434115e-06,
6086
+ "loss": 0.0016,
6087
+ "reward": 5.02336311340332,
6088
+ "reward_std": 1.5655544996261597,
6089
+ "rewards/answer_format_reward": 0.9244047999382019,
6090
+ "rewards/common_format_reward": 0.9843750596046448,
6091
+ "rewards/doors_consistency_reward": 0.7946428656578064,
6092
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
6093
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
6094
+ "rewards/walls_orthogonality_reward": 0.819940447807312,
6095
+ "step": 358
6096
+ },
6097
+ {
6098
+ "completion_length": 190.55357360839844,
6099
+ "epoch": 0.2902182700080841,
6100
+ "grad_norm": 0.029027465730905533,
6101
+ "kl": 0.0330372117459774,
6102
+ "learning_rate": 5.678253839935327e-06,
6103
+ "loss": 0.0013,
6104
+ "reward": 5.3899617195129395,
6105
+ "reward_std": 1.2358527183532715,
6106
+ "rewards/answer_format_reward": 0.9315476417541504,
6107
+ "rewards/common_format_reward": 0.9709821939468384,
6108
+ "rewards/doors_consistency_reward": 0.8571429252624512,
6109
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
6110
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
6111
+ "rewards/walls_orthogonality_reward": 0.9160028100013733,
6112
+ "step": 359
6113
+ },
6114
+ {
6115
+ "completion_length": 198.10714721679688,
6116
+ "epoch": 0.2910266774454325,
6117
+ "grad_norm": 0.021329175680875778,
6118
+ "kl": 0.0386892668902874,
6119
+ "learning_rate": 5.671786580436539e-06,
6120
+ "loss": 0.0015,
6121
+ "reward": 5.020833969116211,
6122
+ "reward_std": 1.72552490234375,
6123
+ "rewards/answer_format_reward": 0.8244048357009888,
6124
+ "rewards/common_format_reward": 0.9821429252624512,
6125
+ "rewards/doors_consistency_reward": 0.8035714626312256,
6126
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
6127
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
6128
+ "rewards/walls_orthogonality_reward": 0.8035714626312256,
6129
+ "step": 360
6130
+ },
6131
+ {
6132
+ "completion_length": 189.1607208251953,
6133
+ "epoch": 0.2918350848827809,
6134
+ "grad_norm": 0.027224717661738396,
6135
+ "kl": 0.04358064383268356,
6136
+ "learning_rate": 5.665319320937753e-06,
6137
+ "loss": 0.0017,
6138
+ "reward": 5.206845283508301,
6139
+ "reward_std": 1.5427162647247314,
6140
+ "rewards/answer_format_reward": 0.8928571939468384,
6141
+ "rewards/common_format_reward": 0.9799107313156128,
6142
+ "rewards/doors_consistency_reward": 0.8370535969734192,
6143
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
6144
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
6145
+ "rewards/walls_orthogonality_reward": 0.8541666865348816,
6146
+ "step": 361
6147
+ },
6148
+ {
6149
+ "completion_length": 209.12501525878906,
6150
+ "epoch": 0.2926434923201294,
6151
+ "grad_norm": 0.02416359819471836,
6152
+ "kl": 0.043135691434144974,
6153
+ "learning_rate": 5.658852061438965e-06,
6154
+ "loss": 0.0017,
6155
+ "reward": 5.290923118591309,
6156
+ "reward_std": 1.3799084424972534,
6157
+ "rewards/answer_format_reward": 0.9285714626312256,
6158
+ "rewards/common_format_reward": 0.9709821939468384,
6159
+ "rewards/doors_consistency_reward": 0.8392857313156128,
6160
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6161
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6162
+ "rewards/walls_orthogonality_reward": 0.8735119700431824,
6163
+ "step": 362
6164
+ },
6165
+ {
6166
+ "completion_length": 203.37501525878906,
6167
+ "epoch": 0.29345189975747776,
6168
+ "grad_norm": 0.02054131217300892,
6169
+ "kl": 0.034598227590322495,
6170
+ "learning_rate": 5.6523848019401775e-06,
6171
+ "loss": 0.0014,
6172
+ "reward": 4.853287696838379,
6173
+ "reward_std": 2.001997947692871,
6174
+ "rewards/answer_format_reward": 0.8779762387275696,
6175
+ "rewards/common_format_reward": 0.9531250596046448,
6176
+ "rewards/doors_consistency_reward": 0.7321428656578064,
6177
+ "rewards/geometry_consistency_reward": 0.7321428656578064,
6178
+ "rewards/prompt_consistency_reward": 0.7321428656578064,
6179
+ "rewards/walls_orthogonality_reward": 0.825757622718811,
6180
+ "step": 363
6181
+ },
6182
+ {
6183
+ "completion_length": 202.92857360839844,
6184
+ "epoch": 0.2942603071948262,
6185
+ "grad_norm": 0.027310578152537346,
6186
+ "kl": 0.044535476714372635,
6187
+ "learning_rate": 5.6459175424413905e-06,
6188
+ "loss": 0.0018,
6189
+ "reward": 4.909489631652832,
6190
+ "reward_std": 1.6785434484481812,
6191
+ "rewards/answer_format_reward": 0.9011905193328857,
6192
+ "rewards/common_format_reward": 0.9754464626312256,
6193
+ "rewards/doors_consistency_reward": 0.7500000596046448,
6194
+ "rewards/geometry_consistency_reward": 0.7500000596046448,
6195
+ "rewards/prompt_consistency_reward": 0.7500000596046448,
6196
+ "rewards/walls_orthogonality_reward": 0.7828525900840759,
6197
+ "step": 364
6198
+ },
6199
+ {
6200
+ "completion_length": 185.25001525878906,
6201
+ "epoch": 0.2950687146321746,
6202
+ "grad_norm": 0.0148573387414217,
6203
+ "kl": 0.044098082929849625,
6204
+ "learning_rate": 5.639450282942603e-06,
6205
+ "loss": 0.0018,
6206
+ "reward": 5.429789066314697,
6207
+ "reward_std": 0.9370274543762207,
6208
+ "rewards/answer_format_reward": 0.9769346117973328,
6209
+ "rewards/common_format_reward": 1.0,
6210
+ "rewards/doors_consistency_reward": 0.8683035969734192,
6211
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6212
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6213
+ "rewards/walls_orthogonality_reward": 0.9059794545173645,
6214
+ "step": 365
6215
+ },
6216
+ {
6217
+ "completion_length": 190.55357360839844,
6218
+ "epoch": 0.29587712206952305,
6219
+ "grad_norm": 0.01718834601342678,
6220
+ "kl": 0.039552826434373856,
6221
+ "learning_rate": 5.632983023443816e-06,
6222
+ "loss": 0.0016,
6223
+ "reward": 5.2567901611328125,
6224
+ "reward_std": 1.1837490797042847,
6225
+ "rewards/answer_format_reward": 0.9613096117973328,
6226
+ "rewards/common_format_reward": 1.0,
6227
+ "rewards/doors_consistency_reward": 0.8191964626312256,
6228
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
6229
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
6230
+ "rewards/walls_orthogonality_reward": 0.8691413402557373,
6231
+ "step": 366
6232
+ },
6233
+ {
6234
+ "completion_length": 186.42857360839844,
6235
+ "epoch": 0.29668552950687144,
6236
+ "grad_norm": 0.02943878434598446,
6237
+ "kl": 0.0521557442843914,
6238
+ "learning_rate": 5.626515763945028e-06,
6239
+ "loss": 0.0021,
6240
+ "reward": 5.131811141967773,
6241
+ "reward_std": 1.5309616327285767,
6242
+ "rewards/answer_format_reward": 0.929315447807312,
6243
+ "rewards/common_format_reward": 0.9955357313156128,
6244
+ "rewards/doors_consistency_reward": 0.7991071939468384,
6245
+ "rewards/geometry_consistency_reward": 0.785714328289032,
6246
+ "rewards/prompt_consistency_reward": 0.785714328289032,
6247
+ "rewards/walls_orthogonality_reward": 0.8364240527153015,
6248
+ "step": 367
6249
+ },
6250
+ {
6251
+ "completion_length": 198.9107208251953,
6252
+ "epoch": 0.2974939369442199,
6253
+ "grad_norm": 0.02020343765616417,
6254
+ "kl": 0.04528440162539482,
6255
+ "learning_rate": 5.620048504446241e-06,
6256
+ "loss": 0.0018,
6257
+ "reward": 5.265996932983398,
6258
+ "reward_std": 1.344224214553833,
6259
+ "rewards/answer_format_reward": 0.9475446939468384,
6260
+ "rewards/common_format_reward": 1.0,
6261
+ "rewards/doors_consistency_reward": 0.8214285969734192,
6262
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
6263
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
6264
+ "rewards/walls_orthogonality_reward": 0.8541666865348816,
6265
+ "step": 368
6266
+ },
6267
+ {
6268
+ "completion_length": 193.62501525878906,
6269
+ "epoch": 0.29830234438156833,
6270
+ "grad_norm": 0.018543804064393044,
6271
+ "kl": 0.03651433065533638,
6272
+ "learning_rate": 5.613581244947453e-06,
6273
+ "loss": 0.0015,
6274
+ "reward": 5.712798118591309,
6275
+ "reward_std": 0.7763786911964417,
6276
+ "rewards/answer_format_reward": 0.9568452835083008,
6277
+ "rewards/common_format_reward": 0.9821429252624512,
6278
+ "rewards/doors_consistency_reward": 0.9464285969734192,
6279
+ "rewards/geometry_consistency_reward": 0.9345238208770752,
6280
+ "rewards/prompt_consistency_reward": 0.9464285969734192,
6281
+ "rewards/walls_orthogonality_reward": 0.9464285969734192,
6282
+ "step": 369
6283
+ },
6284
+ {
6285
+ "completion_length": 185.96429443359375,
6286
+ "epoch": 0.2991107518189167,
6287
+ "grad_norm": 0.013525901362299919,
6288
+ "kl": 0.04148541018366814,
6289
+ "learning_rate": 5.607113985448665e-06,
6290
+ "loss": 0.0017,
6291
+ "reward": 5.389880657196045,
6292
+ "reward_std": 0.9092532992362976,
6293
+ "rewards/answer_format_reward": 0.9434524178504944,
6294
+ "rewards/common_format_reward": 1.0,
6295
+ "rewards/doors_consistency_reward": 0.8571429252624512,
6296
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
6297
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
6298
+ "rewards/walls_orthogonality_reward": 0.8750000596046448,
6299
+ "step": 370
6300
+ },
6301
+ {
6302
+ "completion_length": 190.32144165039062,
6303
+ "epoch": 0.29991915925626517,
6304
+ "grad_norm": 0.02009180374443531,
6305
+ "kl": 0.03104526363313198,
6306
+ "learning_rate": 5.600646725949879e-06,
6307
+ "loss": 0.0012,
6308
+ "reward": 5.639562606811523,
6309
+ "reward_std": 0.7105973958969116,
6310
+ "rewards/answer_format_reward": 0.961309552192688,
6311
+ "rewards/common_format_reward": 0.988839328289032,
6312
+ "rewards/doors_consistency_reward": 0.9241071939468384,
6313
+ "rewards/geometry_consistency_reward": 0.910714328289032,
6314
+ "rewards/prompt_consistency_reward": 0.910714328289032,
6315
+ "rewards/walls_orthogonality_reward": 0.9438775777816772,
6316
+ "step": 371
6317
+ },
6318
+ {
6319
+ "completion_length": 205.42857360839844,
6320
+ "epoch": 0.30072756669361356,
6321
+ "grad_norm": 0.026932930573821068,
6322
+ "kl": 0.03607700765132904,
6323
+ "learning_rate": 5.594179466451091e-06,
6324
+ "loss": 0.0014,
6325
+ "reward": 5.0540452003479,
6326
+ "reward_std": 1.4534943103790283,
6327
+ "rewards/answer_format_reward": 0.8958333134651184,
6328
+ "rewards/common_format_reward": 0.9665179252624512,
6329
+ "rewards/doors_consistency_reward": 0.785714328289032,
6330
+ "rewards/geometry_consistency_reward": 0.785714328289032,
6331
+ "rewards/prompt_consistency_reward": 0.785714328289032,
6332
+ "rewards/walls_orthogonality_reward": 0.8345509767532349,
6333
+ "step": 372
6334
+ },
6335
+ {
6336
+ "completion_length": 185.0357208251953,
6337
+ "epoch": 0.301535974130962,
6338
+ "grad_norm": 0.013487469404935837,
6339
+ "kl": 0.038467638194561005,
6340
+ "learning_rate": 5.587712206952303e-06,
6341
+ "loss": 0.0015,
6342
+ "reward": 5.34956693649292,
6343
+ "reward_std": 1.1325931549072266,
6344
+ "rewards/answer_format_reward": 0.9404762387275696,
6345
+ "rewards/common_format_reward": 0.9866071939468384,
6346
+ "rewards/doors_consistency_reward": 0.8392857313156128,
6347
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6348
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6349
+ "rewards/walls_orthogonality_reward": 0.9046266078948975,
6350
+ "step": 373
6351
+ },
6352
+ {
6353
+ "completion_length": 195.92857360839844,
6354
+ "epoch": 0.30234438156831045,
6355
+ "grad_norm": 0.02199031412601471,
6356
+ "kl": 0.038897089660167694,
6357
+ "learning_rate": 5.581244947453516e-06,
6358
+ "loss": 0.0016,
6359
+ "reward": 5.29923152923584,
6360
+ "reward_std": 1.3177196979522705,
6361
+ "rewards/answer_format_reward": 0.9301835298538208,
6362
+ "rewards/common_format_reward": 0.9955357313156128,
6363
+ "rewards/doors_consistency_reward": 0.8392857313156128,
6364
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6365
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6366
+ "rewards/walls_orthogonality_reward": 0.8556548357009888,
6367
+ "step": 374
6368
+ },
6369
+ {
6370
+ "completion_length": 189.42857360839844,
6371
+ "epoch": 0.30315278900565884,
6372
+ "grad_norm": 0.03327542170882225,
6373
+ "kl": 0.03342190384864807,
6374
+ "learning_rate": 5.5747776879547286e-06,
6375
+ "loss": 0.0013,
6376
+ "reward": 5.373749256134033,
6377
+ "reward_std": 1.151526689529419,
6378
+ "rewards/answer_format_reward": 0.9594494700431824,
6379
+ "rewards/common_format_reward": 0.9910714626312256,
6380
+ "rewards/doors_consistency_reward": 0.8549107313156128,
6381
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6382
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6383
+ "rewards/walls_orthogonality_reward": 0.8897457718849182,
6384
+ "step": 375
6385
+ },
6386
+ {
6387
+ "completion_length": 183.71429443359375,
6388
+ "epoch": 0.3039611964430073,
6389
+ "grad_norm": 0.043908070772886276,
6390
+ "kl": 0.03916674479842186,
6391
+ "learning_rate": 5.5683104284559416e-06,
6392
+ "loss": 0.0016,
6393
+ "reward": 5.370039939880371,
6394
+ "reward_std": 1.4789761304855347,
6395
+ "rewards/answer_format_reward": 0.9270833134651184,
6396
+ "rewards/common_format_reward": 0.9821429252624512,
6397
+ "rewards/doors_consistency_reward": 0.8571429252624512,
6398
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
6399
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
6400
+ "rewards/walls_orthogonality_reward": 0.8893849849700928,
6401
+ "step": 376
6402
+ },
6403
+ {
6404
+ "completion_length": 193.30357360839844,
6405
+ "epoch": 0.3047696038803557,
6406
+ "grad_norm": 0.02499815821647644,
6407
+ "kl": 0.03329702839255333,
6408
+ "learning_rate": 5.561843168957155e-06,
6409
+ "loss": 0.0013,
6410
+ "reward": 5.333482265472412,
6411
+ "reward_std": 1.466332197189331,
6412
+ "rewards/answer_format_reward": 0.9458333849906921,
6413
+ "rewards/common_format_reward": 0.9821429252624512,
6414
+ "rewards/doors_consistency_reward": 0.8549107313156128,
6415
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6416
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6417
+ "rewards/walls_orthogonality_reward": 0.87202388048172,
6418
+ "step": 377
6419
+ },
6420
+ {
6421
+ "completion_length": 191.0357208251953,
6422
+ "epoch": 0.3055780113177041,
6423
+ "grad_norm": 0.020524630323052406,
6424
+ "kl": 0.03136897832155228,
6425
+ "learning_rate": 5.555375909458367e-06,
6426
+ "loss": 0.0013,
6427
+ "reward": 5.382760047912598,
6428
+ "reward_std": 1.3234719038009644,
6429
+ "rewards/answer_format_reward": 0.9489796161651611,
6430
+ "rewards/common_format_reward": 0.9977679252624512,
6431
+ "rewards/doors_consistency_reward": 0.8392857313156128,
6432
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6433
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6434
+ "rewards/walls_orthogonality_reward": 0.918154776096344,
6435
+ "step": 378
6436
+ },
6437
+ {
6438
+ "completion_length": 188.44644165039062,
6439
+ "epoch": 0.30638641875505257,
6440
+ "grad_norm": 0.021576276049017906,
6441
+ "kl": 0.04590163379907608,
6442
+ "learning_rate": 5.548908649959579e-06,
6443
+ "loss": 0.0018,
6444
+ "reward": 5.5309529304504395,
6445
+ "reward_std": 1.0420526266098022,
6446
+ "rewards/answer_format_reward": 0.9431548118591309,
6447
+ "rewards/common_format_reward": 1.0,
6448
+ "rewards/doors_consistency_reward": 0.8928571939468384,
6449
+ "rewards/geometry_consistency_reward": 0.8928571939468384,
6450
+ "rewards/prompt_consistency_reward": 0.8928571939468384,
6451
+ "rewards/walls_orthogonality_reward": 0.9092261791229248,
6452
+ "step": 379
6453
+ },
6454
+ {
6455
+ "completion_length": 197.17857360839844,
6456
+ "epoch": 0.30719482619240096,
6457
+ "grad_norm": 0.03557493910193443,
6458
+ "kl": 0.035080306231975555,
6459
+ "learning_rate": 5.542441390460792e-06,
6460
+ "loss": 0.0014,
6461
+ "reward": 5.494048118591309,
6462
+ "reward_std": 1.340869426727295,
6463
+ "rewards/answer_format_reward": 0.938988208770752,
6464
+ "rewards/common_format_reward": 0.9910714626312256,
6465
+ "rewards/doors_consistency_reward": 0.8883929252624512,
6466
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
6467
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
6468
+ "rewards/walls_orthogonality_reward": 0.9255952835083008,
6469
+ "step": 380
6470
+ },
6471
+ {
6472
+ "completion_length": 191.1428680419922,
6473
+ "epoch": 0.3080032336297494,
6474
+ "grad_norm": 0.023244433104991913,
6475
+ "kl": 0.046850427985191345,
6476
+ "learning_rate": 5.535974130962005e-06,
6477
+ "loss": 0.0019,
6478
+ "reward": 5.303869247436523,
6479
+ "reward_std": 1.5219762325286865,
6480
+ "rewards/answer_format_reward": 0.9244047999382019,
6481
+ "rewards/common_format_reward": 0.9910714626312256,
6482
+ "rewards/doors_consistency_reward": 0.8392857313156128,
6483
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6484
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6485
+ "rewards/walls_orthogonality_reward": 0.8705357313156128,
6486
+ "step": 381
6487
+ },
6488
+ {
6489
+ "completion_length": 186.62501525878906,
6490
+ "epoch": 0.3088116410670978,
6491
+ "grad_norm": 0.020258748903870583,
6492
+ "kl": 0.04046356678009033,
6493
+ "learning_rate": 5.529506871463217e-06,
6494
+ "loss": 0.0016,
6495
+ "reward": 5.550460338592529,
6496
+ "reward_std": 0.9017711877822876,
6497
+ "rewards/answer_format_reward": 0.9761905670166016,
6498
+ "rewards/common_format_reward": 1.0,
6499
+ "rewards/doors_consistency_reward": 0.8750000596046448,
6500
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
6501
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
6502
+ "rewards/walls_orthogonality_reward": 0.9492694735527039,
6503
+ "step": 382
6504
+ },
6505
+ {
6506
+ "completion_length": 197.44644165039062,
6507
+ "epoch": 0.30962004850444624,
6508
+ "grad_norm": 0.023141799494624138,
6509
+ "kl": 0.0362992063164711,
6510
+ "learning_rate": 5.52303961196443e-06,
6511
+ "loss": 0.0015,
6512
+ "reward": 5.497555732727051,
6513
+ "reward_std": 1.248864769935608,
6514
+ "rewards/answer_format_reward": 0.9343112111091614,
6515
+ "rewards/common_format_reward": 0.9754464626312256,
6516
+ "rewards/doors_consistency_reward": 0.8928571939468384,
6517
+ "rewards/geometry_consistency_reward": 0.8928571939468384,
6518
+ "rewards/prompt_consistency_reward": 0.8928571939468384,
6519
+ "rewards/walls_orthogonality_reward": 0.9092262387275696,
6520
+ "step": 383
6521
+ },
6522
+ {
6523
+ "completion_length": 186.85714721679688,
6524
+ "epoch": 0.3104284559417947,
6525
+ "grad_norm": 0.01589200645685196,
6526
+ "kl": 0.03602542355656624,
6527
+ "learning_rate": 5.516572352465642e-06,
6528
+ "loss": 0.0014,
6529
+ "reward": 5.519792079925537,
6530
+ "reward_std": 1.0965229272842407,
6531
+ "rewards/answer_format_reward": 0.953869104385376,
6532
+ "rewards/common_format_reward": 1.0,
6533
+ "rewards/doors_consistency_reward": 0.8906250596046448,
6534
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
6535
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
6536
+ "rewards/walls_orthogonality_reward": 0.9252976179122925,
6537
+ "step": 384
6538
+ },
6539
+ {
6540
+ "completion_length": 194.00001525878906,
6541
+ "epoch": 0.3112368633791431,
6542
+ "grad_norm": 0.020183054730296135,
6543
+ "kl": 0.045087799429893494,
6544
+ "learning_rate": 5.5101050929668545e-06,
6545
+ "loss": 0.0018,
6546
+ "reward": 5.593005657196045,
6547
+ "reward_std": 1.0252506732940674,
6548
+ "rewards/answer_format_reward": 0.9531250596046448,
6549
+ "rewards/common_format_reward": 0.9821429252624512,
6550
+ "rewards/doors_consistency_reward": 0.910714328289032,
6551
+ "rewards/geometry_consistency_reward": 0.910714328289032,
6552
+ "rewards/prompt_consistency_reward": 0.910714328289032,
6553
+ "rewards/walls_orthogonality_reward": 0.9255952835083008,
6554
+ "step": 385
6555
+ },
6556
+ {
6557
+ "completion_length": 189.17857360839844,
6558
+ "epoch": 0.3120452708164915,
6559
+ "grad_norm": 0.02077249437570572,
6560
+ "kl": 0.03618603199720383,
6561
+ "learning_rate": 5.503637833468068e-06,
6562
+ "loss": 0.0014,
6563
+ "reward": 5.324617862701416,
6564
+ "reward_std": 1.528430700302124,
6565
+ "rewards/answer_format_reward": 0.9600340723991394,
6566
+ "rewards/common_format_reward": 0.9933035969734192,
6567
+ "rewards/doors_consistency_reward": 0.8370535969734192,
6568
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
6569
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
6570
+ "rewards/walls_orthogonality_reward": 0.891369104385376,
6571
+ "step": 386
6572
+ },
6573
+ {
6574
+ "completion_length": 188.08929443359375,
6575
+ "epoch": 0.3128536782538399,
6576
+ "grad_norm": 0.036322079598903656,
6577
+ "kl": 0.03895815834403038,
6578
+ "learning_rate": 5.4971705739692805e-06,
6579
+ "loss": 0.0016,
6580
+ "reward": 5.3370537757873535,
6581
+ "reward_std": 1.0119202136993408,
6582
+ "rewards/answer_format_reward": 0.9434524178504944,
6583
+ "rewards/common_format_reward": 0.9955357313156128,
6584
+ "rewards/doors_consistency_reward": 0.863839328289032,
6585
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
6586
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
6587
+ "rewards/walls_orthogonality_reward": 0.891369104385376,
6588
+ "step": 387
6589
+ },
6590
+ {
6591
+ "completion_length": 186.80357360839844,
6592
+ "epoch": 0.31366208569118836,
6593
+ "grad_norm": 0.022763868793845177,
6594
+ "kl": 0.04497779905796051,
6595
+ "learning_rate": 5.490703314470493e-06,
6596
+ "loss": 0.0018,
6597
+ "reward": 4.907108306884766,
6598
+ "reward_std": 1.844080924987793,
6599
+ "rewards/answer_format_reward": 0.9367559552192688,
6600
+ "rewards/common_format_reward": 1.0,
6601
+ "rewards/doors_consistency_reward": 0.7142857313156128,
6602
+ "rewards/geometry_consistency_reward": 0.7142857313156128,
6603
+ "rewards/prompt_consistency_reward": 0.7142857313156128,
6604
+ "rewards/walls_orthogonality_reward": 0.8274954557418823,
6605
+ "step": 388
6606
+ },
6607
+ {
6608
+ "completion_length": 188.30357360839844,
6609
+ "epoch": 0.3144704931285368,
6610
+ "grad_norm": 0.015382261015474796,
6611
+ "kl": 0.0341610386967659,
6612
+ "learning_rate": 5.484236054971706e-06,
6613
+ "loss": 0.0014,
6614
+ "reward": 5.46837854385376,
6615
+ "reward_std": 1.1567938327789307,
6616
+ "rewards/answer_format_reward": 0.9587054252624512,
6617
+ "rewards/common_format_reward": 1.0,
6618
+ "rewards/doors_consistency_reward": 0.8861607313156128,
6619
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
6620
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
6621
+ "rewards/walls_orthogonality_reward": 0.9092262387275696,
6622
+ "step": 389
6623
+ },
6624
+ {
6625
+ "completion_length": 195.9107208251953,
6626
+ "epoch": 0.3152789005658852,
6627
+ "grad_norm": 0.024375399574637413,
6628
+ "kl": 0.052713505923748016,
6629
+ "learning_rate": 5.477768795472918e-06,
6630
+ "loss": 0.0021,
6631
+ "reward": 5.223214626312256,
6632
+ "reward_std": 1.3531911373138428,
6633
+ "rewards/answer_format_reward": 0.9166666269302368,
6634
+ "rewards/common_format_reward": 0.9910714626312256,
6635
+ "rewards/doors_consistency_reward": 0.8214285969734192,
6636
+ "rewards/geometry_consistency_reward": 0.8214285969734192,
6637
+ "rewards/prompt_consistency_reward": 0.8214285969734192,
6638
+ "rewards/walls_orthogonality_reward": 0.8511905074119568,
6639
+ "step": 390
6640
+ },
6641
+ {
6642
+ "completion_length": 185.3928680419922,
6643
+ "epoch": 0.31608730800323365,
6644
+ "grad_norm": 0.023237893357872963,
6645
+ "kl": 0.06520064920186996,
6646
+ "learning_rate": 5.471301535974131e-06,
6647
+ "loss": 0.0026,
6648
+ "reward": 5.475446701049805,
6649
+ "reward_std": 1.1848540306091309,
6650
+ "rewards/answer_format_reward": 0.976190447807312,
6651
+ "rewards/common_format_reward": 1.0,
6652
+ "rewards/doors_consistency_reward": 0.8816964626312256,
6653
+ "rewards/geometry_consistency_reward": 0.8392857313156128,
6654
+ "rewards/prompt_consistency_reward": 0.8392857313156128,
6655
+ "rewards/walls_orthogonality_reward": 0.938988208770752,
6656
+ "step": 391
6657
+ },
6658
+ {
6659
+ "completion_length": 185.21429443359375,
6660
+ "epoch": 0.31689571544058204,
6661
+ "grad_norm": 0.07059354335069656,
6662
+ "kl": 0.07985740900039673,
6663
+ "learning_rate": 5.464834276475343e-06,
6664
+ "loss": 0.0032,
6665
+ "reward": 5.628983974456787,
6666
+ "reward_std": 0.9163791537284851,
6667
+ "rewards/answer_format_reward": 0.9592262506484985,
6668
+ "rewards/common_format_reward": 0.9955357313156128,
6669
+ "rewards/doors_consistency_reward": 0.910714328289032,
6670
+ "rewards/geometry_consistency_reward": 0.910714328289032,
6671
+ "rewards/prompt_consistency_reward": 0.910714328289032,
6672
+ "rewards/walls_orthogonality_reward": 0.9420787692070007,
6673
+ "step": 392
6674
+ },
6675
+ {
6676
+ "completion_length": 191.75001525878906,
6677
+ "epoch": 0.3177041228779305,
6678
+ "grad_norm": 0.015440837480127811,
6679
+ "kl": 0.03730151057243347,
6680
+ "learning_rate": 5.458367016976556e-06,
6681
+ "loss": 0.0015,
6682
+ "reward": 5.5479912757873535,
6683
+ "reward_std": 0.7770728468894958,
6684
+ "rewards/answer_format_reward": 0.9683780670166016,
6685
+ "rewards/common_format_reward": 1.0,
6686
+ "rewards/doors_consistency_reward": 0.9040179252624512,
6687
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
6688
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
6689
+ "rewards/walls_orthogonality_reward": 0.9255953431129456,
6690
+ "step": 393
6691
+ },
6692
+ {
6693
+ "completion_length": 208.5178680419922,
6694
+ "epoch": 0.3185125303152789,
6695
+ "grad_norm": 0.01981918141245842,
6696
+ "kl": 0.04848418012261391,
6697
+ "learning_rate": 5.451899757477768e-06,
6698
+ "loss": 0.0019,
6699
+ "reward": 5.014285564422607,
6700
+ "reward_std": 1.9335200786590576,
6701
+ "rewards/answer_format_reward": 0.8773809671401978,
6702
+ "rewards/common_format_reward": 0.973214328289032,
6703
+ "rewards/doors_consistency_reward": 0.785714328289032,
6704
+ "rewards/geometry_consistency_reward": 0.7738096117973328,
6705
+ "rewards/prompt_consistency_reward": 0.785714328289032,
6706
+ "rewards/walls_orthogonality_reward": 0.8184522986412048,
6707
+ "step": 394
6708
+ },
6709
+ {
6710
+ "completion_length": 191.0357208251953,
6711
+ "epoch": 0.3193209377526273,
6712
+ "grad_norm": 0.04134137183427811,
6713
+ "kl": 0.03147723153233528,
6714
+ "learning_rate": 5.44543249797898e-06,
6715
+ "loss": 0.0013,
6716
+ "reward": 5.494643211364746,
6717
+ "reward_std": 1.1319756507873535,
6718
+ "rewards/answer_format_reward": 0.9633929133415222,
6719
+ "rewards/common_format_reward": 1.0,
6720
+ "rewards/doors_consistency_reward": 0.8750000596046448,
6721
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
6722
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
6723
+ "rewards/walls_orthogonality_reward": 0.9062500596046448,
6724
+ "step": 395
6725
+ },
6726
+ {
6727
+ "completion_length": 187.6428680419922,
6728
+ "epoch": 0.32012934518997577,
6729
+ "grad_norm": 0.027345867827534676,
6730
+ "kl": 0.042445890605449677,
6731
+ "learning_rate": 5.438965238480194e-06,
6732
+ "loss": 0.0017,
6733
+ "reward": 5.201515197753906,
6734
+ "reward_std": 1.6370927095413208,
6735
+ "rewards/answer_format_reward": 0.940625011920929,
6736
+ "rewards/common_format_reward": 0.9776785969734192,
6737
+ "rewards/doors_consistency_reward": 0.8147321939468384,
6738
+ "rewards/geometry_consistency_reward": 0.785714328289032,
6739
+ "rewards/prompt_consistency_reward": 0.785714328289032,
6740
+ "rewards/walls_orthogonality_reward": 0.8970509171485901,
6741
+ "step": 396
6742
+ },
6743
+ {
6744
+ "completion_length": 194.8928680419922,
6745
+ "epoch": 0.32093775262732416,
6746
+ "grad_norm": 0.011695443652570248,
6747
+ "kl": 0.03162195906043053,
6748
+ "learning_rate": 5.432497978981406e-06,
6749
+ "loss": 0.0013,
6750
+ "reward": 5.499255657196045,
6751
+ "reward_std": 0.8614192605018616,
6752
+ "rewards/answer_format_reward": 0.9553571939468384,
6753
+ "rewards/common_format_reward": 0.9977679252624512,
6754
+ "rewards/doors_consistency_reward": 0.8883929252624512,
6755
+ "rewards/geometry_consistency_reward": 0.8750000596046448,
6756
+ "rewards/prompt_consistency_reward": 0.8750000596046448,
6757
+ "rewards/walls_orthogonality_reward": 0.9077380895614624,
6758
+ "step": 397
6759
+ },
6760
+ {
6761
+ "completion_length": 229.82144165039062,
6762
+ "epoch": 0.3217461600646726,
6763
+ "grad_norm": 0.01699819415807724,
6764
+ "kl": 0.03885875642299652,
6765
+ "learning_rate": 5.426030719482619e-06,
6766
+ "loss": 0.0016,
6767
+ "reward": 5.430166244506836,
6768
+ "reward_std": 1.1690788269042969,
6769
+ "rewards/answer_format_reward": 0.9562075138092041,
6770
+ "rewards/common_format_reward": 0.9977679252624512,
6771
+ "rewards/doors_consistency_reward": 0.8571429252624512,
6772
+ "rewards/geometry_consistency_reward": 0.8571429252624512,
6773
+ "rewards/prompt_consistency_reward": 0.8571429252624512,
6774
+ "rewards/walls_orthogonality_reward": 0.9047619104385376,
6775
+ "step": 398
6776
+ },
6777
+ {
6778
+ "completion_length": 186.62501525878906,
6779
+ "epoch": 0.322554567502021,
6780
+ "grad_norm": 0.01640297658741474,
6781
+ "kl": 0.04830015078186989,
6782
+ "learning_rate": 5.419563459983832e-06,
6783
+ "loss": 0.0019,
6784
+ "reward": 5.215774059295654,
6785
+ "reward_std": 1.1917263269424438,
6786
+ "rewards/answer_format_reward": 0.9375000596046448,
6787
+ "rewards/common_format_reward": 1.0,
6788
+ "rewards/doors_consistency_reward": 0.8035714626312256,
6789
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
6790
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
6791
+ "rewards/walls_orthogonality_reward": 0.8675596117973328,
6792
+ "step": 399
6793
+ },
6794
+ {
6795
+ "completion_length": 202.32144165039062,
6796
+ "epoch": 0.32336297493936944,
6797
+ "grad_norm": 0.015151679515838623,
6798
+ "kl": 0.037056345492601395,
6799
+ "learning_rate": 5.413096200485044e-06,
6800
+ "loss": 0.0015,
6801
+ "reward": 5.114969730377197,
6802
+ "reward_std": 1.2965799570083618,
6803
+ "rewards/answer_format_reward": 0.8924320340156555,
6804
+ "rewards/common_format_reward": 0.9687500596046448,
6805
+ "rewards/doors_consistency_reward": 0.8035714626312256,
6806
+ "rewards/geometry_consistency_reward": 0.8035714626312256,
6807
+ "rewards/prompt_consistency_reward": 0.8035714626312256,
6808
+ "rewards/walls_orthogonality_reward": 0.8430736064910889,
6809
+ "step": 400
6810
  }
6811
  ],
6812
  "logging_steps": 1,