xhb120633 commited on
Commit
fa37a92
·
verified ·
1 Parent(s): 3401757

Model save

Browse files
Files changed (6) hide show
  1. README.md +68 -0
  2. all_results.json +8 -0
  3. config.json +1 -1
  4. generation_config.json +14 -0
  5. train_results.json +8 -0
  6. trainer_state.json +1242 -0
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-3B-Instruct
3
+ library_name: transformers
4
+ model_name: qwen-2.5-3b-r1-c13k-individual
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for qwen-2.5-3b-r1-c13k-individual
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="xhb120633/qwen-2.5-3b-r1-c13k-individual", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.1
38
+ - Transformers: 4.48.1
39
+ - Pytorch: 2.5.1
40
+ - Datasets: 3.1.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 7.065190569642255e-05,
4
+ "train_runtime": 24090.8459,
5
+ "train_samples": 3176,
6
+ "train_samples_per_second": 1.992,
7
+ "train_steps_per_second": 0.021
8
+ }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.48.1",
26
- "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
 
23
  "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.48.1",
26
+ "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 151936
29
  }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.1"
14
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 7.065190569642255e-05,
4
+ "train_runtime": 24090.8459,
5
+ "train_samples": 3176,
6
+ "train_samples_per_second": 1.992,
7
+ "train_steps_per_second": 0.021
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.2594458438287153,
5
+ "eval_steps": 500,
6
+ "global_step": 500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 722.9521026611328,
13
+ "epoch": 0.012594458438287154,
14
+ "grad_norm": 0.07664977191949898,
15
+ "kl": 0.00014810562133789064,
16
+ "learning_rate": 3.333333333333333e-07,
17
+ "loss": 0.0,
18
+ "reward": 0.46451360136270525,
19
+ "reward_std": 0.10854418443050236,
20
+ "rewards/wrapped_prediction_reward_func": 0.46451360136270525,
21
+ "step": 5
22
+ },
23
+ {
24
+ "completion_length": 730.1583602905273,
25
+ "epoch": 0.02518891687657431,
26
+ "grad_norm": 0.08074394124833935,
27
+ "kl": 0.00022151470184326172,
28
+ "learning_rate": 6.666666666666666e-07,
29
+ "loss": 0.0,
30
+ "reward": 0.46406558752059934,
31
+ "reward_std": 0.09366982954088598,
32
+ "rewards/wrapped_prediction_reward_func": 0.46406558752059934,
33
+ "step": 10
34
+ },
35
+ {
36
+ "completion_length": 732.6021026611328,
37
+ "epoch": 0.037783375314861464,
38
+ "grad_norm": 0.07971439169776136,
39
+ "kl": 0.00023424625396728516,
40
+ "learning_rate": 1e-06,
41
+ "loss": 0.0,
42
+ "reward": 0.4597253814339638,
43
+ "reward_std": 0.09797948987688869,
44
+ "rewards/wrapped_prediction_reward_func": 0.4597253814339638,
45
+ "step": 15
46
+ },
47
+ {
48
+ "completion_length": 706.5271057128906,
49
+ "epoch": 0.05037783375314862,
50
+ "grad_norm": 0.07675378344787744,
51
+ "kl": 0.00025036334991455076,
52
+ "learning_rate": 9.997377845227574e-07,
53
+ "loss": 0.0,
54
+ "reward": 0.47573257163167,
55
+ "reward_std": 0.09649912532186136,
56
+ "rewards/wrapped_prediction_reward_func": 0.47573257163167,
57
+ "step": 20
58
+ },
59
+ {
60
+ "completion_length": 709.5146011352539,
61
+ "epoch": 0.06297229219143577,
62
+ "grad_norm": 0.07792838886271755,
63
+ "kl": 0.00035452842712402344,
64
+ "learning_rate": 9.989514131188558e-07,
65
+ "loss": 0.0,
66
+ "reward": 0.4748247370123863,
67
+ "reward_std": 0.08561795827117749,
68
+ "rewards/wrapped_prediction_reward_func": 0.4748247370123863,
69
+ "step": 25
70
+ },
71
+ {
72
+ "completion_length": 746.320849609375,
73
+ "epoch": 0.07556675062972293,
74
+ "grad_norm": 0.0755043003664804,
75
+ "kl": 0.0006094932556152344,
76
+ "learning_rate": 9.97641710583307e-07,
77
+ "loss": 0.0,
78
+ "reward": 0.459003009647131,
79
+ "reward_std": 0.09646977222291753,
80
+ "rewards/wrapped_prediction_reward_func": 0.459003009647131,
81
+ "step": 30
82
+ },
83
+ {
84
+ "completion_length": 739.545849609375,
85
+ "epoch": 0.08816120906801007,
86
+ "grad_norm": 0.06831186280995111,
87
+ "kl": 0.00098114013671875,
88
+ "learning_rate": 9.958100506132126e-07,
89
+ "loss": 0.0,
90
+ "reward": 0.491700629144907,
91
+ "reward_std": 0.08609421639703214,
92
+ "rewards/wrapped_prediction_reward_func": 0.491700629144907,
93
+ "step": 35
94
+ },
95
+ {
96
+ "completion_length": 735.9604385375976,
97
+ "epoch": 0.10075566750629723,
98
+ "grad_norm": 0.06820840594292561,
99
+ "kl": 0.00146484375,
100
+ "learning_rate": 9.934583543669453e-07,
101
+ "loss": 0.0,
102
+ "reward": 0.4918279483914375,
103
+ "reward_std": 0.10381489984574728,
104
+ "rewards/wrapped_prediction_reward_func": 0.4918279483914375,
105
+ "step": 40
106
+ },
107
+ {
108
+ "completion_length": 761.6375183105469,
109
+ "epoch": 0.11335012594458438,
110
+ "grad_norm": 0.07306576306189429,
111
+ "kl": 0.0023061752319335936,
112
+ "learning_rate": 9.905890884491194e-07,
113
+ "loss": 0.0,
114
+ "reward": 0.4687023714184761,
115
+ "reward_std": 0.10100088955368847,
116
+ "rewards/wrapped_prediction_reward_func": 0.4687023714184761,
117
+ "step": 45
118
+ },
119
+ {
120
+ "completion_length": 762.1875213623047,
121
+ "epoch": 0.12594458438287154,
122
+ "grad_norm": 0.0803081141044505,
123
+ "kl": 0.0034528732299804687,
124
+ "learning_rate": 9.872052623234631e-07,
125
+ "loss": 0.0,
126
+ "reward": 0.47996846288442613,
127
+ "reward_std": 0.09802622515708208,
128
+ "rewards/wrapped_prediction_reward_func": 0.47996846288442613,
129
+ "step": 50
130
+ },
131
+ {
132
+ "completion_length": 790.0812713623047,
133
+ "epoch": 0.1385390428211587,
134
+ "grad_norm": 0.0704359174357588,
135
+ "kl": 0.003958320617675782,
136
+ "learning_rate": 9.833104251563055e-07,
137
+ "loss": 0.0,
138
+ "reward": 0.48190433755517004,
139
+ "reward_std": 0.08568599077407271,
140
+ "rewards/wrapped_prediction_reward_func": 0.48190433755517004,
141
+ "step": 55
142
+ },
143
+ {
144
+ "completion_length": 768.5521057128906,
145
+ "epoch": 0.15113350125944586,
146
+ "grad_norm": 0.06457778900067923,
147
+ "kl": 0.005392837524414063,
148
+ "learning_rate": 9.789086620939935e-07,
149
+ "loss": 0.0,
150
+ "reward": 0.48327849954366686,
151
+ "reward_std": 0.09457279161288171,
152
+ "rewards/wrapped_prediction_reward_func": 0.48327849954366686,
153
+ "step": 60
154
+ },
155
+ {
156
+ "completion_length": 770.9166870117188,
157
+ "epoch": 0.163727959697733,
158
+ "grad_norm": 0.06812633405378012,
159
+ "kl": 0.006523513793945312,
160
+ "learning_rate": 9.740045899781352e-07,
161
+ "loss": 0.0,
162
+ "reward": 0.5019405260682106,
163
+ "reward_std": 0.07568035275908187,
164
+ "rewards/wrapped_prediction_reward_func": 0.5019405260682106,
165
+ "step": 65
166
+ },
167
+ {
168
+ "completion_length": 749.1146057128906,
169
+ "epoch": 0.17632241813602015,
170
+ "grad_norm": 0.06685526422767288,
171
+ "kl": 0.007244110107421875,
172
+ "learning_rate": 9.686033525031719e-07,
173
+ "loss": 0.0,
174
+ "reward": 0.501540158689022,
175
+ "reward_std": 0.07553914005402476,
176
+ "rewards/wrapped_prediction_reward_func": 0.501540158689022,
177
+ "step": 70
178
+ },
179
+ {
180
+ "completion_length": 761.0937683105469,
181
+ "epoch": 0.1889168765743073,
182
+ "grad_norm": 0.0722156044539115,
183
+ "kl": 0.007537460327148438,
184
+ "learning_rate": 9.62710614821352e-07,
185
+ "loss": 0.0,
186
+ "reward": 0.4996995717287064,
187
+ "reward_std": 0.0968896305654198,
188
+ "rewards/wrapped_prediction_reward_func": 0.4996995717287064,
189
+ "step": 75
190
+ },
191
+ {
192
+ "completion_length": 767.5000213623047,
193
+ "epoch": 0.20151133501259447,
194
+ "grad_norm": 0.078058173076431,
195
+ "kl": 0.0085418701171875,
196
+ "learning_rate": 9.5633255760077e-07,
197
+ "loss": 0.0,
198
+ "reward": 0.4846237309277058,
199
+ "reward_std": 0.10277664107270539,
200
+ "rewards/wrapped_prediction_reward_func": 0.4846237309277058,
201
+ "step": 80
202
+ },
203
+ {
204
+ "completion_length": 737.7312728881836,
205
+ "epoch": 0.2141057934508816,
206
+ "grad_norm": 0.07298119554807302,
207
+ "kl": 0.010308837890625,
208
+ "learning_rate": 9.494758705426976e-07,
209
+ "loss": 0.0,
210
+ "reward": 0.5145397499203682,
211
+ "reward_std": 0.09095026951981708,
212
+ "rewards/wrapped_prediction_reward_func": 0.5145397499203682,
213
+ "step": 85
214
+ },
215
+ {
216
+ "completion_length": 780.7896041870117,
217
+ "epoch": 0.22670025188916876,
218
+ "grad_norm": 0.07249771083699665,
219
+ "kl": 0.0119293212890625,
220
+ "learning_rate": 9.421477453650117e-07,
221
+ "loss": 0.0,
222
+ "reward": 0.5147036135196685,
223
+ "reward_std": 0.09051896830787882,
224
+ "rewards/wrapped_prediction_reward_func": 0.5147036135196685,
225
+ "step": 90
226
+ },
227
+ {
228
+ "completion_length": 761.1833541870117,
229
+ "epoch": 0.23929471032745592,
230
+ "grad_norm": 0.07426543152283624,
231
+ "kl": 0.01405029296875,
232
+ "learning_rate": 9.343558682590755e-07,
233
+ "loss": 0.0,
234
+ "reward": 0.5229489721357823,
235
+ "reward_std": 0.1045097156893462,
236
+ "rewards/wrapped_prediction_reward_func": 0.5229489721357823,
237
+ "step": 95
238
+ },
239
+ {
240
+ "completion_length": 747.8270980834961,
241
+ "epoch": 0.2518891687657431,
242
+ "grad_norm": 0.08009584068360569,
243
+ "kl": 0.016656494140625,
244
+ "learning_rate": 9.261084118279846e-07,
245
+ "loss": 0.0,
246
+ "reward": 0.5308140270411968,
247
+ "reward_std": 0.10999625454423949,
248
+ "rewards/wrapped_prediction_reward_func": 0.5308140270411968,
249
+ "step": 100
250
+ },
251
+ {
252
+ "completion_length": 755.6771011352539,
253
+ "epoch": 0.26448362720403024,
254
+ "grad_norm": 0.0717796741561669,
255
+ "kl": 0.01963958740234375,
256
+ "learning_rate": 9.174140265146355e-07,
257
+ "loss": 0.0,
258
+ "reward": 0.5279295884072781,
259
+ "reward_std": 0.12174326665699482,
260
+ "rewards/wrapped_prediction_reward_func": 0.5279295884072781,
261
+ "step": 105
262
+ },
263
+ {
264
+ "completion_length": 741.0396072387696,
265
+ "epoch": 0.2770780856423174,
266
+ "grad_norm": 0.06788337142864205,
267
+ "kl": 0.0238006591796875,
268
+ "learning_rate": 9.082818315286054e-07,
269
+ "loss": 0.0,
270
+ "reward": 0.5241287641227246,
271
+ "reward_std": 0.08864832047838718,
272
+ "rewards/wrapped_prediction_reward_func": 0.5241287641227246,
273
+ "step": 110
274
+ },
275
+ {
276
+ "completion_length": 785.7208526611328,
277
+ "epoch": 0.28967254408060455,
278
+ "grad_norm": 0.0701122609050844,
279
+ "kl": 0.025714111328125,
280
+ "learning_rate": 8.987214052813603e-07,
281
+ "loss": 0.0,
282
+ "reward": 0.5118412531912326,
283
+ "reward_std": 0.11687530118506402,
284
+ "rewards/wrapped_prediction_reward_func": 0.5118412531912326,
285
+ "step": 115
286
+ },
287
+ {
288
+ "completion_length": 785.070849609375,
289
+ "epoch": 0.3022670025188917,
290
+ "grad_norm": 0.07472239714054472,
291
+ "kl": 0.0284881591796875,
292
+ "learning_rate": 8.887427753398247e-07,
293
+ "loss": 0.0,
294
+ "reward": 0.5511950686573982,
295
+ "reward_std": 0.07816179413348437,
296
+ "rewards/wrapped_prediction_reward_func": 0.5511950686573982,
297
+ "step": 120
298
+ },
299
+ {
300
+ "completion_length": 833.4291870117188,
301
+ "epoch": 0.3148614609571788,
302
+ "grad_norm": 0.07422495442339502,
303
+ "kl": 0.027734375,
304
+ "learning_rate": 8.783564079088476e-07,
305
+ "loss": 0.0,
306
+ "reward": 0.5402487128973007,
307
+ "reward_std": 0.13062875589821488,
308
+ "rewards/wrapped_prediction_reward_func": 0.5402487128973007,
309
+ "step": 125
310
+ },
311
+ {
312
+ "completion_length": 814.3541854858398,
313
+ "epoch": 0.327455919395466,
314
+ "grad_norm": 0.06823337736452399,
315
+ "kl": 0.02915191650390625,
316
+ "learning_rate": 8.675731968536002e-07,
317
+ "loss": 0.0,
318
+ "reward": 0.530899728089571,
319
+ "reward_std": 0.11218150332570076,
320
+ "rewards/wrapped_prediction_reward_func": 0.530899728089571,
321
+ "step": 130
322
+ },
323
+ {
324
+ "completion_length": 812.7791900634766,
325
+ "epoch": 0.34005037783375314,
326
+ "grad_norm": 0.07003585561947805,
327
+ "kl": 0.03168487548828125,
328
+ "learning_rate": 8.564044522734146e-07,
329
+ "loss": 0.0,
330
+ "reward": 0.5358106784522534,
331
+ "reward_std": 0.10542880366556347,
332
+ "rewards/wrapped_prediction_reward_func": 0.5358106784522534,
333
+ "step": 135
334
+ },
335
+ {
336
+ "completion_length": 817.2354385375977,
337
+ "epoch": 0.3526448362720403,
338
+ "grad_norm": 0.07278698573023756,
339
+ "kl": 0.03175811767578125,
340
+ "learning_rate": 8.448618886390521e-07,
341
+ "loss": 0.0,
342
+ "reward": 0.549003791064024,
343
+ "reward_std": 0.0949756694608368,
344
+ "rewards/wrapped_prediction_reward_func": 0.549003791064024,
345
+ "step": 140
346
+ },
347
+ {
348
+ "completion_length": 761.7271057128906,
349
+ "epoch": 0.36523929471032746,
350
+ "grad_norm": 0.0726454943691525,
351
+ "kl": 0.0360321044921875,
352
+ "learning_rate": 8.329576125058405e-07,
353
+ "loss": 0.0,
354
+ "reward": 0.5476619251072407,
355
+ "reward_std": 0.08561794870765879,
356
+ "rewards/wrapped_prediction_reward_func": 0.5476619251072407,
357
+ "step": 145
358
+ },
359
+ {
360
+ "completion_length": 784.9791854858398,
361
+ "epoch": 0.3778337531486146,
362
+ "grad_norm": 0.0734552135965303,
363
+ "kl": 0.03544921875,
364
+ "learning_rate": 8.207041098155699e-07,
365
+ "loss": 0.0,
366
+ "reward": 0.5536449149250984,
367
+ "reward_std": 0.09187358255730942,
368
+ "rewards/wrapped_prediction_reward_func": 0.5536449149250984,
369
+ "step": 150
370
+ },
371
+ {
372
+ "completion_length": 750.6729354858398,
373
+ "epoch": 0.3904282115869018,
374
+ "grad_norm": 0.08269464276856019,
375
+ "kl": 0.0421600341796875,
376
+ "learning_rate": 8.081142328004636e-07,
377
+ "loss": 0.0,
378
+ "reward": 0.560660557448864,
379
+ "reward_std": 0.08419233104214072,
380
+ "rewards/wrapped_prediction_reward_func": 0.560660557448864,
381
+ "step": 155
382
+ },
383
+ {
384
+ "completion_length": 736.0833541870118,
385
+ "epoch": 0.40302267002518893,
386
+ "grad_norm": 0.07934493494179942,
387
+ "kl": 0.2113433837890625,
388
+ "learning_rate": 7.952011865029613e-07,
389
+ "loss": 0.0001,
390
+ "reward": 0.5728602230548858,
391
+ "reward_std": 0.08108794330037199,
392
+ "rewards/wrapped_prediction_reward_func": 0.5728602230548858,
393
+ "step": 160
394
+ },
395
+ {
396
+ "completion_length": 751.6396011352539,
397
+ "epoch": 0.4156171284634761,
398
+ "grad_norm": 0.077727529432095,
399
+ "kl": 0.0558685302734375,
400
+ "learning_rate": 7.819785149254532e-07,
401
+ "loss": 0.0,
402
+ "reward": 0.5558124162256718,
403
+ "reward_std": 0.07910064500756561,
404
+ "rewards/wrapped_prediction_reward_func": 0.5558124162256718,
405
+ "step": 165
406
+ },
407
+ {
408
+ "completion_length": 794.9833511352539,
409
+ "epoch": 0.4282115869017632,
410
+ "grad_norm": 0.08198281407723804,
411
+ "kl": 0.063677978515625,
412
+ "learning_rate": 7.684600868244919e-07,
413
+ "loss": 0.0001,
414
+ "reward": 0.5688013978302479,
415
+ "reward_std": 0.08710555551806465,
416
+ "rewards/wrapped_prediction_reward_func": 0.5688013978302479,
417
+ "step": 170
418
+ },
419
+ {
420
+ "completion_length": 765.7625213623047,
421
+ "epoch": 0.44080604534005036,
422
+ "grad_norm": 0.07454384571252022,
423
+ "kl": 0.0582550048828125,
424
+ "learning_rate": 7.546600811643816e-07,
425
+ "loss": 0.0001,
426
+ "reward": 0.5501547582447529,
427
+ "reward_std": 0.06000117462826893,
428
+ "rewards/wrapped_prediction_reward_func": 0.5501547582447529,
429
+ "step": 175
430
+ },
431
+ {
432
+ "completion_length": 804.5687713623047,
433
+ "epoch": 0.4534005037783375,
434
+ "grad_norm": 0.0759962714852466,
435
+ "kl": 0.051287841796875,
436
+ "learning_rate": 7.405929722454025e-07,
437
+ "loss": 0.0,
438
+ "reward": 0.5598295979201794,
439
+ "reward_std": 0.08419673591852188,
440
+ "rewards/wrapped_prediction_reward_func": 0.5598295979201794,
441
+ "step": 180
442
+ },
443
+ {
444
+ "completion_length": 762.9896026611328,
445
+ "epoch": 0.4659949622166247,
446
+ "grad_norm": 0.05921819318149447,
447
+ "kl": 0.059259033203125,
448
+ "learning_rate": 7.262735145222695e-07,
449
+ "loss": 0.0001,
450
+ "reward": 0.5768641114234925,
451
+ "reward_std": 0.0767961086006835,
452
+ "rewards/wrapped_prediction_reward_func": 0.5768641114234925,
453
+ "step": 185
454
+ },
455
+ {
456
+ "completion_length": 756.3416885375976,
457
+ "epoch": 0.47858942065491183,
458
+ "grad_norm": 0.07564514416368207,
459
+ "kl": 0.0628173828125,
460
+ "learning_rate": 7.117167271287452e-07,
461
+ "loss": 0.0,
462
+ "reward": 0.5682952009141445,
463
+ "reward_std": 0.07079082436393946,
464
+ "rewards/wrapped_prediction_reward_func": 0.5682952009141445,
465
+ "step": 190
466
+ },
467
+ {
468
+ "completion_length": 792.5271087646485,
469
+ "epoch": 0.491183879093199,
470
+ "grad_norm": 0.05939427024749797,
471
+ "kl": 0.0619049072265625,
472
+ "learning_rate": 6.969378781246436e-07,
473
+ "loss": 0.0,
474
+ "reward": 0.555230350792408,
475
+ "reward_std": 0.05828617985825986,
476
+ "rewards/wrapped_prediction_reward_func": 0.555230350792408,
477
+ "step": 195
478
+ },
479
+ {
480
+ "completion_length": 805.2312698364258,
481
+ "epoch": 0.5037783375314862,
482
+ "grad_norm": 0.053418514095113,
483
+ "kl": 0.0571746826171875,
484
+ "learning_rate": 6.819524684817438e-07,
485
+ "loss": 0.0001,
486
+ "reward": 0.5742501869797707,
487
+ "reward_std": 0.04643813910661265,
488
+ "rewards/wrapped_prediction_reward_func": 0.5742501869797707,
489
+ "step": 200
490
+ },
491
+ {
492
+ "completion_length": 817.6541931152344,
493
+ "epoch": 0.5163727959697733,
494
+ "grad_norm": 0.059367612614114815,
495
+ "kl": 0.0566925048828125,
496
+ "learning_rate": 6.667762158254103e-07,
497
+ "loss": 0.0,
498
+ "reward": 0.5665707983076572,
499
+ "reward_std": 0.05952281908830628,
500
+ "rewards/wrapped_prediction_reward_func": 0.5665707983076572,
501
+ "step": 205
502
+ },
503
+ {
504
+ "completion_length": 785.7229415893555,
505
+ "epoch": 0.5289672544080605,
506
+ "grad_norm": 0.06832063353240915,
507
+ "kl": 0.063702392578125,
508
+ "learning_rate": 6.514250379489753e-07,
509
+ "loss": 0.0001,
510
+ "reward": 0.5705230697989464,
511
+ "reward_std": 0.055411939066834746,
512
+ "rewards/wrapped_prediction_reward_func": 0.5705230697989464,
513
+ "step": 210
514
+ },
515
+ {
516
+ "completion_length": 845.8625198364258,
517
+ "epoch": 0.5415617128463476,
518
+ "grad_norm": 0.059630081848652705,
519
+ "kl": 0.055462646484375,
520
+ "learning_rate": 6.359150361181714e-07,
521
+ "loss": 0.0001,
522
+ "reward": 0.5717524968087673,
523
+ "reward_std": 0.058589562051929535,
524
+ "rewards/wrapped_prediction_reward_func": 0.5717524968087673,
525
+ "step": 215
526
+ },
527
+ {
528
+ "completion_length": 842.5708572387696,
529
+ "epoch": 0.5541561712846348,
530
+ "grad_norm": 0.06370639502021583,
531
+ "kl": 0.0563568115234375,
532
+ "learning_rate": 6.202624781831268e-07,
533
+ "loss": 0.0001,
534
+ "reward": 0.5911780953407287,
535
+ "reward_std": 0.05995338945649564,
536
+ "rewards/wrapped_prediction_reward_func": 0.5911780953407287,
537
+ "step": 220
538
+ },
539
+ {
540
+ "completion_length": 819.0875198364258,
541
+ "epoch": 0.5667506297229219,
542
+ "grad_norm": 0.05972945064618038,
543
+ "kl": 0.1108428955078125,
544
+ "learning_rate": 6.044837815156376e-07,
545
+ "loss": 0.0001,
546
+ "reward": 0.5578226193785667,
547
+ "reward_std": 0.04124137028120458,
548
+ "rewards/wrapped_prediction_reward_func": 0.5578226193785667,
549
+ "step": 225
550
+ },
551
+ {
552
+ "completion_length": 815.4666931152344,
553
+ "epoch": 0.5793450881612091,
554
+ "grad_norm": 0.05137674910063656,
555
+ "kl": 0.0625457763671875,
556
+ "learning_rate": 5.885954957896115e-07,
557
+ "loss": 0.0001,
558
+ "reward": 0.5861903376877308,
559
+ "reward_std": 0.04839982387493365,
560
+ "rewards/wrapped_prediction_reward_func": 0.5861903376877308,
561
+ "step": 230
562
+ },
563
+ {
564
+ "completion_length": 802.9166900634766,
565
+ "epoch": 0.5919395465994962,
566
+ "grad_norm": 0.052644946129698264,
567
+ "kl": 0.065142822265625,
568
+ "learning_rate": 5.726142856227452e-07,
569
+ "loss": 0.0001,
570
+ "reward": 0.5952158778905868,
571
+ "reward_std": 0.05475405912147835,
572
+ "rewards/wrapped_prediction_reward_func": 0.5952158778905868,
573
+ "step": 235
574
+ },
575
+ {
576
+ "completion_length": 839.5187728881835,
577
+ "epoch": 0.6045340050377834,
578
+ "grad_norm": 0.0590405309579095,
579
+ "kl": 0.0682373046875,
580
+ "learning_rate": 5.565569130976422e-07,
581
+ "loss": 0.0,
582
+ "reward": 0.5894757807254791,
583
+ "reward_std": 0.06357808914035559,
584
+ "rewards/wrapped_prediction_reward_func": 0.5894757807254791,
585
+ "step": 240
586
+ },
587
+ {
588
+ "completion_length": 835.6958541870117,
589
+ "epoch": 0.6171284634760705,
590
+ "grad_norm": 0.047712245488407155,
591
+ "kl": 0.064862060546875,
592
+ "learning_rate": 5.404402201807021e-07,
593
+ "loss": 0.0001,
594
+ "reward": 0.5752220213413238,
595
+ "reward_std": 0.04643567528109997,
596
+ "rewards/wrapped_prediction_reward_func": 0.5752220213413238,
597
+ "step": 245
598
+ },
599
+ {
600
+ "completion_length": 821.4750274658203,
601
+ "epoch": 0.6297229219143576,
602
+ "grad_norm": 0.05442474889712706,
603
+ "kl": 0.0702850341796875,
604
+ "learning_rate": 5.242811110572242e-07,
605
+ "loss": 0.0001,
606
+ "reward": 0.5673072084784507,
607
+ "reward_std": 0.03941638254909776,
608
+ "rewards/wrapped_prediction_reward_func": 0.5673072084784507,
609
+ "step": 250
610
+ },
611
+ {
612
+ "completion_length": 781.9187728881836,
613
+ "epoch": 0.6423173803526449,
614
+ "grad_norm": 0.05809885115112598,
615
+ "kl": 0.070941162109375,
616
+ "learning_rate": 5.080965344012508e-07,
617
+ "loss": 0.0001,
618
+ "reward": 0.5991133064031601,
619
+ "reward_std": 0.04335737858200446,
620
+ "rewards/wrapped_prediction_reward_func": 0.5991133064031601,
621
+ "step": 255
622
+ },
623
+ {
624
+ "completion_length": 817.3083541870117,
625
+ "epoch": 0.654911838790932,
626
+ "grad_norm": 0.03682776713225834,
627
+ "kl": 0.0657196044921875,
628
+ "learning_rate": 4.919034655987492e-07,
629
+ "loss": 0.0001,
630
+ "reward": 0.5929536901414394,
631
+ "reward_std": 0.06003379854373634,
632
+ "rewards/wrapped_prediction_reward_func": 0.5929536901414394,
633
+ "step": 260
634
+ },
635
+ {
636
+ "completion_length": 808.1771072387695,
637
+ "epoch": 0.6675062972292192,
638
+ "grad_norm": 0.05594643798861689,
639
+ "kl": 0.071002197265625,
640
+ "learning_rate": 4.75718888942776e-07,
641
+ "loss": 0.0001,
642
+ "reward": 0.5693795874714851,
643
+ "reward_std": 0.04352557165548206,
644
+ "rewards/wrapped_prediction_reward_func": 0.5693795874714851,
645
+ "step": 265
646
+ },
647
+ {
648
+ "completion_length": 850.0666854858398,
649
+ "epoch": 0.6801007556675063,
650
+ "grad_norm": 0.05396780654610799,
651
+ "kl": 0.066058349609375,
652
+ "learning_rate": 4.595597798192979e-07,
653
+ "loss": 0.0001,
654
+ "reward": 0.5841031737625599,
655
+ "reward_std": 0.054474068916169925,
656
+ "rewards/wrapped_prediction_reward_func": 0.5841031737625599,
657
+ "step": 270
658
+ },
659
+ {
660
+ "completion_length": 807.5771041870117,
661
+ "epoch": 0.6926952141057935,
662
+ "grad_norm": 0.05692079461397134,
663
+ "kl": 0.072412109375,
664
+ "learning_rate": 4.434430869023579e-07,
665
+ "loss": 0.0001,
666
+ "reward": 0.6022952854633331,
667
+ "reward_std": 0.05330510977655649,
668
+ "rewards/wrapped_prediction_reward_func": 0.6022952854633331,
669
+ "step": 275
670
+ },
671
+ {
672
+ "completion_length": 846.0312774658203,
673
+ "epoch": 0.7052896725440806,
674
+ "grad_norm": 0.04955339683095041,
675
+ "kl": 0.065411376953125,
676
+ "learning_rate": 4.2738571437725496e-07,
677
+ "loss": 0.0001,
678
+ "reward": 0.5892672084271908,
679
+ "reward_std": 0.05133460226934403,
680
+ "rewards/wrapped_prediction_reward_func": 0.5892672084271908,
681
+ "step": 280
682
+ },
683
+ {
684
+ "completion_length": 781.9396057128906,
685
+ "epoch": 0.7178841309823678,
686
+ "grad_norm": 0.056799969581110395,
687
+ "kl": 0.071551513671875,
688
+ "learning_rate": 4.1140450421038866e-07,
689
+ "loss": 0.0001,
690
+ "reward": 0.5798937246203423,
691
+ "reward_std": 0.051263501844368876,
692
+ "rewards/wrapped_prediction_reward_func": 0.5798937246203423,
693
+ "step": 285
694
+ },
695
+ {
696
+ "completion_length": 797.3041900634765,
697
+ "epoch": 0.7304785894206549,
698
+ "grad_norm": 0.050561172649658176,
699
+ "kl": 0.077020263671875,
700
+ "learning_rate": 3.955162184843624e-07,
701
+ "loss": 0.0001,
702
+ "reward": 0.5860190108418465,
703
+ "reward_std": 0.04162131273187697,
704
+ "rewards/wrapped_prediction_reward_func": 0.5860190108418465,
705
+ "step": 290
706
+ },
707
+ {
708
+ "completion_length": 822.4833526611328,
709
+ "epoch": 0.743073047858942,
710
+ "grad_norm": 0.0434111466318861,
711
+ "kl": 0.0822509765625,
712
+ "learning_rate": 3.7973752181687327e-07,
713
+ "loss": 0.0001,
714
+ "reward": 0.5851313889026641,
715
+ "reward_std": 0.05500690509798005,
716
+ "rewards/wrapped_prediction_reward_func": 0.5851313889026641,
717
+ "step": 295
718
+ },
719
+ {
720
+ "completion_length": 855.5312805175781,
721
+ "epoch": 0.7556675062972292,
722
+ "grad_norm": 0.051422594050945035,
723
+ "kl": 0.0636383056640625,
724
+ "learning_rate": 3.640849638818285e-07,
725
+ "loss": 0.0001,
726
+ "reward": 0.5980591103434563,
727
+ "reward_std": 0.036254264996387064,
728
+ "rewards/wrapped_prediction_reward_func": 0.5980591103434563,
729
+ "step": 300
730
+ },
731
+ {
732
+ "completion_length": 792.4396087646485,
733
+ "epoch": 0.7682619647355163,
734
+ "grad_norm": 0.03997352927393124,
735
+ "kl": 0.0724853515625,
736
+ "learning_rate": 3.485749620510247e-07,
737
+ "loss": 0.0001,
738
+ "reward": 0.6093361288309097,
739
+ "reward_std": 0.03319716795813292,
740
+ "rewards/wrapped_prediction_reward_func": 0.6093361288309097,
741
+ "step": 305
742
+ },
743
+ {
744
+ "completion_length": 812.2625213623047,
745
+ "epoch": 0.7808564231738035,
746
+ "grad_norm": 0.058958040086389545,
747
+ "kl": 0.072906494140625,
748
+ "learning_rate": 3.3322378417458977e-07,
749
+ "loss": 0.0001,
750
+ "reward": 0.613701456785202,
751
+ "reward_std": 0.041399752866709605,
752
+ "rewards/wrapped_prediction_reward_func": 0.613701456785202,
753
+ "step": 310
754
+ },
755
+ {
756
+ "completion_length": 830.6354431152344,
757
+ "epoch": 0.7934508816120907,
758
+ "grad_norm": 0.03534980235662457,
759
+ "kl": 0.067303466796875,
760
+ "learning_rate": 3.1804753151825627e-07,
761
+ "loss": 0.0001,
762
+ "reward": 0.5734647884964943,
763
+ "reward_std": 0.051193205296294765,
764
+ "rewards/wrapped_prediction_reward_func": 0.5734647884964943,
765
+ "step": 315
766
+ },
767
+ {
768
+ "completion_length": 813.2833557128906,
769
+ "epoch": 0.8060453400503779,
770
+ "grad_norm": 0.06257699195249379,
771
+ "kl": 0.068646240234375,
772
+ "learning_rate": 3.030621218753565e-07,
773
+ "loss": 0.0001,
774
+ "reward": 0.6155773043632508,
775
+ "reward_std": 0.039483394497074185,
776
+ "rewards/wrapped_prediction_reward_func": 0.6155773043632508,
777
+ "step": 320
778
+ },
779
+ {
780
+ "completion_length": 798.3354370117188,
781
+ "epoch": 0.818639798488665,
782
+ "grad_norm": 0.047751764206928074,
783
+ "kl": 0.0704833984375,
784
+ "learning_rate": 2.8828327287125507e-07,
785
+ "loss": 0.0001,
786
+ "reward": 0.608567351102829,
787
+ "reward_std": 0.03854468032368459,
788
+ "rewards/wrapped_prediction_reward_func": 0.608567351102829,
789
+ "step": 325
790
+ },
791
+ {
792
+ "completion_length": 828.9812713623047,
793
+ "epoch": 0.8312342569269522,
794
+ "grad_norm": 0.04585758374215142,
795
+ "kl": 0.0710540771484375,
796
+ "learning_rate": 2.7372648547773056e-07,
797
+ "loss": 0.0001,
798
+ "reward": 0.5888618856668473,
799
+ "reward_std": 0.03441393570974469,
800
+ "rewards/wrapped_prediction_reward_func": 0.5888618856668473,
801
+ "step": 330
802
+ },
803
+ {
804
+ "completion_length": 821.6791931152344,
805
+ "epoch": 0.8438287153652393,
806
+ "grad_norm": 0.04800095851199432,
807
+ "kl": 0.070928955078125,
808
+ "learning_rate": 2.5940702775459744e-07,
809
+ "loss": 0.0001,
810
+ "reward": 0.5711221612989903,
811
+ "reward_std": 0.037968299351632595,
812
+ "rewards/wrapped_prediction_reward_func": 0.5711221612989903,
813
+ "step": 335
814
+ },
815
+ {
816
+ "completion_length": 825.0750167846679,
817
+ "epoch": 0.8564231738035264,
818
+ "grad_norm": 0.032868154882953755,
819
+ "kl": 0.074005126953125,
820
+ "learning_rate": 2.4533991883561867e-07,
821
+ "loss": 0.0002,
822
+ "reward": 0.5961314618587494,
823
+ "reward_std": 0.02036965051665902,
824
+ "rewards/wrapped_prediction_reward_func": 0.5961314618587494,
825
+ "step": 340
826
+ },
827
+ {
828
+ "completion_length": 789.3958572387695,
829
+ "epoch": 0.8690176322418136,
830
+ "grad_norm": 0.04730084864351471,
831
+ "kl": 0.0718597412109375,
832
+ "learning_rate": 2.3153991317550808e-07,
833
+ "loss": 0.0001,
834
+ "reward": 0.5693817652761937,
835
+ "reward_std": 0.027137306291842835,
836
+ "rewards/wrapped_prediction_reward_func": 0.5693817652761937,
837
+ "step": 345
838
+ },
839
+ {
840
+ "completion_length": 825.4916915893555,
841
+ "epoch": 0.8816120906801007,
842
+ "grad_norm": 0.05929565464749555,
843
+ "kl": 0.0830902099609375,
844
+ "learning_rate": 2.180214850745467e-07,
845
+ "loss": 0.0001,
846
+ "reward": 0.5720083937048912,
847
+ "reward_std": 0.04108033460797742,
848
+ "rewards/wrapped_prediction_reward_func": 0.5720083937048912,
849
+ "step": 350
850
+ },
851
+ {
852
+ "completion_length": 794.3854446411133,
853
+ "epoch": 0.8942065491183879,
854
+ "grad_norm": 0.042508843892672864,
855
+ "kl": 0.0805755615234375,
856
+ "learning_rate": 2.0479881349703882e-07,
857
+ "loss": 0.0001,
858
+ "reward": 0.6149648398160934,
859
+ "reward_std": 0.028531424299580978,
860
+ "rewards/wrapped_prediction_reward_func": 0.6149648398160934,
861
+ "step": 355
862
+ },
863
+ {
864
+ "completion_length": 828.1271102905273,
865
+ "epoch": 0.906801007556675,
866
+ "grad_norm": 0.13929773259081887,
867
+ "kl": 0.1125732421875,
868
+ "learning_rate": 1.918857671995363e-07,
869
+ "loss": 0.0001,
870
+ "reward": 0.5847930148243904,
871
+ "reward_std": 0.044799246825277805,
872
+ "rewards/wrapped_prediction_reward_func": 0.5847930148243904,
873
+ "step": 360
874
+ },
875
+ {
876
+ "completion_length": 843.9896087646484,
877
+ "epoch": 0.9193954659949622,
878
+ "grad_norm": 0.044992966442867634,
879
+ "kl": 0.0656494140625,
880
+ "learning_rate": 1.7929589018443014e-07,
881
+ "loss": 0.0001,
882
+ "reward": 0.5992885082960129,
883
+ "reward_std": 0.029869268811307848,
884
+ "rewards/wrapped_prediction_reward_func": 0.5992885082960129,
885
+ "step": 365
886
+ },
887
+ {
888
+ "completion_length": 791.6791900634765,
889
+ "epoch": 0.9319899244332494,
890
+ "grad_norm": 0.03456117446845568,
891
+ "kl": 0.07545166015625,
892
+ "learning_rate": 1.6704238749415955e-07,
893
+ "loss": 0.0002,
894
+ "reward": 0.5825795724987983,
895
+ "reward_std": 0.021053510857746004,
896
+ "rewards/wrapped_prediction_reward_func": 0.5825795724987983,
897
+ "step": 370
898
+ },
899
+ {
900
+ "completion_length": 795.2250274658203,
901
+ "epoch": 0.9445843828715366,
902
+ "grad_norm": 0.057925315822869305,
903
+ "kl": 0.083660888671875,
904
+ "learning_rate": 1.5513811136094785e-07,
905
+ "loss": 0.0001,
906
+ "reward": 0.5829600065946579,
907
+ "reward_std": 0.034487396385520695,
908
+ "rewards/wrapped_prediction_reward_func": 0.5829600065946579,
909
+ "step": 375
910
+ },
911
+ {
912
+ "completion_length": 846.0875274658204,
913
+ "epoch": 0.9571788413098237,
914
+ "grad_norm": 0.038214011850815244,
915
+ "kl": 0.072222900390625,
916
+ "learning_rate": 1.435955477265855e-07,
917
+ "loss": 0.0001,
918
+ "reward": 0.6045917004346848,
919
+ "reward_std": 0.05368394823744893,
920
+ "rewards/wrapped_prediction_reward_func": 0.6045917004346848,
921
+ "step": 380
922
+ },
923
+ {
924
+ "completion_length": 815.0458526611328,
925
+ "epoch": 0.9697732997481109,
926
+ "grad_norm": 0.030377786493206754,
927
+ "kl": 0.078546142578125,
928
+ "learning_rate": 1.3242680314639993e-07,
929
+ "loss": 0.0001,
930
+ "reward": 0.6047316044569016,
931
+ "reward_std": 0.024706879258155824,
932
+ "rewards/wrapped_prediction_reward_func": 0.6047316044569016,
933
+ "step": 385
934
+ },
935
+ {
936
+ "completion_length": 823.7146057128906,
937
+ "epoch": 0.982367758186398,
938
+ "grad_norm": 0.04345292422364211,
939
+ "kl": 0.07333984375,
940
+ "learning_rate": 1.2164359209115232e-07,
941
+ "loss": 0.0001,
942
+ "reward": 0.5892785102128982,
943
+ "reward_std": 0.03435427148942836,
944
+ "rewards/wrapped_prediction_reward_func": 0.5892785102128982,
945
+ "step": 390
946
+ },
947
+ {
948
+ "completion_length": 854.6458587646484,
949
+ "epoch": 0.9949622166246851,
950
+ "grad_norm": 0.05065816791479697,
951
+ "kl": 0.0732177734375,
952
+ "learning_rate": 1.1125722466017545e-07,
953
+ "loss": 0.0001,
954
+ "reward": 0.5809489727020264,
955
+ "reward_std": 0.052034072624519465,
956
+ "rewards/wrapped_prediction_reward_func": 0.5809489727020264,
957
+ "step": 395
958
+ },
959
+ {
960
+ "completion_length": 810.3968933105468,
961
+ "epoch": 1.0075566750629723,
962
+ "grad_norm": 0.03509442740105049,
963
+ "kl": 0.075762939453125,
964
+ "learning_rate": 1.0127859471863969e-07,
965
+ "loss": 0.0001,
966
+ "reward": 0.6079857856035232,
967
+ "reward_std": 0.02203769111074507,
968
+ "rewards/wrapped_prediction_reward_func": 0.6079857856035232,
969
+ "step": 400
970
+ },
971
+ {
972
+ "completion_length": 790.1479385375976,
973
+ "epoch": 1.0201511335012594,
974
+ "grad_norm": 0.03530726181544221,
975
+ "kl": 0.0824920654296875,
976
+ "learning_rate": 9.171816847139447e-08,
977
+ "loss": 0.0001,
978
+ "reward": 0.6192175649106503,
979
+ "reward_std": 0.0369229513395112,
980
+ "rewards/wrapped_prediction_reward_func": 0.6192175649106503,
981
+ "step": 405
982
+ },
983
+ {
984
+ "completion_length": 793.627099609375,
985
+ "epoch": 1.0327455919395465,
986
+ "grad_norm": 0.032582398864671,
987
+ "kl": 0.077081298828125,
988
+ "learning_rate": 8.258597348536451e-08,
989
+ "loss": 0.0001,
990
+ "reward": 0.6033682949841023,
991
+ "reward_std": 0.034924795664846894,
992
+ "rewards/wrapped_prediction_reward_func": 0.6033682949841023,
993
+ "step": 410
994
+ },
995
+ {
996
+ "completion_length": 820.5625213623047,
997
+ "epoch": 1.0453400503778338,
998
+ "grad_norm": 0.03594193252721311,
999
+ "kl": 0.0728424072265625,
1000
+ "learning_rate": 7.389158817201541e-08,
1001
+ "loss": 0.0001,
1002
+ "reward": 0.5938129395246505,
1003
+ "reward_std": 0.025039249239489435,
1004
+ "rewards/wrapped_prediction_reward_func": 0.5938129395246505,
1005
+ "step": 415
1006
+ },
1007
+ {
1008
+ "completion_length": 791.6812713623046,
1009
+ "epoch": 1.057934508816121,
1010
+ "grad_norm": 0.2660556227962935,
1011
+ "kl": 0.178399658203125,
1012
+ "learning_rate": 6.564413174092443e-08,
1013
+ "loss": 0.0001,
1014
+ "reward": 0.5894127897918224,
1015
+ "reward_std": 0.026753197464859112,
1016
+ "rewards/wrapped_prediction_reward_func": 0.5894127897918224,
1017
+ "step": 420
1018
+ },
1019
+ {
1020
+ "completion_length": 816.7833587646485,
1021
+ "epoch": 1.070528967254408,
1022
+ "grad_norm": 0.04448961501333409,
1023
+ "kl": 0.074652099609375,
1024
+ "learning_rate": 5.785225463498828e-08,
1025
+ "loss": 0.0002,
1026
+ "reward": 0.6132952854037285,
1027
+ "reward_std": 0.04002904643421061,
1028
+ "rewards/wrapped_prediction_reward_func": 0.6132952854037285,
1029
+ "step": 425
1030
+ },
1031
+ {
1032
+ "completion_length": 848.4500198364258,
1033
+ "epoch": 1.0831234256926952,
1034
+ "grad_norm": 0.048148418977785004,
1035
+ "kl": 0.073956298828125,
1036
+ "learning_rate": 5.052412945730239e-08,
1037
+ "loss": 0.0001,
1038
+ "reward": 0.5875438123941421,
1039
+ "reward_std": 0.043150549242272976,
1040
+ "rewards/wrapped_prediction_reward_func": 0.5875438123941421,
1041
+ "step": 430
1042
+ },
1043
+ {
1044
+ "completion_length": 830.6750244140625,
1045
+ "epoch": 1.0957178841309823,
1046
+ "grad_norm": 0.05194640005960778,
1047
+ "kl": 0.082000732421875,
1048
+ "learning_rate": 4.366744239922998e-08,
1049
+ "loss": 0.0002,
1050
+ "reward": 0.6154551565647125,
1051
+ "reward_std": 0.040564915537834166,
1052
+ "rewards/wrapped_prediction_reward_func": 0.6154551565647125,
1053
+ "step": 435
1054
+ },
1055
+ {
1056
+ "completion_length": 836.8979400634765,
1057
+ "epoch": 1.1083123425692696,
1058
+ "grad_norm": 0.01997423624486904,
1059
+ "kl": 0.080609130859375,
1060
+ "learning_rate": 3.7289385178647935e-08,
1061
+ "loss": 0.0001,
1062
+ "reward": 0.60478435754776,
1063
+ "reward_std": 0.026485644932836293,
1064
+ "rewards/wrapped_prediction_reward_func": 0.60478435754776,
1065
+ "step": 440
1066
+ },
1067
+ {
1068
+ "completion_length": 832.4625228881836,
1069
+ "epoch": 1.1209068010075567,
1070
+ "grad_norm": 0.017870734412291134,
1071
+ "kl": 0.0710479736328125,
1072
+ "learning_rate": 3.1396647496828244e-08,
1073
+ "loss": 0.0001,
1074
+ "reward": 0.6108729064464569,
1075
+ "reward_std": 0.03767378572374582,
1076
+ "rewards/wrapped_prediction_reward_func": 0.6108729064464569,
1077
+ "step": 445
1078
+ },
1079
+ {
1080
+ "completion_length": 882.7062759399414,
1081
+ "epoch": 1.1335012594458438,
1082
+ "grad_norm": 0.05015364292113255,
1083
+ "kl": 0.0659332275390625,
1084
+ "learning_rate": 2.5995410021864783e-08,
1085
+ "loss": 0.0001,
1086
+ "reward": 0.5940894782543182,
1087
+ "reward_std": 0.04104239339358173,
1088
+ "rewards/wrapped_prediction_reward_func": 0.5940894782543182,
1089
+ "step": 450
1090
+ },
1091
+ {
1092
+ "completion_length": 820.8708557128906,
1093
+ "epoch": 1.146095717884131,
1094
+ "grad_norm": 0.03890549939275878,
1095
+ "kl": 0.079840087890625,
1096
+ "learning_rate": 2.109133790600648e-08,
1097
+ "loss": 0.0002,
1098
+ "reward": 0.6024712681770324,
1099
+ "reward_std": 0.03324083940824494,
1100
+ "rewards/wrapped_prediction_reward_func": 0.6024712681770324,
1101
+ "step": 455
1102
+ },
1103
+ {
1104
+ "completion_length": 816.364598083496,
1105
+ "epoch": 1.1586901763224182,
1106
+ "grad_norm": 0.03882316928789839,
1107
+ "kl": 0.07708740234375,
1108
+ "learning_rate": 1.6689574843694432e-08,
1109
+ "loss": 0.0001,
1110
+ "reward": 0.575566939264536,
1111
+ "reward_std": 0.049419730342924593,
1112
+ "rewards/wrapped_prediction_reward_func": 0.575566939264536,
1113
+ "step": 460
1114
+ },
1115
+ {
1116
+ "completion_length": 854.1916915893555,
1117
+ "epoch": 1.1712846347607053,
1118
+ "grad_norm": 0.018046225222633556,
1119
+ "kl": 0.06839599609375,
1120
+ "learning_rate": 1.2794737676536993e-08,
1121
+ "loss": 0.0001,
1122
+ "reward": 0.5933746129274369,
1123
+ "reward_std": 0.02758802007883787,
1124
+ "rewards/wrapped_prediction_reward_func": 0.5933746129274369,
1125
+ "step": 465
1126
+ },
1127
+ {
1128
+ "completion_length": 877.6416931152344,
1129
+ "epoch": 1.1838790931989924,
1130
+ "grad_norm": 0.04393883375775292,
1131
+ "kl": 0.064520263671875,
1132
+ "learning_rate": 9.410911550880474e-09,
1133
+ "loss": 0.0001,
1134
+ "reward": 0.5791490338742733,
1135
+ "reward_std": 0.02540514359716326,
1136
+ "rewards/wrapped_prediction_reward_func": 0.5791490338742733,
1137
+ "step": 470
1138
+ },
1139
+ {
1140
+ "completion_length": 823.2521118164062,
1141
+ "epoch": 1.1964735516372795,
1142
+ "grad_norm": 0.019200187518650894,
1143
+ "kl": 0.072186279296875,
1144
+ "learning_rate": 6.541645633054649e-09,
1145
+ "loss": 0.0002,
1146
+ "reward": 0.577340692281723,
1147
+ "reward_std": 0.024969300418160856,
1148
+ "rewards/wrapped_prediction_reward_func": 0.577340692281723,
1149
+ "step": 475
1150
+ },
1151
+ {
1152
+ "completion_length": 835.3208541870117,
1153
+ "epoch": 1.2090680100755669,
1154
+ "grad_norm": 0.03179886042365874,
1155
+ "kl": 0.07593994140625,
1156
+ "learning_rate": 4.189949386787462e-09,
1157
+ "loss": 0.0,
1158
+ "reward": 0.6053503692150116,
1159
+ "reward_std": 0.05023272330872715,
1160
+ "rewards/wrapped_prediction_reward_func": 0.6053503692150116,
1161
+ "step": 480
1162
+ },
1163
+ {
1164
+ "completion_length": 800.9750183105468,
1165
+ "epoch": 1.221662468513854,
1166
+ "grad_norm": 0.031246606223002214,
1167
+ "kl": 0.090142822265625,
1168
+ "learning_rate": 2.3582894166930267e-09,
1169
+ "loss": 0.0001,
1170
+ "reward": 0.6070939235389232,
1171
+ "reward_std": 0.03144086834508926,
1172
+ "rewards/wrapped_prediction_reward_func": 0.6070939235389232,
1173
+ "step": 485
1174
+ },
1175
+ {
1176
+ "completion_length": 848.7771072387695,
1177
+ "epoch": 1.234256926952141,
1178
+ "grad_norm": 0.05601067005700773,
1179
+ "kl": 0.07252197265625,
1180
+ "learning_rate": 1.0485868811441756e-09,
1181
+ "loss": 0.0001,
1182
+ "reward": 0.5712633952498436,
1183
+ "reward_std": 0.04291044136043638,
1184
+ "rewards/wrapped_prediction_reward_func": 0.5712633952498436,
1185
+ "step": 490
1186
+ },
1187
+ {
1188
+ "completion_length": 771.6208572387695,
1189
+ "epoch": 1.2468513853904282,
1190
+ "grad_norm": 0.03724004112267365,
1191
+ "kl": 0.085308837890625,
1192
+ "learning_rate": 2.6221547724253333e-10,
1193
+ "loss": 0.0001,
1194
+ "reward": 0.6077594101428986,
1195
+ "reward_std": 0.026009649853222072,
1196
+ "rewards/wrapped_prediction_reward_func": 0.6077594101428986,
1197
+ "step": 495
1198
+ },
1199
+ {
1200
+ "completion_length": 796.8458557128906,
1201
+ "epoch": 1.2594458438287153,
1202
+ "grad_norm": 0.053721210517970415,
1203
+ "kl": 0.0768798828125,
1204
+ "learning_rate": 0.0,
1205
+ "loss": 0.0002,
1206
+ "reward": 0.601429545879364,
1207
+ "reward_std": 0.024498459114693106,
1208
+ "rewards/wrapped_prediction_reward_func": 0.601429545879364,
1209
+ "step": 500
1210
+ },
1211
+ {
1212
+ "epoch": 1.2594458438287153,
1213
+ "step": 500,
1214
+ "total_flos": 0.0,
1215
+ "train_loss": 7.065190569642255e-05,
1216
+ "train_runtime": 24090.8459,
1217
+ "train_samples_per_second": 1.992,
1218
+ "train_steps_per_second": 0.021
1219
+ }
1220
+ ],
1221
+ "logging_steps": 5,
1222
+ "max_steps": 500,
1223
+ "num_input_tokens_seen": 0,
1224
+ "num_train_epochs": 2,
1225
+ "save_steps": 100,
1226
+ "stateful_callbacks": {
1227
+ "TrainerControl": {
1228
+ "args": {
1229
+ "should_epoch_stop": false,
1230
+ "should_evaluate": false,
1231
+ "should_log": false,
1232
+ "should_save": true,
1233
+ "should_training_stop": true
1234
+ },
1235
+ "attributes": {}
1236
+ }
1237
+ },
1238
+ "total_flos": 0.0,
1239
+ "train_batch_size": 2,
1240
+ "trial_name": null,
1241
+ "trial_params": null
1242
+ }