nimishbongale commited on
Commit
47d67b0
·
verified ·
1 Parent(s): e809a72

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 896,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 4864,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 21,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 14,
17
+ "num_hidden_layers": 24,
18
+ "num_key_value_heads": 2,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": null,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": true,
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.48.2",
26
+ "use_cache": true,
27
+ "use_sliding_window": false,
28
+ "vocab_size": 151936
29
+ }
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.48.2"
14
+ }
gsm8k_eval_results_20250213_072325.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6079bd27eb9298907dab7fc1d166799d630eb11bda56288cc92458eab3e8c9b5
3
+ size 988097824
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d0caaf3f9e957498d461b3ed1ff6f815e90cd4a1acb8259109b03ef511ea586
3
+ size 1004201594
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f8e2ab643ff24c4ece7fa9a20797cdd01676f4ac256543d0994d6804130f174
3
+ size 14180
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b4f4158f4aa521674283d4b8b46f8ac1f00ed4f557bfc5666c5b0f7b81fa1d1
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a2951d5edfa5cc0a2346ef872f8c77a2920274cfc3b503b04e3799104dee80
3
+ size 11422060
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "extra_special_tokens": {},
203
+ "model_max_length": 512,
204
+ "pad_token": "<|im_end|>",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
trainer_state.json ADDED
@@ -0,0 +1,1333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.21401819154628143,
5
+ "eval_steps": 500,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 167.21875,
13
+ "epoch": 0.002140181915462814,
14
+ "grad_norm": 3.21875,
15
+ "kl": 0.0,
16
+ "learning_rate": 2.1276595744680852e-07,
17
+ "loss": -0.0,
18
+ "reward": 0.671875,
19
+ "reward_std": 0.7394911348819733,
20
+ "rewards/correctness_reward_func": 0.28125,
21
+ "rewards/format_reward_func": 0.390625,
22
+ "step": 1
23
+ },
24
+ {
25
+ "completion_length": 151.84375,
26
+ "epoch": 0.004280363830925628,
27
+ "grad_norm": 3.109375,
28
+ "kl": 0.0,
29
+ "learning_rate": 4.2553191489361704e-07,
30
+ "loss": -0.0,
31
+ "reward": 0.84375,
32
+ "reward_std": 0.6551200225949287,
33
+ "rewards/correctness_reward_func": 0.34375,
34
+ "rewards/format_reward_func": 0.5,
35
+ "step": 2
36
+ },
37
+ {
38
+ "completion_length": 152.3125,
39
+ "epoch": 0.006420545746388443,
40
+ "grad_norm": 2.734375,
41
+ "kl": 0.0007652118656551465,
42
+ "learning_rate": 6.382978723404255e-07,
43
+ "loss": 0.0,
44
+ "reward": 0.734375,
45
+ "reward_std": 0.6524592041969299,
46
+ "rewards/correctness_reward_func": 0.21875,
47
+ "rewards/format_reward_func": 0.515625,
48
+ "step": 3
49
+ },
50
+ {
51
+ "completion_length": 175.328125,
52
+ "epoch": 0.008560727661851257,
53
+ "grad_norm": 2.84375,
54
+ "kl": 0.0007007253007031977,
55
+ "learning_rate": 8.510638297872341e-07,
56
+ "loss": 0.0,
57
+ "reward": 0.84375,
58
+ "reward_std": 0.7802355140447617,
59
+ "rewards/correctness_reward_func": 0.40625,
60
+ "rewards/format_reward_func": 0.4375,
61
+ "step": 4
62
+ },
63
+ {
64
+ "completion_length": 152.84375,
65
+ "epoch": 0.010700909577314071,
66
+ "grad_norm": 3.0625,
67
+ "kl": 0.0006465155893238261,
68
+ "learning_rate": 1.0638297872340427e-06,
69
+ "loss": 0.0,
70
+ "reward": 0.875,
71
+ "reward_std": 0.6669624000787735,
72
+ "rewards/correctness_reward_func": 0.3125,
73
+ "rewards/format_reward_func": 0.5625,
74
+ "step": 5
75
+ },
76
+ {
77
+ "completion_length": 179.5,
78
+ "epoch": 0.012841091492776886,
79
+ "grad_norm": 3.15625,
80
+ "kl": 0.0007112839666660875,
81
+ "learning_rate": 1.276595744680851e-06,
82
+ "loss": 0.0,
83
+ "reward": 0.6875,
84
+ "reward_std": 0.7003459334373474,
85
+ "rewards/correctness_reward_func": 0.25,
86
+ "rewards/format_reward_func": 0.4375,
87
+ "step": 6
88
+ },
89
+ {
90
+ "completion_length": 169.140625,
91
+ "epoch": 0.0149812734082397,
92
+ "grad_norm": 3.5,
93
+ "kl": 0.000702101708156988,
94
+ "learning_rate": 1.4893617021276596e-06,
95
+ "loss": 0.0,
96
+ "reward": 0.84375,
97
+ "reward_std": 0.7666550725698471,
98
+ "rewards/correctness_reward_func": 0.34375,
99
+ "rewards/format_reward_func": 0.5,
100
+ "step": 7
101
+ },
102
+ {
103
+ "completion_length": 146.390625,
104
+ "epoch": 0.017121455323702513,
105
+ "grad_norm": 3.25,
106
+ "kl": 0.00070322556712199,
107
+ "learning_rate": 1.7021276595744682e-06,
108
+ "loss": 0.0,
109
+ "reward": 0.765625,
110
+ "reward_std": 0.514876589179039,
111
+ "rewards/correctness_reward_func": 0.15625,
112
+ "rewards/format_reward_func": 0.609375,
113
+ "step": 8
114
+ },
115
+ {
116
+ "completion_length": 141.40625,
117
+ "epoch": 0.019261637239165328,
118
+ "grad_norm": 2.875,
119
+ "kl": 0.0008593319798819721,
120
+ "learning_rate": 1.9148936170212767e-06,
121
+ "loss": 0.0,
122
+ "reward": 1.03125,
123
+ "reward_std": 0.7832296937704086,
124
+ "rewards/correctness_reward_func": 0.46875,
125
+ "rewards/format_reward_func": 0.5625,
126
+ "step": 9
127
+ },
128
+ {
129
+ "completion_length": 152.21875,
130
+ "epoch": 0.021401819154628143,
131
+ "grad_norm": 3.71875,
132
+ "kl": 0.0011522448767209426,
133
+ "learning_rate": 2.1276595744680853e-06,
134
+ "loss": 0.0,
135
+ "reward": 0.75,
136
+ "reward_std": 0.7914351224899292,
137
+ "rewards/correctness_reward_func": 0.1875,
138
+ "rewards/format_reward_func": 0.5625,
139
+ "step": 10
140
+ },
141
+ {
142
+ "completion_length": 118.953125,
143
+ "epoch": 0.023542001070090957,
144
+ "grad_norm": 2.75,
145
+ "kl": 0.0009655868489062414,
146
+ "learning_rate": 2.340425531914894e-06,
147
+ "loss": 0.0,
148
+ "reward": 1.3125,
149
+ "reward_std": 0.6371002793312073,
150
+ "rewards/correctness_reward_func": 0.53125,
151
+ "rewards/format_reward_func": 0.78125,
152
+ "step": 11
153
+ },
154
+ {
155
+ "completion_length": 135.90625,
156
+ "epoch": 0.025682182985553772,
157
+ "grad_norm": 3.3125,
158
+ "kl": 0.0013183261326048523,
159
+ "learning_rate": 2.553191489361702e-06,
160
+ "loss": 0.0,
161
+ "reward": 0.890625,
162
+ "reward_std": 0.6318541243672371,
163
+ "rewards/correctness_reward_func": 0.25,
164
+ "rewards/format_reward_func": 0.640625,
165
+ "step": 12
166
+ },
167
+ {
168
+ "completion_length": 141.265625,
169
+ "epoch": 0.027822364901016586,
170
+ "grad_norm": 2.390625,
171
+ "kl": 0.0016906778037082404,
172
+ "learning_rate": 2.765957446808511e-06,
173
+ "loss": 0.0,
174
+ "reward": 1.015625,
175
+ "reward_std": 0.6712641417980194,
176
+ "rewards/correctness_reward_func": 0.25,
177
+ "rewards/format_reward_func": 0.765625,
178
+ "step": 13
179
+ },
180
+ {
181
+ "completion_length": 140.015625,
182
+ "epoch": 0.0299625468164794,
183
+ "grad_norm": 2.796875,
184
+ "kl": 0.0024730846926104277,
185
+ "learning_rate": 2.978723404255319e-06,
186
+ "loss": 0.0,
187
+ "reward": 1.0625,
188
+ "reward_std": 0.657661646604538,
189
+ "rewards/correctness_reward_func": 0.3125,
190
+ "rewards/format_reward_func": 0.75,
191
+ "step": 14
192
+ },
193
+ {
194
+ "completion_length": 139.65625,
195
+ "epoch": 0.03210272873194221,
196
+ "grad_norm": 2.84375,
197
+ "kl": 0.003585253667552024,
198
+ "learning_rate": 3.191489361702128e-06,
199
+ "loss": 0.0,
200
+ "reward": 1.109375,
201
+ "reward_std": 0.6227563470602036,
202
+ "rewards/correctness_reward_func": 0.3125,
203
+ "rewards/format_reward_func": 0.796875,
204
+ "step": 15
205
+ },
206
+ {
207
+ "completion_length": 140.671875,
208
+ "epoch": 0.03424291064740503,
209
+ "grad_norm": 2.734375,
210
+ "kl": 0.004379586665891111,
211
+ "learning_rate": 3.4042553191489363e-06,
212
+ "loss": 0.0,
213
+ "reward": 1.0,
214
+ "reward_std": 0.603929728269577,
215
+ "rewards/correctness_reward_func": 0.25,
216
+ "rewards/format_reward_func": 0.75,
217
+ "step": 16
218
+ },
219
+ {
220
+ "completion_length": 118.703125,
221
+ "epoch": 0.03638309256286784,
222
+ "grad_norm": 3.25,
223
+ "kl": 0.00417952478164807,
224
+ "learning_rate": 3.6170212765957453e-06,
225
+ "loss": 0.0,
226
+ "reward": 1.1875,
227
+ "reward_std": 0.6465111523866653,
228
+ "rewards/correctness_reward_func": 0.34375,
229
+ "rewards/format_reward_func": 0.84375,
230
+ "step": 17
231
+ },
232
+ {
233
+ "completion_length": 120.28125,
234
+ "epoch": 0.038523274478330656,
235
+ "grad_norm": 2.484375,
236
+ "kl": 0.005598044954240322,
237
+ "learning_rate": 3.8297872340425535e-06,
238
+ "loss": 0.0,
239
+ "reward": 1.09375,
240
+ "reward_std": 0.5800373703241348,
241
+ "rewards/correctness_reward_func": 0.25,
242
+ "rewards/format_reward_func": 0.84375,
243
+ "step": 18
244
+ },
245
+ {
246
+ "completion_length": 127.890625,
247
+ "epoch": 0.04066345639379347,
248
+ "grad_norm": 3.0,
249
+ "kl": 0.005727470270358026,
250
+ "learning_rate": 4.042553191489362e-06,
251
+ "loss": 0.0,
252
+ "reward": 1.15625,
253
+ "reward_std": 0.6766257882118225,
254
+ "rewards/correctness_reward_func": 0.34375,
255
+ "rewards/format_reward_func": 0.8125,
256
+ "step": 19
257
+ },
258
+ {
259
+ "completion_length": 121.703125,
260
+ "epoch": 0.042803638309256285,
261
+ "grad_norm": 2.109375,
262
+ "kl": 0.005017031449824572,
263
+ "learning_rate": 4.255319148936171e-06,
264
+ "loss": 0.0,
265
+ "reward": 1.203125,
266
+ "reward_std": 0.38406607508659363,
267
+ "rewards/correctness_reward_func": 0.28125,
268
+ "rewards/format_reward_func": 0.921875,
269
+ "step": 20
270
+ },
271
+ {
272
+ "completion_length": 147.4375,
273
+ "epoch": 0.0449438202247191,
274
+ "grad_norm": 2.171875,
275
+ "kl": 0.0043895336566492915,
276
+ "learning_rate": 4.468085106382979e-06,
277
+ "loss": 0.0,
278
+ "reward": 1.203125,
279
+ "reward_std": 0.568247452378273,
280
+ "rewards/correctness_reward_func": 0.34375,
281
+ "rewards/format_reward_func": 0.859375,
282
+ "step": 21
283
+ },
284
+ {
285
+ "completion_length": 129.40625,
286
+ "epoch": 0.047084002140181914,
287
+ "grad_norm": 1.984375,
288
+ "kl": 0.006280150264501572,
289
+ "learning_rate": 4.680851063829788e-06,
290
+ "loss": 0.0,
291
+ "reward": 1.3125,
292
+ "reward_std": 0.5328208804130554,
293
+ "rewards/correctness_reward_func": 0.34375,
294
+ "rewards/format_reward_func": 0.96875,
295
+ "step": 22
296
+ },
297
+ {
298
+ "completion_length": 115.859375,
299
+ "epoch": 0.04922418405564473,
300
+ "grad_norm": 3.03125,
301
+ "kl": 0.004840813227929175,
302
+ "learning_rate": 4.893617021276596e-06,
303
+ "loss": 0.0,
304
+ "reward": 1.21875,
305
+ "reward_std": 0.555421955883503,
306
+ "rewards/correctness_reward_func": 0.28125,
307
+ "rewards/format_reward_func": 0.9375,
308
+ "step": 23
309
+ },
310
+ {
311
+ "completion_length": 136.6875,
312
+ "epoch": 0.051364365971107544,
313
+ "grad_norm": 2.515625,
314
+ "kl": 0.005280128796584904,
315
+ "learning_rate": 5.106382978723404e-06,
316
+ "loss": 0.0,
317
+ "reward": 1.265625,
318
+ "reward_std": 0.6034187823534012,
319
+ "rewards/correctness_reward_func": 0.34375,
320
+ "rewards/format_reward_func": 0.921875,
321
+ "step": 24
322
+ },
323
+ {
324
+ "completion_length": 128.9375,
325
+ "epoch": 0.05350454788657036,
326
+ "grad_norm": 2.453125,
327
+ "kl": 0.008012514561414719,
328
+ "learning_rate": 5.319148936170213e-06,
329
+ "loss": 0.0,
330
+ "reward": 1.71875,
331
+ "reward_std": 0.9615881741046906,
332
+ "rewards/correctness_reward_func": 0.8125,
333
+ "rewards/format_reward_func": 0.90625,
334
+ "step": 25
335
+ },
336
+ {
337
+ "completion_length": 119.9375,
338
+ "epoch": 0.05564472980203317,
339
+ "grad_norm": 2.53125,
340
+ "kl": 0.006216021254658699,
341
+ "learning_rate": 5.531914893617022e-06,
342
+ "loss": 0.0,
343
+ "reward": 1.53125,
344
+ "reward_std": 0.6540063470602036,
345
+ "rewards/correctness_reward_func": 0.5625,
346
+ "rewards/format_reward_func": 0.96875,
347
+ "step": 26
348
+ },
349
+ {
350
+ "completion_length": 143.375,
351
+ "epoch": 0.05778491171749599,
352
+ "grad_norm": 2.5625,
353
+ "kl": 0.006388432695530355,
354
+ "learning_rate": 5.744680851063831e-06,
355
+ "loss": 0.0,
356
+ "reward": 1.296875,
357
+ "reward_std": 0.6433176919817924,
358
+ "rewards/correctness_reward_func": 0.40625,
359
+ "rewards/format_reward_func": 0.890625,
360
+ "step": 27
361
+ },
362
+ {
363
+ "completion_length": 131.15625,
364
+ "epoch": 0.0599250936329588,
365
+ "grad_norm": 2.453125,
366
+ "kl": 0.008788998704403639,
367
+ "learning_rate": 5.957446808510638e-06,
368
+ "loss": 0.0,
369
+ "reward": 1.546875,
370
+ "reward_std": 0.6077659651637077,
371
+ "rewards/correctness_reward_func": 0.59375,
372
+ "rewards/format_reward_func": 0.953125,
373
+ "step": 28
374
+ },
375
+ {
376
+ "completion_length": 124.25,
377
+ "epoch": 0.06206527554842162,
378
+ "grad_norm": 1.890625,
379
+ "kl": 0.0065922129433602095,
380
+ "learning_rate": 6.170212765957447e-06,
381
+ "loss": 0.0,
382
+ "reward": 1.4375,
383
+ "reward_std": 0.6141257882118225,
384
+ "rewards/correctness_reward_func": 0.46875,
385
+ "rewards/format_reward_func": 0.96875,
386
+ "step": 29
387
+ },
388
+ {
389
+ "completion_length": 128.421875,
390
+ "epoch": 0.06420545746388442,
391
+ "grad_norm": 2.203125,
392
+ "kl": 0.008303154725581408,
393
+ "learning_rate": 6.382978723404256e-06,
394
+ "loss": 0.0,
395
+ "reward": 1.625,
396
+ "reward_std": 0.7354267686605453,
397
+ "rewards/correctness_reward_func": 0.65625,
398
+ "rewards/format_reward_func": 0.96875,
399
+ "step": 30
400
+ },
401
+ {
402
+ "completion_length": 130.25,
403
+ "epoch": 0.06634563937934725,
404
+ "grad_norm": 1.859375,
405
+ "kl": 0.01200802018865943,
406
+ "learning_rate": 6.595744680851064e-06,
407
+ "loss": 0.0001,
408
+ "reward": 1.703125,
409
+ "reward_std": 0.5409187823534012,
410
+ "rewards/correctness_reward_func": 0.71875,
411
+ "rewards/format_reward_func": 0.984375,
412
+ "step": 31
413
+ },
414
+ {
415
+ "completion_length": 134.3125,
416
+ "epoch": 0.06848582129481005,
417
+ "grad_norm": 2.4375,
418
+ "kl": 0.012481397716328502,
419
+ "learning_rate": 6.808510638297873e-06,
420
+ "loss": 0.0001,
421
+ "reward": 1.5625,
422
+ "reward_std": 0.5936799347400665,
423
+ "rewards/correctness_reward_func": 0.625,
424
+ "rewards/format_reward_func": 0.9375,
425
+ "step": 32
426
+ },
427
+ {
428
+ "completion_length": 140.578125,
429
+ "epoch": 0.07062600321027288,
430
+ "grad_norm": 2.203125,
431
+ "kl": 0.011200629058293998,
432
+ "learning_rate": 7.021276595744682e-06,
433
+ "loss": 0.0001,
434
+ "reward": 1.46875,
435
+ "reward_std": 0.5570628941059113,
436
+ "rewards/correctness_reward_func": 0.5,
437
+ "rewards/format_reward_func": 0.96875,
438
+ "step": 33
439
+ },
440
+ {
441
+ "completion_length": 126.640625,
442
+ "epoch": 0.07276618512573568,
443
+ "grad_norm": 2.328125,
444
+ "kl": 0.011981034418568015,
445
+ "learning_rate": 7.234042553191491e-06,
446
+ "loss": 0.0001,
447
+ "reward": 1.671875,
448
+ "reward_std": 0.6420939117670059,
449
+ "rewards/correctness_reward_func": 0.6875,
450
+ "rewards/format_reward_func": 0.984375,
451
+ "step": 34
452
+ },
453
+ {
454
+ "completion_length": 141.046875,
455
+ "epoch": 0.0749063670411985,
456
+ "grad_norm": 2.0,
457
+ "kl": 0.01834311173297465,
458
+ "learning_rate": 7.446808510638298e-06,
459
+ "loss": 0.0001,
460
+ "reward": 1.75,
461
+ "reward_std": 0.5,
462
+ "rewards/correctness_reward_func": 0.75,
463
+ "rewards/format_reward_func": 1.0,
464
+ "step": 35
465
+ },
466
+ {
467
+ "completion_length": 129.703125,
468
+ "epoch": 0.07704654895666131,
469
+ "grad_norm": 2.15625,
470
+ "kl": 0.02186211384832859,
471
+ "learning_rate": 7.659574468085107e-06,
472
+ "loss": 0.0001,
473
+ "reward": 1.859375,
474
+ "reward_std": 0.6324251294136047,
475
+ "rewards/correctness_reward_func": 0.875,
476
+ "rewards/format_reward_func": 0.984375,
477
+ "step": 36
478
+ },
479
+ {
480
+ "completion_length": 147.5,
481
+ "epoch": 0.07918673087212413,
482
+ "grad_norm": 2.109375,
483
+ "kl": 0.018696403596550226,
484
+ "learning_rate": 7.872340425531916e-06,
485
+ "loss": 0.0001,
486
+ "reward": 1.78125,
487
+ "reward_std": 0.6237945631146431,
488
+ "rewards/correctness_reward_func": 0.875,
489
+ "rewards/format_reward_func": 0.90625,
490
+ "step": 37
491
+ },
492
+ {
493
+ "completion_length": 138.984375,
494
+ "epoch": 0.08132691278758694,
495
+ "grad_norm": 2.375,
496
+ "kl": 0.02475315798074007,
497
+ "learning_rate": 8.085106382978723e-06,
498
+ "loss": 0.0001,
499
+ "reward": 1.78125,
500
+ "reward_std": 0.5818375647068024,
501
+ "rewards/correctness_reward_func": 0.78125,
502
+ "rewards/format_reward_func": 1.0,
503
+ "step": 38
504
+ },
505
+ {
506
+ "completion_length": 138.875,
507
+ "epoch": 0.08346709470304976,
508
+ "grad_norm": 2.21875,
509
+ "kl": 0.02588808024302125,
510
+ "learning_rate": 8.297872340425532e-06,
511
+ "loss": 0.0001,
512
+ "reward": 1.53125,
513
+ "reward_std": 0.46814728528261185,
514
+ "rewards/correctness_reward_func": 0.59375,
515
+ "rewards/format_reward_func": 0.9375,
516
+ "step": 39
517
+ },
518
+ {
519
+ "completion_length": 132.46875,
520
+ "epoch": 0.08560727661851257,
521
+ "grad_norm": 2.5,
522
+ "kl": 0.03315168898552656,
523
+ "learning_rate": 8.510638297872341e-06,
524
+ "loss": 0.0002,
525
+ "reward": 1.609375,
526
+ "reward_std": 0.6949251294136047,
527
+ "rewards/correctness_reward_func": 0.625,
528
+ "rewards/format_reward_func": 0.984375,
529
+ "step": 40
530
+ },
531
+ {
532
+ "completion_length": 143.25,
533
+ "epoch": 0.08774745853397539,
534
+ "grad_norm": 2.34375,
535
+ "kl": 0.027149769477546215,
536
+ "learning_rate": 8.72340425531915e-06,
537
+ "loss": 0.0001,
538
+ "reward": 1.75,
539
+ "reward_std": 0.6540063470602036,
540
+ "rewards/correctness_reward_func": 0.78125,
541
+ "rewards/format_reward_func": 0.96875,
542
+ "step": 41
543
+ },
544
+ {
545
+ "completion_length": 149.1875,
546
+ "epoch": 0.0898876404494382,
547
+ "grad_norm": 2.734375,
548
+ "kl": 0.033755607437342405,
549
+ "learning_rate": 8.936170212765958e-06,
550
+ "loss": 0.0002,
551
+ "reward": 1.890625,
552
+ "reward_std": 0.9243821352720261,
553
+ "rewards/correctness_reward_func": 0.9375,
554
+ "rewards/format_reward_func": 0.953125,
555
+ "step": 42
556
+ },
557
+ {
558
+ "completion_length": 140.765625,
559
+ "epoch": 0.09202782236490102,
560
+ "grad_norm": 1.65625,
561
+ "kl": 0.03690116386860609,
562
+ "learning_rate": 9.148936170212767e-06,
563
+ "loss": 0.0002,
564
+ "reward": 1.71875,
565
+ "reward_std": 0.33183756470680237,
566
+ "rewards/correctness_reward_func": 0.71875,
567
+ "rewards/format_reward_func": 1.0,
568
+ "step": 43
569
+ },
570
+ {
571
+ "completion_length": 119.640625,
572
+ "epoch": 0.09416800428036383,
573
+ "grad_norm": 2.984375,
574
+ "kl": 0.042608937714248896,
575
+ "learning_rate": 9.361702127659576e-06,
576
+ "loss": 0.0002,
577
+ "reward": 2.0,
578
+ "reward_std": 0.6443375647068024,
579
+ "rewards/correctness_reward_func": 1.0,
580
+ "rewards/format_reward_func": 1.0,
581
+ "step": 44
582
+ },
583
+ {
584
+ "completion_length": 133.3125,
585
+ "epoch": 0.09630818619582665,
586
+ "grad_norm": 2.5625,
587
+ "kl": 0.05142616247758269,
588
+ "learning_rate": 9.574468085106385e-06,
589
+ "loss": 0.0003,
590
+ "reward": 1.5625,
591
+ "reward_std": 0.6292316764593124,
592
+ "rewards/correctness_reward_func": 0.59375,
593
+ "rewards/format_reward_func": 0.96875,
594
+ "step": 45
595
+ },
596
+ {
597
+ "completion_length": 137.921875,
598
+ "epoch": 0.09844836811128946,
599
+ "grad_norm": 4.65625,
600
+ "kl": 0.05740292742848396,
601
+ "learning_rate": 9.787234042553192e-06,
602
+ "loss": 0.0003,
603
+ "reward": 1.5625,
604
+ "reward_std": 0.39597851037979126,
605
+ "rewards/correctness_reward_func": 0.65625,
606
+ "rewards/format_reward_func": 0.90625,
607
+ "step": 46
608
+ },
609
+ {
610
+ "completion_length": 139.5625,
611
+ "epoch": 0.10058855002675228,
612
+ "grad_norm": 2.96875,
613
+ "kl": 0.05779360141605139,
614
+ "learning_rate": 1e-05,
615
+ "loss": 0.0003,
616
+ "reward": 1.75,
617
+ "reward_std": 0.6911139190196991,
618
+ "rewards/correctness_reward_func": 0.96875,
619
+ "rewards/format_reward_func": 0.78125,
620
+ "step": 47
621
+ },
622
+ {
623
+ "completion_length": 162.4375,
624
+ "epoch": 0.10272873194221509,
625
+ "grad_norm": 2.75,
626
+ "kl": 0.057580760680139065,
627
+ "learning_rate": 9.99986012530635e-06,
628
+ "loss": 0.0003,
629
+ "reward": 1.734375,
630
+ "reward_std": 0.6755875647068024,
631
+ "rewards/correctness_reward_func": 0.75,
632
+ "rewards/format_reward_func": 0.984375,
633
+ "step": 48
634
+ },
635
+ {
636
+ "completion_length": 117.859375,
637
+ "epoch": 0.10486891385767791,
638
+ "grad_norm": 1.625,
639
+ "kl": 0.05675748083740473,
640
+ "learning_rate": 9.999440509051367e-06,
641
+ "loss": 0.0003,
642
+ "reward": 2.03125,
643
+ "reward_std": 0.30542195588350296,
644
+ "rewards/correctness_reward_func": 1.0625,
645
+ "rewards/format_reward_func": 0.96875,
646
+ "step": 49
647
+ },
648
+ {
649
+ "completion_length": 112.28125,
650
+ "epoch": 0.10700909577314072,
651
+ "grad_norm": 3.203125,
652
+ "kl": 0.05932612717151642,
653
+ "learning_rate": 9.998741174712534e-06,
654
+ "loss": 0.0003,
655
+ "reward": 1.96875,
656
+ "reward_std": 0.5953208804130554,
657
+ "rewards/correctness_reward_func": 1.0625,
658
+ "rewards/format_reward_func": 0.90625,
659
+ "step": 50
660
+ },
661
+ {
662
+ "completion_length": 111.375,
663
+ "epoch": 0.10914927768860354,
664
+ "grad_norm": 2.75,
665
+ "kl": 0.08433620352298021,
666
+ "learning_rate": 9.997762161417517e-06,
667
+ "loss": 0.0004,
668
+ "reward": 1.953125,
669
+ "reward_std": 0.5074251294136047,
670
+ "rewards/correctness_reward_func": 0.96875,
671
+ "rewards/format_reward_func": 0.984375,
672
+ "step": 51
673
+ },
674
+ {
675
+ "completion_length": 123.78125,
676
+ "epoch": 0.11128945960406635,
677
+ "grad_norm": 3.015625,
678
+ "kl": 0.06248955149203539,
679
+ "learning_rate": 9.996503523941994e-06,
680
+ "loss": 0.0003,
681
+ "reward": 2.125,
682
+ "reward_std": 0.4858439266681671,
683
+ "rewards/correctness_reward_func": 1.125,
684
+ "rewards/format_reward_func": 1.0,
685
+ "step": 52
686
+ },
687
+ {
688
+ "completion_length": 119.46875,
689
+ "epoch": 0.11342964151952915,
690
+ "grad_norm": 2.625,
691
+ "kl": 0.061760940589010715,
692
+ "learning_rate": 9.994965332706574e-06,
693
+ "loss": 0.0003,
694
+ "reward": 2.0625,
695
+ "reward_std": 0.5,
696
+ "rewards/correctness_reward_func": 1.0625,
697
+ "rewards/format_reward_func": 1.0,
698
+ "step": 53
699
+ },
700
+ {
701
+ "completion_length": 127.265625,
702
+ "epoch": 0.11556982343499198,
703
+ "grad_norm": 1.96875,
704
+ "kl": 0.05814269371330738,
705
+ "learning_rate": 9.993147673772869e-06,
706
+ "loss": 0.0003,
707
+ "reward": 1.90625,
708
+ "reward_std": 0.3125,
709
+ "rewards/correctness_reward_func": 0.90625,
710
+ "rewards/format_reward_func": 1.0,
711
+ "step": 54
712
+ },
713
+ {
714
+ "completion_length": 125.84375,
715
+ "epoch": 0.11771000535045478,
716
+ "grad_norm": 2.875,
717
+ "kl": 0.08824395015835762,
718
+ "learning_rate": 9.991050648838676e-06,
719
+ "loss": 0.0004,
720
+ "reward": 2.03125,
721
+ "reward_std": 0.7261751294136047,
722
+ "rewards/correctness_reward_func": 1.03125,
723
+ "rewards/format_reward_func": 1.0,
724
+ "step": 55
725
+ },
726
+ {
727
+ "completion_length": 121.390625,
728
+ "epoch": 0.1198501872659176,
729
+ "grad_norm": 2.796875,
730
+ "kl": 0.06532257050275803,
731
+ "learning_rate": 9.98867437523228e-06,
732
+ "loss": 0.0003,
733
+ "reward": 1.625,
734
+ "reward_std": 0.46650634706020355,
735
+ "rewards/correctness_reward_func": 0.625,
736
+ "rewards/format_reward_func": 1.0,
737
+ "step": 56
738
+ },
739
+ {
740
+ "completion_length": 129.5625,
741
+ "epoch": 0.12199036918138041,
742
+ "grad_norm": 3.328125,
743
+ "kl": 0.06475972291082144,
744
+ "learning_rate": 9.986018985905901e-06,
745
+ "loss": 0.0003,
746
+ "reward": 2.015625,
747
+ "reward_std": 0.5409187823534012,
748
+ "rewards/correctness_reward_func": 1.03125,
749
+ "rewards/format_reward_func": 0.984375,
750
+ "step": 57
751
+ },
752
+ {
753
+ "completion_length": 133.15625,
754
+ "epoch": 0.12413055109684323,
755
+ "grad_norm": 2.0625,
756
+ "kl": 0.05745814461261034,
757
+ "learning_rate": 9.983084629428244e-06,
758
+ "loss": 0.0003,
759
+ "reward": 1.984375,
760
+ "reward_std": 0.42558756470680237,
761
+ "rewards/correctness_reward_func": 1.0,
762
+ "rewards/format_reward_func": 0.984375,
763
+ "step": 58
764
+ },
765
+ {
766
+ "completion_length": 124.671875,
767
+ "epoch": 0.12627073301230604,
768
+ "grad_norm": 2.9375,
769
+ "kl": 0.07612972147762775,
770
+ "learning_rate": 9.979871469976197e-06,
771
+ "loss": 0.0004,
772
+ "reward": 1.59375,
773
+ "reward_std": 0.6346687823534012,
774
+ "rewards/correctness_reward_func": 0.59375,
775
+ "rewards/format_reward_func": 1.0,
776
+ "step": 59
777
+ },
778
+ {
779
+ "completion_length": 109.921875,
780
+ "epoch": 0.12841091492776885,
781
+ "grad_norm": 3.0,
782
+ "kl": 0.06931339204311371,
783
+ "learning_rate": 9.976379687325633e-06,
784
+ "loss": 0.0003,
785
+ "reward": 1.96875,
786
+ "reward_std": 0.6540063470602036,
787
+ "rewards/correctness_reward_func": 0.96875,
788
+ "rewards/format_reward_func": 1.0,
789
+ "step": 60
790
+ },
791
+ {
792
+ "completion_length": 125.53125,
793
+ "epoch": 0.13055109684323168,
794
+ "grad_norm": 2.546875,
795
+ "kl": 0.05608335882425308,
796
+ "learning_rate": 9.972609476841368e-06,
797
+ "loss": 0.0003,
798
+ "reward": 2.125,
799
+ "reward_std": 0.5915063470602036,
800
+ "rewards/correctness_reward_func": 1.125,
801
+ "rewards/format_reward_func": 1.0,
802
+ "step": 61
803
+ },
804
+ {
805
+ "completion_length": 127.8125,
806
+ "epoch": 0.1326912787586945,
807
+ "grad_norm": 2.59375,
808
+ "kl": 0.14377161115407944,
809
+ "learning_rate": 9.968561049466214e-06,
810
+ "loss": 0.0007,
811
+ "reward": 1.84375,
812
+ "reward_std": 0.5290063321590424,
813
+ "rewards/correctness_reward_func": 0.84375,
814
+ "rewards/format_reward_func": 1.0,
815
+ "step": 62
816
+ },
817
+ {
818
+ "completion_length": 124.75,
819
+ "epoch": 0.1348314606741573,
820
+ "grad_norm": 1.6328125,
821
+ "kl": 0.06179473642259836,
822
+ "learning_rate": 9.964234631709188e-06,
823
+ "loss": 0.0003,
824
+ "reward": 1.90625,
825
+ "reward_std": 0.2596687823534012,
826
+ "rewards/correctness_reward_func": 0.90625,
827
+ "rewards/format_reward_func": 1.0,
828
+ "step": 63
829
+ },
830
+ {
831
+ "completion_length": 125.515625,
832
+ "epoch": 0.1369716425896201,
833
+ "grad_norm": 2.875,
834
+ "kl": 0.08027161657810211,
835
+ "learning_rate": 9.959630465632833e-06,
836
+ "loss": 0.0004,
837
+ "reward": 2.15625,
838
+ "reward_std": 0.6011751294136047,
839
+ "rewards/correctness_reward_func": 1.15625,
840
+ "rewards/format_reward_func": 1.0,
841
+ "step": 64
842
+ },
843
+ {
844
+ "completion_length": 114.890625,
845
+ "epoch": 0.13911182450508294,
846
+ "grad_norm": 2.90625,
847
+ "kl": 0.13545920699834824,
848
+ "learning_rate": 9.954748808839675e-06,
849
+ "loss": 0.0007,
850
+ "reward": 1.859375,
851
+ "reward_std": 0.4826504588127136,
852
+ "rewards/correctness_reward_func": 0.875,
853
+ "rewards/format_reward_func": 0.984375,
854
+ "step": 65
855
+ },
856
+ {
857
+ "completion_length": 121.125,
858
+ "epoch": 0.14125200642054575,
859
+ "grad_norm": 1.9921875,
860
+ "kl": 0.07298032753169537,
861
+ "learning_rate": 9.949589934457815e-06,
862
+ "loss": 0.0004,
863
+ "reward": 2.03125,
864
+ "reward_std": 0.3846687823534012,
865
+ "rewards/correctness_reward_func": 1.03125,
866
+ "rewards/format_reward_func": 1.0,
867
+ "step": 66
868
+ },
869
+ {
870
+ "completion_length": 122.625,
871
+ "epoch": 0.14339218833600856,
872
+ "grad_norm": 3.359375,
873
+ "kl": 0.10324916429817677,
874
+ "learning_rate": 9.944154131125643e-06,
875
+ "loss": 0.0005,
876
+ "reward": 2.078125,
877
+ "reward_std": 0.5409187823534012,
878
+ "rewards/correctness_reward_func": 1.09375,
879
+ "rewards/format_reward_func": 0.984375,
880
+ "step": 67
881
+ },
882
+ {
883
+ "completion_length": 133.40625,
884
+ "epoch": 0.14553237025147137,
885
+ "grad_norm": 2.28125,
886
+ "kl": 0.10124836303293705,
887
+ "learning_rate": 9.938441702975689e-06,
888
+ "loss": 0.0005,
889
+ "reward": 1.71875,
890
+ "reward_std": 0.40400634706020355,
891
+ "rewards/correctness_reward_func": 0.71875,
892
+ "rewards/format_reward_func": 1.0,
893
+ "step": 68
894
+ },
895
+ {
896
+ "completion_length": 141.984375,
897
+ "epoch": 0.1476725521669342,
898
+ "grad_norm": 2.921875,
899
+ "kl": 0.08171628974378109,
900
+ "learning_rate": 9.932452969617607e-06,
901
+ "loss": 0.0004,
902
+ "reward": 1.96875,
903
+ "reward_std": 0.41325797885656357,
904
+ "rewards/correctness_reward_func": 1.0,
905
+ "rewards/format_reward_func": 0.96875,
906
+ "step": 69
907
+ },
908
+ {
909
+ "completion_length": 125.609375,
910
+ "epoch": 0.149812734082397,
911
+ "grad_norm": 2.53125,
912
+ "kl": 0.08117420598864555,
913
+ "learning_rate": 9.926188266120297e-06,
914
+ "loss": 0.0004,
915
+ "reward": 1.8125,
916
+ "reward_std": 0.41367512941360474,
917
+ "rewards/correctness_reward_func": 0.8125,
918
+ "rewards/format_reward_func": 1.0,
919
+ "step": 70
920
+ },
921
+ {
922
+ "completion_length": 122.296875,
923
+ "epoch": 0.15195291599785982,
924
+ "grad_norm": 2.875,
925
+ "kl": 0.08432869054377079,
926
+ "learning_rate": 9.91964794299315e-06,
927
+ "loss": 0.0004,
928
+ "reward": 2.03125,
929
+ "reward_std": 0.5818375647068024,
930
+ "rewards/correctness_reward_func": 1.03125,
931
+ "rewards/format_reward_func": 1.0,
932
+ "step": 71
933
+ },
934
+ {
935
+ "completion_length": 136.359375,
936
+ "epoch": 0.15409309791332262,
937
+ "grad_norm": 2.296875,
938
+ "kl": 0.10322786308825016,
939
+ "learning_rate": 9.912832366166443e-06,
940
+ "loss": 0.0005,
941
+ "reward": 1.5625,
942
+ "reward_std": 0.26933756470680237,
943
+ "rewards/correctness_reward_func": 0.5625,
944
+ "rewards/format_reward_func": 1.0,
945
+ "step": 72
946
+ },
947
+ {
948
+ "completion_length": 113.921875,
949
+ "epoch": 0.15623327982878546,
950
+ "grad_norm": 3.84375,
951
+ "kl": 0.13962376862764359,
952
+ "learning_rate": 9.905741916970863e-06,
953
+ "loss": 0.0007,
954
+ "reward": 2.09375,
955
+ "reward_std": 0.5290063470602036,
956
+ "rewards/correctness_reward_func": 1.09375,
957
+ "rewards/format_reward_func": 1.0,
958
+ "step": 73
959
+ },
960
+ {
961
+ "completion_length": 120.5625,
962
+ "epoch": 0.15837346174424827,
963
+ "grad_norm": 3.125,
964
+ "kl": 0.10202482901513577,
965
+ "learning_rate": 9.898376992116179e-06,
966
+ "loss": 0.0005,
967
+ "reward": 1.796875,
968
+ "reward_std": 0.5892626941204071,
969
+ "rewards/correctness_reward_func": 0.8125,
970
+ "rewards/format_reward_func": 0.984375,
971
+ "step": 74
972
+ },
973
+ {
974
+ "completion_length": 132.390625,
975
+ "epoch": 0.16051364365971107,
976
+ "grad_norm": 2.875,
977
+ "kl": 0.09189064055681229,
978
+ "learning_rate": 9.890738003669029e-06,
979
+ "loss": 0.0005,
980
+ "reward": 2.125,
981
+ "reward_std": 0.6636751294136047,
982
+ "rewards/correctness_reward_func": 1.125,
983
+ "rewards/format_reward_func": 1.0,
984
+ "step": 75
985
+ },
986
+ {
987
+ "completion_length": 127.546875,
988
+ "epoch": 0.16265382557517388,
989
+ "grad_norm": 3.3125,
990
+ "kl": 0.09288744628429413,
991
+ "learning_rate": 9.882825379029883e-06,
992
+ "loss": 0.0005,
993
+ "reward": 1.84375,
994
+ "reward_std": 0.7068375498056412,
995
+ "rewards/correctness_reward_func": 0.84375,
996
+ "rewards/format_reward_func": 1.0,
997
+ "step": 76
998
+ },
999
+ {
1000
+ "completion_length": 116.90625,
1001
+ "epoch": 0.1647940074906367,
1002
+ "grad_norm": 2.59375,
1003
+ "kl": 0.10366734117269516,
1004
+ "learning_rate": 9.874639560909118e-06,
1005
+ "loss": 0.0005,
1006
+ "reward": 1.625,
1007
+ "reward_std": 0.375,
1008
+ "rewards/correctness_reward_func": 0.625,
1009
+ "rewards/format_reward_func": 1.0,
1010
+ "step": 77
1011
+ },
1012
+ {
1013
+ "completion_length": 145.9375,
1014
+ "epoch": 0.16693418940609953,
1015
+ "grad_norm": 3.125,
1016
+ "kl": 0.14030247181653976,
1017
+ "learning_rate": 9.866181007302258e-06,
1018
+ "loss": 0.0007,
1019
+ "reward": 1.640625,
1020
+ "reward_std": 0.43525634706020355,
1021
+ "rewards/correctness_reward_func": 0.65625,
1022
+ "rewards/format_reward_func": 0.984375,
1023
+ "step": 78
1024
+ },
1025
+ {
1026
+ "completion_length": 114.1875,
1027
+ "epoch": 0.16907437132156233,
1028
+ "grad_norm": 4.15625,
1029
+ "kl": 0.10448622144758701,
1030
+ "learning_rate": 9.857450191464337e-06,
1031
+ "loss": 0.0005,
1032
+ "reward": 1.96875,
1033
+ "reward_std": 0.7790063321590424,
1034
+ "rewards/correctness_reward_func": 0.96875,
1035
+ "rewards/format_reward_func": 1.0,
1036
+ "step": 79
1037
+ },
1038
+ {
1039
+ "completion_length": 135.78125,
1040
+ "epoch": 0.17121455323702514,
1041
+ "grad_norm": 2.421875,
1042
+ "kl": 0.08989986591041088,
1043
+ "learning_rate": 9.848447601883436e-06,
1044
+ "loss": 0.0004,
1045
+ "reward": 1.40625,
1046
+ "reward_std": 0.3582531735301018,
1047
+ "rewards/correctness_reward_func": 0.4375,
1048
+ "rewards/format_reward_func": 0.96875,
1049
+ "step": 80
1050
+ },
1051
+ {
1052
+ "completion_length": 141.953125,
1053
+ "epoch": 0.17335473515248795,
1054
+ "grad_norm": 2.953125,
1055
+ "kl": 0.07188372500240803,
1056
+ "learning_rate": 9.839173742253334e-06,
1057
+ "loss": 0.0004,
1058
+ "reward": 1.96875,
1059
+ "reward_std": 0.6733439117670059,
1060
+ "rewards/correctness_reward_func": 1.0,
1061
+ "rewards/format_reward_func": 0.96875,
1062
+ "step": 81
1063
+ },
1064
+ {
1065
+ "completion_length": 118.359375,
1066
+ "epoch": 0.17549491706795078,
1067
+ "grad_norm": 1.7109375,
1068
+ "kl": 0.07351219840347767,
1069
+ "learning_rate": 9.829629131445342e-06,
1070
+ "loss": 0.0004,
1071
+ "reward": 2.21875,
1072
+ "reward_std": 0.27900634706020355,
1073
+ "rewards/correctness_reward_func": 1.21875,
1074
+ "rewards/format_reward_func": 1.0,
1075
+ "step": 82
1076
+ },
1077
+ {
1078
+ "completion_length": 127.109375,
1079
+ "epoch": 0.1776350989834136,
1080
+ "grad_norm": 1.359375,
1081
+ "kl": 0.07365736179053783,
1082
+ "learning_rate": 9.819814303479268e-06,
1083
+ "loss": 0.0004,
1084
+ "reward": 2.1875,
1085
+ "reward_std": 0.26933756470680237,
1086
+ "rewards/correctness_reward_func": 1.1875,
1087
+ "rewards/format_reward_func": 1.0,
1088
+ "step": 83
1089
+ },
1090
+ {
1091
+ "completion_length": 139.078125,
1092
+ "epoch": 0.1797752808988764,
1093
+ "grad_norm": 2.484375,
1094
+ "kl": 0.07714031636714935,
1095
+ "learning_rate": 9.80972980749353e-06,
1096
+ "loss": 0.0004,
1097
+ "reward": 1.90625,
1098
+ "reward_std": 0.6540063470602036,
1099
+ "rewards/correctness_reward_func": 0.90625,
1100
+ "rewards/format_reward_func": 1.0,
1101
+ "step": 84
1102
+ },
1103
+ {
1104
+ "completion_length": 128.390625,
1105
+ "epoch": 0.1819154628143392,
1106
+ "grad_norm": 1.4453125,
1107
+ "kl": 0.056750121526420116,
1108
+ "learning_rate": 9.799376207714446e-06,
1109
+ "loss": 0.0003,
1110
+ "reward": 2.125,
1111
+ "reward_std": 0.25,
1112
+ "rewards/correctness_reward_func": 1.15625,
1113
+ "rewards/format_reward_func": 0.96875,
1114
+ "step": 85
1115
+ },
1116
+ {
1117
+ "completion_length": 160.234375,
1118
+ "epoch": 0.18405564472980204,
1119
+ "grad_norm": 1.9296875,
1120
+ "kl": 0.06542882043868303,
1121
+ "learning_rate": 9.788754083424654e-06,
1122
+ "loss": 0.0003,
1123
+ "reward": 1.734375,
1124
+ "reward_std": 0.42558756470680237,
1125
+ "rewards/correctness_reward_func": 0.75,
1126
+ "rewards/format_reward_func": 0.984375,
1127
+ "step": 86
1128
+ },
1129
+ {
1130
+ "completion_length": 120.546875,
1131
+ "epoch": 0.18619582664526485,
1132
+ "grad_norm": 2.703125,
1133
+ "kl": 0.14421744085848331,
1134
+ "learning_rate": 9.777864028930705e-06,
1135
+ "loss": 0.0007,
1136
+ "reward": 2.140625,
1137
+ "reward_std": 0.6324251294136047,
1138
+ "rewards/correctness_reward_func": 1.15625,
1139
+ "rewards/format_reward_func": 0.984375,
1140
+ "step": 87
1141
+ },
1142
+ {
1143
+ "completion_length": 137.296875,
1144
+ "epoch": 0.18833600856072766,
1145
+ "grad_norm": 2.59375,
1146
+ "kl": 0.0669497437775135,
1147
+ "learning_rate": 9.766706653529814e-06,
1148
+ "loss": 0.0003,
1149
+ "reward": 2.078125,
1150
+ "reward_std": 0.7763455435633659,
1151
+ "rewards/correctness_reward_func": 1.09375,
1152
+ "rewards/format_reward_func": 0.984375,
1153
+ "step": 88
1154
+ },
1155
+ {
1156
+ "completion_length": 145.4375,
1157
+ "epoch": 0.19047619047619047,
1158
+ "grad_norm": 2.609375,
1159
+ "kl": 0.07120031677186489,
1160
+ "learning_rate": 9.755282581475769e-06,
1161
+ "loss": 0.0004,
1162
+ "reward": 1.59375,
1163
+ "reward_std": 0.7261751294136047,
1164
+ "rewards/correctness_reward_func": 0.59375,
1165
+ "rewards/format_reward_func": 1.0,
1166
+ "step": 89
1167
+ },
1168
+ {
1169
+ "completion_length": 137.59375,
1170
+ "epoch": 0.1926163723916533,
1171
+ "grad_norm": 2.484375,
1172
+ "kl": 0.07096913084387779,
1173
+ "learning_rate": 9.743592451944e-06,
1174
+ "loss": 0.0004,
1175
+ "reward": 1.828125,
1176
+ "reward_std": 0.6324251145124435,
1177
+ "rewards/correctness_reward_func": 0.84375,
1178
+ "rewards/format_reward_func": 0.984375,
1179
+ "step": 90
1180
+ },
1181
+ {
1182
+ "completion_length": 117.359375,
1183
+ "epoch": 0.1947565543071161,
1184
+ "grad_norm": 2.046875,
1185
+ "kl": 0.06865348853170872,
1186
+ "learning_rate": 9.731636918995821e-06,
1187
+ "loss": 0.0003,
1188
+ "reward": 2.3125,
1189
+ "reward_std": 0.39433756470680237,
1190
+ "rewards/correctness_reward_func": 1.3125,
1191
+ "rewards/format_reward_func": 1.0,
1192
+ "step": 91
1193
+ },
1194
+ {
1195
+ "completion_length": 130.734375,
1196
+ "epoch": 0.19689673622257892,
1197
+ "grad_norm": 2.453125,
1198
+ "kl": 0.08213793113827705,
1199
+ "learning_rate": 9.719416651541839e-06,
1200
+ "loss": 0.0004,
1201
+ "reward": 2.1875,
1202
+ "reward_std": 0.5818375647068024,
1203
+ "rewards/correctness_reward_func": 1.21875,
1204
+ "rewards/format_reward_func": 0.96875,
1205
+ "step": 92
1206
+ },
1207
+ {
1208
+ "completion_length": 143.015625,
1209
+ "epoch": 0.19903691813804172,
1210
+ "grad_norm": 1.2578125,
1211
+ "kl": 0.07020723912864923,
1212
+ "learning_rate": 9.706932333304518e-06,
1213
+ "loss": 0.0004,
1214
+ "reward": 1.78125,
1215
+ "reward_std": 0.2596687823534012,
1216
+ "rewards/correctness_reward_func": 0.78125,
1217
+ "rewards/format_reward_func": 1.0,
1218
+ "step": 93
1219
+ },
1220
+ {
1221
+ "completion_length": 119.96875,
1222
+ "epoch": 0.20117710005350456,
1223
+ "grad_norm": 2.375,
1224
+ "kl": 0.07560589909553528,
1225
+ "learning_rate": 9.694184662779931e-06,
1226
+ "loss": 0.0004,
1227
+ "reward": 2.125,
1228
+ "reward_std": 0.46650633215904236,
1229
+ "rewards/correctness_reward_func": 1.125,
1230
+ "rewards/format_reward_func": 1.0,
1231
+ "step": 94
1232
+ },
1233
+ {
1234
+ "completion_length": 139.21875,
1235
+ "epoch": 0.20331728196896737,
1236
+ "grad_norm": 2.078125,
1237
+ "kl": 0.09026694297790527,
1238
+ "learning_rate": 9.681174353198687e-06,
1239
+ "loss": 0.0005,
1240
+ "reward": 2.1875,
1241
+ "reward_std": 0.5386751443147659,
1242
+ "rewards/correctness_reward_func": 1.1875,
1243
+ "rewards/format_reward_func": 1.0,
1244
+ "step": 95
1245
+ },
1246
+ {
1247
+ "completion_length": 151.328125,
1248
+ "epoch": 0.20545746388443017,
1249
+ "grad_norm": 2.25,
1250
+ "kl": 0.06551774125546217,
1251
+ "learning_rate": 9.667902132486009e-06,
1252
+ "loss": 0.0003,
1253
+ "reward": 1.75,
1254
+ "reward_std": 0.4471687823534012,
1255
+ "rewards/correctness_reward_func": 0.75,
1256
+ "rewards/format_reward_func": 1.0,
1257
+ "step": 96
1258
+ },
1259
+ {
1260
+ "completion_length": 138.640625,
1261
+ "epoch": 0.20759764579989298,
1262
+ "grad_norm": 2.40625,
1263
+ "kl": 0.07146297954022884,
1264
+ "learning_rate": 9.654368743221022e-06,
1265
+ "loss": 0.0004,
1266
+ "reward": 1.796875,
1267
+ "reward_std": 0.6604816764593124,
1268
+ "rewards/correctness_reward_func": 0.8125,
1269
+ "rewards/format_reward_func": 0.984375,
1270
+ "step": 97
1271
+ },
1272
+ {
1273
+ "completion_length": 140.9375,
1274
+ "epoch": 0.20973782771535582,
1275
+ "grad_norm": 2.1875,
1276
+ "kl": 0.09298614785075188,
1277
+ "learning_rate": 9.640574942595195e-06,
1278
+ "loss": 0.0005,
1279
+ "reward": 2.09375,
1280
+ "reward_std": 0.5818375647068024,
1281
+ "rewards/correctness_reward_func": 1.09375,
1282
+ "rewards/format_reward_func": 1.0,
1283
+ "step": 98
1284
+ },
1285
+ {
1286
+ "completion_length": 151.171875,
1287
+ "epoch": 0.21187800963081863,
1288
+ "grad_norm": 2.5625,
1289
+ "kl": 0.06539157032966614,
1290
+ "learning_rate": 9.626521502369984e-06,
1291
+ "loss": 0.0003,
1292
+ "reward": 2.3125,
1293
+ "reward_std": 0.6443375647068024,
1294
+ "rewards/correctness_reward_func": 1.34375,
1295
+ "rewards/format_reward_func": 0.96875,
1296
+ "step": 99
1297
+ },
1298
+ {
1299
+ "completion_length": 154.21875,
1300
+ "epoch": 0.21401819154628143,
1301
+ "grad_norm": 2.171875,
1302
+ "kl": 0.07382548321038485,
1303
+ "learning_rate": 9.612209208833648e-06,
1304
+ "loss": 0.0004,
1305
+ "reward": 1.9375,
1306
+ "reward_std": 0.6292316764593124,
1307
+ "rewards/correctness_reward_func": 0.96875,
1308
+ "rewards/format_reward_func": 0.96875,
1309
+ "step": 100
1310
+ }
1311
+ ],
1312
+ "logging_steps": 1,
1313
+ "max_steps": 467,
1314
+ "num_input_tokens_seen": 0,
1315
+ "num_train_epochs": 1,
1316
+ "save_steps": 100,
1317
+ "stateful_callbacks": {
1318
+ "TrainerControl": {
1319
+ "args": {
1320
+ "should_epoch_stop": false,
1321
+ "should_evaluate": false,
1322
+ "should_log": false,
1323
+ "should_save": true,
1324
+ "should_training_stop": false
1325
+ },
1326
+ "attributes": {}
1327
+ }
1328
+ },
1329
+ "total_flos": 0.0,
1330
+ "train_batch_size": 4,
1331
+ "trial_name": null,
1332
+ "trial_params": null
1333
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a891607919cc3a4f5685413eac493180dbc7f55029869807ee308abd66d3a017
3
+ size 5752
vocab.json ADDED
The diff for this file is too large to render. See raw diff