taozihuahua commited on
Commit
44254b8
·
verified ·
1 Parent(s): c3f27a0

Model save

Browse files
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "eval_loss": 0.020811351016163826,
3
- "eval_runtime": 63.4077,
4
  "eval_samples": 99,
5
- "eval_samples_per_second": 1.561,
6
- "eval_steps_per_second": 0.016,
7
  "total_flos": 0.0,
8
- "train_loss": 5.24096154554993,
9
- "train_runtime": 41343.1213,
10
  "train_samples": 72441,
11
- "train_samples_per_second": 1.752,
12
  "train_steps_per_second": 0.007
13
  }
 
1
  {
2
+ "eval_loss": 0.010948318056762218,
3
+ "eval_runtime": 54.3753,
4
  "eval_samples": 99,
5
+ "eval_samples_per_second": 1.821,
6
+ "eval_steps_per_second": 0.018,
7
  "total_flos": 0.0,
8
+ "train_loss": 0.011041131547906181,
9
+ "train_runtime": 4812.1769,
10
  "train_samples": 72441,
11
+ "train_samples_per_second": 1.505,
12
  "train_steps_per_second": 0.007
13
  }
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "eval_loss": 0.020811351016163826,
3
- "eval_runtime": 63.4077,
4
  "eval_samples": 99,
5
- "eval_samples_per_second": 1.561,
6
- "eval_steps_per_second": 0.016
7
  }
 
1
  {
2
+ "eval_loss": 0.010948318056762218,
3
+ "eval_runtime": 54.3753,
4
  "eval_samples": 99,
5
+ "eval_samples_per_second": 1.821,
6
+ "eval_steps_per_second": 0.018
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a567c7e870058f8f9ae67443741d123f396ffc0117407058f39592fcd797838b
3
  size 3554214752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f3560b38352a4c95bb07ebcf72fa472b76368a8a6b30545579079832d974f0
3
  size 3554214752
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 5.24096154554993,
4
- "train_runtime": 41343.1213,
5
  "train_samples": 72441,
6
- "train_samples_per_second": 1.752,
7
  "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.011041131547906181,
4
+ "train_runtime": 4812.1769,
5
  "train_samples": 72441,
6
+ "train_samples_per_second": 1.505,
7
  "train_steps_per_second": 0.007
8
  }
trainer_state.json CHANGED
@@ -1,787 +1,109 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 100,
6
- "global_step": 283,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 381.54387645721437,
13
- "epoch": 0.0176678445229682,
14
- "grad_norm": 0.041543133556842804,
15
- "kl": 0.0009280920028686523,
16
- "learning_rate": 3.448275862068966e-06,
17
- "loss": 0.0,
18
- "reward": 0.6472098495811224,
19
- "reward_std": 0.4381619215011597,
20
- "rewards/accuracy_reward": 0.1501116138417274,
21
- "rewards/format_reward": 0.4970982350409031,
22
  "step": 5
23
  },
24
  {
25
- "completion_length": 236.32568130493163,
26
- "epoch": 0.0353356890459364,
27
- "grad_norm": 0.0530264675617218,
28
- "kl": 0.33028106689453124,
29
- "learning_rate": 6.896551724137932e-06,
30
- "loss": 0.0132,
31
- "reward": 0.9049107544124126,
32
- "reward_std": 0.32910554837435485,
33
- "rewards/accuracy_reward": 0.08258928953437135,
34
- "rewards/format_reward": 0.8223214663565159,
35
  "step": 10
36
  },
37
  {
38
- "completion_length": 176.49442777633666,
39
- "epoch": 0.053003533568904596,
40
- "grad_norm": 0.037097301334142685,
41
- "kl": 0.0668792724609375,
42
- "learning_rate": 1.0344827586206898e-05,
43
- "loss": 0.0027,
44
- "reward": 0.9232143245637416,
45
- "reward_std": 0.3268056377768517,
46
- "rewards/accuracy_reward": 0.08292411086149513,
47
- "rewards/format_reward": 0.8402902185916901,
48
  "step": 15
49
  },
50
  {
51
- "completion_length": 102.76585273742675,
52
- "epoch": 0.0706713780918728,
53
- "grad_norm": 0.022500403225421906,
54
- "kl": 0.141815185546875,
55
- "learning_rate": 1.3793103448275863e-05,
56
- "loss": 0.0057,
57
- "reward": 1.0299107559025287,
58
- "reward_std": 0.23527481253258883,
59
- "rewards/accuracy_reward": 0.09564732620492578,
60
- "rewards/format_reward": 0.9342634320259094,
61
  "step": 20
62
  },
63
  {
64
- "completion_length": 109.1071478843689,
65
- "epoch": 0.08833922261484099,
66
- "grad_norm": 0.018457548692822456,
67
- "kl": 0.186077880859375,
68
- "learning_rate": 1.7241379310344828e-05,
69
- "loss": 0.0074,
70
- "reward": 1.116406300663948,
71
- "reward_std": 0.20077424766495824,
72
- "rewards/accuracy_reward": 0.13683036325965076,
73
- "rewards/format_reward": 0.9795759305357933,
74
  "step": 25
75
  },
76
  {
77
- "completion_length": 171.74520902633668,
78
- "epoch": 0.10600706713780919,
79
- "grad_norm": 0.021204829216003418,
80
- "kl": 0.21072998046875,
81
- "learning_rate": 1.999923511388017e-05,
82
- "loss": 0.0084,
83
- "reward": 1.1837054029107095,
84
- "reward_std": 0.26000751489773394,
85
- "rewards/accuracy_reward": 0.20747768841683864,
86
- "rewards/format_reward": 0.9762277185916901,
87
  "step": 30
88
  },
89
  {
90
- "completion_length": 190.65045490264893,
91
- "epoch": 0.12367491166077739,
92
- "grad_norm": 0.03574452921748161,
93
- "kl": 0.203460693359375,
94
- "learning_rate": 1.9972476383747748e-05,
95
- "loss": 0.0081,
96
- "reward": 1.239620591700077,
97
- "reward_std": 0.2629278709180653,
98
- "rewards/accuracy_reward": 0.25837054699659345,
99
- "rewards/format_reward": 0.9812500409781932,
100
- "step": 35
101
- },
102
- {
103
- "completion_length": 183.1122851371765,
104
- "epoch": 0.1413427561837456,
105
- "grad_norm": 0.0162435844540596,
106
- "kl": 0.210980224609375,
107
- "learning_rate": 1.9907590277344582e-05,
108
- "loss": 0.0084,
109
- "reward": 1.2477679178118706,
110
- "reward_std": 0.2695736668072641,
111
- "rewards/accuracy_reward": 0.26607143888249996,
112
- "rewards/format_reward": 0.9816964700818062,
113
- "step": 40
114
- },
115
- {
116
- "completion_length": 238.26184139251708,
117
- "epoch": 0.15901060070671377,
118
- "grad_norm": 0.4197707176208496,
119
- "kl": 7394.361682128906,
120
- "learning_rate": 1.9804824871166254e-05,
121
- "loss": 295.67,
122
- "reward": 1.234151841700077,
123
- "reward_std": 0.2784772401675582,
124
- "rewards/accuracy_reward": 0.26450893925502894,
125
- "rewards/format_reward": 0.9696428991854191,
126
- "step": 45
127
- },
128
- {
129
- "completion_length": 220.17757682800294,
130
- "epoch": 0.17667844522968199,
131
- "grad_norm": 0.015375903807580471,
132
- "kl": 0.17265625,
133
- "learning_rate": 1.9664573064143604e-05,
134
- "loss": 0.0069,
135
- "reward": 1.2408482730388641,
136
- "reward_std": 0.2870653722435236,
137
- "rewards/accuracy_reward": 0.26808036947622893,
138
- "rewards/format_reward": 0.9727678991854191,
139
- "step": 50
140
- },
141
- {
142
- "completion_length": 231.17356071472167,
143
- "epoch": 0.19434628975265017,
144
- "grad_norm": 0.2615525424480438,
145
- "kl": 0.564453125,
146
- "learning_rate": 1.948737107548771e-05,
147
- "loss": 0.0225,
148
- "reward": 1.2294643357396127,
149
- "reward_std": 0.3038607369177043,
150
- "rewards/accuracy_reward": 0.277008942887187,
151
- "rewards/format_reward": 0.9524553999304771,
152
- "step": 55
153
- },
154
- {
155
- "completion_length": 263.1629590988159,
156
- "epoch": 0.21201413427561838,
157
- "grad_norm": 0.6179373264312744,
158
- "kl": 4.37230224609375,
159
- "learning_rate": 1.9273896394584103e-05,
160
- "loss": 0.1748,
161
- "reward": 1.0507812924683093,
162
- "reward_std": 0.4720887938514352,
163
- "rewards/accuracy_reward": 0.2530134045518935,
164
- "rewards/format_reward": 0.7977678947150707,
165
- "step": 60
166
- },
167
- {
168
- "completion_length": 202.77768750190734,
169
- "epoch": 0.22968197879858657,
170
- "grad_norm": 0.04027915000915527,
171
- "kl": 0.9251708984375,
172
- "learning_rate": 1.9024965190774262e-05,
173
- "loss": 0.037,
174
- "reward": 0.9822545044124127,
175
- "reward_std": 0.5017600081861019,
176
- "rewards/accuracy_reward": 0.2608259034343064,
177
- "rewards/format_reward": 0.7214286051690578,
178
- "step": 65
179
- },
180
- {
181
- "completion_length": 116.15926914215088,
182
- "epoch": 0.24734982332155478,
183
- "grad_norm": 0.03792489692568779,
184
- "kl": 0.6725341796875,
185
- "learning_rate": 1.8741529192927528e-05,
186
- "loss": 0.0269,
187
- "reward": 1.1287946909666062,
188
- "reward_std": 0.4154288250952959,
189
- "rewards/accuracy_reward": 0.28281251303851607,
190
- "rewards/format_reward": 0.8459821805357933,
191
- "step": 70
192
- },
193
- {
194
- "completion_length": 168.19241857528687,
195
- "epoch": 0.26501766784452296,
196
- "grad_norm": 0.22333890199661255,
197
- "kl": 1.2605224609375,
198
- "learning_rate": 1.8424672050733577e-05,
199
- "loss": 0.0504,
200
- "reward": 1.0388393312692643,
201
- "reward_std": 0.46060552783310416,
202
- "rewards/accuracy_reward": 0.26049108393490317,
203
- "rewards/format_reward": 0.7783482506871223,
204
- "step": 75
205
- },
206
- {
207
- "completion_length": 191.00815525054932,
208
- "epoch": 0.2826855123674912,
209
- "grad_norm": 0.03778946399688721,
210
- "kl": 1.990380859375,
211
- "learning_rate": 1.8075605191627242e-05,
212
- "loss": 0.0796,
213
- "reward": 1.128348258137703,
214
- "reward_std": 0.4131897557526827,
215
- "rewards/accuracy_reward": 0.2717634063214064,
216
- "rewards/format_reward": 0.8565848588943481,
217
- "step": 80
218
- },
219
- {
220
- "completion_length": 104.5110532283783,
221
- "epoch": 0.3003533568904594,
222
- "grad_norm": 0.3576684594154358,
223
- "kl": 0.60687255859375,
224
- "learning_rate": 1.7695663189185703e-05,
225
- "loss": 0.0243,
226
- "reward": 1.2685268357396127,
227
- "reward_std": 0.23934431467205286,
228
- "rewards/accuracy_reward": 0.28872769260779024,
229
- "rewards/format_reward": 0.9797991432249546,
230
- "step": 85
231
- },
232
- {
233
- "completion_length": 261.205256652832,
234
- "epoch": 0.31802120141342755,
235
- "grad_norm": 0.16514165699481964,
236
- "kl": 0.769873046875,
237
- "learning_rate": 1.7286298660705877e-05,
238
- "loss": 0.0308,
239
- "reward": 1.2081473730504513,
240
- "reward_std": 0.3599423123523593,
241
- "rewards/accuracy_reward": 0.2926339415833354,
242
- "rewards/format_reward": 0.9155134320259094,
243
- "step": 90
244
- },
245
- {
246
- "completion_length": 337.4015787124634,
247
- "epoch": 0.33568904593639576,
248
- "grad_norm": 0.044928278774023056,
249
- "kl": 0.8098876953125,
250
- "learning_rate": 1.6849076713469914e-05,
251
- "loss": 0.0324,
252
- "reward": 1.1027902282774449,
253
- "reward_std": 0.47088346295058725,
254
- "rewards/accuracy_reward": 0.27622769204899666,
255
- "rewards/format_reward": 0.8265625357627868,
256
- "step": 95
257
- },
258
- {
259
- "completion_length": 254.90804748535157,
260
- "epoch": 0.35335689045936397,
261
- "grad_norm": 0.15292082726955414,
262
- "kl": 0.64703369140625,
263
- "learning_rate": 1.6385668960932143e-05,
264
- "loss": 0.0259,
265
- "reward": 1.1753348790109157,
266
- "reward_std": 0.39564149640500546,
267
- "rewards/accuracy_reward": 0.2820312621071935,
268
- "rewards/format_reward": 0.8933036133646965,
269
- "step": 100
270
- },
271
- {
272
- "epoch": 0.35335689045936397,
273
- "eval_completion_length": 208.94378662109375,
274
- "eval_kl": 0.6962890625,
275
- "eval_loss": 0.028458675369620323,
276
- "eval_reward": 1.254464328289032,
277
- "eval_reward_std": 0.361492395401001,
278
- "eval_rewards/accuracy_reward": 0.3147321566939354,
279
- "eval_rewards/format_reward": 0.9397321790456772,
280
- "eval_runtime": 45.1904,
281
- "eval_samples_per_second": 2.191,
282
- "eval_steps_per_second": 0.022,
283
- "step": 100
284
- },
285
- {
286
- "completion_length": 194.93382568359374,
287
- "epoch": 0.3710247349823322,
288
- "grad_norm": 0.044945139437913895,
289
- "kl": 0.80966796875,
290
- "learning_rate": 1.5897847131705194e-05,
291
- "loss": 0.0324,
292
- "reward": 1.2772321939468383,
293
- "reward_std": 0.30906384270638226,
294
- "rewards/accuracy_reward": 0.3136160854250193,
295
- "rewards/format_reward": 0.9636161163449287,
296
- "step": 105
297
- },
298
- {
299
- "completion_length": 181.32556705474855,
300
- "epoch": 0.38869257950530034,
301
- "grad_norm": 0.0390905924141407,
302
- "kl": 0.6416748046875,
303
- "learning_rate": 1.5387476295779737e-05,
304
- "loss": 0.0257,
305
- "reward": 1.294196479022503,
306
- "reward_std": 0.26708812834694984,
307
- "rewards/accuracy_reward": 0.3142857293598354,
308
- "rewards/format_reward": 0.9799107529222966,
309
- "step": 110
310
- },
311
- {
312
- "completion_length": 188.77099075317383,
313
- "epoch": 0.40636042402826855,
314
- "grad_norm": 0.045743752270936966,
315
- "kl": 0.4034912109375,
316
- "learning_rate": 1.4856507733875837e-05,
317
- "loss": 0.0161,
318
- "reward": 1.2922991663217545,
319
- "reward_std": 0.2678883237764239,
320
- "rewards/accuracy_reward": 0.3164062639698386,
321
- "rewards/format_reward": 0.9758929014205933,
322
- "step": 115
323
- },
324
- {
325
- "completion_length": 196.3886251449585,
326
- "epoch": 0.42402826855123676,
327
- "grad_norm": 0.022538496181368828,
328
- "kl": 0.26793212890625,
329
- "learning_rate": 1.4306971477188223e-05,
330
- "loss": 0.0107,
331
- "reward": 1.3000000566244125,
332
- "reward_std": 0.28592034000903366,
333
- "rewards/accuracy_reward": 0.32600448057055476,
334
- "rewards/format_reward": 0.9739955767989159,
335
- "step": 120
336
- },
337
- {
338
- "completion_length": 207.80346927642822,
339
- "epoch": 0.4416961130742049,
340
- "grad_norm": 0.02644180692732334,
341
- "kl": 0.4103271484375,
342
- "learning_rate": 1.3740968546047935e-05,
343
- "loss": 0.0164,
344
- "reward": 1.2799107655882835,
345
- "reward_std": 0.30040734391659496,
346
- "rewards/accuracy_reward": 0.31573662012815473,
347
- "rewards/format_reward": 0.964174148440361,
348
- "step": 125
349
- },
350
- {
351
- "completion_length": 208.03773231506347,
352
- "epoch": 0.45936395759717313,
353
- "grad_norm": 0.03774217143654823,
354
- "kl": 0.49677734375,
355
- "learning_rate": 1.3160662917174045e-05,
356
- "loss": 0.0199,
357
- "reward": 1.297544701397419,
358
- "reward_std": 0.311822210252285,
359
- "rewards/accuracy_reward": 0.3420759094879031,
360
- "rewards/format_reward": 0.9554687917232514,
361
- "step": 130
362
- },
363
- {
364
- "completion_length": 224.91462955474853,
365
- "epoch": 0.47703180212014135,
366
- "grad_norm": 0.09880122542381287,
367
- "kl": 0.2965087890625,
368
- "learning_rate": 1.2568273250226681e-05,
369
- "loss": 0.0119,
370
- "reward": 1.2757813021540643,
371
- "reward_std": 0.346415382437408,
372
- "rewards/accuracy_reward": 0.3440848354250193,
373
- "rewards/format_reward": 0.9316964715719223,
374
- "step": 135
375
- },
376
- {
377
- "completion_length": 244.44800243377685,
378
- "epoch": 0.49469964664310956,
379
- "grad_norm": 0.025601347908377647,
380
- "kl": 0.39385986328125,
381
- "learning_rate": 1.1966064405292887e-05,
382
- "loss": 0.0158,
383
- "reward": 1.2787946984171867,
384
- "reward_std": 0.3541004925966263,
385
- "rewards/accuracy_reward": 0.34386162348091603,
386
- "rewards/format_reward": 0.9349330797791481,
387
- "step": 140
388
- },
389
- {
390
- "completion_length": 239.1896318435669,
391
- "epoch": 0.5123674911660777,
392
- "grad_norm": 0.018517782911658287,
393
- "kl": 0.194818115234375,
394
- "learning_rate": 1.1356338783736256e-05,
395
- "loss": 0.0078,
396
- "reward": 1.34799113124609,
397
- "reward_std": 0.2732340093702078,
398
- "rewards/accuracy_reward": 0.373772338218987,
399
- "rewards/format_reward": 0.9742187909781933,
400
- "step": 145
401
- },
402
- {
403
- "completion_length": 245.8727783203125,
404
- "epoch": 0.5300353356890459,
405
- "grad_norm": 0.04347226396203041,
406
- "kl": 0.2514892578125,
407
- "learning_rate": 1.0741427525516463e-05,
408
- "loss": 0.0101,
409
- "reward": 1.35837060213089,
410
- "reward_std": 0.2775158016011119,
411
- "rewards/accuracy_reward": 0.380691983550787,
412
- "rewards/format_reward": 0.9776786111295224,
413
- "step": 150
414
- },
415
- {
416
- "completion_length": 262.62690925598145,
417
- "epoch": 0.5477031802120141,
418
- "grad_norm": 0.045237280428409576,
419
- "kl": 0.2498046875,
420
- "learning_rate": 1.012368159663363e-05,
421
- "loss": 0.01,
422
- "reward": 1.3261161297559738,
423
- "reward_std": 0.31663199458271263,
424
- "rewards/accuracy_reward": 0.3674107290804386,
425
- "rewards/format_reward": 0.9587053962051868,
426
- "step": 155
427
- },
428
- {
429
- "completion_length": 294.43438892364503,
430
- "epoch": 0.5653710247349824,
431
- "grad_norm": 0.016969213262200356,
432
- "kl": 0.36796875,
433
- "learning_rate": 9.505462800772612e-06,
434
- "loss": 0.0147,
435
- "reward": 1.2720982685685158,
436
- "reward_std": 0.3833419343456626,
437
- "rewards/accuracy_reward": 0.35625001713633536,
438
- "rewards/format_reward": 0.915848258882761,
439
- "step": 160
440
- },
441
- {
442
- "completion_length": 316.5702146530151,
443
- "epoch": 0.5830388692579506,
444
- "grad_norm": 0.033270638436079025,
445
- "kl": 0.193212890625,
446
- "learning_rate": 8.889134749511956e-06,
447
- "loss": 0.0077,
448
- "reward": 1.2236607670783997,
449
- "reward_std": 0.40494529381394384,
450
- "rewards/accuracy_reward": 0.3286830514669418,
451
- "rewards/format_reward": 0.8949777193367481,
452
- "step": 165
453
- },
454
- {
455
- "completion_length": 311.8881841659546,
456
- "epoch": 0.6007067137809188,
457
- "grad_norm": 0.078031025826931,
458
- "kl": 0.239520263671875,
459
- "learning_rate": 8.277053825620836e-06,
460
- "loss": 0.0096,
461
- "reward": 1.2588170230388642,
462
- "reward_std": 0.382186346501112,
463
- "rewards/accuracy_reward": 0.3531250162050128,
464
- "rewards/format_reward": 0.9056920081377029,
465
- "step": 170
466
- },
467
- {
468
- "completion_length": 285.3632930755615,
469
- "epoch": 0.6183745583038869,
470
- "grad_norm": 0.02530003897845745,
471
- "kl": 0.252392578125,
472
- "learning_rate": 7.671560173993588e-06,
473
- "loss": 0.0101,
474
- "reward": 1.30345988124609,
475
- "reward_std": 0.3494835808873177,
476
- "rewards/accuracy_reward": 0.37667412403970957,
477
- "rewards/format_reward": 0.926785758882761,
478
- "step": 175
479
- },
480
- {
481
- "completion_length": 272.0703248977661,
482
- "epoch": 0.6360424028268551,
483
- "grad_norm": 0.012884252704679966,
484
- "kl": 0.174432373046875,
485
- "learning_rate": 7.07496875466589e-06,
486
- "loss": 0.007,
487
- "reward": 1.316741129755974,
488
- "reward_std": 0.3391346026211977,
489
- "rewards/accuracy_reward": 0.38404019717127086,
490
- "rewards/format_reward": 0.9327009372413159,
491
- "step": 180
492
- },
493
- {
494
- "completion_length": 291.5253484725952,
495
- "epoch": 0.6537102473498233,
496
- "grad_norm": 0.023922132328152657,
497
- "kl": 0.19571533203125,
498
- "learning_rate": 6.489560492119225e-06,
499
- "loss": 0.0078,
500
- "reward": 1.2797991648316382,
501
- "reward_std": 0.3811326840892434,
502
- "rewards/accuracy_reward": 0.372767873480916,
503
- "rewards/format_reward": 0.9070312924683094,
504
- "step": 185
505
- },
506
- {
507
- "completion_length": 290.21173191070557,
508
- "epoch": 0.6713780918727915,
509
- "grad_norm": 0.016599828377366066,
510
- "kl": 0.208367919921875,
511
- "learning_rate": 5.9175735547120975e-06,
512
- "loss": 0.0083,
513
- "reward": 1.2723214849829674,
514
- "reward_std": 0.4036080051213503,
515
- "rewards/accuracy_reward": 0.37020091004669664,
516
- "rewards/format_reward": 0.9021205767989159,
517
- "step": 190
518
- },
519
- {
520
- "completion_length": 279.97177658081057,
521
- "epoch": 0.6890459363957597,
522
- "grad_norm": 0.013091727159917355,
523
- "kl": 0.19798583984375,
524
- "learning_rate": 5.361194797579108e-06,
525
- "loss": 0.0079,
526
- "reward": 1.2968750521540642,
527
- "reward_std": 0.37299707978963853,
528
- "rewards/accuracy_reward": 0.3811384104192257,
529
- "rewards/format_reward": 0.9157366521656514,
530
- "step": 195
531
- },
532
- {
533
- "completion_length": 269.786619758606,
534
- "epoch": 0.7067137809187279,
535
- "grad_norm": 0.014325141906738281,
536
- "kl": 0.288134765625,
537
- "learning_rate": 4.8225514017138205e-06,
538
- "loss": 0.0115,
539
- "reward": 1.3161830991506576,
540
- "reward_std": 0.3630652824416757,
541
- "rewards/accuracy_reward": 0.3881696589291096,
542
- "rewards/format_reward": 0.9280134350061416,
543
- "step": 200
544
- },
545
- {
546
- "epoch": 0.7067137809187279,
547
- "eval_completion_length": 264.20691680908203,
548
- "eval_kl": 0.18017578125,
549
- "eval_loss": 0.006405743304640055,
550
- "eval_reward": 1.3392857611179352,
551
- "eval_reward_std": 0.36248352378606796,
552
- "eval_rewards/accuracy_reward": 0.408482164144516,
553
- "eval_rewards/format_reward": 0.9308036118745804,
554
- "eval_runtime": 48.8588,
555
- "eval_samples_per_second": 2.026,
556
- "eval_steps_per_second": 0.02,
557
- "step": 200
558
- },
559
- {
560
- "completion_length": 276.60916366577146,
561
- "epoch": 0.7243816254416962,
562
- "grad_norm": 0.01561010256409645,
563
- "kl": 0.180450439453125,
564
- "learning_rate": 4.303702741201431e-06,
565
- "loss": 0.0072,
566
- "reward": 1.3011161342263222,
567
- "reward_std": 0.35275506041944027,
568
- "rewards/accuracy_reward": 0.37209823057055474,
569
- "rewards/format_reward": 0.929017896950245,
570
- "step": 205
571
- },
572
- {
573
- "completion_length": 293.17601852416993,
574
- "epoch": 0.7420494699646644,
575
- "grad_norm": 0.029141413047909737,
576
- "kl": 0.1777587890625,
577
- "learning_rate": 3.8066325096949153e-06,
578
- "loss": 0.0071,
579
- "reward": 1.277678619325161,
580
- "reward_std": 0.3869540646672249,
581
- "rewards/accuracy_reward": 0.3700893035158515,
582
- "rewards/format_reward": 0.9075893275439739,
583
- "step": 210
584
- },
585
- {
586
- "completion_length": 318.67546100616454,
587
- "epoch": 0.7597173144876325,
588
- "grad_norm": 0.012996065430343151,
589
- "kl": 0.203765869140625,
590
- "learning_rate": 3.3332411362372063e-06,
591
- "loss": 0.0082,
592
- "reward": 1.2314732655882836,
593
- "reward_std": 0.43429264314472676,
594
- "rewards/accuracy_reward": 0.3552455535158515,
595
- "rewards/format_reward": 0.8762277223169803,
596
- "step": 215
597
- },
598
- {
599
- "completion_length": 307.49554920196533,
600
- "epoch": 0.7773851590106007,
601
- "grad_norm": 0.02377905510365963,
602
- "kl": 0.28052978515625,
603
- "learning_rate": 2.8853385194256677e-06,
604
- "loss": 0.0112,
605
- "reward": 1.2501116588711738,
606
- "reward_std": 0.427118194103241,
607
- "rewards/accuracy_reward": 0.36294644642621277,
608
- "rewards/format_reward": 0.8871652230620384,
609
- "step": 220
610
- },
611
- {
612
- "completion_length": 303.62668800354004,
613
- "epoch": 0.7950530035335689,
614
- "grad_norm": 0.05803314596414566,
615
- "kl": 0.244720458984375,
616
- "learning_rate": 2.464637107698046e-06,
617
- "loss": 0.0098,
618
- "reward": 1.2681920170783996,
619
- "reward_std": 0.4080910187214613,
620
- "rewards/accuracy_reward": 0.36863841097801925,
621
- "rewards/format_reward": 0.8995536133646965,
622
- "step": 225
623
- },
624
- {
625
- "completion_length": 286.2454355239868,
626
- "epoch": 0.8127208480565371,
627
- "grad_norm": 0.01800759695470333,
628
- "kl": 0.18123779296875,
629
- "learning_rate": 2.072745352195794e-06,
630
- "loss": 0.0072,
631
- "reward": 1.2926339864730836,
632
- "reward_std": 0.3678985072299838,
633
- "rewards/accuracy_reward": 0.3722098364494741,
634
- "rewards/format_reward": 0.9204241506755352,
635
- "step": 230
636
- },
637
- {
638
- "completion_length": 283.0959957122803,
639
- "epoch": 0.8303886925795053,
640
- "grad_norm": 0.015781087800860405,
641
- "kl": 0.1795166015625,
642
- "learning_rate": 1.7111615572361628e-06,
643
- "loss": 0.0072,
644
- "reward": 1.2987723737955092,
645
- "reward_std": 0.37512697763741015,
646
- "rewards/accuracy_reward": 0.37700894437730315,
647
- "rewards/format_reward": 0.9217634335160255,
648
- "step": 235
649
- },
650
- {
651
- "completion_length": 298.8237842559814,
652
- "epoch": 0.8480565371024735,
653
- "grad_norm": 0.024129491299390793,
654
- "kl": 0.21593017578125,
655
- "learning_rate": 1.381268151904298e-06,
656
- "loss": 0.0086,
657
- "reward": 1.2565848737955094,
658
- "reward_std": 0.4109664471819997,
659
- "rewards/accuracy_reward": 0.36138394568115473,
660
- "rewards/format_reward": 0.8952009342610836,
661
- "step": 240
662
- },
663
- {
664
- "completion_length": 312.69075107574463,
665
- "epoch": 0.8657243816254417,
666
- "grad_norm": 0.021995313465595245,
667
- "kl": 0.28687744140625,
668
- "learning_rate": 1.0843264046665558e-06,
669
- "loss": 0.0115,
670
- "reward": 1.2415179193019867,
671
- "reward_std": 0.4253259660676122,
672
- "rewards/accuracy_reward": 0.35926340874284507,
673
- "rewards/format_reward": 0.8822545059025287,
674
- "step": 245
675
- },
676
- {
677
- "completion_length": 307.51753559112547,
678
- "epoch": 0.8833922261484098,
679
- "grad_norm": 0.021934768185019493,
680
- "kl": 0.287030029296875,
681
- "learning_rate": 8.214716012124491e-07,
682
- "loss": 0.0115,
683
- "reward": 1.2411830931901933,
684
- "reward_std": 0.41262274123728276,
685
- "rewards/accuracy_reward": 0.34944198131561277,
686
- "rewards/format_reward": 0.8917411133646965,
687
- "step": 250
688
- },
689
- {
690
- "completion_length": 304.0524700164795,
691
- "epoch": 0.901060070671378,
692
- "grad_norm": 0.03234613686800003,
693
- "kl": 0.20001220703125,
694
- "learning_rate": 5.937087039615619e-07,
695
- "loss": 0.008,
696
- "reward": 1.2452009439468383,
697
- "reward_std": 0.4156901784241199,
698
- "rewards/accuracy_reward": 0.3550223371013999,
699
- "rewards/format_reward": 0.8901786126196385,
700
- "step": 255
701
- },
702
- {
703
- "completion_length": 312.4107292175293,
704
- "epoch": 0.9187279151943463,
705
- "grad_norm": 0.011318758130073547,
706
- "kl": 0.22039794921875,
707
- "learning_rate": 4.019085098303077e-07,
708
- "loss": 0.0088,
709
- "reward": 1.239843799173832,
710
- "reward_std": 0.43206911720335484,
711
- "rewards/accuracy_reward": 0.3598214453086257,
712
- "rewards/format_reward": 0.8800223603844642,
713
- "step": 260
714
- },
715
- {
716
- "completion_length": 319.8676481246948,
717
- "epoch": 0.9363957597173145,
718
- "grad_norm": 0.02429085038602352,
719
- "kl": 0.1993408203125,
720
- "learning_rate": 2.4680432094837394e-07,
721
- "loss": 0.008,
722
- "reward": 1.21506702080369,
723
- "reward_std": 0.45882711336016657,
724
- "rewards/accuracy_reward": 0.34754465958103536,
725
- "rewards/format_reward": 0.8675223633646965,
726
- "step": 265
727
- },
728
- {
729
- "completion_length": 319.6962184906006,
730
- "epoch": 0.9540636042402827,
731
- "grad_norm": 0.014112049713730812,
732
- "kl": 0.24310302734375,
733
- "learning_rate": 1.289891410535593e-07,
734
- "loss": 0.0097,
735
- "reward": 1.2215402342379094,
736
- "reward_std": 0.4551096182316542,
737
- "rewards/accuracy_reward": 0.3492187652736902,
738
- "rewards/format_reward": 0.872321467846632,
739
- "step": 270
740
- },
741
- {
742
- "completion_length": 329.8996807098389,
743
- "epoch": 0.9717314487632509,
744
- "grad_norm": 0.013921594247221947,
745
- "kl": 0.2304443359375,
746
- "learning_rate": 4.8913408283934874e-08,
747
- "loss": 0.0092,
748
- "reward": 1.200892908871174,
749
- "reward_std": 0.4598929390311241,
750
- "rewards/accuracy_reward": 0.34587055183947085,
751
- "rewards/format_reward": 0.8550223641097545,
752
- "step": 275
753
- },
754
- {
755
- "completion_length": 315.45526084899905,
756
- "epoch": 0.9893992932862191,
757
- "grad_norm": 0.013098319061100483,
758
- "kl": 0.19427490234375,
759
- "learning_rate": 6.883273035447335e-09,
760
- "loss": 0.0078,
761
- "reward": 1.2271205872297286,
762
- "reward_std": 0.4515534173697233,
763
- "rewards/accuracy_reward": 0.35345983896404504,
764
- "rewards/format_reward": 0.8736607514321804,
765
- "step": 280
766
- },
767
- {
768
- "completion_length": 325.00113519032794,
769
- "epoch": 1.0,
770
- "kl": 0.18748982747395834,
771
- "reward": 1.2046131504078705,
772
- "reward_std": 0.4611472859978676,
773
- "rewards/accuracy_reward": 0.3446800730501612,
774
- "rewards/format_reward": 0.8599330745637417,
775
- "step": 283,
776
  "total_flos": 0.0,
777
- "train_loss": 5.24096154554993,
778
- "train_runtime": 41343.1213,
779
- "train_samples_per_second": 1.752,
780
  "train_steps_per_second": 0.007
781
  }
782
  ],
783
  "logging_steps": 5,
784
- "max_steps": 283,
785
  "num_input_tokens_seen": 0,
786
  "num_train_epochs": 1,
787
  "save_steps": 500,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.10202898550724637,
5
  "eval_steps": 100,
6
+ "global_step": 33,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 303.61742544174194,
13
+ "epoch": 0.015458937198067632,
14
+ "grad_norm": 0.02257479541003704,
15
+ "kl": 0.9676864624023438,
16
+ "learning_rate": 1.9941379571543597e-05,
17
+ "loss": 0.0386,
18
+ "reward": 0.7926339689642191,
19
+ "reward_std": 0.3637773351743817,
20
+ "rewards/accuracy_reward": 0.11194196966243908,
21
+ "rewards/format_reward": 0.6806919939815999,
22
  "step": 5
23
  },
24
  {
25
+ "completion_length": 193.77601299285888,
26
+ "epoch": 0.030917874396135265,
27
+ "grad_norm": 0.02067318558692932,
28
+ "kl": 0.1245941162109375,
29
+ "learning_rate": 1.796093065705644e-05,
30
+ "loss": 0.005,
31
+ "reward": 1.0407366551458836,
32
+ "reward_std": 0.2885524293407798,
33
+ "rewards/accuracy_reward": 0.11953125573927537,
34
+ "rewards/format_reward": 0.9212053991854191,
35
  "step": 10
36
  },
37
  {
38
+ "completion_length": 223.56373710632323,
39
+ "epoch": 0.0463768115942029,
40
+ "grad_norm": 0.014934813603758812,
41
+ "kl": 0.16761474609375,
42
+ "learning_rate": 1.3701381553399147e-05,
43
+ "loss": 0.0067,
44
+ "reward": 1.0725446939468384,
45
+ "reward_std": 0.3380734449252486,
46
+ "rewards/accuracy_reward": 0.16651786545990035,
47
+ "rewards/format_reward": 0.906026828289032,
48
  "step": 15
49
  },
50
  {
51
+ "completion_length": 189.5631784439087,
52
+ "epoch": 0.06183574879227053,
53
+ "grad_norm": 0.015098211355507374,
54
+ "kl": 0.175286865234375,
55
+ "learning_rate": 8.382180034472353e-06,
56
+ "loss": 0.007,
57
+ "reward": 1.1564732655882835,
58
+ "reward_std": 0.2815748773515224,
59
+ "rewards/accuracy_reward": 0.19162947330623864,
60
+ "rewards/format_reward": 0.964843787997961,
61
  "step": 20
62
  },
63
  {
64
+ "completion_length": 190.79632415771485,
65
+ "epoch": 0.07729468599033816,
66
+ "grad_norm": 0.013767687603831291,
67
+ "kl": 0.16260986328125,
68
+ "learning_rate": 3.5261371521817247e-06,
69
+ "loss": 0.0065,
70
+ "reward": 1.1904018431901933,
71
+ "reward_std": 0.306734830327332,
72
+ "rewards/accuracy_reward": 0.22790179681032896,
73
+ "rewards/format_reward": 0.9625000387430191,
74
  "step": 25
75
  },
76
  {
77
+ "completion_length": 203.4788038253784,
78
+ "epoch": 0.0927536231884058,
79
+ "grad_norm": 0.014600388705730438,
80
+ "kl": 0.143023681640625,
81
+ "learning_rate": 5.234682881719766e-07,
82
+ "loss": 0.0057,
83
+ "reward": 1.206808091700077,
84
+ "reward_std": 0.2881101544946432,
85
+ "rewards/accuracy_reward": 0.23984376154839993,
86
+ "rewards/format_reward": 0.9669643275439739,
87
  "step": 30
88
  },
89
  {
90
+ "completion_length": 215.08613300323486,
91
+ "epoch": 0.10202898550724637,
92
+ "kl": 0.13948567708333334,
93
+ "reward": 1.2220982710520427,
94
+ "reward_std": 0.3094604279225071,
95
+ "rewards/accuracy_reward": 0.252976200543344,
96
+ "rewards/format_reward": 0.9691220708191395,
97
+ "step": 33,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  "total_flos": 0.0,
99
+ "train_loss": 0.011041131547906181,
100
+ "train_runtime": 4812.1769,
101
+ "train_samples_per_second": 1.505,
102
  "train_steps_per_second": 0.007
103
  }
104
  ],
105
  "logging_steps": 5,
106
+ "max_steps": 33,
107
  "num_input_tokens_seen": 0,
108
  "num_train_epochs": 1,
109
  "save_steps": 500,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4f843d1dd9765bff089632ddbfb52580ac47d7f8c13fae1f03493b64486769b
3
  size 7480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ddd9b822d423f5edd5932b081042f0d6a6fb9c8892b3e03c09aeadb53fda817
3
  size 7480