CUIGuy commited on
Commit
57df952
·
verified ·
1 Parent(s): a747321

Upload 10 files

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. rng_state.pth +1 -1
  3. tokenizer.json +14 -2
  4. trainer_state.json +1532 -416
  5. training_args.bin +1 -1
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49b8807bedd99b2e574379aa143ff6d032660157534a81dca84000c65ccef846
3
  size 445794776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd9caef85ec52964d77101c01089e033b9de019b050425b43da9b6a07ebbdb60
3
  size 445794776
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ea2550ae59da11bf890044f4115183f9d7415e09bfd06aa78aee47b54379296
3
  size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ccd55bfa5beba6eae86ae11b5406b054abd0de01276723570e299708216fdb
3
  size 14180
tokenizer.json CHANGED
@@ -1,7 +1,19 @@
1
  {
2
  "version": "1.0",
3
- "truncation": null,
4
- "padding": null,
 
 
 
 
 
 
 
 
 
 
 
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
 
1
  {
2
  "version": "1.0",
3
+ "truncation": {
4
+ "direction": "Right",
5
+ "max_length": 32,
6
+ "strategy": "LongestFirst",
7
+ "stride": 0
8
+ },
9
+ "padding": {
10
+ "strategy": "BatchLongest",
11
+ "direction": "Left",
12
+ "pad_to_multiple_of": null,
13
+ "pad_id": 0,
14
+ "pad_type_id": 0,
15
+ "pad_token": "<pad>"
16
+ },
17
  "added_tokens": [
18
  {
19
  "id": 0,
trainer_state.json CHANGED
@@ -3,1178 +3,2294 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
- "global_step": 9295,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03,
13
  "learning_rate": 0.0003,
14
- "loss": 1.072,
15
  "step": 50
16
  },
17
  {
18
- "epoch": 0.05,
19
  "learning_rate": 0.0003,
20
- "loss": 0.2901,
21
  "step": 100
22
  },
23
  {
24
- "epoch": 0.08,
25
  "learning_rate": 0.0003,
26
- "loss": 0.2788,
27
  "step": 150
28
  },
29
  {
30
- "epoch": 0.11,
31
  "learning_rate": 0.0003,
32
- "loss": 0.2179,
33
  "step": 200
34
  },
35
  {
36
- "epoch": 0.13,
37
  "learning_rate": 0.0003,
38
- "loss": 0.2136,
39
  "step": 250
40
  },
41
  {
42
- "epoch": 0.16,
43
  "learning_rate": 0.0003,
44
- "loss": 0.1834,
45
  "step": 300
46
  },
47
  {
48
- "epoch": 0.19,
49
  "learning_rate": 0.0003,
50
- "loss": 0.1721,
51
  "step": 350
52
  },
53
  {
54
- "epoch": 0.22,
55
  "learning_rate": 0.0003,
56
- "loss": 0.1466,
57
  "step": 400
58
  },
59
  {
60
- "epoch": 0.24,
61
  "learning_rate": 0.0003,
62
- "loss": 0.1327,
63
  "step": 450
64
  },
65
  {
66
- "epoch": 0.27,
67
  "learning_rate": 0.0003,
68
- "loss": 0.1377,
69
  "step": 500
70
  },
71
  {
72
- "epoch": 0.3,
73
  "learning_rate": 0.0003,
74
- "loss": 0.1092,
75
  "step": 550
76
  },
77
  {
78
- "epoch": 0.32,
79
  "learning_rate": 0.0003,
80
- "loss": 0.1177,
81
  "step": 600
82
  },
83
  {
84
- "epoch": 0.35,
85
  "learning_rate": 0.0003,
86
- "loss": 0.1659,
87
  "step": 650
88
  },
89
  {
90
- "epoch": 0.38,
91
  "learning_rate": 0.0003,
92
- "loss": 0.1073,
93
  "step": 700
94
  },
95
  {
96
- "epoch": 0.4,
97
  "learning_rate": 0.0003,
98
- "loss": 0.1567,
99
  "step": 750
100
  },
101
  {
102
- "epoch": 0.43,
103
  "learning_rate": 0.0003,
104
- "loss": 0.1389,
105
  "step": 800
106
  },
107
  {
108
- "epoch": 0.46,
109
  "learning_rate": 0.0003,
110
- "loss": 0.1125,
111
  "step": 850
112
  },
113
  {
114
- "epoch": 0.48,
115
  "learning_rate": 0.0003,
116
- "loss": 0.1009,
117
  "step": 900
118
  },
119
  {
120
- "epoch": 0.51,
121
  "learning_rate": 0.0003,
122
- "loss": 0.0986,
123
  "step": 950
124
  },
125
  {
126
- "epoch": 0.54,
127
  "learning_rate": 0.0003,
128
- "loss": 0.104,
129
  "step": 1000
130
  },
131
  {
132
- "epoch": 0.56,
133
  "learning_rate": 0.0003,
134
- "loss": 0.1155,
135
  "step": 1050
136
  },
137
  {
138
- "epoch": 0.59,
139
  "learning_rate": 0.0003,
140
- "loss": 0.1213,
141
  "step": 1100
142
  },
143
  {
144
- "epoch": 0.62,
145
  "learning_rate": 0.0003,
146
- "loss": 0.1001,
147
  "step": 1150
148
  },
149
  {
150
- "epoch": 0.65,
151
  "learning_rate": 0.0003,
152
- "loss": 0.0985,
153
  "step": 1200
154
  },
155
  {
156
- "epoch": 0.67,
157
  "learning_rate": 0.0003,
158
- "loss": 0.0879,
159
  "step": 1250
160
  },
161
  {
162
- "epoch": 0.7,
163
  "learning_rate": 0.0003,
164
- "loss": 0.1006,
165
  "step": 1300
166
  },
167
  {
168
- "epoch": 0.73,
169
  "learning_rate": 0.0003,
170
- "loss": 0.0924,
171
  "step": 1350
172
  },
173
  {
174
- "epoch": 0.75,
175
  "learning_rate": 0.0003,
176
- "loss": 0.0787,
177
  "step": 1400
178
  },
179
  {
180
- "epoch": 0.78,
181
  "learning_rate": 0.0003,
182
- "loss": 0.0959,
183
  "step": 1450
184
  },
185
  {
186
- "epoch": 0.81,
187
  "learning_rate": 0.0003,
188
- "loss": 0.0634,
189
  "step": 1500
190
  },
191
  {
192
- "epoch": 0.83,
193
  "learning_rate": 0.0003,
194
- "loss": 0.082,
195
  "step": 1550
196
  },
197
  {
198
- "epoch": 0.86,
199
  "learning_rate": 0.0003,
200
- "loss": 0.0823,
201
  "step": 1600
202
  },
203
  {
204
- "epoch": 0.89,
205
  "learning_rate": 0.0003,
206
- "loss": 0.0662,
207
  "step": 1650
208
  },
209
  {
210
- "epoch": 0.91,
211
  "learning_rate": 0.0003,
212
- "loss": 0.0865,
213
  "step": 1700
214
  },
215
  {
216
- "epoch": 0.94,
217
  "learning_rate": 0.0003,
218
- "loss": 0.0746,
219
  "step": 1750
220
  },
221
  {
222
- "epoch": 0.97,
223
  "learning_rate": 0.0003,
224
- "loss": 0.0577,
225
  "step": 1800
226
  },
227
  {
228
- "epoch": 1.0,
229
  "learning_rate": 0.0003,
230
- "loss": 0.0666,
231
  "step": 1850
232
  },
233
  {
234
- "epoch": 1.0,
235
- "eval_f1": 25.2962,
236
- "eval_gen_len": 31.19437742578412,
237
- "eval_loss": 0.07428992539644241,
238
- "eval_runtime": 15.1167,
239
- "eval_samples_per_second": 630.629,
240
- "eval_steps_per_second": 19.713,
241
- "step": 1859
242
- },
243
- {
244
- "epoch": 1.02,
245
  "learning_rate": 0.0003,
246
- "loss": 0.0685,
247
  "step": 1900
248
  },
249
  {
250
- "epoch": 1.05,
251
  "learning_rate": 0.0003,
252
- "loss": 0.0339,
253
  "step": 1950
254
  },
255
  {
256
- "epoch": 1.08,
257
  "learning_rate": 0.0003,
258
- "loss": 0.0578,
259
  "step": 2000
260
  },
261
  {
262
- "epoch": 1.1,
263
  "learning_rate": 0.0003,
264
- "loss": 0.0694,
265
  "step": 2050
266
  },
267
  {
268
- "epoch": 1.13,
269
  "learning_rate": 0.0003,
270
- "loss": 0.0488,
271
  "step": 2100
272
  },
273
  {
274
- "epoch": 1.16,
275
  "learning_rate": 0.0003,
276
- "loss": 0.0601,
277
  "step": 2150
278
  },
279
  {
280
- "epoch": 1.18,
281
  "learning_rate": 0.0003,
282
- "loss": 0.0528,
283
  "step": 2200
284
  },
285
  {
286
- "epoch": 1.21,
287
  "learning_rate": 0.0003,
288
- "loss": 0.0365,
289
  "step": 2250
290
  },
291
  {
292
- "epoch": 1.24,
293
  "learning_rate": 0.0003,
294
- "loss": 0.0517,
295
  "step": 2300
296
  },
297
  {
298
- "epoch": 1.26,
299
  "learning_rate": 0.0003,
300
- "loss": 0.0458,
301
  "step": 2350
302
  },
303
  {
304
- "epoch": 1.29,
305
  "learning_rate": 0.0003,
306
- "loss": 0.0386,
307
  "step": 2400
308
  },
309
  {
310
- "epoch": 1.32,
311
  "learning_rate": 0.0003,
312
- "loss": 0.0435,
313
  "step": 2450
314
  },
315
  {
316
- "epoch": 1.34,
317
  "learning_rate": 0.0003,
318
- "loss": 0.0451,
319
  "step": 2500
320
  },
321
  {
322
- "epoch": 1.37,
323
  "learning_rate": 0.0003,
324
- "loss": 0.0545,
325
  "step": 2550
326
  },
327
  {
328
- "epoch": 1.4,
329
  "learning_rate": 0.0003,
330
- "loss": 0.045,
331
  "step": 2600
332
  },
333
  {
334
- "epoch": 1.43,
335
  "learning_rate": 0.0003,
336
- "loss": 0.0312,
337
  "step": 2650
338
  },
339
  {
340
- "epoch": 1.45,
341
  "learning_rate": 0.0003,
342
- "loss": 0.0636,
343
  "step": 2700
344
  },
345
  {
346
- "epoch": 1.48,
347
  "learning_rate": 0.0003,
348
- "loss": 0.0535,
349
  "step": 2750
350
  },
351
  {
352
- "epoch": 1.51,
353
  "learning_rate": 0.0003,
354
- "loss": 0.0587,
355
  "step": 2800
356
  },
357
  {
358
- "epoch": 1.53,
359
  "learning_rate": 0.0003,
360
- "loss": 0.0352,
361
  "step": 2850
362
  },
363
  {
364
- "epoch": 1.56,
365
  "learning_rate": 0.0003,
366
- "loss": 0.0361,
367
  "step": 2900
368
  },
369
  {
370
- "epoch": 1.59,
371
  "learning_rate": 0.0003,
372
- "loss": 0.0387,
373
  "step": 2950
374
  },
375
  {
376
- "epoch": 1.61,
377
  "learning_rate": 0.0003,
378
- "loss": 0.0345,
379
  "step": 3000
380
  },
381
  {
382
- "epoch": 1.64,
383
  "learning_rate": 0.0003,
384
- "loss": 0.053,
385
  "step": 3050
386
  },
387
  {
388
- "epoch": 1.67,
389
  "learning_rate": 0.0003,
390
- "loss": 0.0302,
391
  "step": 3100
392
  },
393
  {
394
- "epoch": 1.69,
395
  "learning_rate": 0.0003,
396
- "loss": 0.0368,
397
  "step": 3150
398
  },
399
  {
400
- "epoch": 1.72,
401
  "learning_rate": 0.0003,
402
- "loss": 0.0392,
403
  "step": 3200
404
  },
405
  {
406
- "epoch": 1.75,
407
  "learning_rate": 0.0003,
408
- "loss": 0.0319,
409
  "step": 3250
410
  },
411
  {
412
- "epoch": 1.78,
413
  "learning_rate": 0.0003,
414
- "loss": 0.0555,
415
  "step": 3300
416
  },
417
  {
418
- "epoch": 1.8,
419
  "learning_rate": 0.0003,
420
- "loss": 0.0426,
421
  "step": 3350
422
  },
423
  {
424
- "epoch": 1.83,
425
  "learning_rate": 0.0003,
426
- "loss": 0.048,
427
  "step": 3400
428
  },
429
  {
430
- "epoch": 1.86,
431
  "learning_rate": 0.0003,
432
- "loss": 0.0545,
433
  "step": 3450
434
  },
435
  {
436
- "epoch": 1.88,
437
  "learning_rate": 0.0003,
438
- "loss": 0.0462,
439
  "step": 3500
440
  },
441
  {
442
- "epoch": 1.91,
443
  "learning_rate": 0.0003,
444
- "loss": 0.0545,
445
  "step": 3550
446
  },
447
  {
448
- "epoch": 1.94,
449
  "learning_rate": 0.0003,
450
- "loss": 0.0576,
451
  "step": 3600
452
  },
453
  {
454
- "epoch": 1.96,
455
  "learning_rate": 0.0003,
456
- "loss": 0.0251,
457
  "step": 3650
458
  },
459
  {
460
- "epoch": 1.99,
461
  "learning_rate": 0.0003,
462
- "loss": 0.0297,
463
  "step": 3700
464
  },
465
  {
466
- "epoch": 2.0,
467
- "eval_f1": 25.2962,
468
- "eval_gen_len": 31.19437742578412,
469
- "eval_loss": 0.07492993772029877,
470
- "eval_runtime": 14.9487,
471
- "eval_samples_per_second": 637.715,
472
- "eval_steps_per_second": 19.935,
473
- "step": 3718
474
  },
475
  {
476
- "epoch": 2.02,
477
  "learning_rate": 0.0003,
478
- "loss": 0.0474,
479
  "step": 3750
480
  },
481
  {
482
- "epoch": 2.04,
483
  "learning_rate": 0.0003,
484
- "loss": 0.0303,
485
  "step": 3800
486
  },
487
  {
488
- "epoch": 2.07,
489
  "learning_rate": 0.0003,
490
- "loss": 0.0383,
491
  "step": 3850
492
  },
493
  {
494
- "epoch": 2.1,
495
  "learning_rate": 0.0003,
496
- "loss": 0.0249,
497
  "step": 3900
498
  },
499
  {
500
- "epoch": 2.12,
501
  "learning_rate": 0.0003,
502
- "loss": 0.0284,
503
  "step": 3950
504
  },
505
  {
506
- "epoch": 2.15,
507
  "learning_rate": 0.0003,
508
- "loss": 0.0237,
509
  "step": 4000
510
  },
511
  {
512
- "epoch": 2.18,
513
  "learning_rate": 0.0003,
514
- "loss": 0.019,
515
  "step": 4050
516
  },
517
  {
518
- "epoch": 2.21,
519
  "learning_rate": 0.0003,
520
- "loss": 0.0319,
521
  "step": 4100
522
  },
523
  {
524
- "epoch": 2.23,
525
  "learning_rate": 0.0003,
526
- "loss": 0.0216,
527
  "step": 4150
528
  },
529
  {
530
- "epoch": 2.26,
531
  "learning_rate": 0.0003,
532
- "loss": 0.0254,
533
  "step": 4200
534
  },
535
  {
536
- "epoch": 2.29,
537
  "learning_rate": 0.0003,
538
- "loss": 0.0243,
539
  "step": 4250
540
  },
541
  {
542
- "epoch": 2.31,
543
  "learning_rate": 0.0003,
544
- "loss": 0.024,
545
  "step": 4300
546
  },
547
  {
548
- "epoch": 2.34,
549
  "learning_rate": 0.0003,
550
- "loss": 0.0168,
551
  "step": 4350
552
  },
553
  {
554
- "epoch": 2.37,
555
  "learning_rate": 0.0003,
556
- "loss": 0.0222,
557
  "step": 4400
558
  },
559
  {
560
- "epoch": 2.39,
561
  "learning_rate": 0.0003,
562
- "loss": 0.0299,
563
  "step": 4450
564
  },
565
  {
566
- "epoch": 2.42,
567
  "learning_rate": 0.0003,
568
- "loss": 0.0272,
569
  "step": 4500
570
  },
571
  {
572
- "epoch": 2.45,
573
  "learning_rate": 0.0003,
574
- "loss": 0.0191,
575
  "step": 4550
576
  },
577
  {
578
- "epoch": 2.47,
579
  "learning_rate": 0.0003,
580
- "loss": 0.0259,
581
  "step": 4600
582
  },
583
  {
584
- "epoch": 2.5,
585
  "learning_rate": 0.0003,
586
- "loss": 0.0148,
587
  "step": 4650
588
  },
589
  {
590
- "epoch": 2.53,
591
  "learning_rate": 0.0003,
592
- "loss": 0.0258,
593
  "step": 4700
594
  },
595
  {
596
- "epoch": 2.56,
597
  "learning_rate": 0.0003,
598
- "loss": 0.0282,
599
  "step": 4750
600
  },
601
  {
602
- "epoch": 2.58,
603
  "learning_rate": 0.0003,
604
- "loss": 0.0224,
605
  "step": 4800
606
  },
607
  {
608
- "epoch": 2.61,
609
  "learning_rate": 0.0003,
610
- "loss": 0.0235,
611
  "step": 4850
612
  },
613
  {
614
- "epoch": 2.64,
615
  "learning_rate": 0.0003,
616
- "loss": 0.0261,
617
  "step": 4900
618
  },
619
  {
620
- "epoch": 2.66,
621
  "learning_rate": 0.0003,
622
- "loss": 0.0246,
623
  "step": 4950
624
  },
625
  {
626
- "epoch": 2.69,
627
  "learning_rate": 0.0003,
628
- "loss": 0.0115,
629
  "step": 5000
630
  },
631
  {
632
- "epoch": 2.72,
633
  "learning_rate": 0.0003,
634
- "loss": 0.0223,
635
  "step": 5050
636
  },
637
  {
638
- "epoch": 2.74,
639
  "learning_rate": 0.0003,
640
- "loss": 0.0235,
641
  "step": 5100
642
  },
643
  {
644
- "epoch": 2.77,
645
  "learning_rate": 0.0003,
646
- "loss": 0.0228,
647
  "step": 5150
648
  },
649
  {
650
- "epoch": 2.8,
651
  "learning_rate": 0.0003,
652
- "loss": 0.0128,
653
  "step": 5200
654
  },
655
  {
656
- "epoch": 2.82,
657
  "learning_rate": 0.0003,
658
- "loss": 0.0293,
659
  "step": 5250
660
  },
661
  {
662
- "epoch": 2.85,
663
  "learning_rate": 0.0003,
664
- "loss": 0.0239,
665
  "step": 5300
666
  },
667
  {
668
- "epoch": 2.88,
669
  "learning_rate": 0.0003,
670
- "loss": 0.0244,
671
  "step": 5350
672
  },
673
  {
674
- "epoch": 2.9,
675
  "learning_rate": 0.0003,
676
- "loss": 0.0324,
677
  "step": 5400
678
  },
679
  {
680
- "epoch": 2.93,
681
  "learning_rate": 0.0003,
682
- "loss": 0.019,
683
  "step": 5450
684
  },
685
  {
686
- "epoch": 2.96,
687
  "learning_rate": 0.0003,
688
- "loss": 0.0327,
689
  "step": 5500
690
  },
691
  {
692
- "epoch": 2.99,
693
  "learning_rate": 0.0003,
694
- "loss": 0.0225,
695
  "step": 5550
696
  },
697
  {
698
- "epoch": 3.0,
699
- "eval_f1": 25.2962,
700
- "eval_gen_len": 31.19437742578412,
701
- "eval_loss": 0.06490713357925415,
702
- "eval_runtime": 15.0374,
703
- "eval_samples_per_second": 633.953,
704
- "eval_steps_per_second": 19.817,
705
- "step": 5577
706
- },
707
- {
708
- "epoch": 3.01,
709
  "learning_rate": 0.0003,
710
- "loss": 0.0158,
711
  "step": 5600
712
  },
713
  {
714
- "epoch": 3.04,
715
  "learning_rate": 0.0003,
716
- "loss": 0.0092,
717
  "step": 5650
718
  },
719
  {
720
- "epoch": 3.07,
721
  "learning_rate": 0.0003,
722
- "loss": 0.0081,
723
  "step": 5700
724
  },
725
  {
726
- "epoch": 3.09,
727
  "learning_rate": 0.0003,
728
- "loss": 0.0155,
729
  "step": 5750
730
  },
731
  {
732
- "epoch": 3.12,
733
  "learning_rate": 0.0003,
734
- "loss": 0.0175,
735
  "step": 5800
736
  },
737
  {
738
- "epoch": 3.15,
739
  "learning_rate": 0.0003,
740
- "loss": 0.0142,
741
  "step": 5850
742
  },
743
  {
744
- "epoch": 3.17,
745
  "learning_rate": 0.0003,
746
- "loss": 0.0116,
747
  "step": 5900
748
  },
749
  {
750
- "epoch": 3.2,
751
  "learning_rate": 0.0003,
752
- "loss": 0.0123,
753
  "step": 5950
754
  },
755
  {
756
- "epoch": 3.23,
757
  "learning_rate": 0.0003,
758
- "loss": 0.0209,
759
  "step": 6000
760
  },
761
  {
762
- "epoch": 3.25,
763
  "learning_rate": 0.0003,
764
- "loss": 0.0188,
765
  "step": 6050
766
  },
767
  {
768
- "epoch": 3.28,
769
  "learning_rate": 0.0003,
770
- "loss": 0.0106,
771
  "step": 6100
772
  },
773
  {
774
- "epoch": 3.31,
775
  "learning_rate": 0.0003,
776
- "loss": 0.0125,
777
  "step": 6150
778
  },
779
  {
780
- "epoch": 3.34,
781
  "learning_rate": 0.0003,
782
- "loss": 0.0092,
783
  "step": 6200
784
  },
785
  {
786
- "epoch": 3.36,
787
  "learning_rate": 0.0003,
788
- "loss": 0.0148,
789
  "step": 6250
790
  },
791
  {
792
- "epoch": 3.39,
793
  "learning_rate": 0.0003,
794
- "loss": 0.0146,
795
  "step": 6300
796
  },
797
  {
798
- "epoch": 3.42,
799
  "learning_rate": 0.0003,
800
- "loss": 0.0352,
801
  "step": 6350
802
  },
803
  {
804
- "epoch": 3.44,
805
  "learning_rate": 0.0003,
806
- "loss": 0.0159,
807
  "step": 6400
808
  },
809
  {
810
- "epoch": 3.47,
811
  "learning_rate": 0.0003,
812
- "loss": 0.0106,
813
  "step": 6450
814
  },
815
  {
816
- "epoch": 3.5,
817
  "learning_rate": 0.0003,
818
- "loss": 0.0284,
819
  "step": 6500
820
  },
821
  {
822
- "epoch": 3.52,
823
  "learning_rate": 0.0003,
824
- "loss": 0.0133,
825
  "step": 6550
826
  },
827
  {
828
- "epoch": 3.55,
829
  "learning_rate": 0.0003,
830
- "loss": 0.0142,
831
  "step": 6600
832
  },
833
  {
834
- "epoch": 3.58,
835
  "learning_rate": 0.0003,
836
- "loss": 0.025,
837
  "step": 6650
838
  },
839
  {
840
- "epoch": 3.6,
841
  "learning_rate": 0.0003,
842
- "loss": 0.0084,
843
  "step": 6700
844
  },
845
  {
846
- "epoch": 3.63,
847
  "learning_rate": 0.0003,
848
- "loss": 0.0111,
849
  "step": 6750
850
  },
851
  {
852
- "epoch": 3.66,
853
  "learning_rate": 0.0003,
854
- "loss": 0.0046,
855
  "step": 6800
856
  },
857
  {
858
- "epoch": 3.68,
859
  "learning_rate": 0.0003,
860
- "loss": 0.0272,
861
  "step": 6850
862
  },
863
  {
864
- "epoch": 3.71,
865
  "learning_rate": 0.0003,
866
- "loss": 0.0164,
867
  "step": 6900
868
  },
869
  {
870
- "epoch": 3.74,
871
  "learning_rate": 0.0003,
872
- "loss": 0.0151,
873
  "step": 6950
874
  },
875
  {
876
- "epoch": 3.77,
877
  "learning_rate": 0.0003,
878
- "loss": 0.0167,
879
  "step": 7000
880
  },
881
  {
882
- "epoch": 3.79,
883
  "learning_rate": 0.0003,
884
- "loss": 0.0091,
885
  "step": 7050
886
  },
887
  {
888
- "epoch": 3.82,
889
  "learning_rate": 0.0003,
890
- "loss": 0.0194,
891
  "step": 7100
892
  },
893
  {
894
- "epoch": 3.85,
895
  "learning_rate": 0.0003,
896
- "loss": 0.0181,
897
  "step": 7150
898
  },
899
  {
900
- "epoch": 3.87,
901
  "learning_rate": 0.0003,
902
- "loss": 0.0191,
903
  "step": 7200
904
  },
905
  {
906
- "epoch": 3.9,
907
  "learning_rate": 0.0003,
908
- "loss": 0.0048,
909
  "step": 7250
910
  },
911
  {
912
- "epoch": 3.93,
913
  "learning_rate": 0.0003,
914
- "loss": 0.013,
915
  "step": 7300
916
  },
917
  {
918
- "epoch": 3.95,
919
  "learning_rate": 0.0003,
920
- "loss": 0.0151,
921
  "step": 7350
922
  },
923
  {
924
- "epoch": 3.98,
925
  "learning_rate": 0.0003,
926
- "loss": 0.0095,
927
  "step": 7400
928
  },
929
  {
930
- "epoch": 4.0,
931
- "eval_f1": 25.2962,
932
- "eval_gen_len": 31.19437742578412,
933
- "eval_loss": 0.06164472550153732,
934
- "eval_runtime": 14.9204,
935
- "eval_samples_per_second": 638.925,
936
- "eval_steps_per_second": 19.973,
937
- "step": 7436
938
  },
939
  {
940
- "epoch": 4.01,
941
  "learning_rate": 0.0003,
942
- "loss": 0.0141,
943
  "step": 7450
944
  },
945
  {
946
- "epoch": 4.03,
947
  "learning_rate": 0.0003,
948
- "loss": 0.0112,
949
  "step": 7500
950
  },
951
  {
952
- "epoch": 4.06,
953
  "learning_rate": 0.0003,
954
- "loss": 0.0053,
955
  "step": 7550
956
  },
957
  {
958
- "epoch": 4.09,
959
  "learning_rate": 0.0003,
960
- "loss": 0.0261,
961
  "step": 7600
962
  },
963
  {
964
- "epoch": 4.12,
965
  "learning_rate": 0.0003,
966
- "loss": 0.0094,
967
  "step": 7650
968
  },
969
  {
970
- "epoch": 4.14,
971
  "learning_rate": 0.0003,
972
- "loss": 0.0054,
973
  "step": 7700
974
  },
975
  {
976
- "epoch": 4.17,
977
  "learning_rate": 0.0003,
978
- "loss": 0.0087,
979
  "step": 7750
980
  },
981
  {
982
- "epoch": 4.2,
983
  "learning_rate": 0.0003,
984
- "loss": 0.0087,
985
  "step": 7800
986
  },
987
  {
988
- "epoch": 4.22,
989
  "learning_rate": 0.0003,
990
- "loss": 0.0114,
991
  "step": 7850
992
  },
993
  {
994
- "epoch": 4.25,
995
  "learning_rate": 0.0003,
996
- "loss": 0.0114,
997
  "step": 7900
998
  },
999
  {
1000
- "epoch": 4.28,
1001
  "learning_rate": 0.0003,
1002
- "loss": 0.0107,
1003
  "step": 7950
1004
  },
1005
  {
1006
- "epoch": 4.3,
1007
  "learning_rate": 0.0003,
1008
- "loss": 0.0079,
1009
  "step": 8000
1010
  },
1011
  {
1012
- "epoch": 4.33,
1013
  "learning_rate": 0.0003,
1014
- "loss": 0.009,
1015
  "step": 8050
1016
  },
1017
  {
1018
- "epoch": 4.36,
1019
  "learning_rate": 0.0003,
1020
- "loss": 0.0148,
1021
  "step": 8100
1022
  },
1023
  {
1024
- "epoch": 4.38,
1025
  "learning_rate": 0.0003,
1026
- "loss": 0.0077,
1027
  "step": 8150
1028
  },
1029
  {
1030
- "epoch": 4.41,
1031
  "learning_rate": 0.0003,
1032
- "loss": 0.0153,
1033
  "step": 8200
1034
  },
1035
  {
1036
- "epoch": 4.44,
1037
  "learning_rate": 0.0003,
1038
- "loss": 0.008,
1039
  "step": 8250
1040
  },
1041
  {
1042
- "epoch": 4.46,
1043
  "learning_rate": 0.0003,
1044
- "loss": 0.0167,
1045
  "step": 8300
1046
  },
1047
  {
1048
- "epoch": 4.49,
1049
  "learning_rate": 0.0003,
1050
- "loss": 0.0115,
1051
  "step": 8350
1052
  },
1053
  {
1054
- "epoch": 4.52,
1055
  "learning_rate": 0.0003,
1056
- "loss": 0.0171,
1057
  "step": 8400
1058
  },
1059
  {
1060
- "epoch": 4.55,
1061
  "learning_rate": 0.0003,
1062
- "loss": 0.0147,
1063
  "step": 8450
1064
  },
1065
  {
1066
- "epoch": 4.57,
1067
  "learning_rate": 0.0003,
1068
- "loss": 0.0088,
1069
  "step": 8500
1070
  },
1071
  {
1072
- "epoch": 4.6,
1073
  "learning_rate": 0.0003,
1074
- "loss": 0.0081,
1075
  "step": 8550
1076
  },
1077
  {
1078
- "epoch": 4.63,
1079
  "learning_rate": 0.0003,
1080
- "loss": 0.0102,
1081
  "step": 8600
1082
  },
1083
  {
1084
- "epoch": 4.65,
1085
  "learning_rate": 0.0003,
1086
- "loss": 0.0073,
1087
  "step": 8650
1088
  },
1089
  {
1090
- "epoch": 4.68,
1091
  "learning_rate": 0.0003,
1092
- "loss": 0.0063,
1093
  "step": 8700
1094
  },
1095
  {
1096
- "epoch": 4.71,
1097
  "learning_rate": 0.0003,
1098
- "loss": 0.0161,
1099
  "step": 8750
1100
  },
1101
  {
1102
- "epoch": 4.73,
1103
  "learning_rate": 0.0003,
1104
- "loss": 0.0056,
1105
  "step": 8800
1106
  },
1107
  {
1108
- "epoch": 4.76,
1109
  "learning_rate": 0.0003,
1110
- "loss": 0.0106,
1111
  "step": 8850
1112
  },
1113
  {
1114
- "epoch": 4.79,
1115
  "learning_rate": 0.0003,
1116
- "loss": 0.0083,
1117
  "step": 8900
1118
  },
1119
  {
1120
- "epoch": 4.81,
1121
  "learning_rate": 0.0003,
1122
- "loss": 0.0098,
1123
  "step": 8950
1124
  },
1125
  {
1126
- "epoch": 4.84,
1127
  "learning_rate": 0.0003,
1128
- "loss": 0.0094,
1129
  "step": 9000
1130
  },
1131
  {
1132
- "epoch": 4.87,
1133
  "learning_rate": 0.0003,
1134
- "loss": 0.0132,
1135
  "step": 9050
1136
  },
1137
  {
1138
- "epoch": 4.9,
1139
  "learning_rate": 0.0003,
1140
- "loss": 0.0084,
1141
  "step": 9100
1142
  },
1143
  {
1144
- "epoch": 4.92,
1145
  "learning_rate": 0.0003,
1146
- "loss": 0.0097,
1147
  "step": 9150
1148
  },
1149
  {
1150
- "epoch": 4.95,
1151
  "learning_rate": 0.0003,
1152
- "loss": 0.0129,
1153
  "step": 9200
1154
  },
1155
  {
1156
- "epoch": 4.98,
1157
  "learning_rate": 0.0003,
1158
- "loss": 0.0093,
1159
  "step": 9250
1160
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
  {
1162
  "epoch": 5.0,
1163
- "eval_f1": 25.2962,
1164
- "eval_gen_len": 31.19437742578412,
1165
- "eval_loss": 0.07671951502561569,
1166
- "eval_runtime": 15.0083,
1167
- "eval_samples_per_second": 635.18,
1168
- "eval_steps_per_second": 19.856,
1169
- "step": 9295
1170
  }
1171
  ],
1172
  "logging_steps": 50,
1173
- "max_steps": 9295,
1174
  "num_input_tokens_seen": 0,
1175
  "num_train_epochs": 5,
1176
  "save_steps": 250,
1177
- "total_flos": 2.570290415972352e+16,
1178
  "train_batch_size": 16,
1179
  "trial_name": null,
1180
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 18550,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.01,
13
  "learning_rate": 0.0003,
14
+ "loss": 1.0781,
15
  "step": 50
16
  },
17
  {
18
+ "epoch": 0.03,
19
  "learning_rate": 0.0003,
20
+ "loss": 0.2803,
21
  "step": 100
22
  },
23
  {
24
+ "epoch": 0.04,
25
  "learning_rate": 0.0003,
26
+ "loss": 0.2085,
27
  "step": 150
28
  },
29
  {
30
+ "epoch": 0.05,
31
  "learning_rate": 0.0003,
32
+ "loss": 0.1611,
33
  "step": 200
34
  },
35
  {
36
+ "epoch": 0.07,
37
  "learning_rate": 0.0003,
38
+ "loss": 0.1625,
39
  "step": 250
40
  },
41
  {
42
+ "epoch": 0.08,
43
  "learning_rate": 0.0003,
44
+ "loss": 0.1371,
45
  "step": 300
46
  },
47
  {
48
+ "epoch": 0.09,
49
  "learning_rate": 0.0003,
50
+ "loss": 0.1533,
51
  "step": 350
52
  },
53
  {
54
+ "epoch": 0.11,
55
  "learning_rate": 0.0003,
56
+ "loss": 0.1224,
57
  "step": 400
58
  },
59
  {
60
+ "epoch": 0.12,
61
  "learning_rate": 0.0003,
62
+ "loss": 0.1488,
63
  "step": 450
64
  },
65
  {
66
+ "epoch": 0.13,
67
  "learning_rate": 0.0003,
68
+ "loss": 0.0981,
69
  "step": 500
70
  },
71
  {
72
+ "epoch": 0.15,
73
  "learning_rate": 0.0003,
74
+ "loss": 0.0973,
75
  "step": 550
76
  },
77
  {
78
+ "epoch": 0.16,
79
  "learning_rate": 0.0003,
80
+ "loss": 0.0992,
81
  "step": 600
82
  },
83
  {
84
+ "epoch": 0.18,
85
  "learning_rate": 0.0003,
86
+ "loss": 0.0867,
87
  "step": 650
88
  },
89
  {
90
+ "epoch": 0.19,
91
  "learning_rate": 0.0003,
92
+ "loss": 0.0933,
93
  "step": 700
94
  },
95
  {
96
+ "epoch": 0.2,
97
  "learning_rate": 0.0003,
98
+ "loss": 0.1006,
99
  "step": 750
100
  },
101
  {
102
+ "epoch": 0.22,
103
  "learning_rate": 0.0003,
104
+ "loss": 0.0694,
105
  "step": 800
106
  },
107
  {
108
+ "epoch": 0.23,
109
  "learning_rate": 0.0003,
110
+ "loss": 0.0655,
111
  "step": 850
112
  },
113
  {
114
+ "epoch": 0.24,
115
  "learning_rate": 0.0003,
116
+ "loss": 0.0775,
117
  "step": 900
118
  },
119
  {
120
+ "epoch": 0.26,
121
  "learning_rate": 0.0003,
122
+ "loss": 0.0666,
123
  "step": 950
124
  },
125
  {
126
+ "epoch": 0.27,
127
  "learning_rate": 0.0003,
128
+ "loss": 0.0539,
129
  "step": 1000
130
  },
131
  {
132
+ "epoch": 0.28,
133
  "learning_rate": 0.0003,
134
+ "loss": 0.0726,
135
  "step": 1050
136
  },
137
  {
138
+ "epoch": 0.3,
139
  "learning_rate": 0.0003,
140
+ "loss": 0.0699,
141
  "step": 1100
142
  },
143
  {
144
+ "epoch": 0.31,
145
  "learning_rate": 0.0003,
146
+ "loss": 0.0786,
147
  "step": 1150
148
  },
149
  {
150
+ "epoch": 0.32,
151
  "learning_rate": 0.0003,
152
+ "loss": 0.0773,
153
  "step": 1200
154
  },
155
  {
156
+ "epoch": 0.34,
157
  "learning_rate": 0.0003,
158
+ "loss": 0.073,
159
  "step": 1250
160
  },
161
  {
162
+ "epoch": 0.35,
163
  "learning_rate": 0.0003,
164
+ "loss": 0.0543,
165
  "step": 1300
166
  },
167
  {
168
+ "epoch": 0.36,
169
  "learning_rate": 0.0003,
170
+ "loss": 0.0572,
171
  "step": 1350
172
  },
173
  {
174
+ "epoch": 0.38,
175
  "learning_rate": 0.0003,
176
+ "loss": 0.089,
177
  "step": 1400
178
  },
179
  {
180
+ "epoch": 0.39,
181
  "learning_rate": 0.0003,
182
+ "loss": 0.0516,
183
  "step": 1450
184
  },
185
  {
186
+ "epoch": 0.4,
187
  "learning_rate": 0.0003,
188
+ "loss": 0.0643,
189
  "step": 1500
190
  },
191
  {
192
+ "epoch": 0.42,
193
  "learning_rate": 0.0003,
194
+ "loss": 0.0364,
195
  "step": 1550
196
  },
197
  {
198
+ "epoch": 0.43,
199
  "learning_rate": 0.0003,
200
+ "loss": 0.0698,
201
  "step": 1600
202
  },
203
  {
204
+ "epoch": 0.44,
205
  "learning_rate": 0.0003,
206
+ "loss": 0.0431,
207
  "step": 1650
208
  },
209
  {
210
+ "epoch": 0.46,
211
  "learning_rate": 0.0003,
212
+ "loss": 0.0393,
213
  "step": 1700
214
  },
215
  {
216
+ "epoch": 0.47,
217
  "learning_rate": 0.0003,
218
+ "loss": 0.0682,
219
  "step": 1750
220
  },
221
  {
222
+ "epoch": 0.49,
223
  "learning_rate": 0.0003,
224
+ "loss": 0.0454,
225
  "step": 1800
226
  },
227
  {
228
+ "epoch": 0.5,
229
  "learning_rate": 0.0003,
230
+ "loss": 0.0534,
231
  "step": 1850
232
  },
233
  {
234
+ "epoch": 0.51,
 
 
 
 
 
 
 
 
 
 
235
  "learning_rate": 0.0003,
236
+ "loss": 0.0519,
237
  "step": 1900
238
  },
239
  {
240
+ "epoch": 0.53,
241
  "learning_rate": 0.0003,
242
+ "loss": 0.0675,
243
  "step": 1950
244
  },
245
  {
246
+ "epoch": 0.54,
247
  "learning_rate": 0.0003,
248
+ "loss": 0.046,
249
  "step": 2000
250
  },
251
  {
252
+ "epoch": 0.55,
253
  "learning_rate": 0.0003,
254
+ "loss": 0.0527,
255
  "step": 2050
256
  },
257
  {
258
+ "epoch": 0.57,
259
  "learning_rate": 0.0003,
260
+ "loss": 0.0588,
261
  "step": 2100
262
  },
263
  {
264
+ "epoch": 0.58,
265
  "learning_rate": 0.0003,
266
+ "loss": 0.0625,
267
  "step": 2150
268
  },
269
  {
270
+ "epoch": 0.59,
271
  "learning_rate": 0.0003,
272
+ "loss": 0.039,
273
  "step": 2200
274
  },
275
  {
276
+ "epoch": 0.61,
277
  "learning_rate": 0.0003,
278
+ "loss": 0.0394,
279
  "step": 2250
280
  },
281
  {
282
+ "epoch": 0.62,
283
  "learning_rate": 0.0003,
284
+ "loss": 0.041,
285
  "step": 2300
286
  },
287
  {
288
+ "epoch": 0.63,
289
  "learning_rate": 0.0003,
290
+ "loss": 0.0363,
291
  "step": 2350
292
  },
293
  {
294
+ "epoch": 0.65,
295
  "learning_rate": 0.0003,
296
+ "loss": 0.0524,
297
  "step": 2400
298
  },
299
  {
300
+ "epoch": 0.66,
301
  "learning_rate": 0.0003,
302
+ "loss": 0.0469,
303
  "step": 2450
304
  },
305
  {
306
+ "epoch": 0.67,
307
  "learning_rate": 0.0003,
308
+ "loss": 0.0427,
309
  "step": 2500
310
  },
311
  {
312
+ "epoch": 0.69,
313
  "learning_rate": 0.0003,
314
+ "loss": 0.0482,
315
  "step": 2550
316
  },
317
  {
318
+ "epoch": 0.7,
319
  "learning_rate": 0.0003,
320
+ "loss": 0.0319,
321
  "step": 2600
322
  },
323
  {
324
+ "epoch": 0.71,
325
  "learning_rate": 0.0003,
326
+ "loss": 0.0371,
327
  "step": 2650
328
  },
329
  {
330
+ "epoch": 0.73,
331
  "learning_rate": 0.0003,
332
+ "loss": 0.0446,
333
  "step": 2700
334
  },
335
  {
336
+ "epoch": 0.74,
337
  "learning_rate": 0.0003,
338
+ "loss": 0.0436,
339
  "step": 2750
340
  },
341
  {
342
+ "epoch": 0.75,
343
  "learning_rate": 0.0003,
344
+ "loss": 0.0308,
345
  "step": 2800
346
  },
347
  {
348
+ "epoch": 0.77,
349
  "learning_rate": 0.0003,
350
+ "loss": 0.0472,
351
  "step": 2850
352
  },
353
  {
354
+ "epoch": 0.78,
355
  "learning_rate": 0.0003,
356
+ "loss": 0.0219,
357
  "step": 2900
358
  },
359
  {
360
+ "epoch": 0.8,
361
  "learning_rate": 0.0003,
362
+ "loss": 0.0346,
363
  "step": 2950
364
  },
365
  {
366
+ "epoch": 0.81,
367
  "learning_rate": 0.0003,
368
+ "loss": 0.0351,
369
  "step": 3000
370
  },
371
  {
372
+ "epoch": 0.82,
373
  "learning_rate": 0.0003,
374
+ "loss": 0.0326,
375
  "step": 3050
376
  },
377
  {
378
+ "epoch": 0.84,
379
  "learning_rate": 0.0003,
380
+ "loss": 0.0405,
381
  "step": 3100
382
  },
383
  {
384
+ "epoch": 0.85,
385
  "learning_rate": 0.0003,
386
+ "loss": 0.0346,
387
  "step": 3150
388
  },
389
  {
390
+ "epoch": 0.86,
391
  "learning_rate": 0.0003,
392
+ "loss": 0.0329,
393
  "step": 3200
394
  },
395
  {
396
+ "epoch": 0.88,
397
  "learning_rate": 0.0003,
398
+ "loss": 0.0407,
399
  "step": 3250
400
  },
401
  {
402
+ "epoch": 0.89,
403
  "learning_rate": 0.0003,
404
+ "loss": 0.0446,
405
  "step": 3300
406
  },
407
  {
408
+ "epoch": 0.9,
409
  "learning_rate": 0.0003,
410
+ "loss": 0.0447,
411
  "step": 3350
412
  },
413
  {
414
+ "epoch": 0.92,
415
  "learning_rate": 0.0003,
416
+ "loss": 0.0393,
417
  "step": 3400
418
  },
419
  {
420
+ "epoch": 0.93,
421
  "learning_rate": 0.0003,
422
+ "loss": 0.0306,
423
  "step": 3450
424
  },
425
  {
426
+ "epoch": 0.94,
427
  "learning_rate": 0.0003,
428
+ "loss": 0.0389,
429
  "step": 3500
430
  },
431
  {
432
+ "epoch": 0.96,
433
  "learning_rate": 0.0003,
434
+ "loss": 0.0326,
435
  "step": 3550
436
  },
437
  {
438
+ "epoch": 0.97,
439
  "learning_rate": 0.0003,
440
+ "loss": 0.0343,
441
  "step": 3600
442
  },
443
  {
444
+ "epoch": 0.98,
445
  "learning_rate": 0.0003,
446
+ "loss": 0.0264,
447
  "step": 3650
448
  },
449
  {
450
+ "epoch": 1.0,
451
  "learning_rate": 0.0003,
452
+ "loss": 0.0313,
453
  "step": 3700
454
  },
455
  {
456
+ "epoch": 1.0,
457
+ "eval_f1": 22.2222,
458
+ "eval_gen_len": 29.002111729698598,
459
+ "eval_loss": 0.045024264603853226,
460
+ "eval_runtime": 25.7329,
461
+ "eval_samples_per_second": 809.701,
462
+ "eval_steps_per_second": 25.337,
463
+ "step": 3710
464
  },
465
  {
466
+ "epoch": 1.01,
467
  "learning_rate": 0.0003,
468
+ "loss": 0.033,
469
  "step": 3750
470
  },
471
  {
472
+ "epoch": 1.02,
473
  "learning_rate": 0.0003,
474
+ "loss": 0.0269,
475
  "step": 3800
476
  },
477
  {
478
+ "epoch": 1.04,
479
  "learning_rate": 0.0003,
480
+ "loss": 0.0199,
481
  "step": 3850
482
  },
483
  {
484
+ "epoch": 1.05,
485
  "learning_rate": 0.0003,
486
+ "loss": 0.0221,
487
  "step": 3900
488
  },
489
  {
490
+ "epoch": 1.06,
491
  "learning_rate": 0.0003,
492
+ "loss": 0.0138,
493
  "step": 3950
494
  },
495
  {
496
+ "epoch": 1.08,
497
  "learning_rate": 0.0003,
498
+ "loss": 0.023,
499
  "step": 4000
500
  },
501
  {
502
+ "epoch": 1.09,
503
  "learning_rate": 0.0003,
504
+ "loss": 0.0171,
505
  "step": 4050
506
  },
507
  {
508
+ "epoch": 1.11,
509
  "learning_rate": 0.0003,
510
+ "loss": 0.0178,
511
  "step": 4100
512
  },
513
  {
514
+ "epoch": 1.12,
515
  "learning_rate": 0.0003,
516
+ "loss": 0.0193,
517
  "step": 4150
518
  },
519
  {
520
+ "epoch": 1.13,
521
  "learning_rate": 0.0003,
522
+ "loss": 0.0129,
523
  "step": 4200
524
  },
525
  {
526
+ "epoch": 1.15,
527
  "learning_rate": 0.0003,
528
+ "loss": 0.0172,
529
  "step": 4250
530
  },
531
  {
532
+ "epoch": 1.16,
533
  "learning_rate": 0.0003,
534
+ "loss": 0.0265,
535
  "step": 4300
536
  },
537
  {
538
+ "epoch": 1.17,
539
  "learning_rate": 0.0003,
540
+ "loss": 0.0208,
541
  "step": 4350
542
  },
543
  {
544
+ "epoch": 1.19,
545
  "learning_rate": 0.0003,
546
+ "loss": 0.0179,
547
  "step": 4400
548
  },
549
  {
550
+ "epoch": 1.2,
551
  "learning_rate": 0.0003,
552
+ "loss": 0.0142,
553
  "step": 4450
554
  },
555
  {
556
+ "epoch": 1.21,
557
  "learning_rate": 0.0003,
558
+ "loss": 0.0232,
559
  "step": 4500
560
  },
561
  {
562
+ "epoch": 1.23,
563
  "learning_rate": 0.0003,
564
+ "loss": 0.0257,
565
  "step": 4550
566
  },
567
  {
568
+ "epoch": 1.24,
569
  "learning_rate": 0.0003,
570
+ "loss": 0.0159,
571
  "step": 4600
572
  },
573
  {
574
+ "epoch": 1.25,
575
  "learning_rate": 0.0003,
576
+ "loss": 0.017,
577
  "step": 4650
578
  },
579
  {
580
+ "epoch": 1.27,
581
  "learning_rate": 0.0003,
582
+ "loss": 0.0221,
583
  "step": 4700
584
  },
585
  {
586
+ "epoch": 1.28,
587
  "learning_rate": 0.0003,
588
+ "loss": 0.0244,
589
  "step": 4750
590
  },
591
  {
592
+ "epoch": 1.29,
593
  "learning_rate": 0.0003,
594
+ "loss": 0.0222,
595
  "step": 4800
596
  },
597
  {
598
+ "epoch": 1.31,
599
  "learning_rate": 0.0003,
600
+ "loss": 0.0129,
601
  "step": 4850
602
  },
603
  {
604
+ "epoch": 1.32,
605
  "learning_rate": 0.0003,
606
+ "loss": 0.0137,
607
  "step": 4900
608
  },
609
  {
610
+ "epoch": 1.33,
611
  "learning_rate": 0.0003,
612
+ "loss": 0.01,
613
  "step": 4950
614
  },
615
  {
616
+ "epoch": 1.35,
617
  "learning_rate": 0.0003,
618
+ "loss": 0.0293,
619
  "step": 5000
620
  },
621
  {
622
+ "epoch": 1.36,
623
  "learning_rate": 0.0003,
624
+ "loss": 0.0155,
625
  "step": 5050
626
  },
627
  {
628
+ "epoch": 1.37,
629
  "learning_rate": 0.0003,
630
+ "loss": 0.0103,
631
  "step": 5100
632
  },
633
  {
634
+ "epoch": 1.39,
635
  "learning_rate": 0.0003,
636
+ "loss": 0.0104,
637
  "step": 5150
638
  },
639
  {
640
+ "epoch": 1.4,
641
  "learning_rate": 0.0003,
642
+ "loss": 0.0316,
643
  "step": 5200
644
  },
645
  {
646
+ "epoch": 1.42,
647
  "learning_rate": 0.0003,
648
+ "loss": 0.0173,
649
  "step": 5250
650
  },
651
  {
652
+ "epoch": 1.43,
653
  "learning_rate": 0.0003,
654
+ "loss": 0.0224,
655
  "step": 5300
656
  },
657
  {
658
+ "epoch": 1.44,
659
  "learning_rate": 0.0003,
660
+ "loss": 0.0181,
661
  "step": 5350
662
  },
663
  {
664
+ "epoch": 1.46,
665
  "learning_rate": 0.0003,
666
+ "loss": 0.0144,
667
  "step": 5400
668
  },
669
  {
670
+ "epoch": 1.47,
671
  "learning_rate": 0.0003,
672
+ "loss": 0.0328,
673
  "step": 5450
674
  },
675
  {
676
+ "epoch": 1.48,
677
  "learning_rate": 0.0003,
678
+ "loss": 0.0182,
679
  "step": 5500
680
  },
681
  {
682
+ "epoch": 1.5,
683
  "learning_rate": 0.0003,
684
+ "loss": 0.0162,
685
  "step": 5550
686
  },
687
  {
688
+ "epoch": 1.51,
 
 
 
 
 
 
 
 
 
 
689
  "learning_rate": 0.0003,
690
+ "loss": 0.0274,
691
  "step": 5600
692
  },
693
  {
694
+ "epoch": 1.52,
695
  "learning_rate": 0.0003,
696
+ "loss": 0.0126,
697
  "step": 5650
698
  },
699
  {
700
+ "epoch": 1.54,
701
  "learning_rate": 0.0003,
702
+ "loss": 0.0239,
703
  "step": 5700
704
  },
705
  {
706
+ "epoch": 1.55,
707
  "learning_rate": 0.0003,
708
+ "loss": 0.0138,
709
  "step": 5750
710
  },
711
  {
712
+ "epoch": 1.56,
713
  "learning_rate": 0.0003,
714
+ "loss": 0.0085,
715
  "step": 5800
716
  },
717
  {
718
+ "epoch": 1.58,
719
  "learning_rate": 0.0003,
720
+ "loss": 0.0225,
721
  "step": 5850
722
  },
723
  {
724
+ "epoch": 1.59,
725
  "learning_rate": 0.0003,
726
+ "loss": 0.0138,
727
  "step": 5900
728
  },
729
  {
730
+ "epoch": 1.6,
731
  "learning_rate": 0.0003,
732
+ "loss": 0.0245,
733
  "step": 5950
734
  },
735
  {
736
+ "epoch": 1.62,
737
  "learning_rate": 0.0003,
738
+ "loss": 0.0146,
739
  "step": 6000
740
  },
741
  {
742
+ "epoch": 1.63,
743
  "learning_rate": 0.0003,
744
+ "loss": 0.0176,
745
  "step": 6050
746
  },
747
  {
748
+ "epoch": 1.64,
749
  "learning_rate": 0.0003,
750
+ "loss": 0.008,
751
  "step": 6100
752
  },
753
  {
754
+ "epoch": 1.66,
755
  "learning_rate": 0.0003,
756
+ "loss": 0.0169,
757
  "step": 6150
758
  },
759
  {
760
+ "epoch": 1.67,
761
  "learning_rate": 0.0003,
762
+ "loss": 0.0254,
763
  "step": 6200
764
  },
765
  {
766
+ "epoch": 1.68,
767
  "learning_rate": 0.0003,
768
+ "loss": 0.0057,
769
  "step": 6250
770
  },
771
  {
772
+ "epoch": 1.7,
773
  "learning_rate": 0.0003,
774
+ "loss": 0.0159,
775
  "step": 6300
776
  },
777
  {
778
+ "epoch": 1.71,
779
  "learning_rate": 0.0003,
780
+ "loss": 0.0265,
781
  "step": 6350
782
  },
783
  {
784
+ "epoch": 1.73,
785
  "learning_rate": 0.0003,
786
+ "loss": 0.0075,
787
  "step": 6400
788
  },
789
  {
790
+ "epoch": 1.74,
791
  "learning_rate": 0.0003,
792
+ "loss": 0.0132,
793
  "step": 6450
794
  },
795
  {
796
+ "epoch": 1.75,
797
  "learning_rate": 0.0003,
798
+ "loss": 0.0095,
799
  "step": 6500
800
  },
801
  {
802
+ "epoch": 1.77,
803
  "learning_rate": 0.0003,
804
+ "loss": 0.0217,
805
  "step": 6550
806
  },
807
  {
808
+ "epoch": 1.78,
809
  "learning_rate": 0.0003,
810
+ "loss": 0.0081,
811
  "step": 6600
812
  },
813
  {
814
+ "epoch": 1.79,
815
  "learning_rate": 0.0003,
816
+ "loss": 0.0095,
817
  "step": 6650
818
  },
819
  {
820
+ "epoch": 1.81,
821
  "learning_rate": 0.0003,
822
+ "loss": 0.0176,
823
  "step": 6700
824
  },
825
  {
826
+ "epoch": 1.82,
827
  "learning_rate": 0.0003,
828
+ "loss": 0.0122,
829
  "step": 6750
830
  },
831
  {
832
+ "epoch": 1.83,
833
  "learning_rate": 0.0003,
834
+ "loss": 0.0111,
835
  "step": 6800
836
  },
837
  {
838
+ "epoch": 1.85,
839
  "learning_rate": 0.0003,
840
+ "loss": 0.026,
841
  "step": 6850
842
  },
843
  {
844
+ "epoch": 1.86,
845
  "learning_rate": 0.0003,
846
+ "loss": 0.0101,
847
  "step": 6900
848
  },
849
  {
850
+ "epoch": 1.87,
851
  "learning_rate": 0.0003,
852
+ "loss": 0.0155,
853
  "step": 6950
854
  },
855
  {
856
+ "epoch": 1.89,
857
  "learning_rate": 0.0003,
858
+ "loss": 0.0146,
859
  "step": 7000
860
  },
861
  {
862
+ "epoch": 1.9,
863
  "learning_rate": 0.0003,
864
+ "loss": 0.018,
865
  "step": 7050
866
  },
867
  {
868
+ "epoch": 1.91,
869
  "learning_rate": 0.0003,
870
+ "loss": 0.0159,
871
  "step": 7100
872
  },
873
  {
874
+ "epoch": 1.93,
875
  "learning_rate": 0.0003,
876
+ "loss": 0.0178,
877
  "step": 7150
878
  },
879
  {
880
+ "epoch": 1.94,
881
  "learning_rate": 0.0003,
882
+ "loss": 0.0151,
883
  "step": 7200
884
  },
885
  {
886
+ "epoch": 1.95,
887
  "learning_rate": 0.0003,
888
+ "loss": 0.0185,
889
  "step": 7250
890
  },
891
  {
892
+ "epoch": 1.97,
893
  "learning_rate": 0.0003,
894
+ "loss": 0.0055,
895
  "step": 7300
896
  },
897
  {
898
+ "epoch": 1.98,
899
  "learning_rate": 0.0003,
900
+ "loss": 0.0046,
901
  "step": 7350
902
  },
903
  {
904
+ "epoch": 1.99,
905
  "learning_rate": 0.0003,
906
+ "loss": 0.0093,
907
  "step": 7400
908
  },
909
  {
910
+ "epoch": 2.0,
911
+ "eval_f1": 22.2222,
912
+ "eval_gen_len": 29.002111729698598,
913
+ "eval_loss": 0.06078604236245155,
914
+ "eval_runtime": 25.7417,
915
+ "eval_samples_per_second": 809.427,
916
+ "eval_steps_per_second": 25.329,
917
+ "step": 7420
918
  },
919
  {
920
+ "epoch": 2.01,
921
  "learning_rate": 0.0003,
922
+ "loss": 0.0159,
923
  "step": 7450
924
  },
925
  {
926
+ "epoch": 2.02,
927
  "learning_rate": 0.0003,
928
+ "loss": 0.0088,
929
  "step": 7500
930
  },
931
  {
932
+ "epoch": 2.04,
933
  "learning_rate": 0.0003,
934
+ "loss": 0.0055,
935
  "step": 7550
936
  },
937
  {
938
+ "epoch": 2.05,
939
  "learning_rate": 0.0003,
940
+ "loss": 0.0074,
941
  "step": 7600
942
  },
943
  {
944
+ "epoch": 2.06,
945
  "learning_rate": 0.0003,
946
+ "loss": 0.0086,
947
  "step": 7650
948
  },
949
  {
950
+ "epoch": 2.08,
951
  "learning_rate": 0.0003,
952
+ "loss": 0.0168,
953
  "step": 7700
954
  },
955
  {
956
+ "epoch": 2.09,
957
  "learning_rate": 0.0003,
958
+ "loss": 0.0072,
959
  "step": 7750
960
  },
961
  {
962
+ "epoch": 2.1,
963
  "learning_rate": 0.0003,
964
+ "loss": 0.0038,
965
  "step": 7800
966
  },
967
  {
968
+ "epoch": 2.12,
969
  "learning_rate": 0.0003,
970
+ "loss": 0.0119,
971
  "step": 7850
972
  },
973
  {
974
+ "epoch": 2.13,
975
  "learning_rate": 0.0003,
976
+ "loss": 0.012,
977
  "step": 7900
978
  },
979
  {
980
+ "epoch": 2.14,
981
  "learning_rate": 0.0003,
982
+ "loss": 0.0053,
983
  "step": 7950
984
  },
985
  {
986
+ "epoch": 2.16,
987
  "learning_rate": 0.0003,
988
+ "loss": 0.0093,
989
  "step": 8000
990
  },
991
  {
992
+ "epoch": 2.17,
993
  "learning_rate": 0.0003,
994
+ "loss": 0.0138,
995
  "step": 8050
996
  },
997
  {
998
+ "epoch": 2.18,
999
  "learning_rate": 0.0003,
1000
+ "loss": 0.0179,
1001
  "step": 8100
1002
  },
1003
  {
1004
+ "epoch": 2.2,
1005
  "learning_rate": 0.0003,
1006
+ "loss": 0.0055,
1007
  "step": 8150
1008
  },
1009
  {
1010
+ "epoch": 2.21,
1011
  "learning_rate": 0.0003,
1012
+ "loss": 0.0039,
1013
  "step": 8200
1014
  },
1015
  {
1016
+ "epoch": 2.22,
1017
  "learning_rate": 0.0003,
1018
+ "loss": 0.0066,
1019
  "step": 8250
1020
  },
1021
  {
1022
+ "epoch": 2.24,
1023
  "learning_rate": 0.0003,
1024
+ "loss": 0.0079,
1025
  "step": 8300
1026
  },
1027
  {
1028
+ "epoch": 2.25,
1029
  "learning_rate": 0.0003,
1030
+ "loss": 0.0042,
1031
  "step": 8350
1032
  },
1033
  {
1034
+ "epoch": 2.26,
1035
  "learning_rate": 0.0003,
1036
+ "loss": 0.004,
1037
  "step": 8400
1038
  },
1039
  {
1040
+ "epoch": 2.28,
1041
  "learning_rate": 0.0003,
1042
+ "loss": 0.0108,
1043
  "step": 8450
1044
  },
1045
  {
1046
+ "epoch": 2.29,
1047
  "learning_rate": 0.0003,
1048
+ "loss": 0.0077,
1049
  "step": 8500
1050
  },
1051
  {
1052
+ "epoch": 2.3,
1053
  "learning_rate": 0.0003,
1054
+ "loss": 0.0115,
1055
  "step": 8550
1056
  },
1057
  {
1058
+ "epoch": 2.32,
1059
  "learning_rate": 0.0003,
1060
+ "loss": 0.0067,
1061
  "step": 8600
1062
  },
1063
  {
1064
+ "epoch": 2.33,
1065
  "learning_rate": 0.0003,
1066
+ "loss": 0.0172,
1067
  "step": 8650
1068
  },
1069
  {
1070
+ "epoch": 2.35,
1071
  "learning_rate": 0.0003,
1072
+ "loss": 0.0039,
1073
  "step": 8700
1074
  },
1075
  {
1076
+ "epoch": 2.36,
1077
  "learning_rate": 0.0003,
1078
+ "loss": 0.0067,
1079
  "step": 8750
1080
  },
1081
  {
1082
+ "epoch": 2.37,
1083
  "learning_rate": 0.0003,
1084
+ "loss": 0.0051,
1085
  "step": 8800
1086
  },
1087
  {
1088
+ "epoch": 2.39,
1089
  "learning_rate": 0.0003,
1090
+ "loss": 0.0163,
1091
  "step": 8850
1092
  },
1093
  {
1094
+ "epoch": 2.4,
1095
  "learning_rate": 0.0003,
1096
+ "loss": 0.0077,
1097
  "step": 8900
1098
  },
1099
  {
1100
+ "epoch": 2.41,
1101
  "learning_rate": 0.0003,
1102
+ "loss": 0.0096,
1103
  "step": 8950
1104
  },
1105
  {
1106
+ "epoch": 2.43,
1107
  "learning_rate": 0.0003,
1108
+ "loss": 0.0044,
1109
  "step": 9000
1110
  },
1111
  {
1112
+ "epoch": 2.44,
1113
  "learning_rate": 0.0003,
1114
+ "loss": 0.0028,
1115
  "step": 9050
1116
  },
1117
  {
1118
+ "epoch": 2.45,
1119
  "learning_rate": 0.0003,
1120
+ "loss": 0.0094,
1121
  "step": 9100
1122
  },
1123
  {
1124
+ "epoch": 2.47,
1125
  "learning_rate": 0.0003,
1126
+ "loss": 0.0034,
1127
  "step": 9150
1128
  },
1129
  {
1130
+ "epoch": 2.48,
1131
  "learning_rate": 0.0003,
1132
+ "loss": 0.0097,
1133
  "step": 9200
1134
  },
1135
  {
1136
+ "epoch": 2.49,
1137
  "learning_rate": 0.0003,
1138
+ "loss": 0.0018,
1139
  "step": 9250
1140
  },
1141
+ {
1142
+ "epoch": 2.51,
1143
+ "learning_rate": 0.0003,
1144
+ "loss": 0.0036,
1145
+ "step": 9300
1146
+ },
1147
+ {
1148
+ "epoch": 2.52,
1149
+ "learning_rate": 0.0003,
1150
+ "loss": 0.0083,
1151
+ "step": 9350
1152
+ },
1153
+ {
1154
+ "epoch": 2.53,
1155
+ "learning_rate": 0.0003,
1156
+ "loss": 0.0044,
1157
+ "step": 9400
1158
+ },
1159
+ {
1160
+ "epoch": 2.55,
1161
+ "learning_rate": 0.0003,
1162
+ "loss": 0.0088,
1163
+ "step": 9450
1164
+ },
1165
+ {
1166
+ "epoch": 2.56,
1167
+ "learning_rate": 0.0003,
1168
+ "loss": 0.0071,
1169
+ "step": 9500
1170
+ },
1171
+ {
1172
+ "epoch": 2.57,
1173
+ "learning_rate": 0.0003,
1174
+ "loss": 0.0124,
1175
+ "step": 9550
1176
+ },
1177
+ {
1178
+ "epoch": 2.59,
1179
+ "learning_rate": 0.0003,
1180
+ "loss": 0.0044,
1181
+ "step": 9600
1182
+ },
1183
+ {
1184
+ "epoch": 2.6,
1185
+ "learning_rate": 0.0003,
1186
+ "loss": 0.0084,
1187
+ "step": 9650
1188
+ },
1189
+ {
1190
+ "epoch": 2.61,
1191
+ "learning_rate": 0.0003,
1192
+ "loss": 0.0088,
1193
+ "step": 9700
1194
+ },
1195
+ {
1196
+ "epoch": 2.63,
1197
+ "learning_rate": 0.0003,
1198
+ "loss": 0.011,
1199
+ "step": 9750
1200
+ },
1201
+ {
1202
+ "epoch": 2.64,
1203
+ "learning_rate": 0.0003,
1204
+ "loss": 0.0068,
1205
+ "step": 9800
1206
+ },
1207
+ {
1208
+ "epoch": 2.65,
1209
+ "learning_rate": 0.0003,
1210
+ "loss": 0.0084,
1211
+ "step": 9850
1212
+ },
1213
+ {
1214
+ "epoch": 2.67,
1215
+ "learning_rate": 0.0003,
1216
+ "loss": 0.0079,
1217
+ "step": 9900
1218
+ },
1219
+ {
1220
+ "epoch": 2.68,
1221
+ "learning_rate": 0.0003,
1222
+ "loss": 0.0104,
1223
+ "step": 9950
1224
+ },
1225
+ {
1226
+ "epoch": 2.7,
1227
+ "learning_rate": 0.0003,
1228
+ "loss": 0.0047,
1229
+ "step": 10000
1230
+ },
1231
+ {
1232
+ "epoch": 2.71,
1233
+ "learning_rate": 0.0003,
1234
+ "loss": 0.0132,
1235
+ "step": 10050
1236
+ },
1237
+ {
1238
+ "epoch": 2.72,
1239
+ "learning_rate": 0.0003,
1240
+ "loss": 0.0066,
1241
+ "step": 10100
1242
+ },
1243
+ {
1244
+ "epoch": 2.74,
1245
+ "learning_rate": 0.0003,
1246
+ "loss": 0.0108,
1247
+ "step": 10150
1248
+ },
1249
+ {
1250
+ "epoch": 2.75,
1251
+ "learning_rate": 0.0003,
1252
+ "loss": 0.0085,
1253
+ "step": 10200
1254
+ },
1255
+ {
1256
+ "epoch": 2.76,
1257
+ "learning_rate": 0.0003,
1258
+ "loss": 0.0164,
1259
+ "step": 10250
1260
+ },
1261
+ {
1262
+ "epoch": 2.78,
1263
+ "learning_rate": 0.0003,
1264
+ "loss": 0.0135,
1265
+ "step": 10300
1266
+ },
1267
+ {
1268
+ "epoch": 2.79,
1269
+ "learning_rate": 0.0003,
1270
+ "loss": 0.0106,
1271
+ "step": 10350
1272
+ },
1273
+ {
1274
+ "epoch": 2.8,
1275
+ "learning_rate": 0.0003,
1276
+ "loss": 0.0063,
1277
+ "step": 10400
1278
+ },
1279
+ {
1280
+ "epoch": 2.82,
1281
+ "learning_rate": 0.0003,
1282
+ "loss": 0.0035,
1283
+ "step": 10450
1284
+ },
1285
+ {
1286
+ "epoch": 2.83,
1287
+ "learning_rate": 0.0003,
1288
+ "loss": 0.0078,
1289
+ "step": 10500
1290
+ },
1291
+ {
1292
+ "epoch": 2.84,
1293
+ "learning_rate": 0.0003,
1294
+ "loss": 0.0188,
1295
+ "step": 10550
1296
+ },
1297
+ {
1298
+ "epoch": 2.86,
1299
+ "learning_rate": 0.0003,
1300
+ "loss": 0.0092,
1301
+ "step": 10600
1302
+ },
1303
+ {
1304
+ "epoch": 2.87,
1305
+ "learning_rate": 0.0003,
1306
+ "loss": 0.0079,
1307
+ "step": 10650
1308
+ },
1309
+ {
1310
+ "epoch": 2.88,
1311
+ "learning_rate": 0.0003,
1312
+ "loss": 0.0059,
1313
+ "step": 10700
1314
+ },
1315
+ {
1316
+ "epoch": 2.9,
1317
+ "learning_rate": 0.0003,
1318
+ "loss": 0.006,
1319
+ "step": 10750
1320
+ },
1321
+ {
1322
+ "epoch": 2.91,
1323
+ "learning_rate": 0.0003,
1324
+ "loss": 0.0061,
1325
+ "step": 10800
1326
+ },
1327
+ {
1328
+ "epoch": 2.92,
1329
+ "learning_rate": 0.0003,
1330
+ "loss": 0.0104,
1331
+ "step": 10850
1332
+ },
1333
+ {
1334
+ "epoch": 2.94,
1335
+ "learning_rate": 0.0003,
1336
+ "loss": 0.0084,
1337
+ "step": 10900
1338
+ },
1339
+ {
1340
+ "epoch": 2.95,
1341
+ "learning_rate": 0.0003,
1342
+ "loss": 0.0093,
1343
+ "step": 10950
1344
+ },
1345
+ {
1346
+ "epoch": 2.96,
1347
+ "learning_rate": 0.0003,
1348
+ "loss": 0.0053,
1349
+ "step": 11000
1350
+ },
1351
+ {
1352
+ "epoch": 2.98,
1353
+ "learning_rate": 0.0003,
1354
+ "loss": 0.0109,
1355
+ "step": 11050
1356
+ },
1357
+ {
1358
+ "epoch": 2.99,
1359
+ "learning_rate": 0.0003,
1360
+ "loss": 0.0084,
1361
+ "step": 11100
1362
+ },
1363
+ {
1364
+ "epoch": 3.0,
1365
+ "eval_f1": 22.2222,
1366
+ "eval_gen_len": 29.002111729698598,
1367
+ "eval_loss": 0.07568035274744034,
1368
+ "eval_runtime": 25.7985,
1369
+ "eval_samples_per_second": 807.643,
1370
+ "eval_steps_per_second": 25.273,
1371
+ "step": 11130
1372
+ },
1373
+ {
1374
+ "epoch": 3.01,
1375
+ "learning_rate": 0.0003,
1376
+ "loss": 0.0069,
1377
+ "step": 11150
1378
+ },
1379
+ {
1380
+ "epoch": 3.02,
1381
+ "learning_rate": 0.0003,
1382
+ "loss": 0.0035,
1383
+ "step": 11200
1384
+ },
1385
+ {
1386
+ "epoch": 3.03,
1387
+ "learning_rate": 0.0003,
1388
+ "loss": 0.0057,
1389
+ "step": 11250
1390
+ },
1391
+ {
1392
+ "epoch": 3.05,
1393
+ "learning_rate": 0.0003,
1394
+ "loss": 0.0041,
1395
+ "step": 11300
1396
+ },
1397
+ {
1398
+ "epoch": 3.06,
1399
+ "learning_rate": 0.0003,
1400
+ "loss": 0.0047,
1401
+ "step": 11350
1402
+ },
1403
+ {
1404
+ "epoch": 3.07,
1405
+ "learning_rate": 0.0003,
1406
+ "loss": 0.0025,
1407
+ "step": 11400
1408
+ },
1409
+ {
1410
+ "epoch": 3.09,
1411
+ "learning_rate": 0.0003,
1412
+ "loss": 0.006,
1413
+ "step": 11450
1414
+ },
1415
+ {
1416
+ "epoch": 3.1,
1417
+ "learning_rate": 0.0003,
1418
+ "loss": 0.0033,
1419
+ "step": 11500
1420
+ },
1421
+ {
1422
+ "epoch": 3.11,
1423
+ "learning_rate": 0.0003,
1424
+ "loss": 0.0046,
1425
+ "step": 11550
1426
+ },
1427
+ {
1428
+ "epoch": 3.13,
1429
+ "learning_rate": 0.0003,
1430
+ "loss": 0.0065,
1431
+ "step": 11600
1432
+ },
1433
+ {
1434
+ "epoch": 3.14,
1435
+ "learning_rate": 0.0003,
1436
+ "loss": 0.0058,
1437
+ "step": 11650
1438
+ },
1439
+ {
1440
+ "epoch": 3.15,
1441
+ "learning_rate": 0.0003,
1442
+ "loss": 0.0056,
1443
+ "step": 11700
1444
+ },
1445
+ {
1446
+ "epoch": 3.17,
1447
+ "learning_rate": 0.0003,
1448
+ "loss": 0.0094,
1449
+ "step": 11750
1450
+ },
1451
+ {
1452
+ "epoch": 3.18,
1453
+ "learning_rate": 0.0003,
1454
+ "loss": 0.0035,
1455
+ "step": 11800
1456
+ },
1457
+ {
1458
+ "epoch": 3.19,
1459
+ "learning_rate": 0.0003,
1460
+ "loss": 0.0016,
1461
+ "step": 11850
1462
+ },
1463
+ {
1464
+ "epoch": 3.21,
1465
+ "learning_rate": 0.0003,
1466
+ "loss": 0.0047,
1467
+ "step": 11900
1468
+ },
1469
+ {
1470
+ "epoch": 3.22,
1471
+ "learning_rate": 0.0003,
1472
+ "loss": 0.0054,
1473
+ "step": 11950
1474
+ },
1475
+ {
1476
+ "epoch": 3.23,
1477
+ "learning_rate": 0.0003,
1478
+ "loss": 0.0022,
1479
+ "step": 12000
1480
+ },
1481
+ {
1482
+ "epoch": 3.25,
1483
+ "learning_rate": 0.0003,
1484
+ "loss": 0.0052,
1485
+ "step": 12050
1486
+ },
1487
+ {
1488
+ "epoch": 3.26,
1489
+ "learning_rate": 0.0003,
1490
+ "loss": 0.0046,
1491
+ "step": 12100
1492
+ },
1493
+ {
1494
+ "epoch": 3.27,
1495
+ "learning_rate": 0.0003,
1496
+ "loss": 0.0091,
1497
+ "step": 12150
1498
+ },
1499
+ {
1500
+ "epoch": 3.29,
1501
+ "learning_rate": 0.0003,
1502
+ "loss": 0.0053,
1503
+ "step": 12200
1504
+ },
1505
+ {
1506
+ "epoch": 3.3,
1507
+ "learning_rate": 0.0003,
1508
+ "loss": 0.0081,
1509
+ "step": 12250
1510
+ },
1511
+ {
1512
+ "epoch": 3.32,
1513
+ "learning_rate": 0.0003,
1514
+ "loss": 0.0088,
1515
+ "step": 12300
1516
+ },
1517
+ {
1518
+ "epoch": 3.33,
1519
+ "learning_rate": 0.0003,
1520
+ "loss": 0.0086,
1521
+ "step": 12350
1522
+ },
1523
+ {
1524
+ "epoch": 3.34,
1525
+ "learning_rate": 0.0003,
1526
+ "loss": 0.0083,
1527
+ "step": 12400
1528
+ },
1529
+ {
1530
+ "epoch": 3.36,
1531
+ "learning_rate": 0.0003,
1532
+ "loss": 0.0032,
1533
+ "step": 12450
1534
+ },
1535
+ {
1536
+ "epoch": 3.37,
1537
+ "learning_rate": 0.0003,
1538
+ "loss": 0.0084,
1539
+ "step": 12500
1540
+ },
1541
+ {
1542
+ "epoch": 3.38,
1543
+ "learning_rate": 0.0003,
1544
+ "loss": 0.0067,
1545
+ "step": 12550
1546
+ },
1547
+ {
1548
+ "epoch": 3.4,
1549
+ "learning_rate": 0.0003,
1550
+ "loss": 0.0071,
1551
+ "step": 12600
1552
+ },
1553
+ {
1554
+ "epoch": 3.41,
1555
+ "learning_rate": 0.0003,
1556
+ "loss": 0.0062,
1557
+ "step": 12650
1558
+ },
1559
+ {
1560
+ "epoch": 3.42,
1561
+ "learning_rate": 0.0003,
1562
+ "loss": 0.0089,
1563
+ "step": 12700
1564
+ },
1565
+ {
1566
+ "epoch": 3.44,
1567
+ "learning_rate": 0.0003,
1568
+ "loss": 0.0043,
1569
+ "step": 12750
1570
+ },
1571
+ {
1572
+ "epoch": 3.45,
1573
+ "learning_rate": 0.0003,
1574
+ "loss": 0.0075,
1575
+ "step": 12800
1576
+ },
1577
+ {
1578
+ "epoch": 3.46,
1579
+ "learning_rate": 0.0003,
1580
+ "loss": 0.0055,
1581
+ "step": 12850
1582
+ },
1583
+ {
1584
+ "epoch": 3.48,
1585
+ "learning_rate": 0.0003,
1586
+ "loss": 0.0044,
1587
+ "step": 12900
1588
+ },
1589
+ {
1590
+ "epoch": 3.49,
1591
+ "learning_rate": 0.0003,
1592
+ "loss": 0.0063,
1593
+ "step": 12950
1594
+ },
1595
+ {
1596
+ "epoch": 3.5,
1597
+ "learning_rate": 0.0003,
1598
+ "loss": 0.0105,
1599
+ "step": 13000
1600
+ },
1601
+ {
1602
+ "epoch": 3.52,
1603
+ "learning_rate": 0.0003,
1604
+ "loss": 0.0088,
1605
+ "step": 13050
1606
+ },
1607
+ {
1608
+ "epoch": 3.53,
1609
+ "learning_rate": 0.0003,
1610
+ "loss": 0.0108,
1611
+ "step": 13100
1612
+ },
1613
+ {
1614
+ "epoch": 3.54,
1615
+ "learning_rate": 0.0003,
1616
+ "loss": 0.003,
1617
+ "step": 13150
1618
+ },
1619
+ {
1620
+ "epoch": 3.56,
1621
+ "learning_rate": 0.0003,
1622
+ "loss": 0.0064,
1623
+ "step": 13200
1624
+ },
1625
+ {
1626
+ "epoch": 3.57,
1627
+ "learning_rate": 0.0003,
1628
+ "loss": 0.0083,
1629
+ "step": 13250
1630
+ },
1631
+ {
1632
+ "epoch": 3.58,
1633
+ "learning_rate": 0.0003,
1634
+ "loss": 0.0024,
1635
+ "step": 13300
1636
+ },
1637
+ {
1638
+ "epoch": 3.6,
1639
+ "learning_rate": 0.0003,
1640
+ "loss": 0.0038,
1641
+ "step": 13350
1642
+ },
1643
+ {
1644
+ "epoch": 3.61,
1645
+ "learning_rate": 0.0003,
1646
+ "loss": 0.0122,
1647
+ "step": 13400
1648
+ },
1649
+ {
1650
+ "epoch": 3.63,
1651
+ "learning_rate": 0.0003,
1652
+ "loss": 0.0126,
1653
+ "step": 13450
1654
+ },
1655
+ {
1656
+ "epoch": 3.64,
1657
+ "learning_rate": 0.0003,
1658
+ "loss": 0.0035,
1659
+ "step": 13500
1660
+ },
1661
+ {
1662
+ "epoch": 3.65,
1663
+ "learning_rate": 0.0003,
1664
+ "loss": 0.0056,
1665
+ "step": 13550
1666
+ },
1667
+ {
1668
+ "epoch": 3.67,
1669
+ "learning_rate": 0.0003,
1670
+ "loss": 0.0034,
1671
+ "step": 13600
1672
+ },
1673
+ {
1674
+ "epoch": 3.68,
1675
+ "learning_rate": 0.0003,
1676
+ "loss": 0.003,
1677
+ "step": 13650
1678
+ },
1679
+ {
1680
+ "epoch": 3.69,
1681
+ "learning_rate": 0.0003,
1682
+ "loss": 0.0034,
1683
+ "step": 13700
1684
+ },
1685
+ {
1686
+ "epoch": 3.71,
1687
+ "learning_rate": 0.0003,
1688
+ "loss": 0.0089,
1689
+ "step": 13750
1690
+ },
1691
+ {
1692
+ "epoch": 3.72,
1693
+ "learning_rate": 0.0003,
1694
+ "loss": 0.0065,
1695
+ "step": 13800
1696
+ },
1697
+ {
1698
+ "epoch": 3.73,
1699
+ "learning_rate": 0.0003,
1700
+ "loss": 0.0038,
1701
+ "step": 13850
1702
+ },
1703
+ {
1704
+ "epoch": 3.75,
1705
+ "learning_rate": 0.0003,
1706
+ "loss": 0.0046,
1707
+ "step": 13900
1708
+ },
1709
+ {
1710
+ "epoch": 3.76,
1711
+ "learning_rate": 0.0003,
1712
+ "loss": 0.0041,
1713
+ "step": 13950
1714
+ },
1715
+ {
1716
+ "epoch": 3.77,
1717
+ "learning_rate": 0.0003,
1718
+ "loss": 0.0063,
1719
+ "step": 14000
1720
+ },
1721
+ {
1722
+ "epoch": 3.79,
1723
+ "learning_rate": 0.0003,
1724
+ "loss": 0.0072,
1725
+ "step": 14050
1726
+ },
1727
+ {
1728
+ "epoch": 3.8,
1729
+ "learning_rate": 0.0003,
1730
+ "loss": 0.0025,
1731
+ "step": 14100
1732
+ },
1733
+ {
1734
+ "epoch": 3.81,
1735
+ "learning_rate": 0.0003,
1736
+ "loss": 0.0037,
1737
+ "step": 14150
1738
+ },
1739
+ {
1740
+ "epoch": 3.83,
1741
+ "learning_rate": 0.0003,
1742
+ "loss": 0.0067,
1743
+ "step": 14200
1744
+ },
1745
+ {
1746
+ "epoch": 3.84,
1747
+ "learning_rate": 0.0003,
1748
+ "loss": 0.0016,
1749
+ "step": 14250
1750
+ },
1751
+ {
1752
+ "epoch": 3.85,
1753
+ "learning_rate": 0.0003,
1754
+ "loss": 0.0053,
1755
+ "step": 14300
1756
+ },
1757
+ {
1758
+ "epoch": 3.87,
1759
+ "learning_rate": 0.0003,
1760
+ "loss": 0.0084,
1761
+ "step": 14350
1762
+ },
1763
+ {
1764
+ "epoch": 3.88,
1765
+ "learning_rate": 0.0003,
1766
+ "loss": 0.0054,
1767
+ "step": 14400
1768
+ },
1769
+ {
1770
+ "epoch": 3.89,
1771
+ "learning_rate": 0.0003,
1772
+ "loss": 0.0045,
1773
+ "step": 14450
1774
+ },
1775
+ {
1776
+ "epoch": 3.91,
1777
+ "learning_rate": 0.0003,
1778
+ "loss": 0.0028,
1779
+ "step": 14500
1780
+ },
1781
+ {
1782
+ "epoch": 3.92,
1783
+ "learning_rate": 0.0003,
1784
+ "loss": 0.0081,
1785
+ "step": 14550
1786
+ },
1787
+ {
1788
+ "epoch": 3.94,
1789
+ "learning_rate": 0.0003,
1790
+ "loss": 0.0054,
1791
+ "step": 14600
1792
+ },
1793
+ {
1794
+ "epoch": 3.95,
1795
+ "learning_rate": 0.0003,
1796
+ "loss": 0.0038,
1797
+ "step": 14650
1798
+ },
1799
+ {
1800
+ "epoch": 3.96,
1801
+ "learning_rate": 0.0003,
1802
+ "loss": 0.0051,
1803
+ "step": 14700
1804
+ },
1805
+ {
1806
+ "epoch": 3.98,
1807
+ "learning_rate": 0.0003,
1808
+ "loss": 0.007,
1809
+ "step": 14750
1810
+ },
1811
+ {
1812
+ "epoch": 3.99,
1813
+ "learning_rate": 0.0003,
1814
+ "loss": 0.0046,
1815
+ "step": 14800
1816
+ },
1817
+ {
1818
+ "epoch": 4.0,
1819
+ "eval_f1": 22.2222,
1820
+ "eval_gen_len": 29.002111729698598,
1821
+ "eval_loss": 0.07932917773723602,
1822
+ "eval_runtime": 25.7288,
1823
+ "eval_samples_per_second": 809.832,
1824
+ "eval_steps_per_second": 25.341,
1825
+ "step": 14840
1826
+ },
1827
+ {
1828
+ "epoch": 4.0,
1829
+ "learning_rate": 0.0003,
1830
+ "loss": 0.007,
1831
+ "step": 14850
1832
+ },
1833
+ {
1834
+ "epoch": 4.02,
1835
+ "learning_rate": 0.0003,
1836
+ "loss": 0.0036,
1837
+ "step": 14900
1838
+ },
1839
+ {
1840
+ "epoch": 4.03,
1841
+ "learning_rate": 0.0003,
1842
+ "loss": 0.0087,
1843
+ "step": 14950
1844
+ },
1845
+ {
1846
+ "epoch": 4.04,
1847
+ "learning_rate": 0.0003,
1848
+ "loss": 0.0094,
1849
+ "step": 15000
1850
+ },
1851
+ {
1852
+ "epoch": 4.06,
1853
+ "learning_rate": 0.0003,
1854
+ "loss": 0.0023,
1855
+ "step": 15050
1856
+ },
1857
+ {
1858
+ "epoch": 4.07,
1859
+ "learning_rate": 0.0003,
1860
+ "loss": 0.0068,
1861
+ "step": 15100
1862
+ },
1863
+ {
1864
+ "epoch": 4.08,
1865
+ "learning_rate": 0.0003,
1866
+ "loss": 0.0022,
1867
+ "step": 15150
1868
+ },
1869
+ {
1870
+ "epoch": 4.1,
1871
+ "learning_rate": 0.0003,
1872
+ "loss": 0.01,
1873
+ "step": 15200
1874
+ },
1875
+ {
1876
+ "epoch": 4.11,
1877
+ "learning_rate": 0.0003,
1878
+ "loss": 0.0047,
1879
+ "step": 15250
1880
+ },
1881
+ {
1882
+ "epoch": 4.12,
1883
+ "learning_rate": 0.0003,
1884
+ "loss": 0.0086,
1885
+ "step": 15300
1886
+ },
1887
+ {
1888
+ "epoch": 4.14,
1889
+ "learning_rate": 0.0003,
1890
+ "loss": 0.0048,
1891
+ "step": 15350
1892
+ },
1893
+ {
1894
+ "epoch": 4.15,
1895
+ "learning_rate": 0.0003,
1896
+ "loss": 0.0025,
1897
+ "step": 15400
1898
+ },
1899
+ {
1900
+ "epoch": 4.16,
1901
+ "learning_rate": 0.0003,
1902
+ "loss": 0.0031,
1903
+ "step": 15450
1904
+ },
1905
+ {
1906
+ "epoch": 4.18,
1907
+ "learning_rate": 0.0003,
1908
+ "loss": 0.0132,
1909
+ "step": 15500
1910
+ },
1911
+ {
1912
+ "epoch": 4.19,
1913
+ "learning_rate": 0.0003,
1914
+ "loss": 0.0035,
1915
+ "step": 15550
1916
+ },
1917
+ {
1918
+ "epoch": 4.2,
1919
+ "learning_rate": 0.0003,
1920
+ "loss": 0.0022,
1921
+ "step": 15600
1922
+ },
1923
+ {
1924
+ "epoch": 4.22,
1925
+ "learning_rate": 0.0003,
1926
+ "loss": 0.0055,
1927
+ "step": 15650
1928
+ },
1929
+ {
1930
+ "epoch": 4.23,
1931
+ "learning_rate": 0.0003,
1932
+ "loss": 0.0028,
1933
+ "step": 15700
1934
+ },
1935
+ {
1936
+ "epoch": 4.25,
1937
+ "learning_rate": 0.0003,
1938
+ "loss": 0.0062,
1939
+ "step": 15750
1940
+ },
1941
+ {
1942
+ "epoch": 4.26,
1943
+ "learning_rate": 0.0003,
1944
+ "loss": 0.001,
1945
+ "step": 15800
1946
+ },
1947
+ {
1948
+ "epoch": 4.27,
1949
+ "learning_rate": 0.0003,
1950
+ "loss": 0.006,
1951
+ "step": 15850
1952
+ },
1953
+ {
1954
+ "epoch": 4.29,
1955
+ "learning_rate": 0.0003,
1956
+ "loss": 0.0026,
1957
+ "step": 15900
1958
+ },
1959
+ {
1960
+ "epoch": 4.3,
1961
+ "learning_rate": 0.0003,
1962
+ "loss": 0.0042,
1963
+ "step": 15950
1964
+ },
1965
+ {
1966
+ "epoch": 4.31,
1967
+ "learning_rate": 0.0003,
1968
+ "loss": 0.0012,
1969
+ "step": 16000
1970
+ },
1971
+ {
1972
+ "epoch": 4.33,
1973
+ "learning_rate": 0.0003,
1974
+ "loss": 0.0037,
1975
+ "step": 16050
1976
+ },
1977
+ {
1978
+ "epoch": 4.34,
1979
+ "learning_rate": 0.0003,
1980
+ "loss": 0.0037,
1981
+ "step": 16100
1982
+ },
1983
+ {
1984
+ "epoch": 4.35,
1985
+ "learning_rate": 0.0003,
1986
+ "loss": 0.003,
1987
+ "step": 16150
1988
+ },
1989
+ {
1990
+ "epoch": 4.37,
1991
+ "learning_rate": 0.0003,
1992
+ "loss": 0.0055,
1993
+ "step": 16200
1994
+ },
1995
+ {
1996
+ "epoch": 4.38,
1997
+ "learning_rate": 0.0003,
1998
+ "loss": 0.0028,
1999
+ "step": 16250
2000
+ },
2001
+ {
2002
+ "epoch": 4.39,
2003
+ "learning_rate": 0.0003,
2004
+ "loss": 0.0015,
2005
+ "step": 16300
2006
+ },
2007
+ {
2008
+ "epoch": 4.41,
2009
+ "learning_rate": 0.0003,
2010
+ "loss": 0.0034,
2011
+ "step": 16350
2012
+ },
2013
+ {
2014
+ "epoch": 4.42,
2015
+ "learning_rate": 0.0003,
2016
+ "loss": 0.0082,
2017
+ "step": 16400
2018
+ },
2019
+ {
2020
+ "epoch": 4.43,
2021
+ "learning_rate": 0.0003,
2022
+ "loss": 0.0039,
2023
+ "step": 16450
2024
+ },
2025
+ {
2026
+ "epoch": 4.45,
2027
+ "learning_rate": 0.0003,
2028
+ "loss": 0.0028,
2029
+ "step": 16500
2030
+ },
2031
+ {
2032
+ "epoch": 4.46,
2033
+ "learning_rate": 0.0003,
2034
+ "loss": 0.0032,
2035
+ "step": 16550
2036
+ },
2037
+ {
2038
+ "epoch": 4.47,
2039
+ "learning_rate": 0.0003,
2040
+ "loss": 0.0017,
2041
+ "step": 16600
2042
+ },
2043
+ {
2044
+ "epoch": 4.49,
2045
+ "learning_rate": 0.0003,
2046
+ "loss": 0.0028,
2047
+ "step": 16650
2048
+ },
2049
+ {
2050
+ "epoch": 4.5,
2051
+ "learning_rate": 0.0003,
2052
+ "loss": 0.0084,
2053
+ "step": 16700
2054
+ },
2055
+ {
2056
+ "epoch": 4.51,
2057
+ "learning_rate": 0.0003,
2058
+ "loss": 0.0042,
2059
+ "step": 16750
2060
+ },
2061
+ {
2062
+ "epoch": 4.53,
2063
+ "learning_rate": 0.0003,
2064
+ "loss": 0.014,
2065
+ "step": 16800
2066
+ },
2067
+ {
2068
+ "epoch": 4.54,
2069
+ "learning_rate": 0.0003,
2070
+ "loss": 0.0046,
2071
+ "step": 16850
2072
+ },
2073
+ {
2074
+ "epoch": 4.56,
2075
+ "learning_rate": 0.0003,
2076
+ "loss": 0.0043,
2077
+ "step": 16900
2078
+ },
2079
+ {
2080
+ "epoch": 4.57,
2081
+ "learning_rate": 0.0003,
2082
+ "loss": 0.0029,
2083
+ "step": 16950
2084
+ },
2085
+ {
2086
+ "epoch": 4.58,
2087
+ "learning_rate": 0.0003,
2088
+ "loss": 0.008,
2089
+ "step": 17000
2090
+ },
2091
+ {
2092
+ "epoch": 4.6,
2093
+ "learning_rate": 0.0003,
2094
+ "loss": 0.0014,
2095
+ "step": 17050
2096
+ },
2097
+ {
2098
+ "epoch": 4.61,
2099
+ "learning_rate": 0.0003,
2100
+ "loss": 0.004,
2101
+ "step": 17100
2102
+ },
2103
+ {
2104
+ "epoch": 4.62,
2105
+ "learning_rate": 0.0003,
2106
+ "loss": 0.0012,
2107
+ "step": 17150
2108
+ },
2109
+ {
2110
+ "epoch": 4.64,
2111
+ "learning_rate": 0.0003,
2112
+ "loss": 0.0023,
2113
+ "step": 17200
2114
+ },
2115
+ {
2116
+ "epoch": 4.65,
2117
+ "learning_rate": 0.0003,
2118
+ "loss": 0.0036,
2119
+ "step": 17250
2120
+ },
2121
+ {
2122
+ "epoch": 4.66,
2123
+ "learning_rate": 0.0003,
2124
+ "loss": 0.0082,
2125
+ "step": 17300
2126
+ },
2127
+ {
2128
+ "epoch": 4.68,
2129
+ "learning_rate": 0.0003,
2130
+ "loss": 0.0014,
2131
+ "step": 17350
2132
+ },
2133
+ {
2134
+ "epoch": 4.69,
2135
+ "learning_rate": 0.0003,
2136
+ "loss": 0.0114,
2137
+ "step": 17400
2138
+ },
2139
+ {
2140
+ "epoch": 4.7,
2141
+ "learning_rate": 0.0003,
2142
+ "loss": 0.0087,
2143
+ "step": 17450
2144
+ },
2145
+ {
2146
+ "epoch": 4.72,
2147
+ "learning_rate": 0.0003,
2148
+ "loss": 0.0029,
2149
+ "step": 17500
2150
+ },
2151
+ {
2152
+ "epoch": 4.73,
2153
+ "learning_rate": 0.0003,
2154
+ "loss": 0.0043,
2155
+ "step": 17550
2156
+ },
2157
+ {
2158
+ "epoch": 4.74,
2159
+ "learning_rate": 0.0003,
2160
+ "loss": 0.0017,
2161
+ "step": 17600
2162
+ },
2163
+ {
2164
+ "epoch": 4.76,
2165
+ "learning_rate": 0.0003,
2166
+ "loss": 0.0013,
2167
+ "step": 17650
2168
+ },
2169
+ {
2170
+ "epoch": 4.77,
2171
+ "learning_rate": 0.0003,
2172
+ "loss": 0.0058,
2173
+ "step": 17700
2174
+ },
2175
+ {
2176
+ "epoch": 4.78,
2177
+ "learning_rate": 0.0003,
2178
+ "loss": 0.0019,
2179
+ "step": 17750
2180
+ },
2181
+ {
2182
+ "epoch": 4.8,
2183
+ "learning_rate": 0.0003,
2184
+ "loss": 0.002,
2185
+ "step": 17800
2186
+ },
2187
+ {
2188
+ "epoch": 4.81,
2189
+ "learning_rate": 0.0003,
2190
+ "loss": 0.0043,
2191
+ "step": 17850
2192
+ },
2193
+ {
2194
+ "epoch": 4.82,
2195
+ "learning_rate": 0.0003,
2196
+ "loss": 0.0088,
2197
+ "step": 17900
2198
+ },
2199
+ {
2200
+ "epoch": 4.84,
2201
+ "learning_rate": 0.0003,
2202
+ "loss": 0.0042,
2203
+ "step": 17950
2204
+ },
2205
+ {
2206
+ "epoch": 4.85,
2207
+ "learning_rate": 0.0003,
2208
+ "loss": 0.0011,
2209
+ "step": 18000
2210
+ },
2211
+ {
2212
+ "epoch": 4.87,
2213
+ "learning_rate": 0.0003,
2214
+ "loss": 0.0078,
2215
+ "step": 18050
2216
+ },
2217
+ {
2218
+ "epoch": 4.88,
2219
+ "learning_rate": 0.0003,
2220
+ "loss": 0.0038,
2221
+ "step": 18100
2222
+ },
2223
+ {
2224
+ "epoch": 4.89,
2225
+ "learning_rate": 0.0003,
2226
+ "loss": 0.0041,
2227
+ "step": 18150
2228
+ },
2229
+ {
2230
+ "epoch": 4.91,
2231
+ "learning_rate": 0.0003,
2232
+ "loss": 0.0038,
2233
+ "step": 18200
2234
+ },
2235
+ {
2236
+ "epoch": 4.92,
2237
+ "learning_rate": 0.0003,
2238
+ "loss": 0.0023,
2239
+ "step": 18250
2240
+ },
2241
+ {
2242
+ "epoch": 4.93,
2243
+ "learning_rate": 0.0003,
2244
+ "loss": 0.0057,
2245
+ "step": 18300
2246
+ },
2247
+ {
2248
+ "epoch": 4.95,
2249
+ "learning_rate": 0.0003,
2250
+ "loss": 0.0016,
2251
+ "step": 18350
2252
+ },
2253
+ {
2254
+ "epoch": 4.96,
2255
+ "learning_rate": 0.0003,
2256
+ "loss": 0.0038,
2257
+ "step": 18400
2258
+ },
2259
+ {
2260
+ "epoch": 4.97,
2261
+ "learning_rate": 0.0003,
2262
+ "loss": 0.0028,
2263
+ "step": 18450
2264
+ },
2265
+ {
2266
+ "epoch": 4.99,
2267
+ "learning_rate": 0.0003,
2268
+ "loss": 0.0062,
2269
+ "step": 18500
2270
+ },
2271
+ {
2272
+ "epoch": 5.0,
2273
+ "learning_rate": 0.0003,
2274
+ "loss": 0.0006,
2275
+ "step": 18550
2276
+ },
2277
  {
2278
  "epoch": 5.0,
2279
+ "eval_f1": 22.2222,
2280
+ "eval_gen_len": 29.002111729698598,
2281
+ "eval_loss": 0.07783858478069305,
2282
+ "eval_runtime": 26.143,
2283
+ "eval_samples_per_second": 797.002,
2284
+ "eval_steps_per_second": 24.94,
2285
+ "step": 18550
2286
  }
2287
  ],
2288
  "logging_steps": 50,
2289
+ "max_steps": 18550,
2290
  "num_input_tokens_seen": 0,
2291
  "num_train_epochs": 5,
2292
  "save_steps": 250,
2293
+ "total_flos": 4.952289149190144e+16,
2294
  "train_batch_size": 16,
2295
  "trial_name": null,
2296
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef81e5dadc06d664ae506ceffc8549372fd7dc72551122e78c5b48a870640a4b
3
  size 6392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8de0a3cb935cd1993c01f2615f8441862dd8c80558d743da924534bed67f647e
3
  size 6392