evgmaslov commited on
Commit
22db74a
·
verified ·
1 Parent(s): 09d685f

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5ccc5f230db8fbb6c8874aa1e2dea60a382cb35e56f6f6fea075fff9d2349092
3
  size 268470320
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e000cfefe0bf68021926c7a726ea44f6f239a89d11f212894f3df5cf62087281
3
  size 268470320
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9bf20484fec9d82ceb75c3299919bd3b5f0df9cb071b00eeef1293b9b2012e7
3
  size 537086714
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d29aff00e13c30c6d77d2e80d0c7ab951ca92186da88481c9d12d83a4ad5a16a
3
  size 537086714
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b87cdb6a9ead0e67aaeb4a1b6260ad68c7c43537b6cabdd238d80e9961300a1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c473ea3ff1721307adcbd4476da1ce7d0c4c9e424349c0a43ee1e92df077688
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27c319142739f779869d5d897bd8b39e3b51026a651fe387362d0aa393419cac
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a090745596d6e223b0bc60dfcc2571a3fb6eeb72f08755a28bb1f86ee72f1c2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.225806451612903,
5
  "eval_steps": 1,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -114,6 +114,113 @@
114
  "loss": 1.0564,
115
  "mean_token_accuracy": 0.7449640461376735,
116
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  }
118
  ],
119
  "logging_steps": 10,
@@ -133,7 +240,7 @@
133
  "attributes": {}
134
  }
135
  },
136
- "total_flos": 3.079986034256118e+17,
137
  "train_batch_size": 16,
138
  "trial_name": null,
139
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 6.451612903225806,
5
  "eval_steps": 1,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
114
  "loss": 1.0564,
115
  "mean_token_accuracy": 0.7449640461376735,
116
  "step": 100
117
+ },
118
+ {
119
+ "epoch": 3.5483870967741935,
120
+ "grad_norm": 0.050828784704208374,
121
+ "learning_rate": 7.682741885881314e-06,
122
+ "loss": 1.0597,
123
+ "mean_token_accuracy": 0.7440615892410278,
124
+ "step": 110
125
+ },
126
+ {
127
+ "epoch": 3.870967741935484,
128
+ "grad_norm": 0.04859774559736252,
129
+ "learning_rate": 7.217431291229068e-06,
130
+ "loss": 1.029,
131
+ "mean_token_accuracy": 0.7496151030063629,
132
+ "step": 120
133
+ },
134
+ {
135
+ "epoch": 4.0,
136
+ "eval_loss": 1.4347164630889893,
137
+ "eval_mean_token_accuracy": 0.7422454118728637,
138
+ "eval_runtime": 0.3857,
139
+ "eval_samples_per_second": 12.964,
140
+ "eval_steps_per_second": 2.593,
141
+ "step": 124
142
+ },
143
+ {
144
+ "epoch": 4.193548387096774,
145
+ "grad_norm": 0.046365246176719666,
146
+ "learning_rate": 6.726825272106539e-06,
147
+ "loss": 0.9981,
148
+ "mean_token_accuracy": 0.7524743378162384,
149
+ "step": 130
150
+ },
151
+ {
152
+ "epoch": 4.516129032258064,
153
+ "grad_norm": 0.046276628971099854,
154
+ "learning_rate": 6.216520433716544e-06,
155
+ "loss": 1.012,
156
+ "mean_token_accuracy": 0.750678151845932,
157
+ "step": 140
158
+ },
159
+ {
160
+ "epoch": 4.838709677419355,
161
+ "grad_norm": 0.04600201174616814,
162
+ "learning_rate": 5.69233809622687e-06,
163
+ "loss": 1.0037,
164
+ "mean_token_accuracy": 0.7513074278831482,
165
+ "step": 150
166
+ },
167
+ {
168
+ "epoch": 5.0,
169
+ "eval_loss": 1.3713712692260742,
170
+ "eval_mean_token_accuracy": 0.750474234422048,
171
+ "eval_runtime": 0.3857,
172
+ "eval_samples_per_second": 12.963,
173
+ "eval_steps_per_second": 2.593,
174
+ "step": 155
175
+ },
176
+ {
177
+ "epoch": 5.161290322580645,
178
+ "grad_norm": 0.0451810322701931,
179
+ "learning_rate": 5.160257887858278e-06,
180
+ "loss": 0.9743,
181
+ "mean_token_accuracy": 0.7553885579109192,
182
+ "step": 160
183
+ },
184
+ {
185
+ "epoch": 5.483870967741936,
186
+ "grad_norm": 0.044044021517038345,
187
+ "learning_rate": 4.626349532067879e-06,
188
+ "loss": 0.9805,
189
+ "mean_token_accuracy": 0.7564027309417725,
190
+ "step": 170
191
+ },
192
+ {
193
+ "epoch": 5.806451612903226,
194
+ "grad_norm": 0.0453786626458168,
195
+ "learning_rate": 4.096703606968007e-06,
196
+ "loss": 0.971,
197
+ "mean_token_accuracy": 0.7588037550449371,
198
+ "step": 180
199
+ },
200
+ {
201
+ "epoch": 6.0,
202
+ "eval_loss": 1.3281458616256714,
203
+ "eval_mean_token_accuracy": 0.7527484212602887,
204
+ "eval_runtime": 0.3849,
205
+ "eval_samples_per_second": 12.991,
206
+ "eval_steps_per_second": 2.598,
207
+ "step": 186
208
+ },
209
+ {
210
+ "epoch": 6.129032258064516,
211
+ "grad_norm": 0.04323386028409004,
212
+ "learning_rate": 3.5773620668448384e-06,
213
+ "loss": 0.9539,
214
+ "mean_token_accuracy": 0.764067068696022,
215
+ "step": 190
216
+ },
217
+ {
218
+ "epoch": 6.451612903225806,
219
+ "grad_norm": 0.04353416711091995,
220
+ "learning_rate": 3.074249318355046e-06,
221
+ "loss": 0.9733,
222
+ "mean_token_accuracy": 0.756243884563446,
223
+ "step": 200
224
  }
225
  ],
226
  "logging_steps": 10,
 
240
  "attributes": {}
241
  }
242
  },
243
+ "total_flos": 6.159972068512236e+17,
244
  "train_batch_size": 16,
245
  "trial_name": null,
246
  "trial_params": null