|
{ |
|
"best_metric": 78.05994502395909, |
|
"best_model_checkpoint": "/root/turkic_qa/ru_kaz_models/ru_kaz_xlm_roberta_base_squad_model/checkpoint-5520", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 5520, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"step": 552, |
|
"train_exact_match": 56.34365634365634, |
|
"train_f1": 75.2715442869406, |
|
"train_runtime": 11.3355, |
|
"train_samples_per_second": 89.63, |
|
"train_steps_per_second": 3.264 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 50.90898132324219, |
|
"learning_rate": 5e-06, |
|
"loss": 1.5648, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_exact_match": 55.0625, |
|
"eval_f1": 73.73760370127802, |
|
"eval_runtime": 36.5495, |
|
"eval_samples_per_second": 89.769, |
|
"eval_steps_per_second": 3.229, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1104, |
|
"train_exact_match": 66.93306693306694, |
|
"train_f1": 82.49314886706897, |
|
"train_runtime": 11.3179, |
|
"train_samples_per_second": 90.83, |
|
"train_steps_per_second": 3.269 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 55.97765350341797, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2158, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_exact_match": 58.84375, |
|
"eval_f1": 76.49071665255673, |
|
"eval_runtime": 36.2092, |
|
"eval_samples_per_second": 90.612, |
|
"eval_steps_per_second": 3.259, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 1656, |
|
"train_exact_match": 67.53246753246754, |
|
"train_f1": 85.19091955779871, |
|
"train_runtime": 11.374, |
|
"train_samples_per_second": 89.415, |
|
"train_steps_per_second": 3.253 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 57.926448822021484, |
|
"learning_rate": 8.750000000000001e-06, |
|
"loss": 1.028, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_exact_match": 60.0, |
|
"eval_f1": 77.43391533868831, |
|
"eval_runtime": 36.533, |
|
"eval_samples_per_second": 89.809, |
|
"eval_steps_per_second": 3.23, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 2208, |
|
"train_exact_match": 74.22577422577423, |
|
"train_f1": 89.15395273332221, |
|
"train_runtime": 11.3619, |
|
"train_samples_per_second": 89.246, |
|
"train_steps_per_second": 3.256 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 38.46842956542969, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.8582, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_exact_match": 60.625, |
|
"eval_f1": 77.72152196665945, |
|
"eval_runtime": 36.465, |
|
"eval_samples_per_second": 89.977, |
|
"eval_steps_per_second": 3.236, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2760, |
|
"train_exact_match": 77.12287712287713, |
|
"train_f1": 90.14502448136521, |
|
"train_runtime": 11.5016, |
|
"train_samples_per_second": 88.857, |
|
"train_steps_per_second": 3.217 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 14.753046989440918, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.7281, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_exact_match": 60.8125, |
|
"eval_f1": 77.62510042626374, |
|
"eval_runtime": 36.4524, |
|
"eval_samples_per_second": 90.008, |
|
"eval_steps_per_second": 3.237, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"step": 3312, |
|
"train_exact_match": 80.21978021978022, |
|
"train_f1": 92.23780908752109, |
|
"train_runtime": 11.7336, |
|
"train_samples_per_second": 87.101, |
|
"train_steps_per_second": 3.153 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 23.36362075805664, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6422, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_exact_match": 61.28125, |
|
"eval_f1": 77.91507488796293, |
|
"eval_runtime": 37.4038, |
|
"eval_samples_per_second": 87.718, |
|
"eval_steps_per_second": 3.155, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"step": 3864, |
|
"train_exact_match": 81.81818181818181, |
|
"train_f1": 93.62881852209286, |
|
"train_runtime": 11.6143, |
|
"train_samples_per_second": 88.512, |
|
"train_steps_per_second": 3.186 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 21.251754760742188, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.5646, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_exact_match": 61.1875, |
|
"eval_f1": 77.88113274187097, |
|
"eval_runtime": 36.8966, |
|
"eval_samples_per_second": 88.924, |
|
"eval_steps_per_second": 3.198, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"step": 4416, |
|
"train_exact_match": 84.31568431568432, |
|
"train_f1": 93.90553661277687, |
|
"train_runtime": 11.4894, |
|
"train_samples_per_second": 89.126, |
|
"train_steps_per_second": 3.22 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 31.64153289794922, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.5156, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_exact_match": 60.71875, |
|
"eval_f1": 77.56837719428701, |
|
"eval_runtime": 36.6849, |
|
"eval_samples_per_second": 89.437, |
|
"eval_steps_per_second": 3.217, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"step": 4968, |
|
"train_exact_match": 85.11488511488511, |
|
"train_f1": 94.79917066124965, |
|
"train_runtime": 11.5536, |
|
"train_samples_per_second": 89.496, |
|
"train_steps_per_second": 3.202 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 46.01136779785156, |
|
"learning_rate": 1.25e-06, |
|
"loss": 0.4658, |
|
"step": 4968 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_exact_match": 61.125, |
|
"eval_f1": 77.88301584766332, |
|
"eval_runtime": 36.5869, |
|
"eval_samples_per_second": 89.677, |
|
"eval_steps_per_second": 3.225, |
|
"step": 4968 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 5520, |
|
"train_exact_match": 84.51548451548452, |
|
"train_f1": 94.34363228489812, |
|
"train_runtime": 11.395, |
|
"train_samples_per_second": 90.479, |
|
"train_steps_per_second": 3.247 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 84.91465759277344, |
|
"learning_rate": 0.0, |
|
"loss": 0.4488, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_exact_match": 61.0625, |
|
"eval_f1": 78.05994502395909, |
|
"eval_runtime": 36.4836, |
|
"eval_samples_per_second": 89.931, |
|
"eval_steps_per_second": 3.234, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 5520, |
|
"total_flos": 3.025228525300224e+16, |
|
"train_loss": 0.8031932664954144, |
|
"train_runtime": 3531.0877, |
|
"train_samples_per_second": 43.717, |
|
"train_steps_per_second": 1.563 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 5520, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 3.025228525300224e+16, |
|
"train_batch_size": 28, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|