|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.964394532950291, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00010077565027123787, |
|
"loss": 0.4732, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00010075885246660077, |
|
"loss": 0.4445, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00010073086073069567, |
|
"loss": 0.4329, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0001006916819700645, |
|
"loss": 0.4256, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.0001006414377109213, |
|
"loss": 0.421, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 0.00010057993897463803, |
|
"loss": 0.4167, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 0.00010050729045131372, |
|
"loss": 0.4132, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 0.00010042351006588448, |
|
"loss": 0.4107, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 0.00010032861848990674, |
|
"loss": 0.4091, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 0.0001002228621438404, |
|
"loss": 0.4067, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.3877949118614197, |
|
"eval_runtime": 108.0468, |
|
"eval_samples_per_second": 46.276, |
|
"eval_steps_per_second": 0.731, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.00010010584325708389, |
|
"loss": 0.4038, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.997779155931062e-05, |
|
"loss": 0.402, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 9.983873864536092e-05, |
|
"loss": 0.401, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 9.968871882446063e-05, |
|
"loss": 0.4001, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 9.952776911175577e-05, |
|
"loss": 0.3987, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 9.935592921917959e-05, |
|
"loss": 0.3974, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 9.917361771757108e-05, |
|
"loss": 0.3954, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 9.898014889719868e-05, |
|
"loss": 0.3945, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 9.877592501404869e-05, |
|
"loss": 0.394, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 9.856099645730841e-05, |
|
"loss": 0.3926, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.37431180477142334, |
|
"eval_runtime": 106.4392, |
|
"eval_samples_per_second": 46.975, |
|
"eval_steps_per_second": 0.742, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 9.833541625738316e-05, |
|
"loss": 0.3929, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 9.809924007281187e-05, |
|
"loss": 0.3906, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 9.78525261765341e-05, |
|
"loss": 0.3899, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 9.759533544151208e-05, |
|
"loss": 0.3892, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 9.732773132571125e-05, |
|
"loss": 0.3889, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"learning_rate": 9.705034604088048e-05, |
|
"loss": 0.3865, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"learning_rate": 9.676213628592508e-05, |
|
"loss": 0.3865, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"learning_rate": 9.64637187296151e-05, |
|
"loss": 0.3867, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"learning_rate": 9.615516700201724e-05, |
|
"loss": 0.3858, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"learning_rate": 9.583720443927501e-05, |
|
"loss": 0.3853, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"eval_loss": 0.36506548523902893, |
|
"eval_runtime": 96.1419, |
|
"eval_samples_per_second": 52.006, |
|
"eval_steps_per_second": 0.822, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 9.550863512110018e-05, |
|
"loss": 0.3844, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 9.517016728422667e-05, |
|
"loss": 0.3834, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 9.482188444052858e-05, |
|
"loss": 0.3826, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 9.446387252358614e-05, |
|
"loss": 0.3824, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 9.40962198674828e-05, |
|
"loss": 0.3825, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 9.371901718501017e-05, |
|
"loss": 0.3804, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 9.333314023958391e-05, |
|
"loss": 0.3803, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 9.293713767144707e-05, |
|
"loss": 0.381, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 9.253187106312908e-05, |
|
"loss": 0.3793, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 9.211744040810141e-05, |
|
"loss": 0.3793, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.35968872904777527, |
|
"eval_runtime": 95.0235, |
|
"eval_samples_per_second": 52.619, |
|
"eval_steps_per_second": 0.831, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 9.169394796092694e-05, |
|
"loss": 0.4465, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 9.126149821203012e-05, |
|
"loss": 0.444, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 9.082019786191564e-05, |
|
"loss": 0.4435, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 9.037106452976014e-05, |
|
"loss": 0.4422, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 8.991240893598493e-05, |
|
"loss": 0.4446, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 8.944523560858406e-05, |
|
"loss": 0.4418, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"learning_rate": 8.89696598155856e-05, |
|
"loss": 0.4412, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"learning_rate": 8.848579889820028e-05, |
|
"loss": 0.4414, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"learning_rate": 8.799377224186928e-05, |
|
"loss": 0.4407, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"learning_rate": 8.749470933536528e-05, |
|
"loss": 0.4402, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"eval_loss": 0.42174017429351807, |
|
"eval_runtime": 91.5901, |
|
"eval_samples_per_second": 54.591, |
|
"eval_steps_per_second": 0.863, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"learning_rate": 8.698673310407346e-05, |
|
"loss": 0.4396, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"learning_rate": 8.647096100588967e-05, |
|
"loss": 0.4381, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"learning_rate": 8.594752029986149e-05, |
|
"loss": 0.4382, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"learning_rate": 8.54165401371507e-05, |
|
"loss": 0.4386, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"learning_rate": 8.487815152916725e-05, |
|
"loss": 0.4376, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"learning_rate": 8.433248731524403e-05, |
|
"loss": 0.4361, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"learning_rate": 8.377968212986092e-05, |
|
"loss": 0.4369, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"learning_rate": 8.321987236942559e-05, |
|
"loss": 0.4367, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"learning_rate": 8.265433627110443e-05, |
|
"loss": 0.4345, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 8.20809467414187e-05, |
|
"loss": 0.4349, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_loss": 0.41873496770858765, |
|
"eval_runtime": 67.332, |
|
"eval_samples_per_second": 74.259, |
|
"eval_steps_per_second": 1.173, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"learning_rate": 8.150097177421913e-05, |
|
"loss": 0.4351, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 8.091455446965055e-05, |
|
"loss": 0.4346, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 8.032183951740807e-05, |
|
"loss": 0.4348, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"learning_rate": 7.972417693488813e-05, |
|
"loss": 0.4339, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"learning_rate": 7.911931879453039e-05, |
|
"loss": 0.4321, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"learning_rate": 7.850860595403895e-05, |
|
"loss": 0.4327, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 7.789218909766887e-05, |
|
"loss": 0.432, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"learning_rate": 7.727022031705582e-05, |
|
"loss": 0.432, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"learning_rate": 7.664537309168187e-05, |
|
"loss": 0.4318, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"learning_rate": 7.601278284364346e-05, |
|
"loss": 0.4311, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"eval_loss": 0.4138348698616028, |
|
"eval_runtime": 71.9462, |
|
"eval_samples_per_second": 69.496, |
|
"eval_steps_per_second": 1.098, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"learning_rate": 7.53751043867331e-05, |
|
"loss": 0.4304, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"learning_rate": 7.473249505856825e-05, |
|
"loss": 0.43, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"learning_rate": 7.408511341338508e-05, |
|
"loss": 0.4295, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"learning_rate": 7.343311918291752e-05, |
|
"loss": 0.43, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 7.277667323698597e-05, |
|
"loss": 0.43, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"learning_rate": 7.211593754380497e-05, |
|
"loss": 0.429, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"learning_rate": 7.145107513002001e-05, |
|
"loss": 0.4289, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"learning_rate": 7.078225004048314e-05, |
|
"loss": 0.4275, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"learning_rate": 7.011097622316453e-05, |
|
"loss": 0.428, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"learning_rate": 6.943472888404397e-05, |
|
"loss": 0.4275, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"eval_loss": 0.4106511175632477, |
|
"eval_runtime": 94.0729, |
|
"eval_samples_per_second": 53.15, |
|
"eval_steps_per_second": 0.84, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"learning_rate": 6.875501637243646e-05, |
|
"loss": 0.4269, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"learning_rate": 6.807200639723347e-05, |
|
"loss": 0.4256, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"learning_rate": 6.738586748092632e-05, |
|
"loss": 0.4273, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"learning_rate": 6.669676891802565e-05, |
|
"loss": 0.4255, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"learning_rate": 6.600626718034563e-05, |
|
"loss": 0.4255, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"learning_rate": 6.531176515384238e-05, |
|
"loss": 0.4265, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"learning_rate": 6.461621146309398e-05, |
|
"loss": 0.4257, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"learning_rate": 6.39169899926586e-05, |
|
"loss": 0.4242, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"learning_rate": 6.321566476924404e-05, |
|
"loss": 0.4236, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"learning_rate": 6.2512408834355e-05, |
|
"loss": 0.4245, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"eval_loss": 0.40728819370269775, |
|
"eval_runtime": 61.7603, |
|
"eval_samples_per_second": 80.958, |
|
"eval_steps_per_second": 1.279, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"learning_rate": 6.180739570587037e-05, |
|
"loss": 0.4251, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"learning_rate": 6.110079933523028e-05, |
|
"loss": 0.4228, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"learning_rate": 6.039279406451614e-05, |
|
"loss": 0.4228, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"learning_rate": 5.968355458343437e-05, |
|
"loss": 0.422, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"learning_rate": 5.89732558862144e-05, |
|
"loss": 0.4224, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"learning_rate": 5.826207322843129e-05, |
|
"loss": 0.4221, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"learning_rate": 5.755160645640033e-05, |
|
"loss": 0.4215, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"learning_rate": 5.6839183363622005e-05, |
|
"loss": 0.4212, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"learning_rate": 5.612640286074268e-05, |
|
"loss": 0.4204, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"learning_rate": 5.5413440815685914e-05, |
|
"loss": 0.4213, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"eval_loss": 0.4042368531227112, |
|
"eval_runtime": 65.4775, |
|
"eval_samples_per_second": 76.362, |
|
"eval_steps_per_second": 1.207, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"learning_rate": 5.4701898965206674e-05, |
|
"loss": 0.4199, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"learning_rate": 5.398910105922212e-05, |
|
"loss": 0.4199, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"learning_rate": 5.327807340052843e-05, |
|
"loss": 0.4194, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"learning_rate": 5.25661416730201e-05, |
|
"loss": 0.4189, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"learning_rate": 5.185490684450904e-05, |
|
"loss": 0.4188, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"learning_rate": 5.114454440154674e-05, |
|
"loss": 0.4199, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 5.043522961543667e-05, |
|
"loss": 0.4186, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"learning_rate": 4.972713749898855e-05, |
|
"loss": 0.418, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 4.902185464226454e-05, |
|
"loss": 0.419, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"learning_rate": 4.831672833649923e-05, |
|
"loss": 0.418, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"eval_loss": 0.40235766768455505, |
|
"eval_runtime": 50.0847, |
|
"eval_samples_per_second": 99.831, |
|
"eval_steps_per_second": 1.577, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"learning_rate": 4.761334740888664e-05, |
|
"loss": 0.4173, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"learning_rate": 4.691188540814576e-05, |
|
"loss": 0.4176, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"learning_rate": 4.621391194692349e-05, |
|
"loss": 0.4166, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"learning_rate": 4.551680180854116e-05, |
|
"loss": 0.4182, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"learning_rate": 4.482212788828615e-05, |
|
"loss": 0.4169, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"learning_rate": 4.413006158655363e-05, |
|
"loss": 0.4158, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"learning_rate": 4.3440773660347955e-05, |
|
"loss": 0.4151, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 4.275443418115087e-05, |
|
"loss": 0.4146, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"learning_rate": 4.207121249295896e-05, |
|
"loss": 0.4152, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"learning_rate": 4.139127717050051e-05, |
|
"loss": 0.4143, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_loss": 0.39910149574279785, |
|
"eval_runtime": 73.8959, |
|
"eval_samples_per_second": 67.663, |
|
"eval_steps_per_second": 1.069, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"learning_rate": 4.0714795977642255e-05, |
|
"loss": 0.4153, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"learning_rate": 4.004327782200046e-05, |
|
"loss": 0.4146, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"learning_rate": 3.937419699055001e-05, |
|
"loss": 0.4138, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"learning_rate": 3.8709067973051936e-05, |
|
"loss": 0.4137, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"learning_rate": 3.804805488013876e-05, |
|
"loss": 0.4131, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"learning_rate": 3.739132080690021e-05, |
|
"loss": 0.4141, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"learning_rate": 3.673902779264194e-05, |
|
"loss": 0.4123, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"learning_rate": 3.609133678090475e-05, |
|
"loss": 0.4126, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"learning_rate": 3.544840757975416e-05, |
|
"loss": 0.4124, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"learning_rate": 3.481039882235021e-05, |
|
"loss": 0.4124, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"eval_loss": 0.3964463174343109, |
|
"eval_runtime": 74.8784, |
|
"eval_samples_per_second": 66.775, |
|
"eval_steps_per_second": 1.055, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"learning_rate": 3.4177467927807086e-05, |
|
"loss": 0.412, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"learning_rate": 3.3549771062352226e-05, |
|
"loss": 0.4128, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"learning_rate": 3.2928702236310036e-05, |
|
"loss": 0.411, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"learning_rate": 3.231192548659413e-05, |
|
"loss": 0.4108, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"learning_rate": 3.170084306064917e-05, |
|
"loss": 0.4106, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"learning_rate": 3.1096810275663516e-05, |
|
"loss": 0.4112, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"learning_rate": 3.0497555244481445e-05, |
|
"loss": 0.4106, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 5.27, |
|
"learning_rate": 2.9904442205778553e-05, |
|
"loss": 0.4106, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 5.31, |
|
"learning_rate": 2.9317617501322188e-05, |
|
"loss": 0.4104, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"learning_rate": 2.873722592132734e-05, |
|
"loss": 0.4096, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.35, |
|
"eval_loss": 0.3943169414997101, |
|
"eval_runtime": 90.0414, |
|
"eval_samples_per_second": 55.53, |
|
"eval_steps_per_second": 0.877, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"learning_rate": 2.816341066873177e-05, |
|
"loss": 0.4098, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"learning_rate": 2.75974407207991e-05, |
|
"loss": 0.4089, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"learning_rate": 2.7037187352256104e-05, |
|
"loss": 0.4089, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"learning_rate": 2.6483929770191072e-05, |
|
"loss": 0.4098, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"learning_rate": 2.5937804482633846e-05, |
|
"loss": 0.4098, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"learning_rate": 2.5400016613135313e-05, |
|
"loss": 0.4082, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.66, |
|
"learning_rate": 2.486854343481112e-05, |
|
"loss": 0.4086, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 2.4344601123401418e-05, |
|
"loss": 0.408, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"learning_rate": 2.3828318953831648e-05, |
|
"loss": 0.409, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"learning_rate": 2.3320833444621267e-05, |
|
"loss": 0.4079, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"eval_loss": 0.39236390590667725, |
|
"eval_runtime": 57.2971, |
|
"eval_samples_per_second": 87.264, |
|
"eval_steps_per_second": 1.379, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 5.83, |
|
"learning_rate": 2.2820235842075488e-05, |
|
"loss": 0.4076, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 2.232767449569324e-05, |
|
"loss": 0.408, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"learning_rate": 2.1843270937616385e-05, |
|
"loss": 0.4075, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"learning_rate": 2.136808860049628e-05, |
|
"loss": 0.4075, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"learning_rate": 2.090034022927794e-05, |
|
"loss": 0.4083, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"learning_rate": 2.044110181973758e-05, |
|
"loss": 0.4072, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 6.09, |
|
"learning_rate": 1.9990486682081012e-05, |
|
"loss": 0.4066, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 6.14, |
|
"learning_rate": 1.9549480970403115e-05, |
|
"loss": 0.4061, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 1.911642597471847e-05, |
|
"loss": 0.4062, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"learning_rate": 1.869232109486083e-05, |
|
"loss": 0.4071, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 6.22, |
|
"eval_loss": 0.39048197865486145, |
|
"eval_runtime": 72.8664, |
|
"eval_samples_per_second": 68.619, |
|
"eval_steps_per_second": 1.084, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"learning_rate": 1.8277270972363276e-05, |
|
"loss": 0.4063, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 6.31, |
|
"learning_rate": 1.787137801463301e-05, |
|
"loss": 0.4054, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 6.35, |
|
"learning_rate": 1.7474742369683822e-05, |
|
"loss": 0.4058, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 1.7087461901426146e-05, |
|
"loss": 0.4046, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"learning_rate": 1.670963216552051e-05, |
|
"loss": 0.4057, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"learning_rate": 1.634134638580067e-05, |
|
"loss": 0.4044, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 6.53, |
|
"learning_rate": 1.5982695431271973e-05, |
|
"loss": 0.4059, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"learning_rate": 1.563514402048906e-05, |
|
"loss": 0.4044, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"learning_rate": 1.52959863865947e-05, |
|
"loss": 0.4042, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"learning_rate": 1.496672150482924e-05, |
|
"loss": 0.4046, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.66, |
|
"eval_loss": 0.38925543427467346, |
|
"eval_runtime": 77.3103, |
|
"eval_samples_per_second": 64.674, |
|
"eval_steps_per_second": 1.022, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.7, |
|
"learning_rate": 1.4647430616375366e-05, |
|
"loss": 0.4042, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"learning_rate": 1.433819250148215e-05, |
|
"loss": 0.4049, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 6.79, |
|
"learning_rate": 1.4039083460027203e-05, |
|
"loss": 0.4041, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"learning_rate": 1.3750177292690856e-05, |
|
"loss": 0.4038, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"learning_rate": 1.347154528274688e-05, |
|
"loss": 0.4047, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"learning_rate": 1.3203782390038048e-05, |
|
"loss": 0.4036, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"learning_rate": 1.2945881505195709e-05, |
|
"loss": 0.4044, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"learning_rate": 1.2698453225696373e-05, |
|
"loss": 0.4045, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"learning_rate": 1.2461558600765676e-05, |
|
"loss": 0.4031, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"learning_rate": 1.2235256080607583e-05, |
|
"loss": 0.4026, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 7.09, |
|
"eval_loss": 0.3881285488605499, |
|
"eval_runtime": 133.2371, |
|
"eval_samples_per_second": 37.527, |
|
"eval_steps_per_second": 0.593, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 7.14, |
|
"learning_rate": 1.20196015019827e-05, |
|
"loss": 0.4036, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 7.18, |
|
"learning_rate": 1.1815047267485115e-05, |
|
"loss": 0.4037, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 7.22, |
|
"learning_rate": 1.162082400805151e-05, |
|
"loss": 0.4048, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"learning_rate": 1.1437400292071077e-05, |
|
"loss": 0.4032, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"learning_rate": 1.126482137660111e-05, |
|
"loss": 0.4032, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 7.36, |
|
"learning_rate": 1.1103129842906643e-05, |
|
"loss": 0.4029, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"learning_rate": 1.0952365585954172e-05, |
|
"loss": 0.4023, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"learning_rate": 1.0812565804568168e-05, |
|
"loss": 0.4027, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 7.49, |
|
"learning_rate": 1.0683764992252818e-05, |
|
"loss": 0.4022, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"learning_rate": 1.0566219440353348e-05, |
|
"loss": 0.4027, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 7.53, |
|
"eval_loss": 0.38710081577301025, |
|
"eval_runtime": 65.4468, |
|
"eval_samples_per_second": 76.398, |
|
"eval_steps_per_second": 1.207, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 7.57, |
|
"learning_rate": 1.0459487036725839e-05, |
|
"loss": 0.4031, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"learning_rate": 1.0363840719071717e-05, |
|
"loss": 0.4023, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"learning_rate": 1.0279462058166865e-05, |
|
"loss": 0.4025, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"learning_rate": 1.0206033689125313e-05, |
|
"loss": 0.4028, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"learning_rate": 1.0143753941878168e-05, |
|
"loss": 0.4023, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"learning_rate": 1.0092638183021144e-05, |
|
"loss": 0.4019, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"learning_rate": 1.0052699024602892e-05, |
|
"loss": 0.4014, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"learning_rate": 1.0023946321013112e-05, |
|
"loss": 0.4018, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"learning_rate": 1.0006411110358469e-05, |
|
"loss": 0.4024, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"learning_rate": 1.0000027439232365e-05, |
|
"loss": 0.4024, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 7.96, |
|
"eval_loss": 0.38628044724464417, |
|
"eval_runtime": 91.7648, |
|
"eval_samples_per_second": 54.487, |
|
"eval_steps_per_second": 0.861, |
|
"step": 100000 |
|
} |
|
], |
|
"max_steps": 100000, |
|
"num_train_epochs": 9, |
|
"total_flos": 4.7098350596970145e+21, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|