{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.964394532950291, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 0.00010077565027123787, "loss": 0.4732, "step": 500 }, { "epoch": 0.09, "learning_rate": 0.00010075885246660077, "loss": 0.4445, "step": 1000 }, { "epoch": 0.13, "learning_rate": 0.00010073086073069567, "loss": 0.4329, "step": 1500 }, { "epoch": 0.17, "learning_rate": 0.0001006916819700645, "loss": 0.4256, "step": 2000 }, { "epoch": 0.22, "learning_rate": 0.0001006414377109213, "loss": 0.421, "step": 2500 }, { "epoch": 0.26, "learning_rate": 0.00010057993897463803, "loss": 0.4167, "step": 3000 }, { "epoch": 0.3, "learning_rate": 0.00010050729045131372, "loss": 0.4132, "step": 3500 }, { "epoch": 0.35, "learning_rate": 0.00010042351006588448, "loss": 0.4107, "step": 4000 }, { "epoch": 0.39, "learning_rate": 0.00010032861848990674, "loss": 0.4091, "step": 4500 }, { "epoch": 0.44, "learning_rate": 0.0001002228621438404, "loss": 0.4067, "step": 5000 }, { "epoch": 0.44, "eval_loss": 0.3877949118614197, "eval_runtime": 108.0468, "eval_samples_per_second": 46.276, "eval_steps_per_second": 0.731, "step": 5000 }, { "epoch": 0.48, "learning_rate": 0.00010010584325708389, "loss": 0.4038, "step": 5500 }, { "epoch": 0.52, "learning_rate": 9.997779155931062e-05, "loss": 0.402, "step": 6000 }, { "epoch": 0.57, "learning_rate": 9.983873864536092e-05, "loss": 0.401, "step": 6500 }, { "epoch": 0.61, "learning_rate": 9.968871882446063e-05, "loss": 0.4001, "step": 7000 }, { "epoch": 0.65, "learning_rate": 9.952776911175577e-05, "loss": 0.3987, "step": 7500 }, { "epoch": 0.7, "learning_rate": 9.935592921917959e-05, "loss": 0.3974, "step": 8000 }, { "epoch": 0.74, "learning_rate": 9.917361771757108e-05, "loss": 0.3954, "step": 8500 }, { "epoch": 0.78, "learning_rate": 9.898014889719868e-05, "loss": 0.3945, "step": 9000 }, { "epoch": 0.83, "learning_rate": 9.877592501404869e-05, "loss": 0.394, "step": 9500 }, { "epoch": 0.87, "learning_rate": 9.856099645730841e-05, "loss": 0.3926, "step": 10000 }, { "epoch": 0.87, "eval_loss": 0.37431180477142334, "eval_runtime": 106.4392, "eval_samples_per_second": 46.975, "eval_steps_per_second": 0.742, "step": 10000 }, { "epoch": 0.04, "learning_rate": 9.833541625738316e-05, "loss": 0.3929, "step": 10500 }, { "epoch": 0.09, "learning_rate": 9.809924007281187e-05, "loss": 0.3906, "step": 11000 }, { "epoch": 0.13, "learning_rate": 9.78525261765341e-05, "loss": 0.3899, "step": 11500 }, { "epoch": 0.17, "learning_rate": 9.759533544151208e-05, "loss": 0.3892, "step": 12000 }, { "epoch": 0.22, "learning_rate": 9.732773132571125e-05, "loss": 0.3889, "step": 12500 }, { "epoch": 0.26, "learning_rate": 9.705034604088048e-05, "loss": 0.3865, "step": 13000 }, { "epoch": 0.3, "learning_rate": 9.676213628592508e-05, "loss": 0.3865, "step": 13500 }, { "epoch": 0.35, "learning_rate": 9.64637187296151e-05, "loss": 0.3867, "step": 14000 }, { "epoch": 0.39, "learning_rate": 9.615516700201724e-05, "loss": 0.3858, "step": 14500 }, { "epoch": 0.44, "learning_rate": 9.583720443927501e-05, "loss": 0.3853, "step": 15000 }, { "epoch": 0.44, "eval_loss": 0.36506548523902893, "eval_runtime": 96.1419, "eval_samples_per_second": 52.006, "eval_steps_per_second": 0.822, "step": 15000 }, { "epoch": 0.48, "learning_rate": 9.550863512110018e-05, "loss": 0.3844, "step": 15500 }, { "epoch": 0.52, "learning_rate": 9.517016728422667e-05, "loss": 0.3834, "step": 16000 }, { "epoch": 0.57, "learning_rate": 9.482188444052858e-05, "loss": 0.3826, "step": 16500 }, { "epoch": 0.61, "learning_rate": 9.446387252358614e-05, "loss": 0.3824, "step": 17000 }, { "epoch": 0.65, "learning_rate": 9.40962198674828e-05, "loss": 0.3825, "step": 17500 }, { "epoch": 0.7, "learning_rate": 9.371901718501017e-05, "loss": 0.3804, "step": 18000 }, { "epoch": 0.74, "learning_rate": 9.333314023958391e-05, "loss": 0.3803, "step": 18500 }, { "epoch": 0.78, "learning_rate": 9.293713767144707e-05, "loss": 0.381, "step": 19000 }, { "epoch": 0.83, "learning_rate": 9.253187106312908e-05, "loss": 0.3793, "step": 19500 }, { "epoch": 0.87, "learning_rate": 9.211744040810141e-05, "loss": 0.3793, "step": 20000 }, { "epoch": 0.87, "eval_loss": 0.35968872904777527, "eval_runtime": 95.0235, "eval_samples_per_second": 52.619, "eval_steps_per_second": 0.831, "step": 20000 }, { "epoch": 1.04, "learning_rate": 9.169394796092694e-05, "loss": 0.4465, "step": 20500 }, { "epoch": 1.09, "learning_rate": 9.126149821203012e-05, "loss": 0.444, "step": 21000 }, { "epoch": 1.13, "learning_rate": 9.082019786191564e-05, "loss": 0.4435, "step": 21500 }, { "epoch": 1.17, "learning_rate": 9.037106452976014e-05, "loss": 0.4422, "step": 22000 }, { "epoch": 1.22, "learning_rate": 8.991240893598493e-05, "loss": 0.4446, "step": 22500 }, { "epoch": 1.26, "learning_rate": 8.944523560858406e-05, "loss": 0.4418, "step": 23000 }, { "epoch": 1.3, "learning_rate": 8.89696598155856e-05, "loss": 0.4412, "step": 23500 }, { "epoch": 1.35, "learning_rate": 8.848579889820028e-05, "loss": 0.4414, "step": 24000 }, { "epoch": 1.39, "learning_rate": 8.799377224186928e-05, "loss": 0.4407, "step": 24500 }, { "epoch": 1.44, "learning_rate": 8.749470933536528e-05, "loss": 0.4402, "step": 25000 }, { "epoch": 1.44, "eval_loss": 0.42174017429351807, "eval_runtime": 91.5901, "eval_samples_per_second": 54.591, "eval_steps_per_second": 0.863, "step": 25000 }, { "epoch": 1.48, "learning_rate": 8.698673310407346e-05, "loss": 0.4396, "step": 25500 }, { "epoch": 1.52, "learning_rate": 8.647096100588967e-05, "loss": 0.4381, "step": 26000 }, { "epoch": 1.57, "learning_rate": 8.594752029986149e-05, "loss": 0.4382, "step": 26500 }, { "epoch": 1.61, "learning_rate": 8.54165401371507e-05, "loss": 0.4386, "step": 27000 }, { "epoch": 1.65, "learning_rate": 8.487815152916725e-05, "loss": 0.4376, "step": 27500 }, { "epoch": 1.7, "learning_rate": 8.433248731524403e-05, "loss": 0.4361, "step": 28000 }, { "epoch": 1.74, "learning_rate": 8.377968212986092e-05, "loss": 0.4369, "step": 28500 }, { "epoch": 1.78, "learning_rate": 8.321987236942559e-05, "loss": 0.4367, "step": 29000 }, { "epoch": 1.83, "learning_rate": 8.265433627110443e-05, "loss": 0.4345, "step": 29500 }, { "epoch": 1.87, "learning_rate": 8.20809467414187e-05, "loss": 0.4349, "step": 30000 }, { "epoch": 1.87, "eval_loss": 0.41873496770858765, "eval_runtime": 67.332, "eval_samples_per_second": 74.259, "eval_steps_per_second": 1.173, "step": 30000 }, { "epoch": 1.91, "learning_rate": 8.150097177421913e-05, "loss": 0.4351, "step": 30500 }, { "epoch": 1.96, "learning_rate": 8.091455446965055e-05, "loss": 0.4346, "step": 31000 }, { "epoch": 2.0, "learning_rate": 8.032183951740807e-05, "loss": 0.4348, "step": 31500 }, { "epoch": 2.04, "learning_rate": 7.972417693488813e-05, "loss": 0.4339, "step": 32000 }, { "epoch": 2.09, "learning_rate": 7.911931879453039e-05, "loss": 0.4321, "step": 32500 }, { "epoch": 2.13, "learning_rate": 7.850860595403895e-05, "loss": 0.4327, "step": 33000 }, { "epoch": 2.18, "learning_rate": 7.789218909766887e-05, "loss": 0.432, "step": 33500 }, { "epoch": 2.22, "learning_rate": 7.727022031705582e-05, "loss": 0.432, "step": 34000 }, { "epoch": 2.26, "learning_rate": 7.664537309168187e-05, "loss": 0.4318, "step": 34500 }, { "epoch": 2.31, "learning_rate": 7.601278284364346e-05, "loss": 0.4311, "step": 35000 }, { "epoch": 2.31, "eval_loss": 0.4138348698616028, "eval_runtime": 71.9462, "eval_samples_per_second": 69.496, "eval_steps_per_second": 1.098, "step": 35000 }, { "epoch": 2.35, "learning_rate": 7.53751043867331e-05, "loss": 0.4304, "step": 35500 }, { "epoch": 2.39, "learning_rate": 7.473249505856825e-05, "loss": 0.43, "step": 36000 }, { "epoch": 2.44, "learning_rate": 7.408511341338508e-05, "loss": 0.4295, "step": 36500 }, { "epoch": 2.48, "learning_rate": 7.343311918291752e-05, "loss": 0.43, "step": 37000 }, { "epoch": 2.52, "learning_rate": 7.277667323698597e-05, "loss": 0.43, "step": 37500 }, { "epoch": 2.57, "learning_rate": 7.211593754380497e-05, "loss": 0.429, "step": 38000 }, { "epoch": 2.61, "learning_rate": 7.145107513002001e-05, "loss": 0.4289, "step": 38500 }, { "epoch": 2.65, "learning_rate": 7.078225004048314e-05, "loss": 0.4275, "step": 39000 }, { "epoch": 2.7, "learning_rate": 7.011097622316453e-05, "loss": 0.428, "step": 39500 }, { "epoch": 2.74, "learning_rate": 6.943472888404397e-05, "loss": 0.4275, "step": 40000 }, { "epoch": 2.74, "eval_loss": 0.4106511175632477, "eval_runtime": 94.0729, "eval_samples_per_second": 53.15, "eval_steps_per_second": 0.84, "step": 40000 }, { "epoch": 2.78, "learning_rate": 6.875501637243646e-05, "loss": 0.4269, "step": 40500 }, { "epoch": 2.83, "learning_rate": 6.807200639723347e-05, "loss": 0.4256, "step": 41000 }, { "epoch": 2.87, "learning_rate": 6.738586748092632e-05, "loss": 0.4273, "step": 41500 }, { "epoch": 2.92, "learning_rate": 6.669676891802565e-05, "loss": 0.4255, "step": 42000 }, { "epoch": 2.96, "learning_rate": 6.600626718034563e-05, "loss": 0.4255, "step": 42500 }, { "epoch": 3.0, "learning_rate": 6.531176515384238e-05, "loss": 0.4265, "step": 43000 }, { "epoch": 3.05, "learning_rate": 6.461621146309398e-05, "loss": 0.4257, "step": 43500 }, { "epoch": 3.09, "learning_rate": 6.39169899926586e-05, "loss": 0.4242, "step": 44000 }, { "epoch": 3.13, "learning_rate": 6.321566476924404e-05, "loss": 0.4236, "step": 44500 }, { "epoch": 3.18, "learning_rate": 6.2512408834355e-05, "loss": 0.4245, "step": 45000 }, { "epoch": 3.18, "eval_loss": 0.40728819370269775, "eval_runtime": 61.7603, "eval_samples_per_second": 80.958, "eval_steps_per_second": 1.279, "step": 45000 }, { "epoch": 3.22, "learning_rate": 6.180739570587037e-05, "loss": 0.4251, "step": 45500 }, { "epoch": 3.26, "learning_rate": 6.110079933523028e-05, "loss": 0.4228, "step": 46000 }, { "epoch": 3.31, "learning_rate": 6.039279406451614e-05, "loss": 0.4228, "step": 46500 }, { "epoch": 3.35, "learning_rate": 5.968355458343437e-05, "loss": 0.422, "step": 47000 }, { "epoch": 3.39, "learning_rate": 5.89732558862144e-05, "loss": 0.4224, "step": 47500 }, { "epoch": 3.44, "learning_rate": 5.826207322843129e-05, "loss": 0.4221, "step": 48000 }, { "epoch": 3.48, "learning_rate": 5.755160645640033e-05, "loss": 0.4215, "step": 48500 }, { "epoch": 3.52, "learning_rate": 5.6839183363622005e-05, "loss": 0.4212, "step": 49000 }, { "epoch": 3.57, "learning_rate": 5.612640286074268e-05, "loss": 0.4204, "step": 49500 }, { "epoch": 3.61, "learning_rate": 5.5413440815685914e-05, "loss": 0.4213, "step": 50000 }, { "epoch": 3.61, "eval_loss": 0.4042368531227112, "eval_runtime": 65.4775, "eval_samples_per_second": 76.362, "eval_steps_per_second": 1.207, "step": 50000 }, { "epoch": 3.66, "learning_rate": 5.4701898965206674e-05, "loss": 0.4199, "step": 50500 }, { "epoch": 3.7, "learning_rate": 5.398910105922212e-05, "loss": 0.4199, "step": 51000 }, { "epoch": 3.74, "learning_rate": 5.327807340052843e-05, "loss": 0.4194, "step": 51500 }, { "epoch": 3.79, "learning_rate": 5.25661416730201e-05, "loss": 0.4189, "step": 52000 }, { "epoch": 3.83, "learning_rate": 5.185490684450904e-05, "loss": 0.4188, "step": 52500 }, { "epoch": 3.87, "learning_rate": 5.114454440154674e-05, "loss": 0.4199, "step": 53000 }, { "epoch": 3.92, "learning_rate": 5.043522961543667e-05, "loss": 0.4186, "step": 53500 }, { "epoch": 3.96, "learning_rate": 4.972713749898855e-05, "loss": 0.418, "step": 54000 }, { "epoch": 4.0, "learning_rate": 4.902185464226454e-05, "loss": 0.419, "step": 54500 }, { "epoch": 4.05, "learning_rate": 4.831672833649923e-05, "loss": 0.418, "step": 55000 }, { "epoch": 4.05, "eval_loss": 0.40235766768455505, "eval_runtime": 50.0847, "eval_samples_per_second": 99.831, "eval_steps_per_second": 1.577, "step": 55000 }, { "epoch": 4.09, "learning_rate": 4.761334740888664e-05, "loss": 0.4173, "step": 55500 }, { "epoch": 4.13, "learning_rate": 4.691188540814576e-05, "loss": 0.4176, "step": 56000 }, { "epoch": 4.18, "learning_rate": 4.621391194692349e-05, "loss": 0.4166, "step": 56500 }, { "epoch": 4.22, "learning_rate": 4.551680180854116e-05, "loss": 0.4182, "step": 57000 }, { "epoch": 4.26, "learning_rate": 4.482212788828615e-05, "loss": 0.4169, "step": 57500 }, { "epoch": 4.31, "learning_rate": 4.413006158655363e-05, "loss": 0.4158, "step": 58000 }, { "epoch": 4.35, "learning_rate": 4.3440773660347955e-05, "loss": 0.4151, "step": 58500 }, { "epoch": 4.4, "learning_rate": 4.275443418115087e-05, "loss": 0.4146, "step": 59000 }, { "epoch": 4.44, "learning_rate": 4.207121249295896e-05, "loss": 0.4152, "step": 59500 }, { "epoch": 4.48, "learning_rate": 4.139127717050051e-05, "loss": 0.4143, "step": 60000 }, { "epoch": 4.48, "eval_loss": 0.39910149574279785, "eval_runtime": 73.8959, "eval_samples_per_second": 67.663, "eval_steps_per_second": 1.069, "step": 60000 }, { "epoch": 4.53, "learning_rate": 4.0714795977642255e-05, "loss": 0.4153, "step": 60500 }, { "epoch": 4.57, "learning_rate": 4.004327782200046e-05, "loss": 0.4146, "step": 61000 }, { "epoch": 4.61, "learning_rate": 3.937419699055001e-05, "loss": 0.4138, "step": 61500 }, { "epoch": 4.66, "learning_rate": 3.8709067973051936e-05, "loss": 0.4137, "step": 62000 }, { "epoch": 4.7, "learning_rate": 3.804805488013876e-05, "loss": 0.4131, "step": 62500 }, { "epoch": 4.74, "learning_rate": 3.739132080690021e-05, "loss": 0.4141, "step": 63000 }, { "epoch": 4.79, "learning_rate": 3.673902779264194e-05, "loss": 0.4123, "step": 63500 }, { "epoch": 4.83, "learning_rate": 3.609133678090475e-05, "loss": 0.4126, "step": 64000 }, { "epoch": 4.87, "learning_rate": 3.544840757975416e-05, "loss": 0.4124, "step": 64500 }, { "epoch": 4.92, "learning_rate": 3.481039882235021e-05, "loss": 0.4124, "step": 65000 }, { "epoch": 4.92, "eval_loss": 0.3964463174343109, "eval_runtime": 74.8784, "eval_samples_per_second": 66.775, "eval_steps_per_second": 1.055, "step": 65000 }, { "epoch": 4.96, "learning_rate": 3.4177467927807086e-05, "loss": 0.412, "step": 65500 }, { "epoch": 5.0, "learning_rate": 3.3549771062352226e-05, "loss": 0.4128, "step": 66000 }, { "epoch": 5.05, "learning_rate": 3.2928702236310036e-05, "loss": 0.411, "step": 66500 }, { "epoch": 5.09, "learning_rate": 3.231192548659413e-05, "loss": 0.4108, "step": 67000 }, { "epoch": 5.14, "learning_rate": 3.170084306064917e-05, "loss": 0.4106, "step": 67500 }, { "epoch": 5.18, "learning_rate": 3.1096810275663516e-05, "loss": 0.4112, "step": 68000 }, { "epoch": 5.22, "learning_rate": 3.0497555244481445e-05, "loss": 0.4106, "step": 68500 }, { "epoch": 5.27, "learning_rate": 2.9904442205778553e-05, "loss": 0.4106, "step": 69000 }, { "epoch": 5.31, "learning_rate": 2.9317617501322188e-05, "loss": 0.4104, "step": 69500 }, { "epoch": 5.35, "learning_rate": 2.873722592132734e-05, "loss": 0.4096, "step": 70000 }, { "epoch": 5.35, "eval_loss": 0.3943169414997101, "eval_runtime": 90.0414, "eval_samples_per_second": 55.53, "eval_steps_per_second": 0.877, "step": 70000 }, { "epoch": 5.4, "learning_rate": 2.816341066873177e-05, "loss": 0.4098, "step": 70500 }, { "epoch": 5.44, "learning_rate": 2.75974407207991e-05, "loss": 0.4089, "step": 71000 }, { "epoch": 5.48, "learning_rate": 2.7037187352256104e-05, "loss": 0.4089, "step": 71500 }, { "epoch": 5.53, "learning_rate": 2.6483929770191072e-05, "loss": 0.4098, "step": 72000 }, { "epoch": 5.57, "learning_rate": 2.5937804482633846e-05, "loss": 0.4098, "step": 72500 }, { "epoch": 5.61, "learning_rate": 2.5400016613135313e-05, "loss": 0.4082, "step": 73000 }, { "epoch": 5.66, "learning_rate": 2.486854343481112e-05, "loss": 0.4086, "step": 73500 }, { "epoch": 5.7, "learning_rate": 2.4344601123401418e-05, "loss": 0.408, "step": 74000 }, { "epoch": 5.74, "learning_rate": 2.3828318953831648e-05, "loss": 0.409, "step": 74500 }, { "epoch": 5.79, "learning_rate": 2.3320833444621267e-05, "loss": 0.4079, "step": 75000 }, { "epoch": 5.79, "eval_loss": 0.39236390590667725, "eval_runtime": 57.2971, "eval_samples_per_second": 87.264, "eval_steps_per_second": 1.379, "step": 75000 }, { "epoch": 5.83, "learning_rate": 2.2820235842075488e-05, "loss": 0.4076, "step": 75500 }, { "epoch": 5.88, "learning_rate": 2.232767449569324e-05, "loss": 0.408, "step": 76000 }, { "epoch": 5.92, "learning_rate": 2.1843270937616385e-05, "loss": 0.4075, "step": 76500 }, { "epoch": 5.96, "learning_rate": 2.136808860049628e-05, "loss": 0.4075, "step": 77000 }, { "epoch": 6.01, "learning_rate": 2.090034022927794e-05, "loss": 0.4083, "step": 77500 }, { "epoch": 6.05, "learning_rate": 2.044110181973758e-05, "loss": 0.4072, "step": 78000 }, { "epoch": 6.09, "learning_rate": 1.9990486682081012e-05, "loss": 0.4066, "step": 78500 }, { "epoch": 6.14, "learning_rate": 1.9549480970403115e-05, "loss": 0.4061, "step": 79000 }, { "epoch": 6.18, "learning_rate": 1.911642597471847e-05, "loss": 0.4062, "step": 79500 }, { "epoch": 6.22, "learning_rate": 1.869232109486083e-05, "loss": 0.4071, "step": 80000 }, { "epoch": 6.22, "eval_loss": 0.39048197865486145, "eval_runtime": 72.8664, "eval_samples_per_second": 68.619, "eval_steps_per_second": 1.084, "step": 80000 }, { "epoch": 6.27, "learning_rate": 1.8277270972363276e-05, "loss": 0.4063, "step": 80500 }, { "epoch": 6.31, "learning_rate": 1.787137801463301e-05, "loss": 0.4054, "step": 81000 }, { "epoch": 6.35, "learning_rate": 1.7474742369683822e-05, "loss": 0.4058, "step": 81500 }, { "epoch": 6.4, "learning_rate": 1.7087461901426146e-05, "loss": 0.4046, "step": 82000 }, { "epoch": 6.44, "learning_rate": 1.670963216552051e-05, "loss": 0.4057, "step": 82500 }, { "epoch": 6.48, "learning_rate": 1.634134638580067e-05, "loss": 0.4044, "step": 83000 }, { "epoch": 6.53, "learning_rate": 1.5982695431271973e-05, "loss": 0.4059, "step": 83500 }, { "epoch": 6.57, "learning_rate": 1.563514402048906e-05, "loss": 0.4044, "step": 84000 }, { "epoch": 6.62, "learning_rate": 1.52959863865947e-05, "loss": 0.4042, "step": 84500 }, { "epoch": 6.66, "learning_rate": 1.496672150482924e-05, "loss": 0.4046, "step": 85000 }, { "epoch": 6.66, "eval_loss": 0.38925543427467346, "eval_runtime": 77.3103, "eval_samples_per_second": 64.674, "eval_steps_per_second": 1.022, "step": 85000 }, { "epoch": 6.7, "learning_rate": 1.4647430616375366e-05, "loss": 0.4042, "step": 85500 }, { "epoch": 6.75, "learning_rate": 1.433819250148215e-05, "loss": 0.4049, "step": 86000 }, { "epoch": 6.79, "learning_rate": 1.4039083460027203e-05, "loss": 0.4041, "step": 86500 }, { "epoch": 6.83, "learning_rate": 1.3750177292690856e-05, "loss": 0.4038, "step": 87000 }, { "epoch": 6.88, "learning_rate": 1.347154528274688e-05, "loss": 0.4047, "step": 87500 }, { "epoch": 6.92, "learning_rate": 1.3203782390038048e-05, "loss": 0.4036, "step": 88000 }, { "epoch": 6.96, "learning_rate": 1.2945881505195709e-05, "loss": 0.4044, "step": 88500 }, { "epoch": 7.01, "learning_rate": 1.2698453225696373e-05, "loss": 0.4045, "step": 89000 }, { "epoch": 7.05, "learning_rate": 1.2461558600765676e-05, "loss": 0.4031, "step": 89500 }, { "epoch": 7.09, "learning_rate": 1.2235256080607583e-05, "loss": 0.4026, "step": 90000 }, { "epoch": 7.09, "eval_loss": 0.3881285488605499, "eval_runtime": 133.2371, "eval_samples_per_second": 37.527, "eval_steps_per_second": 0.593, "step": 90000 }, { "epoch": 7.14, "learning_rate": 1.20196015019827e-05, "loss": 0.4036, "step": 90500 }, { "epoch": 7.18, "learning_rate": 1.1815047267485115e-05, "loss": 0.4037, "step": 91000 }, { "epoch": 7.22, "learning_rate": 1.162082400805151e-05, "loss": 0.4048, "step": 91500 }, { "epoch": 7.27, "learning_rate": 1.1437400292071077e-05, "loss": 0.4032, "step": 92000 }, { "epoch": 7.31, "learning_rate": 1.126482137660111e-05, "loss": 0.4032, "step": 92500 }, { "epoch": 7.36, "learning_rate": 1.1103129842906643e-05, "loss": 0.4029, "step": 93000 }, { "epoch": 7.4, "learning_rate": 1.0952365585954172e-05, "loss": 0.4023, "step": 93500 }, { "epoch": 7.44, "learning_rate": 1.0812565804568168e-05, "loss": 0.4027, "step": 94000 }, { "epoch": 7.49, "learning_rate": 1.0683764992252818e-05, "loss": 0.4022, "step": 94500 }, { "epoch": 7.53, "learning_rate": 1.0566219440353348e-05, "loss": 0.4027, "step": 95000 }, { "epoch": 7.53, "eval_loss": 0.38710081577301025, "eval_runtime": 65.4468, "eval_samples_per_second": 76.398, "eval_steps_per_second": 1.207, "step": 95000 }, { "epoch": 7.57, "learning_rate": 1.0459487036725839e-05, "loss": 0.4031, "step": 95500 }, { "epoch": 7.62, "learning_rate": 1.0363840719071717e-05, "loss": 0.4023, "step": 96000 }, { "epoch": 7.66, "learning_rate": 1.0279462058166865e-05, "loss": 0.4025, "step": 96500 }, { "epoch": 7.7, "learning_rate": 1.0206033689125313e-05, "loss": 0.4028, "step": 97000 }, { "epoch": 7.75, "learning_rate": 1.0143753941878168e-05, "loss": 0.4023, "step": 97500 }, { "epoch": 7.79, "learning_rate": 1.0092638183021144e-05, "loss": 0.4019, "step": 98000 }, { "epoch": 7.83, "learning_rate": 1.0052699024602892e-05, "loss": 0.4014, "step": 98500 }, { "epoch": 7.88, "learning_rate": 1.0023946321013112e-05, "loss": 0.4018, "step": 99000 }, { "epoch": 7.92, "learning_rate": 1.0006411110358469e-05, "loss": 0.4024, "step": 99500 }, { "epoch": 7.96, "learning_rate": 1.0000027439232365e-05, "loss": 0.4024, "step": 100000 }, { "epoch": 7.96, "eval_loss": 0.38628044724464417, "eval_runtime": 91.7648, "eval_samples_per_second": 54.487, "eval_steps_per_second": 0.861, "step": 100000 } ], "max_steps": 100000, "num_train_epochs": 9, "total_flos": 4.7098350596970145e+21, "trial_name": null, "trial_params": null }