{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5779092702169626, "eval_steps": 500, "global_step": 3200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004930966469428008, "grad_norm": 43895.33150575355, "learning_rate": 1.6666666666666667e-06, "loss": 0.6914, "step": 10 }, { "epoch": 0.009861932938856016, "grad_norm": 34666.69271793893, "learning_rate": 3.3333333333333333e-06, "loss": 0.653, "step": 20 }, { "epoch": 0.014792899408284023, "grad_norm": 29566.888507247426, "learning_rate": 5e-06, "loss": 0.6223, "step": 30 }, { "epoch": 0.01972386587771203, "grad_norm": 17779.597295776977, "learning_rate": 6.666666666666667e-06, "loss": 0.5303, "step": 40 }, { "epoch": 0.02465483234714004, "grad_norm": 10344.985645229286, "learning_rate": 8.166666666666668e-06, "loss": 0.4624, "step": 50 }, { "epoch": 0.029585798816568046, "grad_norm": 7057.267176464272, "learning_rate": 9.833333333333333e-06, "loss": 0.4089, "step": 60 }, { "epoch": 0.03451676528599606, "grad_norm": 7072.531088655603, "learning_rate": 1.1500000000000002e-05, "loss": 0.3719, "step": 70 }, { "epoch": 0.03944773175542406, "grad_norm": 5647.309093718884, "learning_rate": 1.3166666666666665e-05, "loss": 0.2918, "step": 80 }, { "epoch": 0.04437869822485207, "grad_norm": 16472.366193112633, "learning_rate": 1.4833333333333336e-05, "loss": 0.2643, "step": 90 }, { "epoch": 0.04930966469428008, "grad_norm": 17463.691247843337, "learning_rate": 1.65e-05, "loss": 0.2145, "step": 100 }, { "epoch": 0.054240631163708086, "grad_norm": 34631.59066517159, "learning_rate": 1.8166666666666667e-05, "loss": 0.2055, "step": 110 }, { "epoch": 0.05917159763313609, "grad_norm": 13642.827272966553, "learning_rate": 1.9833333333333335e-05, "loss": 0.1955, "step": 120 }, { "epoch": 0.0641025641025641, "grad_norm": 12345.246210586487, "learning_rate": 2.15e-05, "loss": 0.1747, "step": 130 }, { "epoch": 0.06903353057199212, "grad_norm": 18095.64544303408, "learning_rate": 2.3166666666666666e-05, "loss": 0.16, "step": 140 }, { "epoch": 0.07396449704142012, "grad_norm": 10606.944894737599, "learning_rate": 2.4833333333333335e-05, "loss": 0.1483, "step": 150 }, { "epoch": 0.07889546351084813, "grad_norm": 19168.698651708208, "learning_rate": 2.6500000000000004e-05, "loss": 0.1438, "step": 160 }, { "epoch": 0.08382642998027613, "grad_norm": 14566.588619165435, "learning_rate": 2.816666666666667e-05, "loss": 0.126, "step": 170 }, { "epoch": 0.08875739644970414, "grad_norm": 14365.429057288891, "learning_rate": 2.9833333333333335e-05, "loss": 0.1203, "step": 180 }, { "epoch": 0.09368836291913216, "grad_norm": 13554.579447552034, "learning_rate": 3.15e-05, "loss": 0.1375, "step": 190 }, { "epoch": 0.09861932938856016, "grad_norm": 14373.095421655004, "learning_rate": 3.316666666666667e-05, "loss": 0.1104, "step": 200 }, { "epoch": 0.10355029585798817, "grad_norm": 8761.746857790404, "learning_rate": 3.483333333333334e-05, "loss": 0.1109, "step": 210 }, { "epoch": 0.10848126232741617, "grad_norm": 14477.860062868407, "learning_rate": 3.65e-05, "loss": 0.1126, "step": 220 }, { "epoch": 0.11341222879684418, "grad_norm": 12729.454976549467, "learning_rate": 3.816666666666667e-05, "loss": 0.112, "step": 230 }, { "epoch": 0.11834319526627218, "grad_norm": 10014.094467299577, "learning_rate": 3.983333333333333e-05, "loss": 0.1014, "step": 240 }, { "epoch": 0.1232741617357002, "grad_norm": 11426.459469144413, "learning_rate": 4.15e-05, "loss": 0.1117, "step": 250 }, { "epoch": 0.1282051282051282, "grad_norm": 12138.764352272434, "learning_rate": 4.316666666666667e-05, "loss": 0.1085, "step": 260 }, { "epoch": 0.13313609467455623, "grad_norm": 17461.02585760642, "learning_rate": 4.483333333333333e-05, "loss": 0.1053, "step": 270 }, { "epoch": 0.13806706114398423, "grad_norm": 11649.5053972261, "learning_rate": 4.6500000000000005e-05, "loss": 0.0993, "step": 280 }, { "epoch": 0.14299802761341224, "grad_norm": 8499.819762794974, "learning_rate": 4.8166666666666674e-05, "loss": 0.0982, "step": 290 }, { "epoch": 0.14792899408284024, "grad_norm": 15086.851493933385, "learning_rate": 4.9833333333333336e-05, "loss": 0.0916, "step": 300 }, { "epoch": 0.15285996055226825, "grad_norm": 14996.807660298908, "learning_rate": 4.9999291660339096e-05, "loss": 0.0984, "step": 310 }, { "epoch": 0.15779092702169625, "grad_norm": 12750.902085734953, "learning_rate": 4.999684313032836e-05, "loss": 0.1033, "step": 320 }, { "epoch": 0.16272189349112426, "grad_norm": 16534.053102612197, "learning_rate": 4.999264583629061e-05, "loss": 0.1017, "step": 330 }, { "epoch": 0.16765285996055226, "grad_norm": 9247.551459710836, "learning_rate": 4.998670007186599e-05, "loss": 0.0999, "step": 340 }, { "epoch": 0.17258382642998027, "grad_norm": 9434.010811950557, "learning_rate": 4.997900625301659e-05, "loss": 0.116, "step": 350 }, { "epoch": 0.17751479289940827, "grad_norm": 14049.300053739333, "learning_rate": 4.996956491799731e-05, "loss": 0.0832, "step": 360 }, { "epoch": 0.18244575936883628, "grad_norm": 13846.231545081138, "learning_rate": 4.995837672731827e-05, "loss": 0.0853, "step": 370 }, { "epoch": 0.1873767258382643, "grad_norm": 8305.576921563004, "learning_rate": 4.994544246369849e-05, "loss": 0.0868, "step": 380 }, { "epoch": 0.19230769230769232, "grad_norm": 7614.640372335387, "learning_rate": 4.993076303201124e-05, "loss": 0.0813, "step": 390 }, { "epoch": 0.19723865877712032, "grad_norm": 8342.321978921696, "learning_rate": 4.9914339459220685e-05, "loss": 0.0827, "step": 400 }, { "epoch": 0.20216962524654833, "grad_norm": 9845.654879184014, "learning_rate": 4.989617289431004e-05, "loss": 0.0722, "step": 410 }, { "epoch": 0.20710059171597633, "grad_norm": 10443.866333882295, "learning_rate": 4.9876264608201203e-05, "loss": 0.0765, "step": 420 }, { "epoch": 0.21203155818540434, "grad_norm": 8376.00382043848, "learning_rate": 4.985461599366583e-05, "loss": 0.0867, "step": 430 }, { "epoch": 0.21696252465483234, "grad_norm": 14326.32150972468, "learning_rate": 4.9831228565227906e-05, "loss": 0.0714, "step": 440 }, { "epoch": 0.22189349112426035, "grad_norm": 11864.219148346847, "learning_rate": 4.980610395905776e-05, "loss": 0.0763, "step": 450 }, { "epoch": 0.22682445759368836, "grad_norm": 8294.590526361142, "learning_rate": 4.977924393285767e-05, "loss": 0.0816, "step": 460 }, { "epoch": 0.23175542406311636, "grad_norm": 9196.279247608785, "learning_rate": 4.9750650365738806e-05, "loss": 0.0717, "step": 470 }, { "epoch": 0.23668639053254437, "grad_norm": 10312.943323804315, "learning_rate": 4.972032525808983e-05, "loss": 0.0838, "step": 480 }, { "epoch": 0.2416173570019724, "grad_norm": 7124.782101931259, "learning_rate": 4.968827073143694e-05, "loss": 0.078, "step": 490 }, { "epoch": 0.2465483234714004, "grad_norm": 8737.514520731855, "learning_rate": 4.965448902829543e-05, "loss": 0.0686, "step": 500 }, { "epoch": 0.2514792899408284, "grad_norm": 6490.680703901556, "learning_rate": 4.961898251201282e-05, "loss": 0.0714, "step": 510 }, { "epoch": 0.2564102564102564, "grad_norm": 6539.6247598772825, "learning_rate": 4.9581753666603517e-05, "loss": 0.0789, "step": 520 }, { "epoch": 0.2613412228796844, "grad_norm": 9437.219505765455, "learning_rate": 4.954280509657502e-05, "loss": 0.0677, "step": 530 }, { "epoch": 0.26627218934911245, "grad_norm": 13868.67549551867, "learning_rate": 4.9502139526745736e-05, "loss": 0.0838, "step": 540 }, { "epoch": 0.27120315581854043, "grad_norm": 9426.942240196448, "learning_rate": 4.945975980205435e-05, "loss": 0.0693, "step": 550 }, { "epoch": 0.27613412228796846, "grad_norm": 9122.218589794918, "learning_rate": 4.9415668887360756e-05, "loss": 0.0751, "step": 560 }, { "epoch": 0.28106508875739644, "grad_norm": 12629.583049333021, "learning_rate": 4.936986986723867e-05, "loss": 0.0795, "step": 570 }, { "epoch": 0.2859960552268245, "grad_norm": 9029.990033217091, "learning_rate": 4.932236594575986e-05, "loss": 0.0756, "step": 580 }, { "epoch": 0.29092702169625245, "grad_norm": 22628.703895716168, "learning_rate": 4.927316044626993e-05, "loss": 0.0724, "step": 590 }, { "epoch": 0.2958579881656805, "grad_norm": 7334.351232385861, "learning_rate": 4.9222256811155875e-05, "loss": 0.0598, "step": 600 }, { "epoch": 0.30078895463510846, "grad_norm": 11844.172913293693, "learning_rate": 4.916965860160521e-05, "loss": 0.0824, "step": 610 }, { "epoch": 0.3057199211045365, "grad_norm": 10243.98516203533, "learning_rate": 4.911536949735686e-05, "loss": 0.0675, "step": 620 }, { "epoch": 0.3106508875739645, "grad_norm": 8889.587616982017, "learning_rate": 4.9059393296443724e-05, "loss": 0.074, "step": 630 }, { "epoch": 0.3155818540433925, "grad_norm": 10217.20353129955, "learning_rate": 4.9001733914926974e-05, "loss": 0.0722, "step": 640 }, { "epoch": 0.32051282051282054, "grad_norm": 9370.911588527553, "learning_rate": 4.8942395386622064e-05, "loss": 0.0688, "step": 650 }, { "epoch": 0.3254437869822485, "grad_norm": 11153.991931142858, "learning_rate": 4.8881381862816536e-05, "loss": 0.0794, "step": 660 }, { "epoch": 0.33037475345167655, "grad_norm": 7372.450338930741, "learning_rate": 4.881869761197963e-05, "loss": 0.0721, "step": 670 }, { "epoch": 0.33530571992110453, "grad_norm": 8280.749724511665, "learning_rate": 4.875434701946361e-05, "loss": 0.0703, "step": 680 }, { "epoch": 0.34023668639053256, "grad_norm": 8886.890119721296, "learning_rate": 4.868833458719702e-05, "loss": 0.0673, "step": 690 }, { "epoch": 0.34516765285996054, "grad_norm": 9734.842987947983, "learning_rate": 4.8620664933369676e-05, "loss": 0.0723, "step": 700 }, { "epoch": 0.35009861932938857, "grad_norm": 9477.64358899405, "learning_rate": 4.855134279210964e-05, "loss": 0.066, "step": 710 }, { "epoch": 0.35502958579881655, "grad_norm": 7460.336989707636, "learning_rate": 4.8480373013152e-05, "loss": 0.0684, "step": 720 }, { "epoch": 0.3599605522682446, "grad_norm": 7986.478072342025, "learning_rate": 4.840776056149957e-05, "loss": 0.0634, "step": 730 }, { "epoch": 0.36489151873767256, "grad_norm": 8724.282434676217, "learning_rate": 4.833351051707553e-05, "loss": 0.066, "step": 740 }, { "epoch": 0.3698224852071006, "grad_norm": 11793.592158456218, "learning_rate": 4.82576280743681e-05, "loss": 0.0601, "step": 750 }, { "epoch": 0.3747534516765286, "grad_norm": 8347.344487919496, "learning_rate": 4.818011854206706e-05, "loss": 0.0591, "step": 760 }, { "epoch": 0.3796844181459566, "grad_norm": 10782.888295813882, "learning_rate": 4.8100987342692424e-05, "loss": 0.0658, "step": 770 }, { "epoch": 0.38461538461538464, "grad_norm": 10922.65315754373, "learning_rate": 4.802024001221504e-05, "loss": 0.0644, "step": 780 }, { "epoch": 0.3895463510848126, "grad_norm": 10552.341920161609, "learning_rate": 4.793788219966931e-05, "loss": 0.0755, "step": 790 }, { "epoch": 0.39447731755424065, "grad_norm": 7972.205215622588, "learning_rate": 4.785391966675799e-05, "loss": 0.0689, "step": 800 }, { "epoch": 0.3994082840236686, "grad_norm": 10807.511832054592, "learning_rate": 4.776835828744909e-05, "loss": 0.0636, "step": 810 }, { "epoch": 0.40433925049309666, "grad_norm": 9430.68353832319, "learning_rate": 4.7681204047564963e-05, "loss": 0.0557, "step": 820 }, { "epoch": 0.40927021696252464, "grad_norm": 10209.031687677338, "learning_rate": 4.7592463044363534e-05, "loss": 0.057, "step": 830 }, { "epoch": 0.41420118343195267, "grad_norm": 8397.82590912672, "learning_rate": 4.750214148611169e-05, "loss": 0.0647, "step": 840 }, { "epoch": 0.41913214990138065, "grad_norm": 9899.747875577437, "learning_rate": 4.7410245691651054e-05, "loss": 0.0699, "step": 850 }, { "epoch": 0.4240631163708087, "grad_norm": 8939.42772217551, "learning_rate": 4.731678208995579e-05, "loss": 0.0665, "step": 860 }, { "epoch": 0.4289940828402367, "grad_norm": 7923.368728009571, "learning_rate": 4.722175721968297e-05, "loss": 0.0643, "step": 870 }, { "epoch": 0.4339250493096647, "grad_norm": 14368.711006906638, "learning_rate": 4.7125177728715026e-05, "loss": 0.0767, "step": 880 }, { "epoch": 0.4388560157790927, "grad_norm": 10018.222197575775, "learning_rate": 4.702705037369475e-05, "loss": 0.0689, "step": 890 }, { "epoch": 0.4437869822485207, "grad_norm": 8922.631450418648, "learning_rate": 4.692738201955255e-05, "loss": 0.0701, "step": 900 }, { "epoch": 0.44871794871794873, "grad_norm": 11155.967013217634, "learning_rate": 4.6826179639026184e-05, "loss": 0.0664, "step": 910 }, { "epoch": 0.4536489151873767, "grad_norm": 7031.218671041315, "learning_rate": 4.6723450312173e-05, "loss": 0.0627, "step": 920 }, { "epoch": 0.45857988165680474, "grad_norm": 12984.67527510796, "learning_rate": 4.6619201225874575e-05, "loss": 0.0638, "step": 930 }, { "epoch": 0.4635108481262327, "grad_norm": 7228.3278840960165, "learning_rate": 4.651343967333393e-05, "loss": 0.0545, "step": 940 }, { "epoch": 0.46844181459566075, "grad_norm": 7148.969156458853, "learning_rate": 4.6406173053565316e-05, "loss": 0.0549, "step": 950 }, { "epoch": 0.47337278106508873, "grad_norm": 7407.089036861917, "learning_rate": 4.6297408870876576e-05, "loss": 0.0565, "step": 960 }, { "epoch": 0.47830374753451677, "grad_norm": 8808.120344318644, "learning_rate": 4.618715473434414e-05, "loss": 0.0601, "step": 970 }, { "epoch": 0.4832347140039448, "grad_norm": 9377.625712300529, "learning_rate": 4.607541835728074e-05, "loss": 0.0724, "step": 980 }, { "epoch": 0.4881656804733728, "grad_norm": 7881.055259291106, "learning_rate": 4.596220755669571e-05, "loss": 0.0567, "step": 990 }, { "epoch": 0.4930966469428008, "grad_norm": 12093.790142052243, "learning_rate": 4.584753025274821e-05, "loss": 0.0618, "step": 1000 }, { "epoch": 0.4980276134122288, "grad_norm": 13753.036610145411, "learning_rate": 4.573139446819305e-05, "loss": 0.057, "step": 1010 }, { "epoch": 0.5029585798816568, "grad_norm": 7077.985024002241, "learning_rate": 4.561380832781947e-05, "loss": 0.0548, "step": 1020 }, { "epoch": 0.5078895463510849, "grad_norm": 7930.713460969322, "learning_rate": 4.549478005788276e-05, "loss": 0.0677, "step": 1030 }, { "epoch": 0.5128205128205128, "grad_norm": 8364.417493167113, "learning_rate": 4.537431798552867e-05, "loss": 0.0602, "step": 1040 }, { "epoch": 0.5177514792899408, "grad_norm": 19419.409671769117, "learning_rate": 4.5252430538210944e-05, "loss": 0.0689, "step": 1050 }, { "epoch": 0.5226824457593688, "grad_norm": 27615.07646196186, "learning_rate": 4.512912624310166e-05, "loss": 0.0586, "step": 1060 }, { "epoch": 0.5276134122287969, "grad_norm": 18720.689730883314, "learning_rate": 4.5004413726494746e-05, "loss": 0.0661, "step": 1070 }, { "epoch": 0.5325443786982249, "grad_norm": 17784.57848811717, "learning_rate": 4.487830171320242e-05, "loss": 0.0537, "step": 1080 }, { "epoch": 0.5374753451676528, "grad_norm": 23348.821983132253, "learning_rate": 4.4750799025944866e-05, "loss": 0.0604, "step": 1090 }, { "epoch": 0.5424063116370809, "grad_norm": 16098.754734450737, "learning_rate": 4.462191458473297e-05, "loss": 0.0566, "step": 1100 }, { "epoch": 0.5473372781065089, "grad_norm": 22919.740312664977, "learning_rate": 4.4491657406244295e-05, "loss": 0.0552, "step": 1110 }, { "epoch": 0.5522682445759369, "grad_norm": 15431.450482699285, "learning_rate": 4.436003660319225e-05, "loss": 0.0565, "step": 1120 }, { "epoch": 0.5571992110453649, "grad_norm": 13429.063407401129, "learning_rate": 4.4227061383688614e-05, "loss": 0.0572, "step": 1130 }, { "epoch": 0.5621301775147929, "grad_norm": 18778.278515348524, "learning_rate": 4.4092741050599334e-05, "loss": 0.0576, "step": 1140 }, { "epoch": 0.5670611439842209, "grad_norm": 12919.829101036901, "learning_rate": 4.395708500089366e-05, "loss": 0.0584, "step": 1150 }, { "epoch": 0.571992110453649, "grad_norm": 13210.966353753232, "learning_rate": 4.382010272498679e-05, "loss": 0.0501, "step": 1160 }, { "epoch": 0.5769230769230769, "grad_norm": 19208.029154496824, "learning_rate": 4.3681803806075896e-05, "loss": 0.0621, "step": 1170 }, { "epoch": 0.5818540433925049, "grad_norm": 20638.19061836575, "learning_rate": 4.3542197919469704e-05, "loss": 0.0635, "step": 1180 }, { "epoch": 0.5867850098619329, "grad_norm": 28701.657652477148, "learning_rate": 4.340129483191159e-05, "loss": 0.0599, "step": 1190 }, { "epoch": 0.591715976331361, "grad_norm": 18200.344612122048, "learning_rate": 4.3259104400896355e-05, "loss": 0.0647, "step": 1200 }, { "epoch": 0.596646942800789, "grad_norm": 27827.53629051627, "learning_rate": 4.3115636573980556e-05, "loss": 0.057, "step": 1210 }, { "epoch": 0.6015779092702169, "grad_norm": 20089.312581569335, "learning_rate": 4.2970901388086585e-05, "loss": 0.063, "step": 1220 }, { "epoch": 0.606508875739645, "grad_norm": 17405.30034213716, "learning_rate": 4.282490896880051e-05, "loss": 0.0546, "step": 1230 }, { "epoch": 0.611439842209073, "grad_norm": 14523.301553021613, "learning_rate": 4.267766952966369e-05, "loss": 0.0649, "step": 1240 }, { "epoch": 0.616370808678501, "grad_norm": 18351.71141882958, "learning_rate": 4.252919337145823e-05, "loss": 0.0646, "step": 1250 }, { "epoch": 0.621301775147929, "grad_norm": 18125.006813791824, "learning_rate": 4.2379490881486374e-05, "loss": 0.0596, "step": 1260 }, { "epoch": 0.626232741617357, "grad_norm": 10411.466371265866, "learning_rate": 4.2228572532843756e-05, "loss": 0.0568, "step": 1270 }, { "epoch": 0.631163708086785, "grad_norm": 16474.133907431977, "learning_rate": 4.207644888368678e-05, "loss": 0.0548, "step": 1280 }, { "epoch": 0.636094674556213, "grad_norm": 18319.25325989026, "learning_rate": 4.192313057649394e-05, "loss": 0.0504, "step": 1290 }, { "epoch": 0.6410256410256411, "grad_norm": 18510.18919406282, "learning_rate": 4.176862833732127e-05, "loss": 0.057, "step": 1300 }, { "epoch": 0.645956607495069, "grad_norm": 27594.67426877875, "learning_rate": 4.161295297505198e-05, "loss": 0.061, "step": 1310 }, { "epoch": 0.650887573964497, "grad_norm": 16779.24432148242, "learning_rate": 4.145611538064026e-05, "loss": 0.0525, "step": 1320 }, { "epoch": 0.6558185404339251, "grad_norm": 24001.978585108354, "learning_rate": 4.1298126526349365e-05, "loss": 0.0542, "step": 1330 }, { "epoch": 0.6607495069033531, "grad_norm": 17789.623942062408, "learning_rate": 4.113899746498397e-05, "loss": 0.0564, "step": 1340 }, { "epoch": 0.665680473372781, "grad_norm": 21222.44019899691, "learning_rate": 4.097873932911699e-05, "loss": 0.0566, "step": 1350 }, { "epoch": 0.6706114398422091, "grad_norm": 23036.33790340817, "learning_rate": 4.081736333031067e-05, "loss": 0.0652, "step": 1360 }, { "epoch": 0.6755424063116371, "grad_norm": 18358.58033727009, "learning_rate": 4.065488075833228e-05, "loss": 0.0541, "step": 1370 }, { "epoch": 0.6804733727810651, "grad_norm": 23994.764762339306, "learning_rate": 4.049130298036429e-05, "loss": 0.0547, "step": 1380 }, { "epoch": 0.685404339250493, "grad_norm": 30128.253849169552, "learning_rate": 4.0326641440209114e-05, "loss": 0.0618, "step": 1390 }, { "epoch": 0.6903353057199211, "grad_norm": 14950.507683687534, "learning_rate": 4.016090765748851e-05, "loss": 0.0497, "step": 1400 }, { "epoch": 0.6952662721893491, "grad_norm": 19541.710876993344, "learning_rate": 3.999411322683768e-05, "loss": 0.0578, "step": 1410 }, { "epoch": 0.7001972386587771, "grad_norm": 17140.760309857902, "learning_rate": 3.982626981709412e-05, "loss": 0.051, "step": 1420 }, { "epoch": 0.7051282051282052, "grad_norm": 13185.760198031814, "learning_rate": 3.965738917048125e-05, "loss": 0.0481, "step": 1430 }, { "epoch": 0.7100591715976331, "grad_norm": 16177.203219345425, "learning_rate": 3.948748310178697e-05, "loss": 0.0531, "step": 1440 }, { "epoch": 0.7149901380670611, "grad_norm": 19925.021856951626, "learning_rate": 3.93165634975371e-05, "loss": 0.0676, "step": 1450 }, { "epoch": 0.7199211045364892, "grad_norm": 14203.66487917819, "learning_rate": 3.914464231516374e-05, "loss": 0.0541, "step": 1460 }, { "epoch": 0.7248520710059172, "grad_norm": 14143.869343287924, "learning_rate": 3.897173158216887e-05, "loss": 0.0479, "step": 1470 }, { "epoch": 0.7297830374753451, "grad_norm": 16587.271264436473, "learning_rate": 3.879784339528277e-05, "loss": 0.053, "step": 1480 }, { "epoch": 0.7347140039447732, "grad_norm": 12806.198499164379, "learning_rate": 3.862298991961784e-05, "loss": 0.0543, "step": 1490 }, { "epoch": 0.7396449704142012, "grad_norm": 26141.707365816794, "learning_rate": 3.844718338781748e-05, "loss": 0.042, "step": 1500 }, { "epoch": 0.7445759368836292, "grad_norm": 15241.946332407813, "learning_rate": 3.82704360992003e-05, "loss": 0.0541, "step": 1510 }, { "epoch": 0.7495069033530573, "grad_norm": 19593.61610321076, "learning_rate": 3.809276041889975e-05, "loss": 0.0489, "step": 1520 }, { "epoch": 0.7544378698224852, "grad_norm": 14828.533036008654, "learning_rate": 3.791416877699894e-05, "loss": 0.0537, "step": 1530 }, { "epoch": 0.7593688362919132, "grad_norm": 19848.77547860321, "learning_rate": 3.773467366766113e-05, "loss": 0.047, "step": 1540 }, { "epoch": 0.7642998027613412, "grad_norm": 17099.498004327495, "learning_rate": 3.755428764825561e-05, "loss": 0.049, "step": 1550 }, { "epoch": 0.7692307692307693, "grad_norm": 16774.193512655085, "learning_rate": 3.7373023338479196e-05, "loss": 0.0544, "step": 1560 }, { "epoch": 0.7741617357001972, "grad_norm": 17768.81042726271, "learning_rate": 3.719089341947337e-05, "loss": 0.0457, "step": 1570 }, { "epoch": 0.7790927021696252, "grad_norm": 30253.611751326483, "learning_rate": 3.7007910632937105e-05, "loss": 0.048, "step": 1580 }, { "epoch": 0.7840236686390533, "grad_norm": 24805.826412357237, "learning_rate": 3.6824087780235485e-05, "loss": 0.0579, "step": 1590 }, { "epoch": 0.7889546351084813, "grad_norm": 16483.00263908248, "learning_rate": 3.66394377215041e-05, "loss": 0.0498, "step": 1600 }, { "epoch": 0.7938856015779092, "grad_norm": 35264.37023399113, "learning_rate": 3.6453973374749405e-05, "loss": 0.0533, "step": 1610 }, { "epoch": 0.7988165680473372, "grad_norm": 14936.754132006057, "learning_rate": 3.626770771494489e-05, "loss": 0.0566, "step": 1620 }, { "epoch": 0.8037475345167653, "grad_norm": 12871.965817232425, "learning_rate": 3.608065377312348e-05, "loss": 0.046, "step": 1630 }, { "epoch": 0.8086785009861933, "grad_norm": 23510.821338268895, "learning_rate": 3.589282463546582e-05, "loss": 0.057, "step": 1640 }, { "epoch": 0.8136094674556213, "grad_norm": 14633.87467487678, "learning_rate": 3.5704233442384756e-05, "loss": 0.0506, "step": 1650 }, { "epoch": 0.8185404339250493, "grad_norm": 14611.25484001973, "learning_rate": 3.5514893387606114e-05, "loss": 0.0528, "step": 1660 }, { "epoch": 0.8234714003944773, "grad_norm": 17865.870031991166, "learning_rate": 3.53248177172456e-05, "loss": 0.0472, "step": 1670 }, { "epoch": 0.8284023668639053, "grad_norm": 19515.118242019442, "learning_rate": 3.5134019728882156e-05, "loss": 0.0474, "step": 1680 }, { "epoch": 0.8333333333333334, "grad_norm": 11861.742199188111, "learning_rate": 3.4942512770627654e-05, "loss": 0.0531, "step": 1690 }, { "epoch": 0.8382642998027613, "grad_norm": 10874.551209130426, "learning_rate": 3.475031024019307e-05, "loss": 0.0512, "step": 1700 }, { "epoch": 0.8431952662721893, "grad_norm": 14928.882074689987, "learning_rate": 3.455742558395118e-05, "loss": 0.0502, "step": 1710 }, { "epoch": 0.8481262327416174, "grad_norm": 15317.730380183613, "learning_rate": 3.436387229599587e-05, "loss": 0.0509, "step": 1720 }, { "epoch": 0.8530571992110454, "grad_norm": 15591.351961905035, "learning_rate": 3.416966391719811e-05, "loss": 0.0514, "step": 1730 }, { "epoch": 0.8579881656804734, "grad_norm": 18949.96612134175, "learning_rate": 3.397481403425863e-05, "loss": 0.0535, "step": 1740 }, { "epoch": 0.8629191321499013, "grad_norm": 12076.941997045444, "learning_rate": 3.3779336278757386e-05, "loss": 0.0498, "step": 1750 }, { "epoch": 0.8678500986193294, "grad_norm": 24920.90399644443, "learning_rate": 3.3583244326199924e-05, "loss": 0.0431, "step": 1760 }, { "epoch": 0.8727810650887574, "grad_norm": 15472.821074387179, "learning_rate": 3.338655189506063e-05, "loss": 0.0488, "step": 1770 }, { "epoch": 0.8777120315581854, "grad_norm": 14609.320312731868, "learning_rate": 3.318927274582307e-05, "loss": 0.0466, "step": 1780 }, { "epoch": 0.8826429980276134, "grad_norm": 17558.566684100384, "learning_rate": 3.299142068001716e-05, "loss": 0.0496, "step": 1790 }, { "epoch": 0.8875739644970414, "grad_norm": 15509.33654286991, "learning_rate": 3.279300953925378e-05, "loss": 0.0406, "step": 1800 }, { "epoch": 0.8925049309664694, "grad_norm": 12294.958316318116, "learning_rate": 3.259405320425634e-05, "loss": 0.0511, "step": 1810 }, { "epoch": 0.8974358974358975, "grad_norm": 15796.216762250384, "learning_rate": 3.2394565593889705e-05, "loss": 0.0519, "step": 1820 }, { "epoch": 0.9023668639053254, "grad_norm": 14265.63535213206, "learning_rate": 3.2194560664186434e-05, "loss": 0.0435, "step": 1830 }, { "epoch": 0.9072978303747534, "grad_norm": 18098.215602649892, "learning_rate": 3.199405240737045e-05, "loss": 0.0473, "step": 1840 }, { "epoch": 0.9122287968441815, "grad_norm": 25492.097599059987, "learning_rate": 3.179305485087813e-05, "loss": 0.0471, "step": 1850 }, { "epoch": 0.9171597633136095, "grad_norm": 19625.7488010012, "learning_rate": 3.159158205637696e-05, "loss": 0.0521, "step": 1860 }, { "epoch": 0.9220907297830375, "grad_norm": 16158.12761429987, "learning_rate": 3.1389648118781795e-05, "loss": 0.043, "step": 1870 }, { "epoch": 0.9270216962524654, "grad_norm": 16575.068506645755, "learning_rate": 3.1187267165268774e-05, "loss": 0.0491, "step": 1880 }, { "epoch": 0.9319526627218935, "grad_norm": 16422.20642910081, "learning_rate": 3.0984453354287e-05, "loss": 0.0516, "step": 1890 }, { "epoch": 0.9368836291913215, "grad_norm": 14754.34851154059, "learning_rate": 3.078122087456802e-05, "loss": 0.0497, "step": 1900 }, { "epoch": 0.9418145956607495, "grad_norm": 15679.373456870017, "learning_rate": 3.057758394413321e-05, "loss": 0.0488, "step": 1910 }, { "epoch": 0.9467455621301775, "grad_norm": 15054.075860045345, "learning_rate": 3.037355680929906e-05, "loss": 0.0468, "step": 1920 }, { "epoch": 0.9516765285996055, "grad_norm": 22642.915006685867, "learning_rate": 3.016915374368052e-05, "loss": 0.0458, "step": 1930 }, { "epoch": 0.9566074950690335, "grad_norm": 15260.94151748181, "learning_rate": 2.9964389047192428e-05, "loss": 0.0401, "step": 1940 }, { "epoch": 0.9615384615384616, "grad_norm": 21582.143730408246, "learning_rate": 2.9759277045049117e-05, "loss": 0.0467, "step": 1950 }, { "epoch": 0.9664694280078896, "grad_norm": 21636.31317946752, "learning_rate": 2.95538320867622e-05, "loss": 0.0478, "step": 1960 }, { "epoch": 0.9714003944773175, "grad_norm": 19466.86292138515, "learning_rate": 2.9348068545136703e-05, "loss": 0.0392, "step": 1970 }, { "epoch": 0.9763313609467456, "grad_norm": 20193.997524016882, "learning_rate": 2.9142000815265552e-05, "loss": 0.0492, "step": 1980 }, { "epoch": 0.9812623274161736, "grad_norm": 16234.605015213643, "learning_rate": 2.89356433135225e-05, "loss": 0.0478, "step": 1990 }, { "epoch": 0.9861932938856016, "grad_norm": 19339.389028612048, "learning_rate": 2.8729010476553546e-05, "loss": 0.0473, "step": 2000 }, { "epoch": 0.9911242603550295, "grad_norm": 32683.024584637205, "learning_rate": 2.852211676026701e-05, "loss": 0.0446, "step": 2010 }, { "epoch": 0.9960552268244576, "grad_norm": 16478.83393932957, "learning_rate": 2.8314976638822142e-05, "loss": 0.0434, "step": 2020 }, { "epoch": 1.0009861932938855, "grad_norm": 12970.191054876563, "learning_rate": 2.8107604603616555e-05, "loss": 0.0406, "step": 2030 }, { "epoch": 1.0059171597633136, "grad_norm": 16996.303598135684, "learning_rate": 2.790001516227241e-05, "loss": 0.0372, "step": 2040 }, { "epoch": 1.0108481262327416, "grad_norm": 33541.365982917276, "learning_rate": 2.769222283762148e-05, "loss": 0.0384, "step": 2050 }, { "epoch": 1.0157790927021697, "grad_norm": 37506.9998800224, "learning_rate": 2.7484242166689123e-05, "loss": 0.0441, "step": 2060 }, { "epoch": 1.0207100591715976, "grad_norm": 37564.614838967806, "learning_rate": 2.7276087699677277e-05, "loss": 0.0351, "step": 2070 }, { "epoch": 1.0256410256410255, "grad_norm": 51239.6546436449, "learning_rate": 2.706777399894656e-05, "loss": 0.0381, "step": 2080 }, { "epoch": 1.0305719921104537, "grad_norm": 25657.361984428564, "learning_rate": 2.68593156379975e-05, "loss": 0.0404, "step": 2090 }, { "epoch": 1.0355029585798816, "grad_norm": 40016.91642293294, "learning_rate": 2.6650727200450936e-05, "loss": 0.0352, "step": 2100 }, { "epoch": 1.0404339250493098, "grad_norm": 39623.3902638328, "learning_rate": 2.6442023279027806e-05, "loss": 0.0397, "step": 2110 }, { "epoch": 1.0453648915187377, "grad_norm": 42575.39270517654, "learning_rate": 2.623321847452823e-05, "loss": 0.0358, "step": 2120 }, { "epoch": 1.0502958579881656, "grad_norm": 29283.62491222697, "learning_rate": 2.6024327394810044e-05, "loss": 0.0372, "step": 2130 }, { "epoch": 1.0552268244575937, "grad_norm": 36733.54347187322, "learning_rate": 2.5815364653766837e-05, "loss": 0.0359, "step": 2140 }, { "epoch": 1.0601577909270217, "grad_norm": 45356.02240055889, "learning_rate": 2.5606344870305597e-05, "loss": 0.0343, "step": 2150 }, { "epoch": 1.0650887573964498, "grad_norm": 53383.08586059821, "learning_rate": 2.5397282667323956e-05, "loss": 0.0374, "step": 2160 }, { "epoch": 1.0700197238658777, "grad_norm": 44916.75500300528, "learning_rate": 2.518819267068718e-05, "loss": 0.0368, "step": 2170 }, { "epoch": 1.0749506903353057, "grad_norm": 45634.131787511855, "learning_rate": 2.4979089508205002e-05, "loss": 0.033, "step": 2180 }, { "epoch": 1.0798816568047338, "grad_norm": 33894.735874468766, "learning_rate": 2.4769987808608175e-05, "loss": 0.0438, "step": 2190 }, { "epoch": 1.0848126232741617, "grad_norm": 29514.47861643502, "learning_rate": 2.456090220052514e-05, "loss": 0.0335, "step": 2200 }, { "epoch": 1.0897435897435896, "grad_norm": 45448.43583667099, "learning_rate": 2.4351847311458587e-05, "loss": 0.0371, "step": 2210 }, { "epoch": 1.0946745562130178, "grad_norm": 33717.85046529509, "learning_rate": 2.4142837766762108e-05, "loss": 0.0295, "step": 2220 }, { "epoch": 1.0996055226824457, "grad_norm": 43915.722924711146, "learning_rate": 2.393388818861705e-05, "loss": 0.0385, "step": 2230 }, { "epoch": 1.1045364891518739, "grad_norm": 42817.53958367996, "learning_rate": 2.3725013195009522e-05, "loss": 0.0383, "step": 2240 }, { "epoch": 1.1094674556213018, "grad_norm": 55755.649758567066, "learning_rate": 2.3516227398707754e-05, "loss": 0.034, "step": 2250 }, { "epoch": 1.1143984220907297, "grad_norm": 33225.09894642904, "learning_rate": 2.330754540623975e-05, "loss": 0.0323, "step": 2260 }, { "epoch": 1.1193293885601578, "grad_norm": 53806.90810667344, "learning_rate": 2.309898181687152e-05, "loss": 0.0349, "step": 2270 }, { "epoch": 1.1242603550295858, "grad_norm": 47541.57086172059, "learning_rate": 2.289055122158562e-05, "loss": 0.0394, "step": 2280 }, { "epoch": 1.1291913214990137, "grad_norm": 38933.466118495024, "learning_rate": 2.2682268202060442e-05, "loss": 0.0331, "step": 2290 }, { "epoch": 1.1341222879684418, "grad_norm": 32132.282583097018, "learning_rate": 2.2474147329650052e-05, "loss": 0.0331, "step": 2300 }, { "epoch": 1.1390532544378698, "grad_norm": 35663.32166245876, "learning_rate": 2.2266203164364818e-05, "loss": 0.0329, "step": 2310 }, { "epoch": 1.143984220907298, "grad_norm": 60547.0083819176, "learning_rate": 2.205845025385278e-05, "loss": 0.0417, "step": 2320 }, { "epoch": 1.1489151873767258, "grad_norm": 30475.69208402001, "learning_rate": 2.185090313238189e-05, "loss": 0.0316, "step": 2330 }, { "epoch": 1.1538461538461537, "grad_norm": 28629.068304784214, "learning_rate": 2.1643576319823254e-05, "loss": 0.0312, "step": 2340 }, { "epoch": 1.1587771203155819, "grad_norm": 35177.58013280618, "learning_rate": 2.1436484320635276e-05, "loss": 0.033, "step": 2350 }, { "epoch": 1.1637080867850098, "grad_norm": 38736.13052435671, "learning_rate": 2.1229641622848964e-05, "loss": 0.0355, "step": 2360 }, { "epoch": 1.168639053254438, "grad_norm": 41191.78947314622, "learning_rate": 2.1023062697054368e-05, "loss": 0.0446, "step": 2370 }, { "epoch": 1.1735700197238659, "grad_norm": 39650.51646574102, "learning_rate": 2.0816761995388197e-05, "loss": 0.0346, "step": 2380 }, { "epoch": 1.1785009861932938, "grad_norm": 44066.93091196618, "learning_rate": 2.0610753950522752e-05, "loss": 0.0387, "step": 2390 }, { "epoch": 1.183431952662722, "grad_norm": 35579.78032534771, "learning_rate": 2.0405052974656253e-05, "loss": 0.0376, "step": 2400 }, { "epoch": 1.1883629191321499, "grad_norm": 48319.300657190804, "learning_rate": 2.0199673458504574e-05, "loss": 0.0346, "step": 2410 }, { "epoch": 1.193293885601578, "grad_norm": 50381.03262141418, "learning_rate": 1.9994629770294448e-05, "loss": 0.0339, "step": 2420 }, { "epoch": 1.198224852071006, "grad_norm": 34522.5130023873, "learning_rate": 1.9789936254758288e-05, "loss": 0.0301, "step": 2430 }, { "epoch": 1.2031558185404339, "grad_norm": 51816.098193515114, "learning_rate": 1.9585607232130633e-05, "loss": 0.0402, "step": 2440 }, { "epoch": 1.208086785009862, "grad_norm": 50369.39261098947, "learning_rate": 1.9381656997146343e-05, "loss": 0.0406, "step": 2450 }, { "epoch": 1.21301775147929, "grad_norm": 55031.280704704666, "learning_rate": 1.9178099818040482e-05, "loss": 0.0326, "step": 2460 }, { "epoch": 1.217948717948718, "grad_norm": 39373.461112785095, "learning_rate": 1.8974949935550202e-05, "loss": 0.0295, "step": 2470 }, { "epoch": 1.222879684418146, "grad_norm": 51596.797419994975, "learning_rate": 1.877222156191841e-05, "loss": 0.0314, "step": 2480 }, { "epoch": 1.227810650887574, "grad_norm": 46556.9511029234, "learning_rate": 1.8569928879899506e-05, "loss": 0.0369, "step": 2490 }, { "epoch": 1.232741617357002, "grad_norm": 60444.887161777384, "learning_rate": 1.836808604176719e-05, "loss": 0.0367, "step": 2500 }, { "epoch": 1.23767258382643, "grad_norm": 40896.774640550815, "learning_rate": 1.8166707168324338e-05, "loss": 0.0319, "step": 2510 }, { "epoch": 1.242603550295858, "grad_norm": 53234.142427581195, "learning_rate": 1.796580634791514e-05, "loss": 0.0335, "step": 2520 }, { "epoch": 1.247534516765286, "grad_norm": 28195.286982047193, "learning_rate": 1.7765397635439468e-05, "loss": 0.0278, "step": 2530 }, { "epoch": 1.252465483234714, "grad_norm": 17427.74615376297, "learning_rate": 1.7585462135142083e-05, "loss": 0.0344, "step": 2540 }, { "epoch": 1.2573964497041419, "grad_norm": 21108.922094697304, "learning_rate": 1.738602702489652e-05, "loss": 0.0349, "step": 2550 }, { "epoch": 1.26232741617357, "grad_norm": 24377.500446108086, "learning_rate": 1.718712458359528e-05, "loss": 0.034, "step": 2560 }, { "epoch": 1.267258382642998, "grad_norm": 20456.646059410617, "learning_rate": 1.6988768726332854e-05, "loss": 0.0311, "step": 2570 }, { "epoch": 1.272189349112426, "grad_norm": 15780.460829773001, "learning_rate": 1.6790973329965044e-05, "loss": 0.0275, "step": 2580 }, { "epoch": 1.277120315581854, "grad_norm": 25885.175371242898, "learning_rate": 1.659375223213816e-05, "loss": 0.0349, "step": 2590 }, { "epoch": 1.282051282051282, "grad_norm": 17902.541495553083, "learning_rate": 1.6397119230320918e-05, "loss": 0.0309, "step": 2600 }, { "epoch": 1.28698224852071, "grad_norm": 18151.209326102766, "learning_rate": 1.62010880808392e-05, "loss": 0.0341, "step": 2610 }, { "epoch": 1.291913214990138, "grad_norm": 10715.816721090372, "learning_rate": 1.602518596574137e-05, "loss": 0.0322, "step": 2620 }, { "epoch": 1.2968441814595661, "grad_norm": 11396.753923815324, "learning_rate": 1.583033608280189e-05, "loss": 0.0297, "step": 2630 }, { "epoch": 1.301775147928994, "grad_norm": 10740.794756441443, "learning_rate": 1.5636127704004134e-05, "loss": 0.0348, "step": 2640 }, { "epoch": 1.306706114398422, "grad_norm": 12196.05509990833, "learning_rate": 1.5442574416048827e-05, "loss": 0.0341, "step": 2650 }, { "epoch": 1.3116370808678501, "grad_norm": 13820.070043237842, "learning_rate": 1.5249689759806935e-05, "loss": 0.0344, "step": 2660 }, { "epoch": 1.316568047337278, "grad_norm": 9524.579570773714, "learning_rate": 1.5057487229372347e-05, "loss": 0.0314, "step": 2670 }, { "epoch": 1.3214990138067062, "grad_norm": 14665.234536140226, "learning_rate": 1.4865980271117852e-05, "loss": 0.0308, "step": 2680 }, { "epoch": 1.3264299802761341, "grad_norm": 10442.700417037731, "learning_rate": 1.4675182282754408e-05, "loss": 0.0357, "step": 2690 }, { "epoch": 1.331360946745562, "grad_norm": 9396.478914997893, "learning_rate": 1.4485106612393899e-05, "loss": 0.0303, "step": 2700 }, { "epoch": 1.3362919132149902, "grad_norm": 17759.793692495416, "learning_rate": 1.4295766557615248e-05, "loss": 0.0352, "step": 2710 }, { "epoch": 1.3412228796844181, "grad_norm": 11063.842732070987, "learning_rate": 1.4107175364534184e-05, "loss": 0.0304, "step": 2720 }, { "epoch": 1.3461538461538463, "grad_norm": 9136.923333376504, "learning_rate": 1.391934622687652e-05, "loss": 0.0325, "step": 2730 }, { "epoch": 1.3510848126232742, "grad_norm": 10481.712073893272, "learning_rate": 1.3732292285055107e-05, "loss": 0.0361, "step": 2740 }, { "epoch": 1.356015779092702, "grad_norm": 10502.118643397627, "learning_rate": 1.3546026625250612e-05, "loss": 0.0355, "step": 2750 }, { "epoch": 1.3609467455621302, "grad_norm": 12646.443294460305, "learning_rate": 1.3360562278495896e-05, "loss": 0.0333, "step": 2760 }, { "epoch": 1.3658777120315582, "grad_norm": 11539.639509100793, "learning_rate": 1.3175912219764514e-05, "loss": 0.0303, "step": 2770 }, { "epoch": 1.3708086785009863, "grad_norm": 14185.68179538791, "learning_rate": 1.29920893670629e-05, "loss": 0.0301, "step": 2780 }, { "epoch": 1.3757396449704142, "grad_norm": 17136.45564286851, "learning_rate": 1.2809106580526636e-05, "loss": 0.0361, "step": 2790 }, { "epoch": 1.3806706114398422, "grad_norm": 9303.49181759193, "learning_rate": 1.2626976661520811e-05, "loss": 0.0329, "step": 2800 }, { "epoch": 1.3856015779092703, "grad_norm": 13639.397054122297, "learning_rate": 1.2445712351744402e-05, "loss": 0.0299, "step": 2810 }, { "epoch": 1.3905325443786982, "grad_norm": 11963.693743990607, "learning_rate": 1.2265326332338875e-05, "loss": 0.0365, "step": 2820 }, { "epoch": 1.3954635108481264, "grad_norm": 11382.881181844956, "learning_rate": 1.2085831223001071e-05, "loss": 0.0349, "step": 2830 }, { "epoch": 1.4003944773175543, "grad_norm": 10434.131684045395, "learning_rate": 1.1907239581100255e-05, "loss": 0.0332, "step": 2840 }, { "epoch": 1.4053254437869822, "grad_norm": 13172.61401544887, "learning_rate": 1.1729563900799694e-05, "loss": 0.0302, "step": 2850 }, { "epoch": 1.4102564102564101, "grad_norm": 8347.449430814182, "learning_rate": 1.155281661218253e-05, "loss": 0.0288, "step": 2860 }, { "epoch": 1.4151873767258383, "grad_norm": 10608.303163088807, "learning_rate": 1.1377010080382155e-05, "loss": 0.0332, "step": 2870 }, { "epoch": 1.4201183431952662, "grad_norm": 13408.207635623787, "learning_rate": 1.1202156604717235e-05, "loss": 0.0342, "step": 2880 }, { "epoch": 1.4250493096646943, "grad_norm": 10305.117951775224, "learning_rate": 1.1028268417831134e-05, "loss": 0.0311, "step": 2890 }, { "epoch": 1.4299802761341223, "grad_norm": 11212.751312679686, "learning_rate": 1.0855357684836257e-05, "loss": 0.0277, "step": 2900 }, { "epoch": 1.4349112426035502, "grad_norm": 10448.69408108018, "learning_rate": 1.0683436502462916e-05, "loss": 0.0311, "step": 2910 }, { "epoch": 1.4398422090729783, "grad_norm": 12742.063255218913, "learning_rate": 1.051251689821303e-05, "loss": 0.0286, "step": 2920 }, { "epoch": 1.4447731755424063, "grad_norm": 8262.845030617482, "learning_rate": 1.0342610829518756e-05, "loss": 0.0272, "step": 2930 }, { "epoch": 1.4497041420118344, "grad_norm": 6957.433434823505, "learning_rate": 1.017373018290588e-05, "loss": 0.0326, "step": 2940 }, { "epoch": 1.4546351084812623, "grad_norm": 11696.316685179143, "learning_rate": 1.000588677316232e-05, "loss": 0.029, "step": 2950 }, { "epoch": 1.4595660749506902, "grad_norm": 9799.848978428188, "learning_rate": 9.839092342511496e-06, "loss": 0.0332, "step": 2960 }, { "epoch": 1.4644970414201184, "grad_norm": 14969.193431845284, "learning_rate": 9.67335855979089e-06, "loss": 0.0299, "step": 2970 }, { "epoch": 1.4694280078895463, "grad_norm": 10413.710577887212, "learning_rate": 9.508697019635717e-06, "loss": 0.03, "step": 2980 }, { "epoch": 1.4743589743589745, "grad_norm": 9404.458091777537, "learning_rate": 9.345119241667727e-06, "loss": 0.0289, "step": 2990 }, { "epoch": 1.4792899408284024, "grad_norm": 14476.315553344366, "learning_rate": 9.182636669689335e-06, "loss": 0.0312, "step": 3000 }, { "epoch": 1.4842209072978303, "grad_norm": 9446.193307359319, "learning_rate": 9.021260670883013e-06, "loss": 0.0282, "step": 3010 }, { "epoch": 1.4891518737672584, "grad_norm": 7762.654700551867, "learning_rate": 8.861002535016033e-06, "loss": 0.0273, "step": 3020 }, { "epoch": 1.4940828402366864, "grad_norm": 9771.784279239897, "learning_rate": 8.701873473650643e-06, "loss": 0.0298, "step": 3030 }, { "epoch": 1.4990138067061145, "grad_norm": 12271.321363243651, "learning_rate": 8.543884619359748e-06, "loss": 0.0266, "step": 3040 }, { "epoch": 1.5039447731755424, "grad_norm": 8355.98803254289, "learning_rate": 8.387047024948025e-06, "loss": 0.0335, "step": 3050 }, { "epoch": 1.5088757396449703, "grad_norm": 8749.858513141799, "learning_rate": 8.231371662678741e-06, "loss": 0.0279, "step": 3060 }, { "epoch": 1.5138067061143983, "grad_norm": 12467.332673832041, "learning_rate": 8.076869423506068e-06, "loss": 0.0296, "step": 3070 }, { "epoch": 1.5187376725838264, "grad_norm": 12654.59600303384, "learning_rate": 7.923551116313227e-06, "loss": 0.0251, "step": 3080 }, { "epoch": 1.5236686390532546, "grad_norm": 15393.669867838533, "learning_rate": 7.771427467156255e-06, "loss": 0.0313, "step": 3090 }, { "epoch": 1.5285996055226825, "grad_norm": 12831.222545026643, "learning_rate": 7.620509118513636e-06, "loss": 0.0297, "step": 3100 }, { "epoch": 1.5335305719921104, "grad_norm": 14548.99089284202, "learning_rate": 7.470806628541774e-06, "loss": 0.0311, "step": 3110 }, { "epoch": 1.5384615384615383, "grad_norm": 10693.692720477806, "learning_rate": 7.3223304703363135e-06, "loss": 0.0305, "step": 3120 }, { "epoch": 1.5433925049309665, "grad_norm": 7706.081234972806, "learning_rate": 7.175091031199491e-06, "loss": 0.0304, "step": 3130 }, { "epoch": 1.5483234714003946, "grad_norm": 11757.682084492675, "learning_rate": 7.02909861191342e-06, "loss": 0.0291, "step": 3140 }, { "epoch": 1.5532544378698225, "grad_norm": 8024.787598435239, "learning_rate": 6.884363426019444e-06, "loss": 0.0257, "step": 3150 }, { "epoch": 1.5581854043392505, "grad_norm": 10546.368095225958, "learning_rate": 6.740895599103645e-06, "loss": 0.0326, "step": 3160 }, { "epoch": 1.5631163708086784, "grad_norm": 8094.14306767554, "learning_rate": 6.598705168088415e-06, "loss": 0.0298, "step": 3170 }, { "epoch": 1.5680473372781065, "grad_norm": 12800.953714469872, "learning_rate": 6.457802080530304e-06, "loss": 0.0277, "step": 3180 }, { "epoch": 1.5729783037475347, "grad_norm": 11911.56480064647, "learning_rate": 6.318196193924109e-06, "loss": 0.0263, "step": 3190 }, { "epoch": 1.5779092702169626, "grad_norm": 8510.830746760272, "learning_rate": 6.179897275013211e-06, "loss": 0.0279, "step": 3200 } ], "logging_steps": 10, "max_steps": 4056, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 400, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4351113300371046e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }