model-qwen2-5-32b-instruct-.../trainer_state.json
2025-12-16 21:45:36 +08:00

2274 lines
55 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.5779092702169626,
"eval_steps": 500,
"global_step": 3200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004930966469428008,
"grad_norm": 43895.33150575355,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.6914,
"step": 10
},
{
"epoch": 0.009861932938856016,
"grad_norm": 34666.69271793893,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.653,
"step": 20
},
{
"epoch": 0.014792899408284023,
"grad_norm": 29566.888507247426,
"learning_rate": 5e-06,
"loss": 0.6223,
"step": 30
},
{
"epoch": 0.01972386587771203,
"grad_norm": 17779.597295776977,
"learning_rate": 6.666666666666667e-06,
"loss": 0.5303,
"step": 40
},
{
"epoch": 0.02465483234714004,
"grad_norm": 10344.985645229286,
"learning_rate": 8.166666666666668e-06,
"loss": 0.4624,
"step": 50
},
{
"epoch": 0.029585798816568046,
"grad_norm": 7057.267176464272,
"learning_rate": 9.833333333333333e-06,
"loss": 0.4089,
"step": 60
},
{
"epoch": 0.03451676528599606,
"grad_norm": 7072.531088655603,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.3719,
"step": 70
},
{
"epoch": 0.03944773175542406,
"grad_norm": 5647.309093718884,
"learning_rate": 1.3166666666666665e-05,
"loss": 0.2918,
"step": 80
},
{
"epoch": 0.04437869822485207,
"grad_norm": 16472.366193112633,
"learning_rate": 1.4833333333333336e-05,
"loss": 0.2643,
"step": 90
},
{
"epoch": 0.04930966469428008,
"grad_norm": 17463.691247843337,
"learning_rate": 1.65e-05,
"loss": 0.2145,
"step": 100
},
{
"epoch": 0.054240631163708086,
"grad_norm": 34631.59066517159,
"learning_rate": 1.8166666666666667e-05,
"loss": 0.2055,
"step": 110
},
{
"epoch": 0.05917159763313609,
"grad_norm": 13642.827272966553,
"learning_rate": 1.9833333333333335e-05,
"loss": 0.1955,
"step": 120
},
{
"epoch": 0.0641025641025641,
"grad_norm": 12345.246210586487,
"learning_rate": 2.15e-05,
"loss": 0.1747,
"step": 130
},
{
"epoch": 0.06903353057199212,
"grad_norm": 18095.64544303408,
"learning_rate": 2.3166666666666666e-05,
"loss": 0.16,
"step": 140
},
{
"epoch": 0.07396449704142012,
"grad_norm": 10606.944894737599,
"learning_rate": 2.4833333333333335e-05,
"loss": 0.1483,
"step": 150
},
{
"epoch": 0.07889546351084813,
"grad_norm": 19168.698651708208,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.1438,
"step": 160
},
{
"epoch": 0.08382642998027613,
"grad_norm": 14566.588619165435,
"learning_rate": 2.816666666666667e-05,
"loss": 0.126,
"step": 170
},
{
"epoch": 0.08875739644970414,
"grad_norm": 14365.429057288891,
"learning_rate": 2.9833333333333335e-05,
"loss": 0.1203,
"step": 180
},
{
"epoch": 0.09368836291913216,
"grad_norm": 13554.579447552034,
"learning_rate": 3.15e-05,
"loss": 0.1375,
"step": 190
},
{
"epoch": 0.09861932938856016,
"grad_norm": 14373.095421655004,
"learning_rate": 3.316666666666667e-05,
"loss": 0.1104,
"step": 200
},
{
"epoch": 0.10355029585798817,
"grad_norm": 8761.746857790404,
"learning_rate": 3.483333333333334e-05,
"loss": 0.1109,
"step": 210
},
{
"epoch": 0.10848126232741617,
"grad_norm": 14477.860062868407,
"learning_rate": 3.65e-05,
"loss": 0.1126,
"step": 220
},
{
"epoch": 0.11341222879684418,
"grad_norm": 12729.454976549467,
"learning_rate": 3.816666666666667e-05,
"loss": 0.112,
"step": 230
},
{
"epoch": 0.11834319526627218,
"grad_norm": 10014.094467299577,
"learning_rate": 3.983333333333333e-05,
"loss": 0.1014,
"step": 240
},
{
"epoch": 0.1232741617357002,
"grad_norm": 11426.459469144413,
"learning_rate": 4.15e-05,
"loss": 0.1117,
"step": 250
},
{
"epoch": 0.1282051282051282,
"grad_norm": 12138.764352272434,
"learning_rate": 4.316666666666667e-05,
"loss": 0.1085,
"step": 260
},
{
"epoch": 0.13313609467455623,
"grad_norm": 17461.02585760642,
"learning_rate": 4.483333333333333e-05,
"loss": 0.1053,
"step": 270
},
{
"epoch": 0.13806706114398423,
"grad_norm": 11649.5053972261,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.0993,
"step": 280
},
{
"epoch": 0.14299802761341224,
"grad_norm": 8499.819762794974,
"learning_rate": 4.8166666666666674e-05,
"loss": 0.0982,
"step": 290
},
{
"epoch": 0.14792899408284024,
"grad_norm": 15086.851493933385,
"learning_rate": 4.9833333333333336e-05,
"loss": 0.0916,
"step": 300
},
{
"epoch": 0.15285996055226825,
"grad_norm": 14996.807660298908,
"learning_rate": 4.9999291660339096e-05,
"loss": 0.0984,
"step": 310
},
{
"epoch": 0.15779092702169625,
"grad_norm": 12750.902085734953,
"learning_rate": 4.999684313032836e-05,
"loss": 0.1033,
"step": 320
},
{
"epoch": 0.16272189349112426,
"grad_norm": 16534.053102612197,
"learning_rate": 4.999264583629061e-05,
"loss": 0.1017,
"step": 330
},
{
"epoch": 0.16765285996055226,
"grad_norm": 9247.551459710836,
"learning_rate": 4.998670007186599e-05,
"loss": 0.0999,
"step": 340
},
{
"epoch": 0.17258382642998027,
"grad_norm": 9434.010811950557,
"learning_rate": 4.997900625301659e-05,
"loss": 0.116,
"step": 350
},
{
"epoch": 0.17751479289940827,
"grad_norm": 14049.300053739333,
"learning_rate": 4.996956491799731e-05,
"loss": 0.0832,
"step": 360
},
{
"epoch": 0.18244575936883628,
"grad_norm": 13846.231545081138,
"learning_rate": 4.995837672731827e-05,
"loss": 0.0853,
"step": 370
},
{
"epoch": 0.1873767258382643,
"grad_norm": 8305.576921563004,
"learning_rate": 4.994544246369849e-05,
"loss": 0.0868,
"step": 380
},
{
"epoch": 0.19230769230769232,
"grad_norm": 7614.640372335387,
"learning_rate": 4.993076303201124e-05,
"loss": 0.0813,
"step": 390
},
{
"epoch": 0.19723865877712032,
"grad_norm": 8342.321978921696,
"learning_rate": 4.9914339459220685e-05,
"loss": 0.0827,
"step": 400
},
{
"epoch": 0.20216962524654833,
"grad_norm": 9845.654879184014,
"learning_rate": 4.989617289431004e-05,
"loss": 0.0722,
"step": 410
},
{
"epoch": 0.20710059171597633,
"grad_norm": 10443.866333882295,
"learning_rate": 4.9876264608201203e-05,
"loss": 0.0765,
"step": 420
},
{
"epoch": 0.21203155818540434,
"grad_norm": 8376.00382043848,
"learning_rate": 4.985461599366583e-05,
"loss": 0.0867,
"step": 430
},
{
"epoch": 0.21696252465483234,
"grad_norm": 14326.32150972468,
"learning_rate": 4.9831228565227906e-05,
"loss": 0.0714,
"step": 440
},
{
"epoch": 0.22189349112426035,
"grad_norm": 11864.219148346847,
"learning_rate": 4.980610395905776e-05,
"loss": 0.0763,
"step": 450
},
{
"epoch": 0.22682445759368836,
"grad_norm": 8294.590526361142,
"learning_rate": 4.977924393285767e-05,
"loss": 0.0816,
"step": 460
},
{
"epoch": 0.23175542406311636,
"grad_norm": 9196.279247608785,
"learning_rate": 4.9750650365738806e-05,
"loss": 0.0717,
"step": 470
},
{
"epoch": 0.23668639053254437,
"grad_norm": 10312.943323804315,
"learning_rate": 4.972032525808983e-05,
"loss": 0.0838,
"step": 480
},
{
"epoch": 0.2416173570019724,
"grad_norm": 7124.782101931259,
"learning_rate": 4.968827073143694e-05,
"loss": 0.078,
"step": 490
},
{
"epoch": 0.2465483234714004,
"grad_norm": 8737.514520731855,
"learning_rate": 4.965448902829543e-05,
"loss": 0.0686,
"step": 500
},
{
"epoch": 0.2514792899408284,
"grad_norm": 6490.680703901556,
"learning_rate": 4.961898251201282e-05,
"loss": 0.0714,
"step": 510
},
{
"epoch": 0.2564102564102564,
"grad_norm": 6539.6247598772825,
"learning_rate": 4.9581753666603517e-05,
"loss": 0.0789,
"step": 520
},
{
"epoch": 0.2613412228796844,
"grad_norm": 9437.219505765455,
"learning_rate": 4.954280509657502e-05,
"loss": 0.0677,
"step": 530
},
{
"epoch": 0.26627218934911245,
"grad_norm": 13868.67549551867,
"learning_rate": 4.9502139526745736e-05,
"loss": 0.0838,
"step": 540
},
{
"epoch": 0.27120315581854043,
"grad_norm": 9426.942240196448,
"learning_rate": 4.945975980205435e-05,
"loss": 0.0693,
"step": 550
},
{
"epoch": 0.27613412228796846,
"grad_norm": 9122.218589794918,
"learning_rate": 4.9415668887360756e-05,
"loss": 0.0751,
"step": 560
},
{
"epoch": 0.28106508875739644,
"grad_norm": 12629.583049333021,
"learning_rate": 4.936986986723867e-05,
"loss": 0.0795,
"step": 570
},
{
"epoch": 0.2859960552268245,
"grad_norm": 9029.990033217091,
"learning_rate": 4.932236594575986e-05,
"loss": 0.0756,
"step": 580
},
{
"epoch": 0.29092702169625245,
"grad_norm": 22628.703895716168,
"learning_rate": 4.927316044626993e-05,
"loss": 0.0724,
"step": 590
},
{
"epoch": 0.2958579881656805,
"grad_norm": 7334.351232385861,
"learning_rate": 4.9222256811155875e-05,
"loss": 0.0598,
"step": 600
},
{
"epoch": 0.30078895463510846,
"grad_norm": 11844.172913293693,
"learning_rate": 4.916965860160521e-05,
"loss": 0.0824,
"step": 610
},
{
"epoch": 0.3057199211045365,
"grad_norm": 10243.98516203533,
"learning_rate": 4.911536949735686e-05,
"loss": 0.0675,
"step": 620
},
{
"epoch": 0.3106508875739645,
"grad_norm": 8889.587616982017,
"learning_rate": 4.9059393296443724e-05,
"loss": 0.074,
"step": 630
},
{
"epoch": 0.3155818540433925,
"grad_norm": 10217.20353129955,
"learning_rate": 4.9001733914926974e-05,
"loss": 0.0722,
"step": 640
},
{
"epoch": 0.32051282051282054,
"grad_norm": 9370.911588527553,
"learning_rate": 4.8942395386622064e-05,
"loss": 0.0688,
"step": 650
},
{
"epoch": 0.3254437869822485,
"grad_norm": 11153.991931142858,
"learning_rate": 4.8881381862816536e-05,
"loss": 0.0794,
"step": 660
},
{
"epoch": 0.33037475345167655,
"grad_norm": 7372.450338930741,
"learning_rate": 4.881869761197963e-05,
"loss": 0.0721,
"step": 670
},
{
"epoch": 0.33530571992110453,
"grad_norm": 8280.749724511665,
"learning_rate": 4.875434701946361e-05,
"loss": 0.0703,
"step": 680
},
{
"epoch": 0.34023668639053256,
"grad_norm": 8886.890119721296,
"learning_rate": 4.868833458719702e-05,
"loss": 0.0673,
"step": 690
},
{
"epoch": 0.34516765285996054,
"grad_norm": 9734.842987947983,
"learning_rate": 4.8620664933369676e-05,
"loss": 0.0723,
"step": 700
},
{
"epoch": 0.35009861932938857,
"grad_norm": 9477.64358899405,
"learning_rate": 4.855134279210964e-05,
"loss": 0.066,
"step": 710
},
{
"epoch": 0.35502958579881655,
"grad_norm": 7460.336989707636,
"learning_rate": 4.8480373013152e-05,
"loss": 0.0684,
"step": 720
},
{
"epoch": 0.3599605522682446,
"grad_norm": 7986.478072342025,
"learning_rate": 4.840776056149957e-05,
"loss": 0.0634,
"step": 730
},
{
"epoch": 0.36489151873767256,
"grad_norm": 8724.282434676217,
"learning_rate": 4.833351051707553e-05,
"loss": 0.066,
"step": 740
},
{
"epoch": 0.3698224852071006,
"grad_norm": 11793.592158456218,
"learning_rate": 4.82576280743681e-05,
"loss": 0.0601,
"step": 750
},
{
"epoch": 0.3747534516765286,
"grad_norm": 8347.344487919496,
"learning_rate": 4.818011854206706e-05,
"loss": 0.0591,
"step": 760
},
{
"epoch": 0.3796844181459566,
"grad_norm": 10782.888295813882,
"learning_rate": 4.8100987342692424e-05,
"loss": 0.0658,
"step": 770
},
{
"epoch": 0.38461538461538464,
"grad_norm": 10922.65315754373,
"learning_rate": 4.802024001221504e-05,
"loss": 0.0644,
"step": 780
},
{
"epoch": 0.3895463510848126,
"grad_norm": 10552.341920161609,
"learning_rate": 4.793788219966931e-05,
"loss": 0.0755,
"step": 790
},
{
"epoch": 0.39447731755424065,
"grad_norm": 7972.205215622588,
"learning_rate": 4.785391966675799e-05,
"loss": 0.0689,
"step": 800
},
{
"epoch": 0.3994082840236686,
"grad_norm": 10807.511832054592,
"learning_rate": 4.776835828744909e-05,
"loss": 0.0636,
"step": 810
},
{
"epoch": 0.40433925049309666,
"grad_norm": 9430.68353832319,
"learning_rate": 4.7681204047564963e-05,
"loss": 0.0557,
"step": 820
},
{
"epoch": 0.40927021696252464,
"grad_norm": 10209.031687677338,
"learning_rate": 4.7592463044363534e-05,
"loss": 0.057,
"step": 830
},
{
"epoch": 0.41420118343195267,
"grad_norm": 8397.82590912672,
"learning_rate": 4.750214148611169e-05,
"loss": 0.0647,
"step": 840
},
{
"epoch": 0.41913214990138065,
"grad_norm": 9899.747875577437,
"learning_rate": 4.7410245691651054e-05,
"loss": 0.0699,
"step": 850
},
{
"epoch": 0.4240631163708087,
"grad_norm": 8939.42772217551,
"learning_rate": 4.731678208995579e-05,
"loss": 0.0665,
"step": 860
},
{
"epoch": 0.4289940828402367,
"grad_norm": 7923.368728009571,
"learning_rate": 4.722175721968297e-05,
"loss": 0.0643,
"step": 870
},
{
"epoch": 0.4339250493096647,
"grad_norm": 14368.711006906638,
"learning_rate": 4.7125177728715026e-05,
"loss": 0.0767,
"step": 880
},
{
"epoch": 0.4388560157790927,
"grad_norm": 10018.222197575775,
"learning_rate": 4.702705037369475e-05,
"loss": 0.0689,
"step": 890
},
{
"epoch": 0.4437869822485207,
"grad_norm": 8922.631450418648,
"learning_rate": 4.692738201955255e-05,
"loss": 0.0701,
"step": 900
},
{
"epoch": 0.44871794871794873,
"grad_norm": 11155.967013217634,
"learning_rate": 4.6826179639026184e-05,
"loss": 0.0664,
"step": 910
},
{
"epoch": 0.4536489151873767,
"grad_norm": 7031.218671041315,
"learning_rate": 4.6723450312173e-05,
"loss": 0.0627,
"step": 920
},
{
"epoch": 0.45857988165680474,
"grad_norm": 12984.67527510796,
"learning_rate": 4.6619201225874575e-05,
"loss": 0.0638,
"step": 930
},
{
"epoch": 0.4635108481262327,
"grad_norm": 7228.3278840960165,
"learning_rate": 4.651343967333393e-05,
"loss": 0.0545,
"step": 940
},
{
"epoch": 0.46844181459566075,
"grad_norm": 7148.969156458853,
"learning_rate": 4.6406173053565316e-05,
"loss": 0.0549,
"step": 950
},
{
"epoch": 0.47337278106508873,
"grad_norm": 7407.089036861917,
"learning_rate": 4.6297408870876576e-05,
"loss": 0.0565,
"step": 960
},
{
"epoch": 0.47830374753451677,
"grad_norm": 8808.120344318644,
"learning_rate": 4.618715473434414e-05,
"loss": 0.0601,
"step": 970
},
{
"epoch": 0.4832347140039448,
"grad_norm": 9377.625712300529,
"learning_rate": 4.607541835728074e-05,
"loss": 0.0724,
"step": 980
},
{
"epoch": 0.4881656804733728,
"grad_norm": 7881.055259291106,
"learning_rate": 4.596220755669571e-05,
"loss": 0.0567,
"step": 990
},
{
"epoch": 0.4930966469428008,
"grad_norm": 12093.790142052243,
"learning_rate": 4.584753025274821e-05,
"loss": 0.0618,
"step": 1000
},
{
"epoch": 0.4980276134122288,
"grad_norm": 13753.036610145411,
"learning_rate": 4.573139446819305e-05,
"loss": 0.057,
"step": 1010
},
{
"epoch": 0.5029585798816568,
"grad_norm": 7077.985024002241,
"learning_rate": 4.561380832781947e-05,
"loss": 0.0548,
"step": 1020
},
{
"epoch": 0.5078895463510849,
"grad_norm": 7930.713460969322,
"learning_rate": 4.549478005788276e-05,
"loss": 0.0677,
"step": 1030
},
{
"epoch": 0.5128205128205128,
"grad_norm": 8364.417493167113,
"learning_rate": 4.537431798552867e-05,
"loss": 0.0602,
"step": 1040
},
{
"epoch": 0.5177514792899408,
"grad_norm": 19419.409671769117,
"learning_rate": 4.5252430538210944e-05,
"loss": 0.0689,
"step": 1050
},
{
"epoch": 0.5226824457593688,
"grad_norm": 27615.07646196186,
"learning_rate": 4.512912624310166e-05,
"loss": 0.0586,
"step": 1060
},
{
"epoch": 0.5276134122287969,
"grad_norm": 18720.689730883314,
"learning_rate": 4.5004413726494746e-05,
"loss": 0.0661,
"step": 1070
},
{
"epoch": 0.5325443786982249,
"grad_norm": 17784.57848811717,
"learning_rate": 4.487830171320242e-05,
"loss": 0.0537,
"step": 1080
},
{
"epoch": 0.5374753451676528,
"grad_norm": 23348.821983132253,
"learning_rate": 4.4750799025944866e-05,
"loss": 0.0604,
"step": 1090
},
{
"epoch": 0.5424063116370809,
"grad_norm": 16098.754734450737,
"learning_rate": 4.462191458473297e-05,
"loss": 0.0566,
"step": 1100
},
{
"epoch": 0.5473372781065089,
"grad_norm": 22919.740312664977,
"learning_rate": 4.4491657406244295e-05,
"loss": 0.0552,
"step": 1110
},
{
"epoch": 0.5522682445759369,
"grad_norm": 15431.450482699285,
"learning_rate": 4.436003660319225e-05,
"loss": 0.0565,
"step": 1120
},
{
"epoch": 0.5571992110453649,
"grad_norm": 13429.063407401129,
"learning_rate": 4.4227061383688614e-05,
"loss": 0.0572,
"step": 1130
},
{
"epoch": 0.5621301775147929,
"grad_norm": 18778.278515348524,
"learning_rate": 4.4092741050599334e-05,
"loss": 0.0576,
"step": 1140
},
{
"epoch": 0.5670611439842209,
"grad_norm": 12919.829101036901,
"learning_rate": 4.395708500089366e-05,
"loss": 0.0584,
"step": 1150
},
{
"epoch": 0.571992110453649,
"grad_norm": 13210.966353753232,
"learning_rate": 4.382010272498679e-05,
"loss": 0.0501,
"step": 1160
},
{
"epoch": 0.5769230769230769,
"grad_norm": 19208.029154496824,
"learning_rate": 4.3681803806075896e-05,
"loss": 0.0621,
"step": 1170
},
{
"epoch": 0.5818540433925049,
"grad_norm": 20638.19061836575,
"learning_rate": 4.3542197919469704e-05,
"loss": 0.0635,
"step": 1180
},
{
"epoch": 0.5867850098619329,
"grad_norm": 28701.657652477148,
"learning_rate": 4.340129483191159e-05,
"loss": 0.0599,
"step": 1190
},
{
"epoch": 0.591715976331361,
"grad_norm": 18200.344612122048,
"learning_rate": 4.3259104400896355e-05,
"loss": 0.0647,
"step": 1200
},
{
"epoch": 0.596646942800789,
"grad_norm": 27827.53629051627,
"learning_rate": 4.3115636573980556e-05,
"loss": 0.057,
"step": 1210
},
{
"epoch": 0.6015779092702169,
"grad_norm": 20089.312581569335,
"learning_rate": 4.2970901388086585e-05,
"loss": 0.063,
"step": 1220
},
{
"epoch": 0.606508875739645,
"grad_norm": 17405.30034213716,
"learning_rate": 4.282490896880051e-05,
"loss": 0.0546,
"step": 1230
},
{
"epoch": 0.611439842209073,
"grad_norm": 14523.301553021613,
"learning_rate": 4.267766952966369e-05,
"loss": 0.0649,
"step": 1240
},
{
"epoch": 0.616370808678501,
"grad_norm": 18351.71141882958,
"learning_rate": 4.252919337145823e-05,
"loss": 0.0646,
"step": 1250
},
{
"epoch": 0.621301775147929,
"grad_norm": 18125.006813791824,
"learning_rate": 4.2379490881486374e-05,
"loss": 0.0596,
"step": 1260
},
{
"epoch": 0.626232741617357,
"grad_norm": 10411.466371265866,
"learning_rate": 4.2228572532843756e-05,
"loss": 0.0568,
"step": 1270
},
{
"epoch": 0.631163708086785,
"grad_norm": 16474.133907431977,
"learning_rate": 4.207644888368678e-05,
"loss": 0.0548,
"step": 1280
},
{
"epoch": 0.636094674556213,
"grad_norm": 18319.25325989026,
"learning_rate": 4.192313057649394e-05,
"loss": 0.0504,
"step": 1290
},
{
"epoch": 0.6410256410256411,
"grad_norm": 18510.18919406282,
"learning_rate": 4.176862833732127e-05,
"loss": 0.057,
"step": 1300
},
{
"epoch": 0.645956607495069,
"grad_norm": 27594.67426877875,
"learning_rate": 4.161295297505198e-05,
"loss": 0.061,
"step": 1310
},
{
"epoch": 0.650887573964497,
"grad_norm": 16779.24432148242,
"learning_rate": 4.145611538064026e-05,
"loss": 0.0525,
"step": 1320
},
{
"epoch": 0.6558185404339251,
"grad_norm": 24001.978585108354,
"learning_rate": 4.1298126526349365e-05,
"loss": 0.0542,
"step": 1330
},
{
"epoch": 0.6607495069033531,
"grad_norm": 17789.623942062408,
"learning_rate": 4.113899746498397e-05,
"loss": 0.0564,
"step": 1340
},
{
"epoch": 0.665680473372781,
"grad_norm": 21222.44019899691,
"learning_rate": 4.097873932911699e-05,
"loss": 0.0566,
"step": 1350
},
{
"epoch": 0.6706114398422091,
"grad_norm": 23036.33790340817,
"learning_rate": 4.081736333031067e-05,
"loss": 0.0652,
"step": 1360
},
{
"epoch": 0.6755424063116371,
"grad_norm": 18358.58033727009,
"learning_rate": 4.065488075833228e-05,
"loss": 0.0541,
"step": 1370
},
{
"epoch": 0.6804733727810651,
"grad_norm": 23994.764762339306,
"learning_rate": 4.049130298036429e-05,
"loss": 0.0547,
"step": 1380
},
{
"epoch": 0.685404339250493,
"grad_norm": 30128.253849169552,
"learning_rate": 4.0326641440209114e-05,
"loss": 0.0618,
"step": 1390
},
{
"epoch": 0.6903353057199211,
"grad_norm": 14950.507683687534,
"learning_rate": 4.016090765748851e-05,
"loss": 0.0497,
"step": 1400
},
{
"epoch": 0.6952662721893491,
"grad_norm": 19541.710876993344,
"learning_rate": 3.999411322683768e-05,
"loss": 0.0578,
"step": 1410
},
{
"epoch": 0.7001972386587771,
"grad_norm": 17140.760309857902,
"learning_rate": 3.982626981709412e-05,
"loss": 0.051,
"step": 1420
},
{
"epoch": 0.7051282051282052,
"grad_norm": 13185.760198031814,
"learning_rate": 3.965738917048125e-05,
"loss": 0.0481,
"step": 1430
},
{
"epoch": 0.7100591715976331,
"grad_norm": 16177.203219345425,
"learning_rate": 3.948748310178697e-05,
"loss": 0.0531,
"step": 1440
},
{
"epoch": 0.7149901380670611,
"grad_norm": 19925.021856951626,
"learning_rate": 3.93165634975371e-05,
"loss": 0.0676,
"step": 1450
},
{
"epoch": 0.7199211045364892,
"grad_norm": 14203.66487917819,
"learning_rate": 3.914464231516374e-05,
"loss": 0.0541,
"step": 1460
},
{
"epoch": 0.7248520710059172,
"grad_norm": 14143.869343287924,
"learning_rate": 3.897173158216887e-05,
"loss": 0.0479,
"step": 1470
},
{
"epoch": 0.7297830374753451,
"grad_norm": 16587.271264436473,
"learning_rate": 3.879784339528277e-05,
"loss": 0.053,
"step": 1480
},
{
"epoch": 0.7347140039447732,
"grad_norm": 12806.198499164379,
"learning_rate": 3.862298991961784e-05,
"loss": 0.0543,
"step": 1490
},
{
"epoch": 0.7396449704142012,
"grad_norm": 26141.707365816794,
"learning_rate": 3.844718338781748e-05,
"loss": 0.042,
"step": 1500
},
{
"epoch": 0.7445759368836292,
"grad_norm": 15241.946332407813,
"learning_rate": 3.82704360992003e-05,
"loss": 0.0541,
"step": 1510
},
{
"epoch": 0.7495069033530573,
"grad_norm": 19593.61610321076,
"learning_rate": 3.809276041889975e-05,
"loss": 0.0489,
"step": 1520
},
{
"epoch": 0.7544378698224852,
"grad_norm": 14828.533036008654,
"learning_rate": 3.791416877699894e-05,
"loss": 0.0537,
"step": 1530
},
{
"epoch": 0.7593688362919132,
"grad_norm": 19848.77547860321,
"learning_rate": 3.773467366766113e-05,
"loss": 0.047,
"step": 1540
},
{
"epoch": 0.7642998027613412,
"grad_norm": 17099.498004327495,
"learning_rate": 3.755428764825561e-05,
"loss": 0.049,
"step": 1550
},
{
"epoch": 0.7692307692307693,
"grad_norm": 16774.193512655085,
"learning_rate": 3.7373023338479196e-05,
"loss": 0.0544,
"step": 1560
},
{
"epoch": 0.7741617357001972,
"grad_norm": 17768.81042726271,
"learning_rate": 3.719089341947337e-05,
"loss": 0.0457,
"step": 1570
},
{
"epoch": 0.7790927021696252,
"grad_norm": 30253.611751326483,
"learning_rate": 3.7007910632937105e-05,
"loss": 0.048,
"step": 1580
},
{
"epoch": 0.7840236686390533,
"grad_norm": 24805.826412357237,
"learning_rate": 3.6824087780235485e-05,
"loss": 0.0579,
"step": 1590
},
{
"epoch": 0.7889546351084813,
"grad_norm": 16483.00263908248,
"learning_rate": 3.66394377215041e-05,
"loss": 0.0498,
"step": 1600
},
{
"epoch": 0.7938856015779092,
"grad_norm": 35264.37023399113,
"learning_rate": 3.6453973374749405e-05,
"loss": 0.0533,
"step": 1610
},
{
"epoch": 0.7988165680473372,
"grad_norm": 14936.754132006057,
"learning_rate": 3.626770771494489e-05,
"loss": 0.0566,
"step": 1620
},
{
"epoch": 0.8037475345167653,
"grad_norm": 12871.965817232425,
"learning_rate": 3.608065377312348e-05,
"loss": 0.046,
"step": 1630
},
{
"epoch": 0.8086785009861933,
"grad_norm": 23510.821338268895,
"learning_rate": 3.589282463546582e-05,
"loss": 0.057,
"step": 1640
},
{
"epoch": 0.8136094674556213,
"grad_norm": 14633.87467487678,
"learning_rate": 3.5704233442384756e-05,
"loss": 0.0506,
"step": 1650
},
{
"epoch": 0.8185404339250493,
"grad_norm": 14611.25484001973,
"learning_rate": 3.5514893387606114e-05,
"loss": 0.0528,
"step": 1660
},
{
"epoch": 0.8234714003944773,
"grad_norm": 17865.870031991166,
"learning_rate": 3.53248177172456e-05,
"loss": 0.0472,
"step": 1670
},
{
"epoch": 0.8284023668639053,
"grad_norm": 19515.118242019442,
"learning_rate": 3.5134019728882156e-05,
"loss": 0.0474,
"step": 1680
},
{
"epoch": 0.8333333333333334,
"grad_norm": 11861.742199188111,
"learning_rate": 3.4942512770627654e-05,
"loss": 0.0531,
"step": 1690
},
{
"epoch": 0.8382642998027613,
"grad_norm": 10874.551209130426,
"learning_rate": 3.475031024019307e-05,
"loss": 0.0512,
"step": 1700
},
{
"epoch": 0.8431952662721893,
"grad_norm": 14928.882074689987,
"learning_rate": 3.455742558395118e-05,
"loss": 0.0502,
"step": 1710
},
{
"epoch": 0.8481262327416174,
"grad_norm": 15317.730380183613,
"learning_rate": 3.436387229599587e-05,
"loss": 0.0509,
"step": 1720
},
{
"epoch": 0.8530571992110454,
"grad_norm": 15591.351961905035,
"learning_rate": 3.416966391719811e-05,
"loss": 0.0514,
"step": 1730
},
{
"epoch": 0.8579881656804734,
"grad_norm": 18949.96612134175,
"learning_rate": 3.397481403425863e-05,
"loss": 0.0535,
"step": 1740
},
{
"epoch": 0.8629191321499013,
"grad_norm": 12076.941997045444,
"learning_rate": 3.3779336278757386e-05,
"loss": 0.0498,
"step": 1750
},
{
"epoch": 0.8678500986193294,
"grad_norm": 24920.90399644443,
"learning_rate": 3.3583244326199924e-05,
"loss": 0.0431,
"step": 1760
},
{
"epoch": 0.8727810650887574,
"grad_norm": 15472.821074387179,
"learning_rate": 3.338655189506063e-05,
"loss": 0.0488,
"step": 1770
},
{
"epoch": 0.8777120315581854,
"grad_norm": 14609.320312731868,
"learning_rate": 3.318927274582307e-05,
"loss": 0.0466,
"step": 1780
},
{
"epoch": 0.8826429980276134,
"grad_norm": 17558.566684100384,
"learning_rate": 3.299142068001716e-05,
"loss": 0.0496,
"step": 1790
},
{
"epoch": 0.8875739644970414,
"grad_norm": 15509.33654286991,
"learning_rate": 3.279300953925378e-05,
"loss": 0.0406,
"step": 1800
},
{
"epoch": 0.8925049309664694,
"grad_norm": 12294.958316318116,
"learning_rate": 3.259405320425634e-05,
"loss": 0.0511,
"step": 1810
},
{
"epoch": 0.8974358974358975,
"grad_norm": 15796.216762250384,
"learning_rate": 3.2394565593889705e-05,
"loss": 0.0519,
"step": 1820
},
{
"epoch": 0.9023668639053254,
"grad_norm": 14265.63535213206,
"learning_rate": 3.2194560664186434e-05,
"loss": 0.0435,
"step": 1830
},
{
"epoch": 0.9072978303747534,
"grad_norm": 18098.215602649892,
"learning_rate": 3.199405240737045e-05,
"loss": 0.0473,
"step": 1840
},
{
"epoch": 0.9122287968441815,
"grad_norm": 25492.097599059987,
"learning_rate": 3.179305485087813e-05,
"loss": 0.0471,
"step": 1850
},
{
"epoch": 0.9171597633136095,
"grad_norm": 19625.7488010012,
"learning_rate": 3.159158205637696e-05,
"loss": 0.0521,
"step": 1860
},
{
"epoch": 0.9220907297830375,
"grad_norm": 16158.12761429987,
"learning_rate": 3.1389648118781795e-05,
"loss": 0.043,
"step": 1870
},
{
"epoch": 0.9270216962524654,
"grad_norm": 16575.068506645755,
"learning_rate": 3.1187267165268774e-05,
"loss": 0.0491,
"step": 1880
},
{
"epoch": 0.9319526627218935,
"grad_norm": 16422.20642910081,
"learning_rate": 3.0984453354287e-05,
"loss": 0.0516,
"step": 1890
},
{
"epoch": 0.9368836291913215,
"grad_norm": 14754.34851154059,
"learning_rate": 3.078122087456802e-05,
"loss": 0.0497,
"step": 1900
},
{
"epoch": 0.9418145956607495,
"grad_norm": 15679.373456870017,
"learning_rate": 3.057758394413321e-05,
"loss": 0.0488,
"step": 1910
},
{
"epoch": 0.9467455621301775,
"grad_norm": 15054.075860045345,
"learning_rate": 3.037355680929906e-05,
"loss": 0.0468,
"step": 1920
},
{
"epoch": 0.9516765285996055,
"grad_norm": 22642.915006685867,
"learning_rate": 3.016915374368052e-05,
"loss": 0.0458,
"step": 1930
},
{
"epoch": 0.9566074950690335,
"grad_norm": 15260.94151748181,
"learning_rate": 2.9964389047192428e-05,
"loss": 0.0401,
"step": 1940
},
{
"epoch": 0.9615384615384616,
"grad_norm": 21582.143730408246,
"learning_rate": 2.9759277045049117e-05,
"loss": 0.0467,
"step": 1950
},
{
"epoch": 0.9664694280078896,
"grad_norm": 21636.31317946752,
"learning_rate": 2.95538320867622e-05,
"loss": 0.0478,
"step": 1960
},
{
"epoch": 0.9714003944773175,
"grad_norm": 19466.86292138515,
"learning_rate": 2.9348068545136703e-05,
"loss": 0.0392,
"step": 1970
},
{
"epoch": 0.9763313609467456,
"grad_norm": 20193.997524016882,
"learning_rate": 2.9142000815265552e-05,
"loss": 0.0492,
"step": 1980
},
{
"epoch": 0.9812623274161736,
"grad_norm": 16234.605015213643,
"learning_rate": 2.89356433135225e-05,
"loss": 0.0478,
"step": 1990
},
{
"epoch": 0.9861932938856016,
"grad_norm": 19339.389028612048,
"learning_rate": 2.8729010476553546e-05,
"loss": 0.0473,
"step": 2000
},
{
"epoch": 0.9911242603550295,
"grad_norm": 32683.024584637205,
"learning_rate": 2.852211676026701e-05,
"loss": 0.0446,
"step": 2010
},
{
"epoch": 0.9960552268244576,
"grad_norm": 16478.83393932957,
"learning_rate": 2.8314976638822142e-05,
"loss": 0.0434,
"step": 2020
},
{
"epoch": 1.0009861932938855,
"grad_norm": 12970.191054876563,
"learning_rate": 2.8107604603616555e-05,
"loss": 0.0406,
"step": 2030
},
{
"epoch": 1.0059171597633136,
"grad_norm": 16996.303598135684,
"learning_rate": 2.790001516227241e-05,
"loss": 0.0372,
"step": 2040
},
{
"epoch": 1.0108481262327416,
"grad_norm": 33541.365982917276,
"learning_rate": 2.769222283762148e-05,
"loss": 0.0384,
"step": 2050
},
{
"epoch": 1.0157790927021697,
"grad_norm": 37506.9998800224,
"learning_rate": 2.7484242166689123e-05,
"loss": 0.0441,
"step": 2060
},
{
"epoch": 1.0207100591715976,
"grad_norm": 37564.614838967806,
"learning_rate": 2.7276087699677277e-05,
"loss": 0.0351,
"step": 2070
},
{
"epoch": 1.0256410256410255,
"grad_norm": 51239.6546436449,
"learning_rate": 2.706777399894656e-05,
"loss": 0.0381,
"step": 2080
},
{
"epoch": 1.0305719921104537,
"grad_norm": 25657.361984428564,
"learning_rate": 2.68593156379975e-05,
"loss": 0.0404,
"step": 2090
},
{
"epoch": 1.0355029585798816,
"grad_norm": 40016.91642293294,
"learning_rate": 2.6650727200450936e-05,
"loss": 0.0352,
"step": 2100
},
{
"epoch": 1.0404339250493098,
"grad_norm": 39623.3902638328,
"learning_rate": 2.6442023279027806e-05,
"loss": 0.0397,
"step": 2110
},
{
"epoch": 1.0453648915187377,
"grad_norm": 42575.39270517654,
"learning_rate": 2.623321847452823e-05,
"loss": 0.0358,
"step": 2120
},
{
"epoch": 1.0502958579881656,
"grad_norm": 29283.62491222697,
"learning_rate": 2.6024327394810044e-05,
"loss": 0.0372,
"step": 2130
},
{
"epoch": 1.0552268244575937,
"grad_norm": 36733.54347187322,
"learning_rate": 2.5815364653766837e-05,
"loss": 0.0359,
"step": 2140
},
{
"epoch": 1.0601577909270217,
"grad_norm": 45356.02240055889,
"learning_rate": 2.5606344870305597e-05,
"loss": 0.0343,
"step": 2150
},
{
"epoch": 1.0650887573964498,
"grad_norm": 53383.08586059821,
"learning_rate": 2.5397282667323956e-05,
"loss": 0.0374,
"step": 2160
},
{
"epoch": 1.0700197238658777,
"grad_norm": 44916.75500300528,
"learning_rate": 2.518819267068718e-05,
"loss": 0.0368,
"step": 2170
},
{
"epoch": 1.0749506903353057,
"grad_norm": 45634.131787511855,
"learning_rate": 2.4979089508205002e-05,
"loss": 0.033,
"step": 2180
},
{
"epoch": 1.0798816568047338,
"grad_norm": 33894.735874468766,
"learning_rate": 2.4769987808608175e-05,
"loss": 0.0438,
"step": 2190
},
{
"epoch": 1.0848126232741617,
"grad_norm": 29514.47861643502,
"learning_rate": 2.456090220052514e-05,
"loss": 0.0335,
"step": 2200
},
{
"epoch": 1.0897435897435896,
"grad_norm": 45448.43583667099,
"learning_rate": 2.4351847311458587e-05,
"loss": 0.0371,
"step": 2210
},
{
"epoch": 1.0946745562130178,
"grad_norm": 33717.85046529509,
"learning_rate": 2.4142837766762108e-05,
"loss": 0.0295,
"step": 2220
},
{
"epoch": 1.0996055226824457,
"grad_norm": 43915.722924711146,
"learning_rate": 2.393388818861705e-05,
"loss": 0.0385,
"step": 2230
},
{
"epoch": 1.1045364891518739,
"grad_norm": 42817.53958367996,
"learning_rate": 2.3725013195009522e-05,
"loss": 0.0383,
"step": 2240
},
{
"epoch": 1.1094674556213018,
"grad_norm": 55755.649758567066,
"learning_rate": 2.3516227398707754e-05,
"loss": 0.034,
"step": 2250
},
{
"epoch": 1.1143984220907297,
"grad_norm": 33225.09894642904,
"learning_rate": 2.330754540623975e-05,
"loss": 0.0323,
"step": 2260
},
{
"epoch": 1.1193293885601578,
"grad_norm": 53806.90810667344,
"learning_rate": 2.309898181687152e-05,
"loss": 0.0349,
"step": 2270
},
{
"epoch": 1.1242603550295858,
"grad_norm": 47541.57086172059,
"learning_rate": 2.289055122158562e-05,
"loss": 0.0394,
"step": 2280
},
{
"epoch": 1.1291913214990137,
"grad_norm": 38933.466118495024,
"learning_rate": 2.2682268202060442e-05,
"loss": 0.0331,
"step": 2290
},
{
"epoch": 1.1341222879684418,
"grad_norm": 32132.282583097018,
"learning_rate": 2.2474147329650052e-05,
"loss": 0.0331,
"step": 2300
},
{
"epoch": 1.1390532544378698,
"grad_norm": 35663.32166245876,
"learning_rate": 2.2266203164364818e-05,
"loss": 0.0329,
"step": 2310
},
{
"epoch": 1.143984220907298,
"grad_norm": 60547.0083819176,
"learning_rate": 2.205845025385278e-05,
"loss": 0.0417,
"step": 2320
},
{
"epoch": 1.1489151873767258,
"grad_norm": 30475.69208402001,
"learning_rate": 2.185090313238189e-05,
"loss": 0.0316,
"step": 2330
},
{
"epoch": 1.1538461538461537,
"grad_norm": 28629.068304784214,
"learning_rate": 2.1643576319823254e-05,
"loss": 0.0312,
"step": 2340
},
{
"epoch": 1.1587771203155819,
"grad_norm": 35177.58013280618,
"learning_rate": 2.1436484320635276e-05,
"loss": 0.033,
"step": 2350
},
{
"epoch": 1.1637080867850098,
"grad_norm": 38736.13052435671,
"learning_rate": 2.1229641622848964e-05,
"loss": 0.0355,
"step": 2360
},
{
"epoch": 1.168639053254438,
"grad_norm": 41191.78947314622,
"learning_rate": 2.1023062697054368e-05,
"loss": 0.0446,
"step": 2370
},
{
"epoch": 1.1735700197238659,
"grad_norm": 39650.51646574102,
"learning_rate": 2.0816761995388197e-05,
"loss": 0.0346,
"step": 2380
},
{
"epoch": 1.1785009861932938,
"grad_norm": 44066.93091196618,
"learning_rate": 2.0610753950522752e-05,
"loss": 0.0387,
"step": 2390
},
{
"epoch": 1.183431952662722,
"grad_norm": 35579.78032534771,
"learning_rate": 2.0405052974656253e-05,
"loss": 0.0376,
"step": 2400
},
{
"epoch": 1.1883629191321499,
"grad_norm": 48319.300657190804,
"learning_rate": 2.0199673458504574e-05,
"loss": 0.0346,
"step": 2410
},
{
"epoch": 1.193293885601578,
"grad_norm": 50381.03262141418,
"learning_rate": 1.9994629770294448e-05,
"loss": 0.0339,
"step": 2420
},
{
"epoch": 1.198224852071006,
"grad_norm": 34522.5130023873,
"learning_rate": 1.9789936254758288e-05,
"loss": 0.0301,
"step": 2430
},
{
"epoch": 1.2031558185404339,
"grad_norm": 51816.098193515114,
"learning_rate": 1.9585607232130633e-05,
"loss": 0.0402,
"step": 2440
},
{
"epoch": 1.208086785009862,
"grad_norm": 50369.39261098947,
"learning_rate": 1.9381656997146343e-05,
"loss": 0.0406,
"step": 2450
},
{
"epoch": 1.21301775147929,
"grad_norm": 55031.280704704666,
"learning_rate": 1.9178099818040482e-05,
"loss": 0.0326,
"step": 2460
},
{
"epoch": 1.217948717948718,
"grad_norm": 39373.461112785095,
"learning_rate": 1.8974949935550202e-05,
"loss": 0.0295,
"step": 2470
},
{
"epoch": 1.222879684418146,
"grad_norm": 51596.797419994975,
"learning_rate": 1.877222156191841e-05,
"loss": 0.0314,
"step": 2480
},
{
"epoch": 1.227810650887574,
"grad_norm": 46556.9511029234,
"learning_rate": 1.8569928879899506e-05,
"loss": 0.0369,
"step": 2490
},
{
"epoch": 1.232741617357002,
"grad_norm": 60444.887161777384,
"learning_rate": 1.836808604176719e-05,
"loss": 0.0367,
"step": 2500
},
{
"epoch": 1.23767258382643,
"grad_norm": 40896.774640550815,
"learning_rate": 1.8166707168324338e-05,
"loss": 0.0319,
"step": 2510
},
{
"epoch": 1.242603550295858,
"grad_norm": 53234.142427581195,
"learning_rate": 1.796580634791514e-05,
"loss": 0.0335,
"step": 2520
},
{
"epoch": 1.247534516765286,
"grad_norm": 28195.286982047193,
"learning_rate": 1.7765397635439468e-05,
"loss": 0.0278,
"step": 2530
},
{
"epoch": 1.252465483234714,
"grad_norm": 17427.74615376297,
"learning_rate": 1.7585462135142083e-05,
"loss": 0.0344,
"step": 2540
},
{
"epoch": 1.2573964497041419,
"grad_norm": 21108.922094697304,
"learning_rate": 1.738602702489652e-05,
"loss": 0.0349,
"step": 2550
},
{
"epoch": 1.26232741617357,
"grad_norm": 24377.500446108086,
"learning_rate": 1.718712458359528e-05,
"loss": 0.034,
"step": 2560
},
{
"epoch": 1.267258382642998,
"grad_norm": 20456.646059410617,
"learning_rate": 1.6988768726332854e-05,
"loss": 0.0311,
"step": 2570
},
{
"epoch": 1.272189349112426,
"grad_norm": 15780.460829773001,
"learning_rate": 1.6790973329965044e-05,
"loss": 0.0275,
"step": 2580
},
{
"epoch": 1.277120315581854,
"grad_norm": 25885.175371242898,
"learning_rate": 1.659375223213816e-05,
"loss": 0.0349,
"step": 2590
},
{
"epoch": 1.282051282051282,
"grad_norm": 17902.541495553083,
"learning_rate": 1.6397119230320918e-05,
"loss": 0.0309,
"step": 2600
},
{
"epoch": 1.28698224852071,
"grad_norm": 18151.209326102766,
"learning_rate": 1.62010880808392e-05,
"loss": 0.0341,
"step": 2610
},
{
"epoch": 1.291913214990138,
"grad_norm": 10715.816721090372,
"learning_rate": 1.602518596574137e-05,
"loss": 0.0322,
"step": 2620
},
{
"epoch": 1.2968441814595661,
"grad_norm": 11396.753923815324,
"learning_rate": 1.583033608280189e-05,
"loss": 0.0297,
"step": 2630
},
{
"epoch": 1.301775147928994,
"grad_norm": 10740.794756441443,
"learning_rate": 1.5636127704004134e-05,
"loss": 0.0348,
"step": 2640
},
{
"epoch": 1.306706114398422,
"grad_norm": 12196.05509990833,
"learning_rate": 1.5442574416048827e-05,
"loss": 0.0341,
"step": 2650
},
{
"epoch": 1.3116370808678501,
"grad_norm": 13820.070043237842,
"learning_rate": 1.5249689759806935e-05,
"loss": 0.0344,
"step": 2660
},
{
"epoch": 1.316568047337278,
"grad_norm": 9524.579570773714,
"learning_rate": 1.5057487229372347e-05,
"loss": 0.0314,
"step": 2670
},
{
"epoch": 1.3214990138067062,
"grad_norm": 14665.234536140226,
"learning_rate": 1.4865980271117852e-05,
"loss": 0.0308,
"step": 2680
},
{
"epoch": 1.3264299802761341,
"grad_norm": 10442.700417037731,
"learning_rate": 1.4675182282754408e-05,
"loss": 0.0357,
"step": 2690
},
{
"epoch": 1.331360946745562,
"grad_norm": 9396.478914997893,
"learning_rate": 1.4485106612393899e-05,
"loss": 0.0303,
"step": 2700
},
{
"epoch": 1.3362919132149902,
"grad_norm": 17759.793692495416,
"learning_rate": 1.4295766557615248e-05,
"loss": 0.0352,
"step": 2710
},
{
"epoch": 1.3412228796844181,
"grad_norm": 11063.842732070987,
"learning_rate": 1.4107175364534184e-05,
"loss": 0.0304,
"step": 2720
},
{
"epoch": 1.3461538461538463,
"grad_norm": 9136.923333376504,
"learning_rate": 1.391934622687652e-05,
"loss": 0.0325,
"step": 2730
},
{
"epoch": 1.3510848126232742,
"grad_norm": 10481.712073893272,
"learning_rate": 1.3732292285055107e-05,
"loss": 0.0361,
"step": 2740
},
{
"epoch": 1.356015779092702,
"grad_norm": 10502.118643397627,
"learning_rate": 1.3546026625250612e-05,
"loss": 0.0355,
"step": 2750
},
{
"epoch": 1.3609467455621302,
"grad_norm": 12646.443294460305,
"learning_rate": 1.3360562278495896e-05,
"loss": 0.0333,
"step": 2760
},
{
"epoch": 1.3658777120315582,
"grad_norm": 11539.639509100793,
"learning_rate": 1.3175912219764514e-05,
"loss": 0.0303,
"step": 2770
},
{
"epoch": 1.3708086785009863,
"grad_norm": 14185.68179538791,
"learning_rate": 1.29920893670629e-05,
"loss": 0.0301,
"step": 2780
},
{
"epoch": 1.3757396449704142,
"grad_norm": 17136.45564286851,
"learning_rate": 1.2809106580526636e-05,
"loss": 0.0361,
"step": 2790
},
{
"epoch": 1.3806706114398422,
"grad_norm": 9303.49181759193,
"learning_rate": 1.2626976661520811e-05,
"loss": 0.0329,
"step": 2800
},
{
"epoch": 1.3856015779092703,
"grad_norm": 13639.397054122297,
"learning_rate": 1.2445712351744402e-05,
"loss": 0.0299,
"step": 2810
},
{
"epoch": 1.3905325443786982,
"grad_norm": 11963.693743990607,
"learning_rate": 1.2265326332338875e-05,
"loss": 0.0365,
"step": 2820
},
{
"epoch": 1.3954635108481264,
"grad_norm": 11382.881181844956,
"learning_rate": 1.2085831223001071e-05,
"loss": 0.0349,
"step": 2830
},
{
"epoch": 1.4003944773175543,
"grad_norm": 10434.131684045395,
"learning_rate": 1.1907239581100255e-05,
"loss": 0.0332,
"step": 2840
},
{
"epoch": 1.4053254437869822,
"grad_norm": 13172.61401544887,
"learning_rate": 1.1729563900799694e-05,
"loss": 0.0302,
"step": 2850
},
{
"epoch": 1.4102564102564101,
"grad_norm": 8347.449430814182,
"learning_rate": 1.155281661218253e-05,
"loss": 0.0288,
"step": 2860
},
{
"epoch": 1.4151873767258383,
"grad_norm": 10608.303163088807,
"learning_rate": 1.1377010080382155e-05,
"loss": 0.0332,
"step": 2870
},
{
"epoch": 1.4201183431952662,
"grad_norm": 13408.207635623787,
"learning_rate": 1.1202156604717235e-05,
"loss": 0.0342,
"step": 2880
},
{
"epoch": 1.4250493096646943,
"grad_norm": 10305.117951775224,
"learning_rate": 1.1028268417831134e-05,
"loss": 0.0311,
"step": 2890
},
{
"epoch": 1.4299802761341223,
"grad_norm": 11212.751312679686,
"learning_rate": 1.0855357684836257e-05,
"loss": 0.0277,
"step": 2900
},
{
"epoch": 1.4349112426035502,
"grad_norm": 10448.69408108018,
"learning_rate": 1.0683436502462916e-05,
"loss": 0.0311,
"step": 2910
},
{
"epoch": 1.4398422090729783,
"grad_norm": 12742.063255218913,
"learning_rate": 1.051251689821303e-05,
"loss": 0.0286,
"step": 2920
},
{
"epoch": 1.4447731755424063,
"grad_norm": 8262.845030617482,
"learning_rate": 1.0342610829518756e-05,
"loss": 0.0272,
"step": 2930
},
{
"epoch": 1.4497041420118344,
"grad_norm": 6957.433434823505,
"learning_rate": 1.017373018290588e-05,
"loss": 0.0326,
"step": 2940
},
{
"epoch": 1.4546351084812623,
"grad_norm": 11696.316685179143,
"learning_rate": 1.000588677316232e-05,
"loss": 0.029,
"step": 2950
},
{
"epoch": 1.4595660749506902,
"grad_norm": 9799.848978428188,
"learning_rate": 9.839092342511496e-06,
"loss": 0.0332,
"step": 2960
},
{
"epoch": 1.4644970414201184,
"grad_norm": 14969.193431845284,
"learning_rate": 9.67335855979089e-06,
"loss": 0.0299,
"step": 2970
},
{
"epoch": 1.4694280078895463,
"grad_norm": 10413.710577887212,
"learning_rate": 9.508697019635717e-06,
"loss": 0.03,
"step": 2980
},
{
"epoch": 1.4743589743589745,
"grad_norm": 9404.458091777537,
"learning_rate": 9.345119241667727e-06,
"loss": 0.0289,
"step": 2990
},
{
"epoch": 1.4792899408284024,
"grad_norm": 14476.315553344366,
"learning_rate": 9.182636669689335e-06,
"loss": 0.0312,
"step": 3000
},
{
"epoch": 1.4842209072978303,
"grad_norm": 9446.193307359319,
"learning_rate": 9.021260670883013e-06,
"loss": 0.0282,
"step": 3010
},
{
"epoch": 1.4891518737672584,
"grad_norm": 7762.654700551867,
"learning_rate": 8.861002535016033e-06,
"loss": 0.0273,
"step": 3020
},
{
"epoch": 1.4940828402366864,
"grad_norm": 9771.784279239897,
"learning_rate": 8.701873473650643e-06,
"loss": 0.0298,
"step": 3030
},
{
"epoch": 1.4990138067061145,
"grad_norm": 12271.321363243651,
"learning_rate": 8.543884619359748e-06,
"loss": 0.0266,
"step": 3040
},
{
"epoch": 1.5039447731755424,
"grad_norm": 8355.98803254289,
"learning_rate": 8.387047024948025e-06,
"loss": 0.0335,
"step": 3050
},
{
"epoch": 1.5088757396449703,
"grad_norm": 8749.858513141799,
"learning_rate": 8.231371662678741e-06,
"loss": 0.0279,
"step": 3060
},
{
"epoch": 1.5138067061143983,
"grad_norm": 12467.332673832041,
"learning_rate": 8.076869423506068e-06,
"loss": 0.0296,
"step": 3070
},
{
"epoch": 1.5187376725838264,
"grad_norm": 12654.59600303384,
"learning_rate": 7.923551116313227e-06,
"loss": 0.0251,
"step": 3080
},
{
"epoch": 1.5236686390532546,
"grad_norm": 15393.669867838533,
"learning_rate": 7.771427467156255e-06,
"loss": 0.0313,
"step": 3090
},
{
"epoch": 1.5285996055226825,
"grad_norm": 12831.222545026643,
"learning_rate": 7.620509118513636e-06,
"loss": 0.0297,
"step": 3100
},
{
"epoch": 1.5335305719921104,
"grad_norm": 14548.99089284202,
"learning_rate": 7.470806628541774e-06,
"loss": 0.0311,
"step": 3110
},
{
"epoch": 1.5384615384615383,
"grad_norm": 10693.692720477806,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.0305,
"step": 3120
},
{
"epoch": 1.5433925049309665,
"grad_norm": 7706.081234972806,
"learning_rate": 7.175091031199491e-06,
"loss": 0.0304,
"step": 3130
},
{
"epoch": 1.5483234714003946,
"grad_norm": 11757.682084492675,
"learning_rate": 7.02909861191342e-06,
"loss": 0.0291,
"step": 3140
},
{
"epoch": 1.5532544378698225,
"grad_norm": 8024.787598435239,
"learning_rate": 6.884363426019444e-06,
"loss": 0.0257,
"step": 3150
},
{
"epoch": 1.5581854043392505,
"grad_norm": 10546.368095225958,
"learning_rate": 6.740895599103645e-06,
"loss": 0.0326,
"step": 3160
},
{
"epoch": 1.5631163708086784,
"grad_norm": 8094.14306767554,
"learning_rate": 6.598705168088415e-06,
"loss": 0.0298,
"step": 3170
},
{
"epoch": 1.5680473372781065,
"grad_norm": 12800.953714469872,
"learning_rate": 6.457802080530304e-06,
"loss": 0.0277,
"step": 3180
},
{
"epoch": 1.5729783037475347,
"grad_norm": 11911.56480064647,
"learning_rate": 6.318196193924109e-06,
"loss": 0.0263,
"step": 3190
},
{
"epoch": 1.5779092702169626,
"grad_norm": 8510.830746760272,
"learning_rate": 6.179897275013211e-06,
"loss": 0.0279,
"step": 3200
}
],
"logging_steps": 10,
"max_steps": 4056,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4351113300371046e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}