{ "best_global_step": 4611, "best_metric": 82.1472249933553, "best_model_checkpoint": "./finetuning-checkpoints/checkpoint-4611", "epoch": 5.0, "eval_steps": 500, "global_step": 7685, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006506180871828237, "grad_norm": 130.89578247070312, "learning_rate": 5.844155844155844e-06, "loss": 10.8594, "step": 10 }, { "epoch": 0.013012361743656473, "grad_norm": 24.240734100341797, "learning_rate": 1.2337662337662339e-05, "loss": 6.5781, "step": 20 }, { "epoch": 0.01951854261548471, "grad_norm": 21.042543411254883, "learning_rate": 1.8831168831168833e-05, "loss": 4.2508, "step": 30 }, { "epoch": 0.026024723487312947, "grad_norm": 16.95259666442871, "learning_rate": 2.5324675324675325e-05, "loss": 3.393, "step": 40 }, { "epoch": 0.03253090435914118, "grad_norm": 24.48785972595215, "learning_rate": 3.181818181818182e-05, "loss": 2.9828, "step": 50 }, { "epoch": 0.03903708523096942, "grad_norm": 15.361346244812012, "learning_rate": 3.831168831168831e-05, "loss": 2.7164, "step": 60 }, { "epoch": 0.04554326610279766, "grad_norm": 51.514888763427734, "learning_rate": 4.4805194805194805e-05, "loss": 2.6777, "step": 70 }, { "epoch": 0.05204944697462589, "grad_norm": 28.284515380859375, "learning_rate": 4.99999914743261e-05, "loss": 2.5809, "step": 80 }, { "epoch": 0.05855562784645413, "grad_norm": 23.2271728515625, "learning_rate": 4.9999693076350204e-05, "loss": 2.5285, "step": 90 }, { "epoch": 0.06506180871828236, "grad_norm": 34.038822174072266, "learning_rate": 4.9998968400494294e-05, "loss": 2.4488, "step": 100 }, { "epoch": 0.07156798959011061, "grad_norm": 31.4317626953125, "learning_rate": 4.999781745911506e-05, "loss": 2.4762, "step": 110 }, { "epoch": 0.07807417046193885, "grad_norm": 27.888914108276367, "learning_rate": 4.999624027183758e-05, "loss": 2.3512, "step": 120 }, { "epoch": 0.08458035133376708, "grad_norm": 11.051114082336426, "learning_rate": 4.999423686555498e-05, "loss": 2.3824, "step": 130 }, { "epoch": 0.09108653220559532, "grad_norm": 5.426783084869385, "learning_rate": 4.999180727442799e-05, "loss": 2.2887, "step": 140 }, { "epoch": 0.09759271307742355, "grad_norm": 15.684212684631348, "learning_rate": 4.9988951539884365e-05, "loss": 2.2617, "step": 150 }, { "epoch": 0.10409889394925179, "grad_norm": 5.376956939697266, "learning_rate": 4.9985669710618156e-05, "loss": 2.2561, "step": 160 }, { "epoch": 0.11060507482108002, "grad_norm": 8.866439819335938, "learning_rate": 4.99819618425889e-05, "loss": 2.2877, "step": 170 }, { "epoch": 0.11711125569290826, "grad_norm": 18.468015670776367, "learning_rate": 4.997782799902065e-05, "loss": 2.2482, "step": 180 }, { "epoch": 0.1236174365647365, "grad_norm": 14.588068008422852, "learning_rate": 4.997326825040094e-05, "loss": 2.1953, "step": 190 }, { "epoch": 0.13012361743656473, "grad_norm": 4.425182819366455, "learning_rate": 4.9968282674479486e-05, "loss": 2.1434, "step": 200 }, { "epoch": 0.13662979830839297, "grad_norm": 11.427102088928223, "learning_rate": 4.9962871356266994e-05, "loss": 2.1264, "step": 210 }, { "epoch": 0.14313597918022122, "grad_norm": 11.059713363647461, "learning_rate": 4.995703438803359e-05, "loss": 2.1518, "step": 220 }, { "epoch": 0.14964216005204944, "grad_norm": 5.673813819885254, "learning_rate": 4.995077186930731e-05, "loss": 2.1496, "step": 230 }, { "epoch": 0.1561483409238777, "grad_norm": 12.602893829345703, "learning_rate": 4.9944083906872405e-05, "loss": 2.1037, "step": 240 }, { "epoch": 0.16265452179570591, "grad_norm": 11.657296180725098, "learning_rate": 4.9936970614767485e-05, "loss": 2.1143, "step": 250 }, { "epoch": 0.16916070266753416, "grad_norm": 8.69975471496582, "learning_rate": 4.9929432114283614e-05, "loss": 2.1437, "step": 260 }, { "epoch": 0.17566688353936238, "grad_norm": 29.609445571899414, "learning_rate": 4.992146853396219e-05, "loss": 2.083, "step": 270 }, { "epoch": 0.18217306441119063, "grad_norm": 8.23004150390625, "learning_rate": 4.9913080009592824e-05, "loss": 2.0965, "step": 280 }, { "epoch": 0.18867924528301888, "grad_norm": 7.247291564941406, "learning_rate": 4.9904266684210964e-05, "loss": 2.082, "step": 290 }, { "epoch": 0.1951854261548471, "grad_norm": 15.842830657958984, "learning_rate": 4.9895028708095474e-05, "loss": 2.0455, "step": 300 }, { "epoch": 0.20169160702667535, "grad_norm": 3.956104278564453, "learning_rate": 4.988536623876609e-05, "loss": 2.0539, "step": 310 }, { "epoch": 0.20819778789850357, "grad_norm": 10.965349197387695, "learning_rate": 4.98752794409807e-05, "loss": 2.0277, "step": 320 }, { "epoch": 0.21470396877033182, "grad_norm": 13.925650596618652, "learning_rate": 4.9864768486732585e-05, "loss": 2.0898, "step": 330 }, { "epoch": 0.22121014964216004, "grad_norm": 5.380825996398926, "learning_rate": 4.985383355524743e-05, "loss": 2.0656, "step": 340 }, { "epoch": 0.2277163305139883, "grad_norm": 12.201285362243652, "learning_rate": 4.984247483298029e-05, "loss": 2.0297, "step": 350 }, { "epoch": 0.2342225113858165, "grad_norm": 12.665562629699707, "learning_rate": 4.983069251361244e-05, "loss": 2.0973, "step": 360 }, { "epoch": 0.24072869225764476, "grad_norm": 6.231887340545654, "learning_rate": 4.981848679804803e-05, "loss": 2.0199, "step": 370 }, { "epoch": 0.247234873129473, "grad_norm": 9.651846885681152, "learning_rate": 4.980585789441066e-05, "loss": 2.034, "step": 380 }, { "epoch": 0.25374105400130126, "grad_norm": 10.33559799194336, "learning_rate": 4.9792806018039876e-05, "loss": 2.0164, "step": 390 }, { "epoch": 0.26024723487312945, "grad_norm": 10.100518226623535, "learning_rate": 4.977933139148746e-05, "loss": 1.9846, "step": 400 }, { "epoch": 0.2667534157449577, "grad_norm": 8.689348220825195, "learning_rate": 4.976543424451365e-05, "loss": 1.9846, "step": 410 }, { "epoch": 0.27325959661678595, "grad_norm": 13.723487854003906, "learning_rate": 4.9751114814083186e-05, "loss": 1.9838, "step": 420 }, { "epoch": 0.2797657774886142, "grad_norm": 3.7204697132110596, "learning_rate": 4.973637334436135e-05, "loss": 1.9898, "step": 430 }, { "epoch": 0.28627195836044245, "grad_norm": 6.423130512237549, "learning_rate": 4.972121008670971e-05, "loss": 1.9945, "step": 440 }, { "epoch": 0.29277813923227064, "grad_norm": 10.954133987426758, "learning_rate": 4.970562529968189e-05, "loss": 2.0, "step": 450 }, { "epoch": 0.2992843201040989, "grad_norm": 7.982641220092773, "learning_rate": 4.9689619249019174e-05, "loss": 1.9662, "step": 460 }, { "epoch": 0.30579050097592714, "grad_norm": 3.9185597896575928, "learning_rate": 4.9673192207645894e-05, "loss": 1.9352, "step": 470 }, { "epoch": 0.3122966818477554, "grad_norm": 6.356838226318359, "learning_rate": 4.9656344455664885e-05, "loss": 1.9787, "step": 480 }, { "epoch": 0.3188028627195836, "grad_norm": 6.755145072937012, "learning_rate": 4.963907628035264e-05, "loss": 1.9861, "step": 490 }, { "epoch": 0.32530904359141183, "grad_norm": 6.672500133514404, "learning_rate": 4.9621387976154396e-05, "loss": 2.0008, "step": 500 }, { "epoch": 0.3318152244632401, "grad_norm": 6.26496696472168, "learning_rate": 4.960327984467919e-05, "loss": 1.9238, "step": 510 }, { "epoch": 0.3383214053350683, "grad_norm": 6.744718551635742, "learning_rate": 4.958475219469464e-05, "loss": 1.9773, "step": 520 }, { "epoch": 0.3448275862068966, "grad_norm": 7.031189441680908, "learning_rate": 4.9565805342121716e-05, "loss": 1.908, "step": 530 }, { "epoch": 0.35133376707872477, "grad_norm": 5.89207649230957, "learning_rate": 4.954643961002936e-05, "loss": 1.9055, "step": 540 }, { "epoch": 0.357839947950553, "grad_norm": 3.371752977371216, "learning_rate": 4.952665532862895e-05, "loss": 1.9406, "step": 550 }, { "epoch": 0.36434612882238127, "grad_norm": 8.62122917175293, "learning_rate": 4.950645283526868e-05, "loss": 1.9441, "step": 560 }, { "epoch": 0.3708523096942095, "grad_norm": 12.293152809143066, "learning_rate": 4.948583247442783e-05, "loss": 1.9652, "step": 570 }, { "epoch": 0.37735849056603776, "grad_norm": 8.459797859191895, "learning_rate": 4.9464794597710864e-05, "loss": 1.9238, "step": 580 }, { "epoch": 0.38386467143786596, "grad_norm": 7.8875532150268555, "learning_rate": 4.944333956384144e-05, "loss": 1.934, "step": 590 }, { "epoch": 0.3903708523096942, "grad_norm": 6.423868656158447, "learning_rate": 4.942146773865631e-05, "loss": 1.9344, "step": 600 }, { "epoch": 0.39687703318152245, "grad_norm": 5.624599933624268, "learning_rate": 4.939917949509907e-05, "loss": 1.9207, "step": 610 }, { "epoch": 0.4033832140533507, "grad_norm": 6.100623607635498, "learning_rate": 4.937647521321378e-05, "loss": 1.9016, "step": 620 }, { "epoch": 0.4098893949251789, "grad_norm": 9.013191223144531, "learning_rate": 4.935335528013853e-05, "loss": 1.9025, "step": 630 }, { "epoch": 0.41639557579700714, "grad_norm": 5.427655220031738, "learning_rate": 4.932982009009879e-05, "loss": 1.9023, "step": 640 }, { "epoch": 0.4229017566688354, "grad_norm": 7.220954895019531, "learning_rate": 4.9305870044400725e-05, "loss": 1.8982, "step": 650 }, { "epoch": 0.42940793754066364, "grad_norm": 8.385273933410645, "learning_rate": 4.928150555142436e-05, "loss": 1.9178, "step": 660 }, { "epoch": 0.4359141184124919, "grad_norm": 9.131636619567871, "learning_rate": 4.925672702661653e-05, "loss": 1.8633, "step": 670 }, { "epoch": 0.4424202992843201, "grad_norm": 6.151471138000488, "learning_rate": 4.923153489248395e-05, "loss": 1.9086, "step": 680 }, { "epoch": 0.44892648015614833, "grad_norm": 4.592082500457764, "learning_rate": 4.920592957858584e-05, "loss": 1.9025, "step": 690 }, { "epoch": 0.4554326610279766, "grad_norm": 6.865637302398682, "learning_rate": 4.9179911521526734e-05, "loss": 1.957, "step": 700 }, { "epoch": 0.46193884189980483, "grad_norm": 10.764424324035645, "learning_rate": 4.9153481164948964e-05, "loss": 1.8707, "step": 710 }, { "epoch": 0.468445022771633, "grad_norm": 3.9533846378326416, "learning_rate": 4.912663895952511e-05, "loss": 1.8988, "step": 720 }, { "epoch": 0.4749512036434613, "grad_norm": 6.9716596603393555, "learning_rate": 4.909938536295034e-05, "loss": 1.8994, "step": 730 }, { "epoch": 0.4814573845152895, "grad_norm": 4.04404878616333, "learning_rate": 4.907172083993457e-05, "loss": 1.8664, "step": 740 }, { "epoch": 0.48796356538711777, "grad_norm": 8.119486808776855, "learning_rate": 4.904364586219454e-05, "loss": 1.8926, "step": 750 }, { "epoch": 0.494469746258946, "grad_norm": 4.090360641479492, "learning_rate": 4.9015160908445846e-05, "loss": 1.8967, "step": 760 }, { "epoch": 0.5009759271307742, "grad_norm": 5.823906421661377, "learning_rate": 4.8986266464394645e-05, "loss": 1.866, "step": 770 }, { "epoch": 0.5074821080026025, "grad_norm": 4.35285758972168, "learning_rate": 4.89569630227295e-05, "loss": 1.8674, "step": 780 }, { "epoch": 0.5139882888744307, "grad_norm": 4.863727569580078, "learning_rate": 4.892725108311289e-05, "loss": 1.8963, "step": 790 }, { "epoch": 0.5204944697462589, "grad_norm": 8.635047912597656, "learning_rate": 4.889713115217276e-05, "loss": 1.8752, "step": 800 }, { "epoch": 0.5270006506180872, "grad_norm": 9.313777923583984, "learning_rate": 4.886660374349381e-05, "loss": 1.9324, "step": 810 }, { "epoch": 0.5335068314899154, "grad_norm": 5.126181125640869, "learning_rate": 4.883566937760879e-05, "loss": 1.8846, "step": 820 }, { "epoch": 0.5400130123617437, "grad_norm": 7.131484508514404, "learning_rate": 4.880432858198962e-05, "loss": 1.8596, "step": 830 }, { "epoch": 0.5465191932335719, "grad_norm": 9.836341857910156, "learning_rate": 4.8772581891038385e-05, "loss": 1.8695, "step": 840 }, { "epoch": 0.5530253741054001, "grad_norm": 7.066926956176758, "learning_rate": 4.87404298460782e-05, "loss": 1.8617, "step": 850 }, { "epoch": 0.5595315549772284, "grad_norm": 3.1584465503692627, "learning_rate": 4.870787299534404e-05, "loss": 1.8541, "step": 860 }, { "epoch": 0.5660377358490566, "grad_norm": 5.483874320983887, "learning_rate": 4.8674911893973305e-05, "loss": 1.8607, "step": 870 }, { "epoch": 0.5725439167208849, "grad_norm": 3.9347193241119385, "learning_rate": 4.8641547103996456e-05, "loss": 1.8705, "step": 880 }, { "epoch": 0.5790500975927131, "grad_norm": 6.541131496429443, "learning_rate": 4.8607779194327344e-05, "loss": 1.8842, "step": 890 }, { "epoch": 0.5855562784645413, "grad_norm": 6.1325907707214355, "learning_rate": 4.857360874075355e-05, "loss": 1.8531, "step": 900 }, { "epoch": 0.5920624593363696, "grad_norm": 8.212583541870117, "learning_rate": 4.853903632592657e-05, "loss": 1.8717, "step": 910 }, { "epoch": 0.5985686402081978, "grad_norm": 3.124750852584839, "learning_rate": 4.850406253935188e-05, "loss": 1.8598, "step": 920 }, { "epoch": 0.605074821080026, "grad_norm": 7.354849338531494, "learning_rate": 4.846868797737886e-05, "loss": 1.8506, "step": 930 }, { "epoch": 0.6115810019518543, "grad_norm": 7.021043300628662, "learning_rate": 4.843291324319064e-05, "loss": 1.8625, "step": 940 }, { "epoch": 0.6180871828236825, "grad_norm": 4.686529636383057, "learning_rate": 4.839673894679383e-05, "loss": 1.8504, "step": 950 }, { "epoch": 0.6245933636955108, "grad_norm": 4.174448013305664, "learning_rate": 4.836016570500809e-05, "loss": 1.873, "step": 960 }, { "epoch": 0.631099544567339, "grad_norm": 7.377628326416016, "learning_rate": 4.832319414145565e-05, "loss": 1.865, "step": 970 }, { "epoch": 0.6376057254391672, "grad_norm": 5.196572303771973, "learning_rate": 4.828582488655062e-05, "loss": 1.8705, "step": 980 }, { "epoch": 0.6441119063109955, "grad_norm": 5.850039005279541, "learning_rate": 4.824805857748831e-05, "loss": 1.8428, "step": 990 }, { "epoch": 0.6506180871828237, "grad_norm": 4.637782573699951, "learning_rate": 4.82098958582343e-05, "loss": 1.8398, "step": 1000 }, { "epoch": 0.657124268054652, "grad_norm": 5.280829429626465, "learning_rate": 4.817133737951352e-05, "loss": 1.8832, "step": 1010 }, { "epoch": 0.6636304489264802, "grad_norm": 7.612039089202881, "learning_rate": 4.8132383798799077e-05, "loss": 1.8705, "step": 1020 }, { "epoch": 0.6701366297983083, "grad_norm": 3.3436310291290283, "learning_rate": 4.8093035780301135e-05, "loss": 1.8344, "step": 1030 }, { "epoch": 0.6766428106701367, "grad_norm": 4.327233791351318, "learning_rate": 4.805329399495552e-05, "loss": 1.8125, "step": 1040 }, { "epoch": 0.6831489915419648, "grad_norm": 3.7132182121276855, "learning_rate": 4.8013159120412324e-05, "loss": 1.8346, "step": 1050 }, { "epoch": 0.6896551724137931, "grad_norm": 5.213608264923096, "learning_rate": 4.79726318410243e-05, "loss": 1.798, "step": 1060 }, { "epoch": 0.6961613532856213, "grad_norm": 5.125476360321045, "learning_rate": 4.793171284783525e-05, "loss": 1.8701, "step": 1070 }, { "epoch": 0.7026675341574495, "grad_norm": 4.42897891998291, "learning_rate": 4.789040283856822e-05, "loss": 1.8686, "step": 1080 }, { "epoch": 0.7091737150292778, "grad_norm": 7.08945894241333, "learning_rate": 4.784870251761357e-05, "loss": 1.8535, "step": 1090 }, { "epoch": 0.715679895901106, "grad_norm": 7.498415946960449, "learning_rate": 4.7806612596017e-05, "loss": 1.8254, "step": 1100 }, { "epoch": 0.7221860767729343, "grad_norm": 3.11222767829895, "learning_rate": 4.776413379146743e-05, "loss": 1.8557, "step": 1110 }, { "epoch": 0.7286922576447625, "grad_norm": 3.0804524421691895, "learning_rate": 4.7721266828284754e-05, "loss": 1.8451, "step": 1120 }, { "epoch": 0.7351984385165907, "grad_norm": 2.847862482070923, "learning_rate": 4.767801243740746e-05, "loss": 1.8332, "step": 1130 }, { "epoch": 0.741704619388419, "grad_norm": 3.423305034637451, "learning_rate": 4.763437135638021e-05, "loss": 1.8188, "step": 1140 }, { "epoch": 0.7482108002602472, "grad_norm": 6.292115688323975, "learning_rate": 4.759034432934123e-05, "loss": 1.8355, "step": 1150 }, { "epoch": 0.7547169811320755, "grad_norm": 4.049074649810791, "learning_rate": 4.754593210700966e-05, "loss": 1.8484, "step": 1160 }, { "epoch": 0.7612231620039037, "grad_norm": 4.450780868530273, "learning_rate": 4.750113544667271e-05, "loss": 1.8451, "step": 1170 }, { "epoch": 0.7677293428757319, "grad_norm": 4.9657416343688965, "learning_rate": 4.745595511217277e-05, "loss": 1.8164, "step": 1180 }, { "epoch": 0.7742355237475602, "grad_norm": 3.7400104999542236, "learning_rate": 4.7410391873894386e-05, "loss": 1.7971, "step": 1190 }, { "epoch": 0.7807417046193884, "grad_norm": 5.6448140144348145, "learning_rate": 4.736444650875114e-05, "loss": 1.7822, "step": 1200 }, { "epoch": 0.7872478854912166, "grad_norm": 3.0754597187042236, "learning_rate": 4.731811980017234e-05, "loss": 1.8191, "step": 1210 }, { "epoch": 0.7937540663630449, "grad_norm": 2.9193413257598877, "learning_rate": 4.727141253808974e-05, "loss": 1.8377, "step": 1220 }, { "epoch": 0.8002602472348731, "grad_norm": 5.102992534637451, "learning_rate": 4.722432551892402e-05, "loss": 1.8322, "step": 1230 }, { "epoch": 0.8067664281067014, "grad_norm": 2.7815585136413574, "learning_rate": 4.717685954557123e-05, "loss": 1.8139, "step": 1240 }, { "epoch": 0.8132726089785296, "grad_norm": 3.520860195159912, "learning_rate": 4.712901542738908e-05, "loss": 1.8143, "step": 1250 }, { "epoch": 0.8197787898503578, "grad_norm": 5.187878131866455, "learning_rate": 4.7080793980183165e-05, "loss": 1.7781, "step": 1260 }, { "epoch": 0.8262849707221861, "grad_norm": 6.9749650955200195, "learning_rate": 4.703219602619302e-05, "loss": 1.8066, "step": 1270 }, { "epoch": 0.8327911515940143, "grad_norm": 4.195006370544434, "learning_rate": 4.698322239407814e-05, "loss": 1.7664, "step": 1280 }, { "epoch": 0.8392973324658426, "grad_norm": 3.3593590259552, "learning_rate": 4.6933873918903816e-05, "loss": 1.7678, "step": 1290 }, { "epoch": 0.8458035133376708, "grad_norm": 6.621061325073242, "learning_rate": 4.688415144212692e-05, "loss": 1.8158, "step": 1300 }, { "epoch": 0.852309694209499, "grad_norm": 4.431589603424072, "learning_rate": 4.683405581158153e-05, "loss": 1.8379, "step": 1310 }, { "epoch": 0.8588158750813273, "grad_norm": 8.838325500488281, "learning_rate": 4.67835878814645e-05, "loss": 1.8088, "step": 1320 }, { "epoch": 0.8653220559531555, "grad_norm": 6.058382511138916, "learning_rate": 4.67327485123209e-05, "loss": 1.8102, "step": 1330 }, { "epoch": 0.8718282368249838, "grad_norm": 8.134848594665527, "learning_rate": 4.6681538571029295e-05, "loss": 1.7666, "step": 1340 }, { "epoch": 0.878334417696812, "grad_norm": 5.910958290100098, "learning_rate": 4.662995893078702e-05, "loss": 1.8109, "step": 1350 }, { "epoch": 0.8848405985686402, "grad_norm": 3.413054943084717, "learning_rate": 4.657801047109527e-05, "loss": 1.8023, "step": 1360 }, { "epoch": 0.8913467794404685, "grad_norm": 4.527835845947266, "learning_rate": 4.6525694077744076e-05, "loss": 1.7961, "step": 1370 }, { "epoch": 0.8978529603122967, "grad_norm": 13.099793434143066, "learning_rate": 4.647301064279725e-05, "loss": 1.8154, "step": 1380 }, { "epoch": 0.904359141184125, "grad_norm": 14.050834655761719, "learning_rate": 4.6419961064577134e-05, "loss": 1.8125, "step": 1390 }, { "epoch": 0.9108653220559532, "grad_norm": 3.7129900455474854, "learning_rate": 4.63665462476493e-05, "loss": 1.8012, "step": 1400 }, { "epoch": 0.9173715029277814, "grad_norm": 6.35868501663208, "learning_rate": 4.631276710280713e-05, "loss": 1.7811, "step": 1410 }, { "epoch": 0.9238776837996097, "grad_norm": 3.5830605030059814, "learning_rate": 4.625862454705629e-05, "loss": 1.8213, "step": 1420 }, { "epoch": 0.9303838646714379, "grad_norm": 3.2499704360961914, "learning_rate": 4.620411950359903e-05, "loss": 1.7914, "step": 1430 }, { "epoch": 0.936890045543266, "grad_norm": 5.130862712860107, "learning_rate": 4.614925290181858e-05, "loss": 1.7969, "step": 1440 }, { "epoch": 0.9433962264150944, "grad_norm": 8.873045921325684, "learning_rate": 4.6094025677263155e-05, "loss": 1.8152, "step": 1450 }, { "epoch": 0.9499024072869225, "grad_norm": 4.36439323425293, "learning_rate": 4.6038438771630074e-05, "loss": 1.8301, "step": 1460 }, { "epoch": 0.9564085881587508, "grad_norm": 4.794332981109619, "learning_rate": 4.5982493132749724e-05, "loss": 1.8375, "step": 1470 }, { "epoch": 0.962914769030579, "grad_norm": 2.4946184158325195, "learning_rate": 4.592618971456933e-05, "loss": 1.8033, "step": 1480 }, { "epoch": 0.9694209499024072, "grad_norm": 4.2077107429504395, "learning_rate": 4.586952947713677e-05, "loss": 1.7855, "step": 1490 }, { "epoch": 0.9759271307742355, "grad_norm": 5.372648239135742, "learning_rate": 4.581251338658412e-05, "loss": 1.7668, "step": 1500 }, { "epoch": 0.9824333116460637, "grad_norm": 6.311350345611572, "learning_rate": 4.5755142415111264e-05, "loss": 1.7953, "step": 1510 }, { "epoch": 0.988939492517892, "grad_norm": 5.414618968963623, "learning_rate": 4.5697417540969234e-05, "loss": 1.7961, "step": 1520 }, { "epoch": 0.9954456733897202, "grad_norm": 5.223058700561523, "learning_rate": 4.563933974844361e-05, "loss": 1.8064, "step": 1530 }, { "epoch": 1.0, "eval_f1": 80.96034986917698, "eval_loss": 0.4447461664676666, "eval_precision": 80.97098119347288, "eval_recall": 81.09598790143225, "eval_runtime": 62.6152, "eval_samples_per_second": 6283.375, "eval_steps_per_second": 6.149, "step": 1537 }, { "epoch": 1.0019518542615484, "grad_norm": 6.164201259613037, "learning_rate": 4.5580910027837673e-05, "loss": 1.7648, "step": 1540 }, { "epoch": 1.0084580351333767, "grad_norm": 5.059112548828125, "learning_rate": 4.5522129375455555e-05, "loss": 1.7076, "step": 1550 }, { "epoch": 1.014964216005205, "grad_norm": 4.295845031738281, "learning_rate": 4.546299879358523e-05, "loss": 1.7203, "step": 1560 }, { "epoch": 1.0214703968770331, "grad_norm": 3.7177891731262207, "learning_rate": 4.540351929048146e-05, "loss": 1.6992, "step": 1570 }, { "epoch": 1.0279765777488614, "grad_norm": 2.9181087017059326, "learning_rate": 4.534369188034853e-05, "loss": 1.6977, "step": 1580 }, { "epoch": 1.0344827586206897, "grad_norm": 5.58280086517334, "learning_rate": 4.528351758332303e-05, "loss": 1.7008, "step": 1590 }, { "epoch": 1.0409889394925178, "grad_norm": 8.704566955566406, "learning_rate": 4.5222997425456446e-05, "loss": 1.7199, "step": 1600 }, { "epoch": 1.047495120364346, "grad_norm": 3.961500883102417, "learning_rate": 4.5162132438697615e-05, "loss": 1.7008, "step": 1610 }, { "epoch": 1.0540013012361744, "grad_norm": 4.934179782867432, "learning_rate": 4.510092366087518e-05, "loss": 1.677, "step": 1620 }, { "epoch": 1.0605074821080025, "grad_norm": 5.9107136726379395, "learning_rate": 4.5039372135679883e-05, "loss": 1.6697, "step": 1630 }, { "epoch": 1.0670136629798308, "grad_norm": 3.819345474243164, "learning_rate": 4.497747891264675e-05, "loss": 1.6973, "step": 1640 }, { "epoch": 1.073519843851659, "grad_norm": 5.10596227645874, "learning_rate": 4.491524504713722e-05, "loss": 1.6963, "step": 1650 }, { "epoch": 1.0800260247234874, "grad_norm": 5.221753120422363, "learning_rate": 4.485267160032112e-05, "loss": 1.6895, "step": 1660 }, { "epoch": 1.0865322055953155, "grad_norm": 8.133532524108887, "learning_rate": 4.478975963915861e-05, "loss": 1.6979, "step": 1670 }, { "epoch": 1.0930383864671438, "grad_norm": 7.811677932739258, "learning_rate": 4.472651023638196e-05, "loss": 1.6857, "step": 1680 }, { "epoch": 1.099544567338972, "grad_norm": 4.768795490264893, "learning_rate": 4.4662924470477255e-05, "loss": 1.6865, "step": 1690 }, { "epoch": 1.1060507482108002, "grad_norm": 4.132567882537842, "learning_rate": 4.4599003425666026e-05, "loss": 1.6779, "step": 1700 }, { "epoch": 1.1125569290826285, "grad_norm": 4.644925117492676, "learning_rate": 4.453474819188675e-05, "loss": 1.7004, "step": 1710 }, { "epoch": 1.1190631099544568, "grad_norm": 4.388784885406494, "learning_rate": 4.447015986477628e-05, "loss": 1.6475, "step": 1720 }, { "epoch": 1.1255692908262849, "grad_norm": 5.948523044586182, "learning_rate": 4.440523954565114e-05, "loss": 1.6945, "step": 1730 }, { "epoch": 1.1320754716981132, "grad_norm": 3.153449058532715, "learning_rate": 4.433998834148877e-05, "loss": 1.7299, "step": 1740 }, { "epoch": 1.1385816525699415, "grad_norm": 5.864811420440674, "learning_rate": 4.427440736490861e-05, "loss": 1.6486, "step": 1750 }, { "epoch": 1.1450878334417696, "grad_norm": 5.873137950897217, "learning_rate": 4.4208497734153177e-05, "loss": 1.7176, "step": 1760 }, { "epoch": 1.1515940143135979, "grad_norm": 7.453052520751953, "learning_rate": 4.4142260573068993e-05, "loss": 1.7025, "step": 1770 }, { "epoch": 1.1581001951854262, "grad_norm": 5.205383777618408, "learning_rate": 4.407569701108737e-05, "loss": 1.6881, "step": 1780 }, { "epoch": 1.1646063760572545, "grad_norm": 3.868424415588379, "learning_rate": 4.400880818320521e-05, "loss": 1.6613, "step": 1790 }, { "epoch": 1.1711125569290826, "grad_norm": 4.129473686218262, "learning_rate": 4.3941595229965636e-05, "loss": 1.698, "step": 1800 }, { "epoch": 1.1776187378009109, "grad_norm": 4.3387131690979, "learning_rate": 4.3874059297438515e-05, "loss": 1.7031, "step": 1810 }, { "epoch": 1.1841249186727392, "grad_norm": 5.17094612121582, "learning_rate": 4.380620153720095e-05, "loss": 1.6945, "step": 1820 }, { "epoch": 1.1906310995445673, "grad_norm": 4.9148125648498535, "learning_rate": 4.373802310631765e-05, "loss": 1.6643, "step": 1830 }, { "epoch": 1.1971372804163956, "grad_norm": 7.63484525680542, "learning_rate": 4.366952516732114e-05, "loss": 1.7283, "step": 1840 }, { "epoch": 1.2036434612882239, "grad_norm": 3.8116321563720703, "learning_rate": 4.360070888819203e-05, "loss": 1.7043, "step": 1850 }, { "epoch": 1.2101496421600522, "grad_norm": 5.3097310066223145, "learning_rate": 4.353157544233902e-05, "loss": 1.651, "step": 1860 }, { "epoch": 1.2166558230318802, "grad_norm": 9.860427856445312, "learning_rate": 4.3462126008578936e-05, "loss": 1.6811, "step": 1870 }, { "epoch": 1.2231620039037086, "grad_norm": 5.98110818862915, "learning_rate": 4.3392361771116604e-05, "loss": 1.6617, "step": 1880 }, { "epoch": 1.2296681847755369, "grad_norm": 6.695423126220703, "learning_rate": 4.332228391952469e-05, "loss": 1.6918, "step": 1890 }, { "epoch": 1.236174365647365, "grad_norm": 4.08838415145874, "learning_rate": 4.325189364872337e-05, "loss": 1.7023, "step": 1900 }, { "epoch": 1.2426805465191932, "grad_norm": 3.411262035369873, "learning_rate": 4.318119215896001e-05, "loss": 1.6916, "step": 1910 }, { "epoch": 1.2491867273910215, "grad_norm": 2.878885507583618, "learning_rate": 4.311018065578864e-05, "loss": 1.6568, "step": 1920 }, { "epoch": 1.2556929082628496, "grad_norm": 4.060927391052246, "learning_rate": 4.303886035004947e-05, "loss": 1.6834, "step": 1930 }, { "epoch": 1.262199089134678, "grad_norm": 5.755529403686523, "learning_rate": 4.2967232457848154e-05, "loss": 1.6707, "step": 1940 }, { "epoch": 1.2687052700065062, "grad_norm": 4.134897232055664, "learning_rate": 4.289529820053515e-05, "loss": 1.6885, "step": 1950 }, { "epoch": 1.2752114508783343, "grad_norm": 7.08973503112793, "learning_rate": 4.2823058804684815e-05, "loss": 1.6812, "step": 1960 }, { "epoch": 1.2817176317501626, "grad_norm": 4.418806552886963, "learning_rate": 4.275051550207453e-05, "loss": 1.709, "step": 1970 }, { "epoch": 1.288223812621991, "grad_norm": 2.787440061569214, "learning_rate": 4.267766952966369e-05, "loss": 1.6502, "step": 1980 }, { "epoch": 1.294729993493819, "grad_norm": 3.611393690109253, "learning_rate": 4.2604522129572624e-05, "loss": 1.6379, "step": 1990 }, { "epoch": 1.3012361743656473, "grad_norm": 4.657389163970947, "learning_rate": 4.253107454906137e-05, "loss": 1.6859, "step": 2000 }, { "epoch": 1.3077423552374756, "grad_norm": 3.5194790363311768, "learning_rate": 4.2457328040508484e-05, "loss": 1.677, "step": 2010 }, { "epoch": 1.3142485361093037, "grad_norm": 7.83802604675293, "learning_rate": 4.238328386138959e-05, "loss": 1.7084, "step": 2020 }, { "epoch": 1.320754716981132, "grad_norm": 5.4843645095825195, "learning_rate": 4.230894327425604e-05, "loss": 1.626, "step": 2030 }, { "epoch": 1.3272608978529603, "grad_norm": 4.370174884796143, "learning_rate": 4.2234307546713305e-05, "loss": 1.7027, "step": 2040 }, { "epoch": 1.3337670787247886, "grad_norm": 3.42929744720459, "learning_rate": 4.2159377951399385e-05, "loss": 1.6664, "step": 2050 }, { "epoch": 1.340273259596617, "grad_norm": 3.7664260864257812, "learning_rate": 4.208415576596315e-05, "loss": 1.7207, "step": 2060 }, { "epoch": 1.346779440468445, "grad_norm": 3.5000877380371094, "learning_rate": 4.200864227304247e-05, "loss": 1.6795, "step": 2070 }, { "epoch": 1.3532856213402733, "grad_norm": 5.146000385284424, "learning_rate": 4.1932838760242445e-05, "loss": 1.6805, "step": 2080 }, { "epoch": 1.3597918022121016, "grad_norm": 3.095343828201294, "learning_rate": 4.1856746520113345e-05, "loss": 1.6818, "step": 2090 }, { "epoch": 1.3662979830839297, "grad_norm": 3.989588499069214, "learning_rate": 4.178036685012868e-05, "loss": 1.6689, "step": 2100 }, { "epoch": 1.372804163955758, "grad_norm": 3.1599042415618896, "learning_rate": 4.1703701052662974e-05, "loss": 1.6629, "step": 2110 }, { "epoch": 1.3793103448275863, "grad_norm": 7.862871170043945, "learning_rate": 4.162675043496963e-05, "loss": 1.6811, "step": 2120 }, { "epoch": 1.3858165256994144, "grad_norm": 3.747095823287964, "learning_rate": 4.1549516309158586e-05, "loss": 1.6742, "step": 2130 }, { "epoch": 1.3923227065712427, "grad_norm": 3.156120777130127, "learning_rate": 4.147199999217402e-05, "loss": 1.6693, "step": 2140 }, { "epoch": 1.398828887443071, "grad_norm": 7.266796588897705, "learning_rate": 4.139420280577177e-05, "loss": 1.6863, "step": 2150 }, { "epoch": 1.405335068314899, "grad_norm": 7.066165447235107, "learning_rate": 4.1316126076496935e-05, "loss": 1.733, "step": 2160 }, { "epoch": 1.4118412491867274, "grad_norm": 13.163073539733887, "learning_rate": 4.1237771135661164e-05, "loss": 1.6719, "step": 2170 }, { "epoch": 1.4183474300585557, "grad_norm": 4.427972316741943, "learning_rate": 4.115913931931997e-05, "loss": 1.6537, "step": 2180 }, { "epoch": 1.4248536109303838, "grad_norm": 3.6664226055145264, "learning_rate": 4.108023196824998e-05, "loss": 1.6535, "step": 2190 }, { "epoch": 1.431359791802212, "grad_norm": 4.697982311248779, "learning_rate": 4.1001050427926045e-05, "loss": 1.6848, "step": 2200 }, { "epoch": 1.4378659726740404, "grad_norm": 5.903241157531738, "learning_rate": 4.0921596048498315e-05, "loss": 1.6867, "step": 2210 }, { "epoch": 1.4443721535458685, "grad_norm": 10.139890670776367, "learning_rate": 4.084187018476918e-05, "loss": 1.6553, "step": 2220 }, { "epoch": 1.4508783344176968, "grad_norm": 4.825778007507324, "learning_rate": 4.076187419617024e-05, "loss": 1.6639, "step": 2230 }, { "epoch": 1.457384515289525, "grad_norm": 3.74001145362854, "learning_rate": 4.068160944673903e-05, "loss": 1.6441, "step": 2240 }, { "epoch": 1.4638906961613534, "grad_norm": 2.7896409034729004, "learning_rate": 4.060107730509587e-05, "loss": 1.7281, "step": 2250 }, { "epoch": 1.4703968770331814, "grad_norm": 3.233732223510742, "learning_rate": 4.052027914442043e-05, "loss": 1.6775, "step": 2260 }, { "epoch": 1.4769030579050098, "grad_norm": 3.576988458633423, "learning_rate": 4.043921634242836e-05, "loss": 1.667, "step": 2270 }, { "epoch": 1.483409238776838, "grad_norm": 3.626955986022949, "learning_rate": 4.035789028134782e-05, "loss": 1.7066, "step": 2280 }, { "epoch": 1.4899154196486664, "grad_norm": 3.231436252593994, "learning_rate": 4.0276302347895864e-05, "loss": 1.6834, "step": 2290 }, { "epoch": 1.4964216005204944, "grad_norm": 5.224520683288574, "learning_rate": 4.019445393325483e-05, "loss": 1.6967, "step": 2300 }, { "epoch": 1.5029277813923227, "grad_norm": 3.10335636138916, "learning_rate": 4.01123464330486e-05, "loss": 1.6611, "step": 2310 }, { "epoch": 1.509433962264151, "grad_norm": 3.9993865489959717, "learning_rate": 4.002998124731879e-05, "loss": 1.674, "step": 2320 }, { "epoch": 1.5159401431359791, "grad_norm": 6.949474334716797, "learning_rate": 3.994735978050094e-05, "loss": 1.6773, "step": 2330 }, { "epoch": 1.5224463240078074, "grad_norm": 4.110148906707764, "learning_rate": 3.986448344140047e-05, "loss": 1.6912, "step": 2340 }, { "epoch": 1.5289525048796357, "grad_norm": 4.1665940284729, "learning_rate": 3.978135364316874e-05, "loss": 1.6877, "step": 2350 }, { "epoch": 1.5354586857514638, "grad_norm": 6.90950345993042, "learning_rate": 3.9697971803278924e-05, "loss": 1.6631, "step": 2360 }, { "epoch": 1.5419648666232921, "grad_norm": 2.8868823051452637, "learning_rate": 3.9614339343501836e-05, "loss": 1.6775, "step": 2370 }, { "epoch": 1.5484710474951204, "grad_norm": 7.176961421966553, "learning_rate": 3.9530457689881684e-05, "loss": 1.6844, "step": 2380 }, { "epoch": 1.5549772283669485, "grad_norm": 5.691576957702637, "learning_rate": 3.944632827271176e-05, "loss": 1.6561, "step": 2390 }, { "epoch": 1.5614834092387768, "grad_norm": 5.8782639503479, "learning_rate": 3.9361952526510085e-05, "loss": 1.6689, "step": 2400 }, { "epoch": 1.5679895901106051, "grad_norm": 3.185760974884033, "learning_rate": 3.927733188999486e-05, "loss": 1.6902, "step": 2410 }, { "epoch": 1.5744957709824332, "grad_norm": 5.211911201477051, "learning_rate": 3.9192467806060044e-05, "loss": 1.6604, "step": 2420 }, { "epoch": 1.5810019518542615, "grad_norm": 5.558224201202393, "learning_rate": 3.910736172175066e-05, "loss": 1.7166, "step": 2430 }, { "epoch": 1.5875081327260898, "grad_norm": 3.257280111312866, "learning_rate": 3.9022015088238174e-05, "loss": 1.6596, "step": 2440 }, { "epoch": 1.594014313597918, "grad_norm": 4.716050624847412, "learning_rate": 3.8936429360795745e-05, "loss": 1.7066, "step": 2450 }, { "epoch": 1.6005204944697464, "grad_norm": 3.6234216690063477, "learning_rate": 3.885060599877337e-05, "loss": 1.6408, "step": 2460 }, { "epoch": 1.6070266753415745, "grad_norm": 3.169839382171631, "learning_rate": 3.876454646557305e-05, "loss": 1.6531, "step": 2470 }, { "epoch": 1.6135328562134026, "grad_norm": 3.7503509521484375, "learning_rate": 3.867825222862383e-05, "loss": 1.6855, "step": 2480 }, { "epoch": 1.6200390370852311, "grad_norm": 3.5376431941986084, "learning_rate": 3.8591724759356734e-05, "loss": 1.6342, "step": 2490 }, { "epoch": 1.6265452179570592, "grad_norm": 4.007684707641602, "learning_rate": 3.8504965533179724e-05, "loss": 1.6523, "step": 2500 }, { "epoch": 1.6330513988288873, "grad_norm": 4.970841884613037, "learning_rate": 3.841797602945254e-05, "loss": 1.6336, "step": 2510 }, { "epoch": 1.6395575797007158, "grad_norm": 3.411363363265991, "learning_rate": 3.833075773146142e-05, "loss": 1.6645, "step": 2520 }, { "epoch": 1.6460637605725439, "grad_norm": 3.1285226345062256, "learning_rate": 3.824331212639388e-05, "loss": 1.6484, "step": 2530 }, { "epoch": 1.6525699414443722, "grad_norm": 3.881978988647461, "learning_rate": 3.81556407053133e-05, "loss": 1.6676, "step": 2540 }, { "epoch": 1.6590761223162005, "grad_norm": 5.4114298820495605, "learning_rate": 3.806774496313355e-05, "loss": 1.7105, "step": 2550 }, { "epoch": 1.6655823031880286, "grad_norm": 3.7841079235076904, "learning_rate": 3.797962639859344e-05, "loss": 1.6922, "step": 2560 }, { "epoch": 1.6720884840598569, "grad_norm": 5.9798970222473145, "learning_rate": 3.7891286514231225e-05, "loss": 1.6666, "step": 2570 }, { "epoch": 1.6785946649316852, "grad_norm": 3.702932596206665, "learning_rate": 3.780272681635894e-05, "loss": 1.6414, "step": 2580 }, { "epoch": 1.6851008458035133, "grad_norm": 3.5056796073913574, "learning_rate": 3.771394881503673e-05, "loss": 1.6676, "step": 2590 }, { "epoch": 1.6916070266753416, "grad_norm": 4.812647342681885, "learning_rate": 3.76249540240471e-05, "loss": 1.6828, "step": 2600 }, { "epoch": 1.6981132075471699, "grad_norm": 4.714016437530518, "learning_rate": 3.753574396086913e-05, "loss": 1.6623, "step": 2610 }, { "epoch": 1.704619388418998, "grad_norm": 5.55129861831665, "learning_rate": 3.7446320146652556e-05, "loss": 1.6787, "step": 2620 }, { "epoch": 1.7111255692908263, "grad_norm": 4.482264995574951, "learning_rate": 3.735668410619183e-05, "loss": 1.6529, "step": 2630 }, { "epoch": 1.7176317501626546, "grad_norm": 3.083521604537964, "learning_rate": 3.726683736790022e-05, "loss": 1.6227, "step": 2640 }, { "epoch": 1.7241379310344827, "grad_norm": 4.615660667419434, "learning_rate": 3.717678146378357e-05, "loss": 1.6262, "step": 2650 }, { "epoch": 1.730644111906311, "grad_norm": 4.197783470153809, "learning_rate": 3.7086517929414346e-05, "loss": 1.6621, "step": 2660 }, { "epoch": 1.7371502927781393, "grad_norm": 6.159745216369629, "learning_rate": 3.699604830390537e-05, "loss": 1.6895, "step": 2670 }, { "epoch": 1.7436564736499673, "grad_norm": 3.7764313220977783, "learning_rate": 3.690537412988359e-05, "loss": 1.6574, "step": 2680 }, { "epoch": 1.7501626545217959, "grad_norm": 2.963048219680786, "learning_rate": 3.681449695346376e-05, "loss": 1.6832, "step": 2690 }, { "epoch": 1.756668835393624, "grad_norm": 5.0992889404296875, "learning_rate": 3.6723418324222126e-05, "loss": 1.6559, "step": 2700 }, { "epoch": 1.763175016265452, "grad_norm": 5.12439489364624, "learning_rate": 3.663213979516994e-05, "loss": 1.6666, "step": 2710 }, { "epoch": 1.7696811971372806, "grad_norm": 3.483926296234131, "learning_rate": 3.6540662922727034e-05, "loss": 1.6705, "step": 2720 }, { "epoch": 1.7761873780091086, "grad_norm": 2.750868320465088, "learning_rate": 3.644898926669524e-05, "loss": 1.6418, "step": 2730 }, { "epoch": 1.7826935588809367, "grad_norm": 4.758782386779785, "learning_rate": 3.6357120390231825e-05, "loss": 1.6965, "step": 2740 }, { "epoch": 1.7891997397527653, "grad_norm": 4.477340221405029, "learning_rate": 3.626505785982281e-05, "loss": 1.684, "step": 2750 }, { "epoch": 1.7957059206245933, "grad_norm": 3.9034929275512695, "learning_rate": 3.6172803245256284e-05, "loss": 1.6412, "step": 2760 }, { "epoch": 1.8022121014964216, "grad_norm": 3.6904385089874268, "learning_rate": 3.608035811959561e-05, "loss": 1.6445, "step": 2770 }, { "epoch": 1.80871828236825, "grad_norm": 5.436261177062988, "learning_rate": 3.598772405915264e-05, "loss": 1.6543, "step": 2780 }, { "epoch": 1.815224463240078, "grad_norm": 3.0844461917877197, "learning_rate": 3.58949026434608e-05, "loss": 1.6402, "step": 2790 }, { "epoch": 1.8217306441119063, "grad_norm": 3.3180079460144043, "learning_rate": 3.580189545524818e-05, "loss": 1.6717, "step": 2800 }, { "epoch": 1.8282368249837346, "grad_norm": 4.296079158782959, "learning_rate": 3.57087040804105e-05, "loss": 1.6764, "step": 2810 }, { "epoch": 1.8347430058555627, "grad_norm": 4.349729537963867, "learning_rate": 3.561533010798418e-05, "loss": 1.6463, "step": 2820 }, { "epoch": 1.841249186727391, "grad_norm": 6.426949977874756, "learning_rate": 3.5521775130119095e-05, "loss": 1.6711, "step": 2830 }, { "epoch": 1.8477553675992193, "grad_norm": 3.799238920211792, "learning_rate": 3.542804074205155e-05, "loss": 1.6523, "step": 2840 }, { "epoch": 1.8542615484710474, "grad_norm": 4.285486221313477, "learning_rate": 3.5334128542077004e-05, "loss": 1.6656, "step": 2850 }, { "epoch": 1.8607677293428757, "grad_norm": 2.7036943435668945, "learning_rate": 3.5240040131522876e-05, "loss": 1.6406, "step": 2860 }, { "epoch": 1.867273910214704, "grad_norm": 2.7466037273406982, "learning_rate": 3.514577711472117e-05, "loss": 1.6457, "step": 2870 }, { "epoch": 1.873780091086532, "grad_norm": 3.1877686977386475, "learning_rate": 3.505134109898118e-05, "loss": 1.651, "step": 2880 }, { "epoch": 1.8802862719583604, "grad_norm": 2.4399566650390625, "learning_rate": 3.495673369456207e-05, "loss": 1.6484, "step": 2890 }, { "epoch": 1.8867924528301887, "grad_norm": 3.6766226291656494, "learning_rate": 3.4861956514645386e-05, "loss": 1.6428, "step": 2900 }, { "epoch": 1.8932986337020168, "grad_norm": 3.6114513874053955, "learning_rate": 3.4767011175307595e-05, "loss": 1.6258, "step": 2910 }, { "epoch": 1.8998048145738453, "grad_norm": 3.948526382446289, "learning_rate": 3.4671899295492485e-05, "loss": 1.6223, "step": 2920 }, { "epoch": 1.9063109954456734, "grad_norm": 4.993218898773193, "learning_rate": 3.4576622496983575e-05, "loss": 1.6834, "step": 2930 }, { "epoch": 1.9128171763175015, "grad_norm": 5.977007865905762, "learning_rate": 3.4481182404376485e-05, "loss": 1.6598, "step": 2940 }, { "epoch": 1.91932335718933, "grad_norm": 5.328046798706055, "learning_rate": 3.4385580645051216e-05, "loss": 1.6121, "step": 2950 }, { "epoch": 1.925829538061158, "grad_norm": 5.055603981018066, "learning_rate": 3.4289818849144384e-05, "loss": 1.6826, "step": 2960 }, { "epoch": 1.9323357189329864, "grad_norm": 4.111296653747559, "learning_rate": 3.419389864952145e-05, "loss": 1.6652, "step": 2970 }, { "epoch": 1.9388418998048147, "grad_norm": 3.1650102138519287, "learning_rate": 3.409782168174887e-05, "loss": 1.6451, "step": 2980 }, { "epoch": 1.9453480806766428, "grad_norm": 5.943548679351807, "learning_rate": 3.40015895840662e-05, "loss": 1.6848, "step": 2990 }, { "epoch": 1.951854261548471, "grad_norm": 6.0067973136901855, "learning_rate": 3.390520399735818e-05, "loss": 1.6225, "step": 3000 }, { "epoch": 1.9583604424202994, "grad_norm": 3.805189847946167, "learning_rate": 3.38086665651267e-05, "loss": 1.6635, "step": 3010 }, { "epoch": 1.9648666232921275, "grad_norm": 3.8608787059783936, "learning_rate": 3.371197893346288e-05, "loss": 1.6689, "step": 3020 }, { "epoch": 1.9713728041639558, "grad_norm": 3.455205202102661, "learning_rate": 3.3615142751018894e-05, "loss": 1.6416, "step": 3030 }, { "epoch": 1.977878985035784, "grad_norm": 7.355952739715576, "learning_rate": 3.35181596689799e-05, "loss": 1.6621, "step": 3040 }, { "epoch": 1.9843851659076122, "grad_norm": 3.4102537631988525, "learning_rate": 3.342103134103593e-05, "loss": 1.6115, "step": 3050 }, { "epoch": 1.9908913467794405, "grad_norm": 3.903759479522705, "learning_rate": 3.3323759423353615e-05, "loss": 1.6104, "step": 3060 }, { "epoch": 1.9973975276512688, "grad_norm": 3.1395809650421143, "learning_rate": 3.3226345574548e-05, "loss": 1.6408, "step": 3070 }, { "epoch": 2.0, "eval_f1": 81.72768271208038, "eval_loss": 0.43087127804756165, "eval_precision": 81.8109069486882, "eval_recall": 81.67651581582727, "eval_runtime": 62.0838, "eval_samples_per_second": 6337.159, "eval_steps_per_second": 6.201, "step": 3074 }, { "epoch": 2.003903708523097, "grad_norm": 6.669315338134766, "learning_rate": 3.312879145565422e-05, "loss": 1.5785, "step": 3080 }, { "epoch": 2.0104098893949254, "grad_norm": 8.681927680969238, "learning_rate": 3.303109873009922e-05, "loss": 1.5193, "step": 3090 }, { "epoch": 2.0169160702667535, "grad_norm": 3.3505547046661377, "learning_rate": 3.293326906367338e-05, "loss": 1.509, "step": 3100 }, { "epoch": 2.0234222511385815, "grad_norm": 3.3314144611358643, "learning_rate": 3.283530412450207e-05, "loss": 1.5211, "step": 3110 }, { "epoch": 2.02992843201041, "grad_norm": 4.8515305519104, "learning_rate": 3.2737205583017286e-05, "loss": 1.4908, "step": 3120 }, { "epoch": 2.036434612882238, "grad_norm": 3.4298112392425537, "learning_rate": 3.2638975111929084e-05, "loss": 1.4859, "step": 3130 }, { "epoch": 2.0429407937540662, "grad_norm": 5.121671199798584, "learning_rate": 3.254061438619711e-05, "loss": 1.5346, "step": 3140 }, { "epoch": 2.0494469746258948, "grad_norm": 6.640491008758545, "learning_rate": 3.244212508300201e-05, "loss": 1.5188, "step": 3150 }, { "epoch": 2.055953155497723, "grad_norm": 3.9364047050476074, "learning_rate": 3.2343508881716874e-05, "loss": 1.4916, "step": 3160 }, { "epoch": 2.062459336369551, "grad_norm": 3.2305216789245605, "learning_rate": 3.2244767463878525e-05, "loss": 1.5078, "step": 3170 }, { "epoch": 2.0689655172413794, "grad_norm": 3.122135639190674, "learning_rate": 3.214590251315896e-05, "loss": 1.5176, "step": 3180 }, { "epoch": 2.0754716981132075, "grad_norm": 4.0217671394348145, "learning_rate": 3.204691571533652e-05, "loss": 1.5182, "step": 3190 }, { "epoch": 2.0819778789850356, "grad_norm": 5.475218772888184, "learning_rate": 3.194780875826723e-05, "loss": 1.5355, "step": 3200 }, { "epoch": 2.088484059856864, "grad_norm": 2.9265801906585693, "learning_rate": 3.1848583331855954e-05, "loss": 1.5098, "step": 3210 }, { "epoch": 2.094990240728692, "grad_norm": 6.830881595611572, "learning_rate": 3.174924112802767e-05, "loss": 1.5133, "step": 3220 }, { "epoch": 2.1014964216005203, "grad_norm": 3.977867841720581, "learning_rate": 3.164978384069852e-05, "loss": 1.5266, "step": 3230 }, { "epoch": 2.108002602472349, "grad_norm": 2.9714772701263428, "learning_rate": 3.155021316574699e-05, "loss": 1.5027, "step": 3240 }, { "epoch": 2.114508783344177, "grad_norm": 6.47925329208374, "learning_rate": 3.1450530800984965e-05, "loss": 1.5373, "step": 3250 }, { "epoch": 2.121014964216005, "grad_norm": 3.452409267425537, "learning_rate": 3.1350738446128826e-05, "loss": 1.5199, "step": 3260 }, { "epoch": 2.1275211450878335, "grad_norm": 4.454134464263916, "learning_rate": 3.125083780277038e-05, "loss": 1.5096, "step": 3270 }, { "epoch": 2.1340273259596616, "grad_norm": 3.5372655391693115, "learning_rate": 3.115083057434791e-05, "loss": 1.4986, "step": 3280 }, { "epoch": 2.14053350683149, "grad_norm": 3.796682119369507, "learning_rate": 3.105071846611714e-05, "loss": 1.5287, "step": 3290 }, { "epoch": 2.147039687703318, "grad_norm": 3.666710376739502, "learning_rate": 3.0950503185122116e-05, "loss": 1.5221, "step": 3300 }, { "epoch": 2.1535458685751463, "grad_norm": 3.271162271499634, "learning_rate": 3.085018644016611e-05, "loss": 1.4699, "step": 3310 }, { "epoch": 2.160052049446975, "grad_norm": 3.440091609954834, "learning_rate": 3.074976994178251e-05, "loss": 1.51, "step": 3320 }, { "epoch": 2.166558230318803, "grad_norm": 3.9851841926574707, "learning_rate": 3.06492554022056e-05, "loss": 1.5172, "step": 3330 }, { "epoch": 2.173064411190631, "grad_norm": 4.255520820617676, "learning_rate": 3.054864453534144e-05, "loss": 1.5109, "step": 3340 }, { "epoch": 2.1795705920624595, "grad_norm": 4.557705402374268, "learning_rate": 3.044793905673855e-05, "loss": 1.527, "step": 3350 }, { "epoch": 2.1860767729342876, "grad_norm": 3.902419090270996, "learning_rate": 3.034714068355874e-05, "loss": 1.5225, "step": 3360 }, { "epoch": 2.1925829538061157, "grad_norm": 3.707981586456299, "learning_rate": 3.0246251134547777e-05, "loss": 1.5174, "step": 3370 }, { "epoch": 2.199089134677944, "grad_norm": 5.099106311798096, "learning_rate": 3.0145272130006107e-05, "loss": 1.5367, "step": 3380 }, { "epoch": 2.2055953155497723, "grad_norm": 3.369356393814087, "learning_rate": 3.0044205391759518e-05, "loss": 1.5361, "step": 3390 }, { "epoch": 2.2121014964216004, "grad_norm": 4.735031604766846, "learning_rate": 2.9943052643129755e-05, "loss": 1.5469, "step": 3400 }, { "epoch": 2.218607677293429, "grad_norm": 3.2020480632781982, "learning_rate": 2.9841815608905156e-05, "loss": 1.548, "step": 3410 }, { "epoch": 2.225113858165257, "grad_norm": 4.269221782684326, "learning_rate": 2.974049601531126e-05, "loss": 1.502, "step": 3420 }, { "epoch": 2.231620039037085, "grad_norm": 4.453713417053223, "learning_rate": 2.963909558998133e-05, "loss": 1.5117, "step": 3430 }, { "epoch": 2.2381262199089136, "grad_norm": 3.5041463375091553, "learning_rate": 2.9537616061926938e-05, "loss": 1.5266, "step": 3440 }, { "epoch": 2.2446324007807417, "grad_norm": 5.326657772064209, "learning_rate": 2.9436059161508423e-05, "loss": 1.5207, "step": 3450 }, { "epoch": 2.2511385816525697, "grad_norm": 4.812317848205566, "learning_rate": 2.933442662040549e-05, "loss": 1.5332, "step": 3460 }, { "epoch": 2.2576447625243983, "grad_norm": 3.8038558959960938, "learning_rate": 2.9232720171587564e-05, "loss": 1.5262, "step": 3470 }, { "epoch": 2.2641509433962264, "grad_norm": 4.666142463684082, "learning_rate": 2.9130941549284307e-05, "loss": 1.5412, "step": 3480 }, { "epoch": 2.2706571242680544, "grad_norm": 3.583735942840576, "learning_rate": 2.9029092488956045e-05, "loss": 1.5129, "step": 3490 }, { "epoch": 2.277163305139883, "grad_norm": 3.494488000869751, "learning_rate": 2.8927174727264154e-05, "loss": 1.5258, "step": 3500 }, { "epoch": 2.283669486011711, "grad_norm": 3.5402581691741943, "learning_rate": 2.8825190002041474e-05, "loss": 1.5053, "step": 3510 }, { "epoch": 2.290175666883539, "grad_norm": 3.7277910709381104, "learning_rate": 2.8723140052262647e-05, "loss": 1.492, "step": 3520 }, { "epoch": 2.2966818477553677, "grad_norm": 4.2001261711120605, "learning_rate": 2.8621026618014483e-05, "loss": 1.5207, "step": 3530 }, { "epoch": 2.3031880286271957, "grad_norm": 5.695222854614258, "learning_rate": 2.85188514404663e-05, "loss": 1.5078, "step": 3540 }, { "epoch": 2.3096942094990243, "grad_norm": 4.224460124969482, "learning_rate": 2.84166162618402e-05, "loss": 1.4881, "step": 3550 }, { "epoch": 2.3162003903708523, "grad_norm": 4.437689781188965, "learning_rate": 2.8314322825381394e-05, "loss": 1.5117, "step": 3560 }, { "epoch": 2.3227065712426804, "grad_norm": 4.180202484130859, "learning_rate": 2.821197287532847e-05, "loss": 1.4871, "step": 3570 }, { "epoch": 2.329212752114509, "grad_norm": 4.899669170379639, "learning_rate": 2.8109568156883633e-05, "loss": 1.4857, "step": 3580 }, { "epoch": 2.335718932986337, "grad_norm": 4.5778703689575195, "learning_rate": 2.800711041618298e-05, "loss": 1.5283, "step": 3590 }, { "epoch": 2.342225113858165, "grad_norm": 3.73079514503479, "learning_rate": 2.7904601400266707e-05, "loss": 1.4885, "step": 3600 }, { "epoch": 2.3487312947299936, "grad_norm": 4.455453395843506, "learning_rate": 2.7802042857049292e-05, "loss": 1.5039, "step": 3610 }, { "epoch": 2.3552374756018217, "grad_norm": 4.652501583099365, "learning_rate": 2.769943653528976e-05, "loss": 1.5125, "step": 3620 }, { "epoch": 2.36174365647365, "grad_norm": 3.7688090801239014, "learning_rate": 2.7596784184561787e-05, "loss": 1.51, "step": 3630 }, { "epoch": 2.3682498373454783, "grad_norm": 4.762564659118652, "learning_rate": 2.749408755522393e-05, "loss": 1.5162, "step": 3640 }, { "epoch": 2.3747560182173064, "grad_norm": 3.9891295433044434, "learning_rate": 2.7391348398389734e-05, "loss": 1.5541, "step": 3650 }, { "epoch": 2.3812621990891345, "grad_norm": 3.2875916957855225, "learning_rate": 2.7288568465897917e-05, "loss": 1.527, "step": 3660 }, { "epoch": 2.387768379960963, "grad_norm": 3.542940378189087, "learning_rate": 2.7185749510282467e-05, "loss": 1.4936, "step": 3670 }, { "epoch": 2.394274560832791, "grad_norm": 4.812227725982666, "learning_rate": 2.7082893284742748e-05, "loss": 1.4744, "step": 3680 }, { "epoch": 2.4007807417046196, "grad_norm": 4.9299397468566895, "learning_rate": 2.6980001543113652e-05, "loss": 1.4936, "step": 3690 }, { "epoch": 2.4072869225764477, "grad_norm": 4.630545139312744, "learning_rate": 2.6877076039835663e-05, "loss": 1.4936, "step": 3700 }, { "epoch": 2.413793103448276, "grad_norm": 3.2208354473114014, "learning_rate": 2.6774118529924934e-05, "loss": 1.5063, "step": 3710 }, { "epoch": 2.4202992843201043, "grad_norm": 3.9948575496673584, "learning_rate": 2.6671130768943375e-05, "loss": 1.4686, "step": 3720 }, { "epoch": 2.4268054651919324, "grad_norm": 3.723567008972168, "learning_rate": 2.6568114512968732e-05, "loss": 1.4824, "step": 3730 }, { "epoch": 2.4333116460637605, "grad_norm": 4.118974208831787, "learning_rate": 2.646507151856462e-05, "loss": 1.4996, "step": 3740 }, { "epoch": 2.439817826935589, "grad_norm": 5.077260494232178, "learning_rate": 2.6362003542750568e-05, "loss": 1.527, "step": 3750 }, { "epoch": 2.446324007807417, "grad_norm": 3.2592742443084717, "learning_rate": 2.625891234297209e-05, "loss": 1.4836, "step": 3760 }, { "epoch": 2.452830188679245, "grad_norm": 4.358582496643066, "learning_rate": 2.615579967707068e-05, "loss": 1.4787, "step": 3770 }, { "epoch": 2.4593363695510737, "grad_norm": 4.804825305938721, "learning_rate": 2.6052667303253887e-05, "loss": 1.5195, "step": 3780 }, { "epoch": 2.465842550422902, "grad_norm": 4.6675615310668945, "learning_rate": 2.594951698006526e-05, "loss": 1.5086, "step": 3790 }, { "epoch": 2.47234873129473, "grad_norm": 3.5179455280303955, "learning_rate": 2.5846350466354457e-05, "loss": 1.4803, "step": 3800 }, { "epoch": 2.4788549121665584, "grad_norm": 5.697339057922363, "learning_rate": 2.574316952124718e-05, "loss": 1.4914, "step": 3810 }, { "epoch": 2.4853610930383865, "grad_norm": 5.036843776702881, "learning_rate": 2.563997590411521e-05, "loss": 1.4898, "step": 3820 }, { "epoch": 2.4918672739102146, "grad_norm": 4.910745620727539, "learning_rate": 2.5536771374546402e-05, "loss": 1.4871, "step": 3830 }, { "epoch": 2.498373454782043, "grad_norm": 3.5659191608428955, "learning_rate": 2.5433557692314687e-05, "loss": 1.516, "step": 3840 }, { "epoch": 2.504879635653871, "grad_norm": 4.358558177947998, "learning_rate": 2.5330336617350035e-05, "loss": 1.4904, "step": 3850 }, { "epoch": 2.5113858165256993, "grad_norm": 4.307555198669434, "learning_rate": 2.5227109909708536e-05, "loss": 1.5258, "step": 3860 }, { "epoch": 2.517891997397528, "grad_norm": 4.036217212677002, "learning_rate": 2.5123879329542255e-05, "loss": 1.473, "step": 3870 }, { "epoch": 2.524398178269356, "grad_norm": 3.8947434425354004, "learning_rate": 2.5020646637069324e-05, "loss": 1.4896, "step": 3880 }, { "epoch": 2.530904359141184, "grad_norm": 3.8258981704711914, "learning_rate": 2.4917413592543872e-05, "loss": 1.5125, "step": 3890 }, { "epoch": 2.5374105400130125, "grad_norm": 3.9123308658599854, "learning_rate": 2.4814181956226067e-05, "loss": 1.5279, "step": 3900 }, { "epoch": 2.5439167208848406, "grad_norm": 3.9720828533172607, "learning_rate": 2.471095348835203e-05, "loss": 1.509, "step": 3910 }, { "epoch": 2.5504229017566686, "grad_norm": 4.662309646606445, "learning_rate": 2.460772994910387e-05, "loss": 1.5332, "step": 3920 }, { "epoch": 2.556929082628497, "grad_norm": 3.919618844985962, "learning_rate": 2.450451309857965e-05, "loss": 1.5123, "step": 3930 }, { "epoch": 2.5634352635003252, "grad_norm": 4.287420749664307, "learning_rate": 2.4401304696763397e-05, "loss": 1.5002, "step": 3940 }, { "epoch": 2.5699414443721533, "grad_norm": 4.4911651611328125, "learning_rate": 2.4298106503495046e-05, "loss": 1.5234, "step": 3950 }, { "epoch": 2.576447625243982, "grad_norm": 3.418739080429077, "learning_rate": 2.4194920278440508e-05, "loss": 1.5094, "step": 3960 }, { "epoch": 2.58295380611581, "grad_norm": 3.337728500366211, "learning_rate": 2.409174778106158e-05, "loss": 1.4844, "step": 3970 }, { "epoch": 2.589459986987638, "grad_norm": 5.230040073394775, "learning_rate": 2.3988590770585993e-05, "loss": 1.4836, "step": 3980 }, { "epoch": 2.5959661678594665, "grad_norm": 4.868875503540039, "learning_rate": 2.388545100597743e-05, "loss": 1.5365, "step": 3990 }, { "epoch": 2.6024723487312946, "grad_norm": 3.672023057937622, "learning_rate": 2.3782330245905475e-05, "loss": 1.4836, "step": 4000 }, { "epoch": 2.6089785296031227, "grad_norm": 3.8064827919006348, "learning_rate": 2.367923024871567e-05, "loss": 1.4977, "step": 4010 }, { "epoch": 2.6154847104749512, "grad_norm": 8.262436866760254, "learning_rate": 2.357615277239954e-05, "loss": 1.4871, "step": 4020 }, { "epoch": 2.6219908913467793, "grad_norm": 6.529214382171631, "learning_rate": 2.3473099574564584e-05, "loss": 1.517, "step": 4030 }, { "epoch": 2.6284970722186074, "grad_norm": 6.7583794593811035, "learning_rate": 2.3370072412404318e-05, "loss": 1.5076, "step": 4040 }, { "epoch": 2.635003253090436, "grad_norm": 3.288546562194824, "learning_rate": 2.3267073042668318e-05, "loss": 1.5016, "step": 4050 }, { "epoch": 2.641509433962264, "grad_norm": 5.317094326019287, "learning_rate": 2.316410322163227e-05, "loss": 1.4941, "step": 4060 }, { "epoch": 2.6480156148340925, "grad_norm": 4.624538898468018, "learning_rate": 2.3061164705068016e-05, "loss": 1.5166, "step": 4070 }, { "epoch": 2.6545217957059206, "grad_norm": 4.275417804718018, "learning_rate": 2.2958259248213594e-05, "loss": 1.5248, "step": 4080 }, { "epoch": 2.6610279765777487, "grad_norm": 4.061877727508545, "learning_rate": 2.2855388605743356e-05, "loss": 1.5195, "step": 4090 }, { "epoch": 2.6675341574495772, "grad_norm": 3.7903144359588623, "learning_rate": 2.2752554531738008e-05, "loss": 1.4623, "step": 4100 }, { "epoch": 2.6740403383214053, "grad_norm": 5.4264726638793945, "learning_rate": 2.264975877965473e-05, "loss": 1.5225, "step": 4110 }, { "epoch": 2.680546519193234, "grad_norm": 3.509178400039673, "learning_rate": 2.254700310229724e-05, "loss": 1.5186, "step": 4120 }, { "epoch": 2.687052700065062, "grad_norm": 4.230285167694092, "learning_rate": 2.2444289251785942e-05, "loss": 1.491, "step": 4130 }, { "epoch": 2.69355888093689, "grad_norm": 3.772911548614502, "learning_rate": 2.2341618979528042e-05, "loss": 1.4805, "step": 4140 }, { "epoch": 2.7000650618087185, "grad_norm": 4.489816188812256, "learning_rate": 2.2238994036187672e-05, "loss": 1.4754, "step": 4150 }, { "epoch": 2.7065712426805466, "grad_norm": 3.7173023223876953, "learning_rate": 2.2136416171656042e-05, "loss": 1.5074, "step": 4160 }, { "epoch": 2.7130774235523747, "grad_norm": 6.166986465454102, "learning_rate": 2.2033887135021606e-05, "loss": 1.4842, "step": 4170 }, { "epoch": 2.719583604424203, "grad_norm": 4.796930313110352, "learning_rate": 2.1931408674540254e-05, "loss": 1.5213, "step": 4180 }, { "epoch": 2.7260897852960313, "grad_norm": 4.019711017608643, "learning_rate": 2.182898253760547e-05, "loss": 1.5006, "step": 4190 }, { "epoch": 2.7325959661678594, "grad_norm": 4.862495422363281, "learning_rate": 2.1726610470718554e-05, "loss": 1.5072, "step": 4200 }, { "epoch": 2.739102147039688, "grad_norm": 4.607982158660889, "learning_rate": 2.1624294219458836e-05, "loss": 1.5172, "step": 4210 }, { "epoch": 2.745608327911516, "grad_norm": 3.7435410022735596, "learning_rate": 2.1522035528453936e-05, "loss": 1.5271, "step": 4220 }, { "epoch": 2.752114508783344, "grad_norm": 3.2756705284118652, "learning_rate": 2.141983614134996e-05, "loss": 1.5412, "step": 4230 }, { "epoch": 2.7586206896551726, "grad_norm": 4.252240180969238, "learning_rate": 2.131769780078185e-05, "loss": 1.5197, "step": 4240 }, { "epoch": 2.7651268705270007, "grad_norm": 3.387568712234497, "learning_rate": 2.1215622248343593e-05, "loss": 1.465, "step": 4250 }, { "epoch": 2.7716330513988288, "grad_norm": 5.154775142669678, "learning_rate": 2.1113611224558545e-05, "loss": 1.4838, "step": 4260 }, { "epoch": 2.7781392322706573, "grad_norm": 4.66436767578125, "learning_rate": 2.1011666468849797e-05, "loss": 1.5135, "step": 4270 }, { "epoch": 2.7846454131424854, "grad_norm": 4.574682235717773, "learning_rate": 2.0909789719510457e-05, "loss": 1.5162, "step": 4280 }, { "epoch": 2.7911515940143135, "grad_norm": 4.471498012542725, "learning_rate": 2.0807982713674036e-05, "loss": 1.5004, "step": 4290 }, { "epoch": 2.797657774886142, "grad_norm": 3.5990407466888428, "learning_rate": 2.0706247187284836e-05, "loss": 1.534, "step": 4300 }, { "epoch": 2.80416395575797, "grad_norm": 3.9830987453460693, "learning_rate": 2.0604584875068337e-05, "loss": 1.4711, "step": 4310 }, { "epoch": 2.810670136629798, "grad_norm": 3.6713786125183105, "learning_rate": 2.0502997510501616e-05, "loss": 1.5188, "step": 4320 }, { "epoch": 2.8171763175016267, "grad_norm": 4.233299732208252, "learning_rate": 2.0401486825783784e-05, "loss": 1.4926, "step": 4330 }, { "epoch": 2.8236824983734548, "grad_norm": 6.869638919830322, "learning_rate": 2.0300054551806488e-05, "loss": 1.4729, "step": 4340 }, { "epoch": 2.830188679245283, "grad_norm": 3.3226113319396973, "learning_rate": 2.0198702418124342e-05, "loss": 1.4775, "step": 4350 }, { "epoch": 2.8366948601171114, "grad_norm": 4.803750991821289, "learning_rate": 2.0097432152925462e-05, "loss": 1.442, "step": 4360 }, { "epoch": 2.8432010409889394, "grad_norm": 4.162081241607666, "learning_rate": 1.9996245483002025e-05, "loss": 1.5377, "step": 4370 }, { "epoch": 2.8497072218607675, "grad_norm": 3.2803311347961426, "learning_rate": 1.989514413372076e-05, "loss": 1.5059, "step": 4380 }, { "epoch": 2.856213402732596, "grad_norm": 4.516105651855469, "learning_rate": 1.97941298289936e-05, "loss": 1.5123, "step": 4390 }, { "epoch": 2.862719583604424, "grad_norm": 5.965749740600586, "learning_rate": 1.969320429124823e-05, "loss": 1.4957, "step": 4400 }, { "epoch": 2.869225764476252, "grad_norm": 4.8274312019348145, "learning_rate": 1.9592369241398746e-05, "loss": 1.468, "step": 4410 }, { "epoch": 2.8757319453480807, "grad_norm": 3.6499929428100586, "learning_rate": 1.9491626398816315e-05, "loss": 1.4908, "step": 4420 }, { "epoch": 2.882238126219909, "grad_norm": 3.5763795375823975, "learning_rate": 1.9390977481299836e-05, "loss": 1.499, "step": 4430 }, { "epoch": 2.888744307091737, "grad_norm": 3.3365864753723145, "learning_rate": 1.929042420504667e-05, "loss": 1.507, "step": 4440 }, { "epoch": 2.8952504879635654, "grad_norm": 3.638169288635254, "learning_rate": 1.9189968284623355e-05, "loss": 1.4926, "step": 4450 }, { "epoch": 2.9017566688353935, "grad_norm": 4.30372428894043, "learning_rate": 1.9089611432936406e-05, "loss": 1.5332, "step": 4460 }, { "epoch": 2.9082628497072216, "grad_norm": 4.822491645812988, "learning_rate": 1.8989355361203057e-05, "loss": 1.476, "step": 4470 }, { "epoch": 2.91476903057905, "grad_norm": 4.0753679275512695, "learning_rate": 1.888920177892213e-05, "loss": 1.4863, "step": 4480 }, { "epoch": 2.921275211450878, "grad_norm": 4.983773708343506, "learning_rate": 1.878915239384486e-05, "loss": 1.4715, "step": 4490 }, { "epoch": 2.9277813923227067, "grad_norm": 5.453441143035889, "learning_rate": 1.868920891194577e-05, "loss": 1.5053, "step": 4500 }, { "epoch": 2.934287573194535, "grad_norm": 4.935688495635986, "learning_rate": 1.8589373037393633e-05, "loss": 1.4625, "step": 4510 }, { "epoch": 2.940793754066363, "grad_norm": 5.212314128875732, "learning_rate": 1.848964647252233e-05, "loss": 1.5268, "step": 4520 }, { "epoch": 2.9472999349381914, "grad_norm": 4.662916660308838, "learning_rate": 1.8390030917801883e-05, "loss": 1.5225, "step": 4530 }, { "epoch": 2.9538061158100195, "grad_norm": 7.3380842208862305, "learning_rate": 1.829052807180944e-05, "loss": 1.5066, "step": 4540 }, { "epoch": 2.960312296681848, "grad_norm": 3.6364431381225586, "learning_rate": 1.8191139631200332e-05, "loss": 1.5207, "step": 4550 }, { "epoch": 2.966818477553676, "grad_norm": 5.3654069900512695, "learning_rate": 1.809186729067911e-05, "loss": 1.5121, "step": 4560 }, { "epoch": 2.973324658425504, "grad_norm": 3.771247148513794, "learning_rate": 1.7992712742970665e-05, "loss": 1.4914, "step": 4570 }, { "epoch": 2.9798308392973327, "grad_norm": 3.9818434715270996, "learning_rate": 1.7893677678791382e-05, "loss": 1.5033, "step": 4580 }, { "epoch": 2.986337020169161, "grad_norm": 4.603394031524658, "learning_rate": 1.779476378682027e-05, "loss": 1.5021, "step": 4590 }, { "epoch": 2.992843201040989, "grad_norm": 3.6947720050811768, "learning_rate": 1.7695972753670202e-05, "loss": 1.4512, "step": 4600 }, { "epoch": 2.9993493819128174, "grad_norm": 3.0982110500335693, "learning_rate": 1.759730626385915e-05, "loss": 1.4703, "step": 4610 }, { "epoch": 3.0, "eval_f1": 82.1472249933553, "eval_loss": 0.4251977801322937, "eval_precision": 82.11507813117342, "eval_recall": 82.18714654263093, "eval_runtime": 62.6137, "eval_samples_per_second": 6283.532, "eval_steps_per_second": 6.149, "step": 4611 }, { "epoch": 3.0058555627846455, "grad_norm": 3.175236225128174, "learning_rate": 1.7498765999781455e-05, "loss": 1.374, "step": 4620 }, { "epoch": 3.0123617436564736, "grad_norm": 4.318149089813232, "learning_rate": 1.7400353641679125e-05, "loss": 1.302, "step": 4630 }, { "epoch": 3.018867924528302, "grad_norm": 5.663356304168701, "learning_rate": 1.7302070867613242e-05, "loss": 1.3352, "step": 4640 }, { "epoch": 3.02537410540013, "grad_norm": 4.74486780166626, "learning_rate": 1.720391935343527e-05, "loss": 1.3344, "step": 4650 }, { "epoch": 3.0318802862719583, "grad_norm": 3.9575154781341553, "learning_rate": 1.710590077275852e-05, "loss": 1.3396, "step": 4660 }, { "epoch": 3.038386467143787, "grad_norm": 4.727320194244385, "learning_rate": 1.7008016796929645e-05, "loss": 1.3465, "step": 4670 }, { "epoch": 3.044892648015615, "grad_norm": 5.0931315422058105, "learning_rate": 1.691026909500007e-05, "loss": 1.3449, "step": 4680 }, { "epoch": 3.051398828887443, "grad_norm": 5.281672954559326, "learning_rate": 1.6812659333697578e-05, "loss": 1.3467, "step": 4690 }, { "epoch": 3.0579050097592715, "grad_norm": 4.07818603515625, "learning_rate": 1.67151891773979e-05, "loss": 1.3104, "step": 4700 }, { "epoch": 3.0644111906310996, "grad_norm": 4.469542980194092, "learning_rate": 1.6617860288096298e-05, "loss": 1.3426, "step": 4710 }, { "epoch": 3.0709173715029277, "grad_norm": 4.697359561920166, "learning_rate": 1.6520674325379255e-05, "loss": 1.334, "step": 4720 }, { "epoch": 3.077423552374756, "grad_norm": 4.5642781257629395, "learning_rate": 1.6423632946396143e-05, "loss": 1.3387, "step": 4730 }, { "epoch": 3.0839297332465843, "grad_norm": 3.9987831115722656, "learning_rate": 1.6326737805831037e-05, "loss": 1.3326, "step": 4740 }, { "epoch": 3.0904359141184123, "grad_norm": 4.102402210235596, "learning_rate": 1.622999055587441e-05, "loss": 1.332, "step": 4750 }, { "epoch": 3.096942094990241, "grad_norm": 4.410552501678467, "learning_rate": 1.613339284619502e-05, "loss": 1.3408, "step": 4760 }, { "epoch": 3.103448275862069, "grad_norm": 5.863821029663086, "learning_rate": 1.6036946323911753e-05, "loss": 1.3135, "step": 4770 }, { "epoch": 3.109954456733897, "grad_norm": 4.710544109344482, "learning_rate": 1.5940652633565578e-05, "loss": 1.3459, "step": 4780 }, { "epoch": 3.1164606376057256, "grad_norm": 4.711605072021484, "learning_rate": 1.584451341709146e-05, "loss": 1.3236, "step": 4790 }, { "epoch": 3.1229668184775536, "grad_norm": 6.081175804138184, "learning_rate": 1.5748530313790378e-05, "loss": 1.3318, "step": 4800 }, { "epoch": 3.1294729993493817, "grad_norm": 4.5566511154174805, "learning_rate": 1.5652704960301372e-05, "loss": 1.3248, "step": 4810 }, { "epoch": 3.1359791802212102, "grad_norm": 3.8354947566986084, "learning_rate": 1.5557038990573642e-05, "loss": 1.3082, "step": 4820 }, { "epoch": 3.1424853610930383, "grad_norm": 4.998276233673096, "learning_rate": 1.546153403583869e-05, "loss": 1.3119, "step": 4830 }, { "epoch": 3.1489915419648664, "grad_norm": 5.049701690673828, "learning_rate": 1.5366191724582494e-05, "loss": 1.2971, "step": 4840 }, { "epoch": 3.155497722836695, "grad_norm": 5.712673664093018, "learning_rate": 1.5271013682517738e-05, "loss": 1.3045, "step": 4850 }, { "epoch": 3.162003903708523, "grad_norm": 5.350017547607422, "learning_rate": 1.5176001532556117e-05, "loss": 1.3508, "step": 4860 }, { "epoch": 3.168510084580351, "grad_norm": 4.5841193199157715, "learning_rate": 1.508115689478063e-05, "loss": 1.3271, "step": 4870 }, { "epoch": 3.1750162654521796, "grad_norm": 4.288856506347656, "learning_rate": 1.4986481386417978e-05, "loss": 1.31, "step": 4880 }, { "epoch": 3.1815224463240077, "grad_norm": 5.793276786804199, "learning_rate": 1.4891976621810971e-05, "loss": 1.3146, "step": 4890 }, { "epoch": 3.1880286271958362, "grad_norm": 4.976964950561523, "learning_rate": 1.4797644212391037e-05, "loss": 1.3174, "step": 4900 }, { "epoch": 3.1945348080676643, "grad_norm": 4.657573223114014, "learning_rate": 1.4703485766650682e-05, "loss": 1.3182, "step": 4910 }, { "epoch": 3.2010409889394924, "grad_norm": 5.415560245513916, "learning_rate": 1.4609502890116145e-05, "loss": 1.3242, "step": 4920 }, { "epoch": 3.207547169811321, "grad_norm": 4.287566661834717, "learning_rate": 1.4515697185319946e-05, "loss": 1.3299, "step": 4930 }, { "epoch": 3.214053350683149, "grad_norm": 5.623661518096924, "learning_rate": 1.4422070251773594e-05, "loss": 1.3457, "step": 4940 }, { "epoch": 3.220559531554977, "grad_norm": 4.72862434387207, "learning_rate": 1.4328623685940335e-05, "loss": 1.3307, "step": 4950 }, { "epoch": 3.2270657124268056, "grad_norm": 6.381460666656494, "learning_rate": 1.4235359081207871e-05, "loss": 1.3285, "step": 4960 }, { "epoch": 3.2335718932986337, "grad_norm": 4.664497375488281, "learning_rate": 1.4142278027861253e-05, "loss": 1.3281, "step": 4970 }, { "epoch": 3.240078074170462, "grad_norm": 4.98822021484375, "learning_rate": 1.404938211305574e-05, "loss": 1.3006, "step": 4980 }, { "epoch": 3.2465842550422903, "grad_norm": 4.5542144775390625, "learning_rate": 1.3956672920789705e-05, "loss": 1.3504, "step": 4990 }, { "epoch": 3.2530904359141184, "grad_norm": 4.417718410491943, "learning_rate": 1.386415203187768e-05, "loss": 1.3555, "step": 5000 }, { "epoch": 3.2595966167859465, "grad_norm": 4.833828926086426, "learning_rate": 1.3771821023923383e-05, "loss": 1.368, "step": 5010 }, { "epoch": 3.266102797657775, "grad_norm": 4.329287528991699, "learning_rate": 1.3679681471292776e-05, "loss": 1.3246, "step": 5020 }, { "epoch": 3.272608978529603, "grad_norm": 4.183626651763916, "learning_rate": 1.3587734945087277e-05, "loss": 1.3373, "step": 5030 }, { "epoch": 3.279115159401431, "grad_norm": 5.649768829345703, "learning_rate": 1.3495983013116953e-05, "loss": 1.3266, "step": 5040 }, { "epoch": 3.2856213402732597, "grad_norm": 4.677701473236084, "learning_rate": 1.3404427239873763e-05, "loss": 1.3582, "step": 5050 }, { "epoch": 3.2921275211450878, "grad_norm": 4.981357097625732, "learning_rate": 1.3313069186504929e-05, "loss": 1.3174, "step": 5060 }, { "epoch": 3.2986337020169163, "grad_norm": 4.904416084289551, "learning_rate": 1.3221910410786248e-05, "loss": 1.3045, "step": 5070 }, { "epoch": 3.3051398828887444, "grad_norm": 4.641432762145996, "learning_rate": 1.3130952467095593e-05, "loss": 1.3479, "step": 5080 }, { "epoch": 3.3116460637605725, "grad_norm": 4.611999034881592, "learning_rate": 1.3040196906386392e-05, "loss": 1.3223, "step": 5090 }, { "epoch": 3.318152244632401, "grad_norm": 4.533626079559326, "learning_rate": 1.2949645276161149e-05, "loss": 1.3355, "step": 5100 }, { "epoch": 3.324658425504229, "grad_norm": 5.721342086791992, "learning_rate": 1.2859299120445107e-05, "loss": 1.3191, "step": 5110 }, { "epoch": 3.331164606376057, "grad_norm": 5.074528694152832, "learning_rate": 1.2769159979759899e-05, "loss": 1.3184, "step": 5120 }, { "epoch": 3.3376707872478857, "grad_norm": 5.300868511199951, "learning_rate": 1.2679229391097241e-05, "loss": 1.3559, "step": 5130 }, { "epoch": 3.3441769681197138, "grad_norm": 6.650349140167236, "learning_rate": 1.258950888789281e-05, "loss": 1.3004, "step": 5140 }, { "epoch": 3.350683148991542, "grad_norm": 6.611538887023926, "learning_rate": 1.2500000000000006e-05, "loss": 1.3281, "step": 5150 }, { "epoch": 3.3571893298633704, "grad_norm": 4.342381000518799, "learning_rate": 1.2410704253663932e-05, "loss": 1.3486, "step": 5160 }, { "epoch": 3.3636955107351985, "grad_norm": 4.60611629486084, "learning_rate": 1.232162317149535e-05, "loss": 1.3316, "step": 5170 }, { "epoch": 3.3702016916070265, "grad_norm": 5.181060791015625, "learning_rate": 1.2232758272444672e-05, "loss": 1.325, "step": 5180 }, { "epoch": 3.376707872478855, "grad_norm": 12.275197982788086, "learning_rate": 1.2144111071776174e-05, "loss": 1.2998, "step": 5190 }, { "epoch": 3.383214053350683, "grad_norm": 7.4621262550354, "learning_rate": 1.205568308104201e-05, "loss": 1.3068, "step": 5200 }, { "epoch": 3.3897202342225112, "grad_norm": 4.856866359710693, "learning_rate": 1.196747580805656e-05, "loss": 1.3195, "step": 5210 }, { "epoch": 3.3962264150943398, "grad_norm": 4.4897332191467285, "learning_rate": 1.1879490756870674e-05, "loss": 1.3107, "step": 5220 }, { "epoch": 3.402732595966168, "grad_norm": 5.8517537117004395, "learning_rate": 1.1791729427745992e-05, "loss": 1.3383, "step": 5230 }, { "epoch": 3.409238776837996, "grad_norm": 5.004359245300293, "learning_rate": 1.170419331712943e-05, "loss": 1.3049, "step": 5240 }, { "epoch": 3.4157449577098244, "grad_norm": 5.522491455078125, "learning_rate": 1.161688391762763e-05, "loss": 1.3072, "step": 5250 }, { "epoch": 3.4222511385816525, "grad_norm": 4.746038913726807, "learning_rate": 1.1529802717981475e-05, "loss": 1.3494, "step": 5260 }, { "epoch": 3.4287573194534806, "grad_norm": 6.5835700035095215, "learning_rate": 1.1442951203040775e-05, "loss": 1.3316, "step": 5270 }, { "epoch": 3.435263500325309, "grad_norm": 5.831517219543457, "learning_rate": 1.1356330853738906e-05, "loss": 1.2939, "step": 5280 }, { "epoch": 3.441769681197137, "grad_norm": 7.093508720397949, "learning_rate": 1.1269943147067535e-05, "loss": 1.3359, "step": 5290 }, { "epoch": 3.4482758620689653, "grad_norm": 6.217349529266357, "learning_rate": 1.1183789556051508e-05, "loss": 1.3395, "step": 5300 }, { "epoch": 3.454782042940794, "grad_norm": 7.403109550476074, "learning_rate": 1.1097871549723629e-05, "loss": 1.3365, "step": 5310 }, { "epoch": 3.461288223812622, "grad_norm": 5.734516143798828, "learning_rate": 1.1012190593099744e-05, "loss": 1.2857, "step": 5320 }, { "epoch": 3.46779440468445, "grad_norm": 4.754746913909912, "learning_rate": 1.0926748147153648e-05, "loss": 1.3678, "step": 5330 }, { "epoch": 3.4743005855562785, "grad_norm": 6.582971096038818, "learning_rate": 1.08415456687922e-05, "loss": 1.3164, "step": 5340 }, { "epoch": 3.4808067664281066, "grad_norm": 5.222586631774902, "learning_rate": 1.0756584610830523e-05, "loss": 1.318, "step": 5350 }, { "epoch": 3.487312947299935, "grad_norm": 4.8354411125183105, "learning_rate": 1.0671866421967175e-05, "loss": 1.3271, "step": 5360 }, { "epoch": 3.493819128171763, "grad_norm": 4.462943077087402, "learning_rate": 1.0587392546759498e-05, "loss": 1.308, "step": 5370 }, { "epoch": 3.5003253090435913, "grad_norm": 4.869028568267822, "learning_rate": 1.050316442559896e-05, "loss": 1.3107, "step": 5380 }, { "epoch": 3.5068314899154194, "grad_norm": 7.337380409240723, "learning_rate": 1.0419183494686574e-05, "loss": 1.34, "step": 5390 }, { "epoch": 3.513337670787248, "grad_norm": 5.1599040031433105, "learning_rate": 1.0335451186008454e-05, "loss": 1.3171, "step": 5400 }, { "epoch": 3.519843851659076, "grad_norm": 5.68267297744751, "learning_rate": 1.0251968927311384e-05, "loss": 1.3029, "step": 5410 }, { "epoch": 3.5263500325309045, "grad_norm": 5.19374942779541, "learning_rate": 1.0168738142078429e-05, "loss": 1.3027, "step": 5420 }, { "epoch": 3.5328562134027326, "grad_norm": 5.05349063873291, "learning_rate": 1.0085760249504728e-05, "loss": 1.3408, "step": 5430 }, { "epoch": 3.5393623942745607, "grad_norm": 6.524130821228027, "learning_rate": 1.0003036664473267e-05, "loss": 1.3258, "step": 5440 }, { "epoch": 3.545868575146389, "grad_norm": 4.899120807647705, "learning_rate": 9.920568797530716e-06, "loss": 1.3016, "step": 5450 }, { "epoch": 3.5523747560182173, "grad_norm": 5.650516986846924, "learning_rate": 9.83835805486347e-06, "loss": 1.3369, "step": 5460 }, { "epoch": 3.558880936890046, "grad_norm": 5.218024253845215, "learning_rate": 9.756405838273558e-06, "loss": 1.2865, "step": 5470 }, { "epoch": 3.565387117761874, "grad_norm": 5.262111186981201, "learning_rate": 9.674713545154831e-06, "loss": 1.3133, "step": 5480 }, { "epoch": 3.571893298633702, "grad_norm": 5.540369510650635, "learning_rate": 9.5932825684691e-06, "loss": 1.3381, "step": 5490 }, { "epoch": 3.5783994795055305, "grad_norm": 5.584460735321045, "learning_rate": 9.51211429672236e-06, "loss": 1.3108, "step": 5500 }, { "epoch": 3.5849056603773586, "grad_norm": 5.3172688484191895, "learning_rate": 9.431210113941169e-06, "loss": 1.2797, "step": 5510 }, { "epoch": 3.5914118412491867, "grad_norm": 4.457610130310059, "learning_rate": 9.350571399648988e-06, "loss": 1.332, "step": 5520 }, { "epoch": 3.597918022121015, "grad_norm": 4.220865249633789, "learning_rate": 9.270199528842715e-06, "loss": 1.3334, "step": 5530 }, { "epoch": 3.6044242029928433, "grad_norm": 4.474475383758545, "learning_rate": 9.19009587196921e-06, "loss": 1.2893, "step": 5540 }, { "epoch": 3.6109303838646714, "grad_norm": 5.967447757720947, "learning_rate": 9.110261794901903e-06, "loss": 1.3338, "step": 5550 }, { "epoch": 3.6174365647365, "grad_norm": 5.03196382522583, "learning_rate": 9.030698658917566e-06, "loss": 1.3391, "step": 5560 }, { "epoch": 3.623942745608328, "grad_norm": 5.669247627258301, "learning_rate": 8.951407820673058e-06, "loss": 1.3041, "step": 5570 }, { "epoch": 3.630448926480156, "grad_norm": 4.971733093261719, "learning_rate": 8.872390632182175e-06, "loss": 1.3084, "step": 5580 }, { "epoch": 3.6369551073519846, "grad_norm": 4.8311591148376465, "learning_rate": 8.793648440792654e-06, "loss": 1.3193, "step": 5590 }, { "epoch": 3.6434612882238127, "grad_norm": 4.955861568450928, "learning_rate": 8.715182589163153e-06, "loss": 1.3383, "step": 5600 }, { "epoch": 3.6499674690956407, "grad_norm": 6.267313003540039, "learning_rate": 8.636994415240376e-06, "loss": 1.3225, "step": 5610 }, { "epoch": 3.6564736499674693, "grad_norm": 5.251039505004883, "learning_rate": 8.559085252236259e-06, "loss": 1.3195, "step": 5620 }, { "epoch": 3.6629798308392973, "grad_norm": 5.265859603881836, "learning_rate": 8.481456428605205e-06, "loss": 1.3141, "step": 5630 }, { "epoch": 3.6694860117111254, "grad_norm": 5.500703811645508, "learning_rate": 8.404109268021493e-06, "loss": 1.3271, "step": 5640 }, { "epoch": 3.675992192582954, "grad_norm": 5.094318389892578, "learning_rate": 8.327045089356663e-06, "loss": 1.3316, "step": 5650 }, { "epoch": 3.682498373454782, "grad_norm": 4.512362480163574, "learning_rate": 8.250265206657025e-06, "loss": 1.3141, "step": 5660 }, { "epoch": 3.68900455432661, "grad_norm": 5.620757579803467, "learning_rate": 8.17377092912128e-06, "loss": 1.3361, "step": 5670 }, { "epoch": 3.6955107351984386, "grad_norm": 5.749104976654053, "learning_rate": 8.097563561078193e-06, "loss": 1.334, "step": 5680 }, { "epoch": 3.7020169160702667, "grad_norm": 5.941647529602051, "learning_rate": 8.021644401964305e-06, "loss": 1.309, "step": 5690 }, { "epoch": 3.708523096942095, "grad_norm": 5.45607852935791, "learning_rate": 7.946014746301858e-06, "loss": 1.3178, "step": 5700 }, { "epoch": 3.7150292778139233, "grad_norm": 6.094242572784424, "learning_rate": 7.87067588367664e-06, "loss": 1.275, "step": 5710 }, { "epoch": 3.7215354586857514, "grad_norm": 4.6306304931640625, "learning_rate": 7.795629098716045e-06, "loss": 1.3355, "step": 5720 }, { "epoch": 3.7280416395575795, "grad_norm": 5.950948715209961, "learning_rate": 7.720875671067188e-06, "loss": 1.2968, "step": 5730 }, { "epoch": 3.734547820429408, "grad_norm": 7.5676679611206055, "learning_rate": 7.646416875374992e-06, "loss": 1.3, "step": 5740 }, { "epoch": 3.741054001301236, "grad_norm": 5.311653137207031, "learning_rate": 7.572253981260571e-06, "loss": 1.3018, "step": 5750 }, { "epoch": 3.747560182173064, "grad_norm": 4.659637928009033, "learning_rate": 7.498388253299482e-06, "loss": 1.3141, "step": 5760 }, { "epoch": 3.7540663630448927, "grad_norm": 5.579241752624512, "learning_rate": 7.424820951000233e-06, "loss": 1.3363, "step": 5770 }, { "epoch": 3.760572543916721, "grad_norm": 4.9434919357299805, "learning_rate": 7.351553328782779e-06, "loss": 1.3076, "step": 5780 }, { "epoch": 3.767078724788549, "grad_norm": 4.733918190002441, "learning_rate": 7.278586635957107e-06, "loss": 1.3242, "step": 5790 }, { "epoch": 3.7735849056603774, "grad_norm": 4.652329921722412, "learning_rate": 7.205922116701985e-06, "loss": 1.3459, "step": 5800 }, { "epoch": 3.7800910865322055, "grad_norm": 5.1576337814331055, "learning_rate": 7.133561010043724e-06, "loss": 1.3189, "step": 5810 }, { "epoch": 3.7865972674040336, "grad_norm": 4.887834072113037, "learning_rate": 7.0615045498350215e-06, "loss": 1.3124, "step": 5820 }, { "epoch": 3.793103448275862, "grad_norm": 5.797942161560059, "learning_rate": 6.9897539647339725e-06, "loss": 1.3154, "step": 5830 }, { "epoch": 3.79960962914769, "grad_norm": 4.32541036605835, "learning_rate": 6.918310478183093e-06, "loss": 1.3041, "step": 5840 }, { "epoch": 3.8061158100195187, "grad_norm": 4.879608631134033, "learning_rate": 6.847175308388451e-06, "loss": 1.3216, "step": 5850 }, { "epoch": 3.812621990891347, "grad_norm": 4.810073375701904, "learning_rate": 6.776349668298912e-06, "loss": 1.3305, "step": 5860 }, { "epoch": 3.819128171763175, "grad_norm": 5.070435047149658, "learning_rate": 6.705834765585459e-06, "loss": 1.3342, "step": 5870 }, { "epoch": 3.8256343526350034, "grad_norm": 3.9700376987457275, "learning_rate": 6.635631802620576e-06, "loss": 1.332, "step": 5880 }, { "epoch": 3.8321405335068315, "grad_norm": 5.695364475250244, "learning_rate": 6.565741976457782e-06, "loss": 1.3342, "step": 5890 }, { "epoch": 3.8386467143786596, "grad_norm": 5.030066013336182, "learning_rate": 6.496166478811164e-06, "loss": 1.3246, "step": 5900 }, { "epoch": 3.845152895250488, "grad_norm": 4.794107913970947, "learning_rate": 6.426906496035129e-06, "loss": 1.317, "step": 5910 }, { "epoch": 3.851659076122316, "grad_norm": 6.574291706085205, "learning_rate": 6.357963209104106e-06, "loss": 1.2834, "step": 5920 }, { "epoch": 3.8581652569941447, "grad_norm": 6.288641929626465, "learning_rate": 6.289337793592468e-06, "loss": 1.3154, "step": 5930 }, { "epoch": 3.864671437865973, "grad_norm": 5.040285110473633, "learning_rate": 6.221031419654444e-06, "loss": 1.3141, "step": 5940 }, { "epoch": 3.871177618737801, "grad_norm": 5.231017112731934, "learning_rate": 6.153045252004177e-06, "loss": 1.3195, "step": 5950 }, { "epoch": 3.8776837996096294, "grad_norm": 4.824458599090576, "learning_rate": 6.08538044989588e-06, "loss": 1.2854, "step": 5960 }, { "epoch": 3.8841899804814575, "grad_norm": 5.37669563293457, "learning_rate": 6.0180381671040596e-06, "loss": 1.3254, "step": 5970 }, { "epoch": 3.8906961613532856, "grad_norm": 5.023409366607666, "learning_rate": 5.9510195519038245e-06, "loss": 1.3195, "step": 5980 }, { "epoch": 3.897202342225114, "grad_norm": 5.021458625793457, "learning_rate": 5.884325747051336e-06, "loss": 1.3154, "step": 5990 }, { "epoch": 3.903708523096942, "grad_norm": 5.258111000061035, "learning_rate": 5.817957889764308e-06, "loss": 1.2885, "step": 6000 }, { "epoch": 3.9102147039687702, "grad_norm": 7.723100662231445, "learning_rate": 5.751917111702612e-06, "loss": 1.317, "step": 6010 }, { "epoch": 3.9167208848405988, "grad_norm": 5.029079914093018, "learning_rate": 5.686204538948997e-06, "loss": 1.3234, "step": 6020 }, { "epoch": 3.923227065712427, "grad_norm": 4.3268938064575195, "learning_rate": 5.62082129198985e-06, "loss": 1.3219, "step": 6030 }, { "epoch": 3.929733246584255, "grad_norm": 5.595819473266602, "learning_rate": 5.555768485696144e-06, "loss": 1.3043, "step": 6040 }, { "epoch": 3.9362394274560835, "grad_norm": 4.387709140777588, "learning_rate": 5.491047229304397e-06, "loss": 1.2824, "step": 6050 }, { "epoch": 3.9427456083279115, "grad_norm": 5.619391441345215, "learning_rate": 5.42665862639774e-06, "loss": 1.3104, "step": 6060 }, { "epoch": 3.9492517891997396, "grad_norm": 5.790836334228516, "learning_rate": 5.3626037748871565e-06, "loss": 1.3139, "step": 6070 }, { "epoch": 3.955757970071568, "grad_norm": 6.1346869468688965, "learning_rate": 5.29888376699269e-06, "loss": 1.3135, "step": 6080 }, { "epoch": 3.9622641509433962, "grad_norm": 4.424135208129883, "learning_rate": 5.235499689224885e-06, "loss": 1.2979, "step": 6090 }, { "epoch": 3.9687703318152243, "grad_norm": 4.595534801483154, "learning_rate": 5.172452622366228e-06, "loss": 1.2908, "step": 6100 }, { "epoch": 3.975276512687053, "grad_norm": 5.033629417419434, "learning_rate": 5.109743641452699e-06, "loss": 1.3213, "step": 6110 }, { "epoch": 3.981782693558881, "grad_norm": 4.468408584594727, "learning_rate": 5.047373815755496e-06, "loss": 1.3041, "step": 6120 }, { "epoch": 3.988288874430709, "grad_norm": 5.089435577392578, "learning_rate": 4.985344208762757e-06, "loss": 1.2879, "step": 6130 }, { "epoch": 3.9947950553025375, "grad_norm": 5.855978965759277, "learning_rate": 4.92365587816144e-06, "loss": 1.3121, "step": 6140 }, { "epoch": 4.0, "eval_f1": 82.05317459768847, "eval_loss": 0.4393366277217865, "eval_precision": 82.08726639714278, "eval_recall": 82.02752678333142, "eval_runtime": 62.8918, "eval_samples_per_second": 6255.741, "eval_steps_per_second": 6.122, "step": 6148 }, { "epoch": 4.001301236174366, "grad_norm": 4.1740241050720215, "learning_rate": 4.862309875819299e-06, "loss": 1.2947, "step": 6150 }, { "epoch": 4.007807417046194, "grad_norm": 5.509492874145508, "learning_rate": 4.801307247766912e-06, "loss": 1.2148, "step": 6160 }, { "epoch": 4.014313597918022, "grad_norm": 4.662569999694824, "learning_rate": 4.740649034179898e-06, "loss": 1.2176, "step": 6170 }, { "epoch": 4.020819778789851, "grad_norm": 5.486327171325684, "learning_rate": 4.680336269361146e-06, "loss": 1.1814, "step": 6180 }, { "epoch": 4.027325959661678, "grad_norm": 5.3234357833862305, "learning_rate": 4.620369981723174e-06, "loss": 1.2045, "step": 6190 }, { "epoch": 4.033832140533507, "grad_norm": 5.096307754516602, "learning_rate": 4.560751193770619e-06, "loss": 1.1968, "step": 6200 }, { "epoch": 4.040338321405335, "grad_norm": 6.199032783508301, "learning_rate": 4.501480922082787e-06, "loss": 1.1888, "step": 6210 }, { "epoch": 4.046844502277163, "grad_norm": 5.72098445892334, "learning_rate": 4.442560177296307e-06, "loss": 1.2028, "step": 6220 }, { "epoch": 4.053350683148992, "grad_norm": 5.6385416984558105, "learning_rate": 4.383989964087923e-06, "loss": 1.1975, "step": 6230 }, { "epoch": 4.05985686402082, "grad_norm": 4.712112903594971, "learning_rate": 4.325771281157356e-06, "loss": 1.1878, "step": 6240 }, { "epoch": 4.066363044892648, "grad_norm": 5.062438488006592, "learning_rate": 4.267905121210253e-06, "loss": 1.202, "step": 6250 }, { "epoch": 4.072869225764476, "grad_norm": 5.3925557136535645, "learning_rate": 4.210392470941288e-06, "loss": 1.1696, "step": 6260 }, { "epoch": 4.079375406636305, "grad_norm": 5.509522914886475, "learning_rate": 4.153234311017332e-06, "loss": 1.1744, "step": 6270 }, { "epoch": 4.0858815875081325, "grad_norm": 4.856545448303223, "learning_rate": 4.096431616060717e-06, "loss": 1.2011, "step": 6280 }, { "epoch": 4.092387768379961, "grad_norm": 6.018407344818115, "learning_rate": 4.039985354632633e-06, "loss": 1.2034, "step": 6290 }, { "epoch": 4.0988939492517895, "grad_norm": 5.942923069000244, "learning_rate": 3.983896489216596e-06, "loss": 1.1546, "step": 6300 }, { "epoch": 4.105400130123617, "grad_norm": 5.557123184204102, "learning_rate": 3.928165976202058e-06, "loss": 1.2093, "step": 6310 }, { "epoch": 4.111906310995446, "grad_norm": 5.105636119842529, "learning_rate": 3.872794765868079e-06, "loss": 1.1903, "step": 6320 }, { "epoch": 4.118412491867274, "grad_norm": 5.262782573699951, "learning_rate": 3.817783802367137e-06, "loss": 1.1986, "step": 6330 }, { "epoch": 4.124918672739102, "grad_norm": 5.6022186279296875, "learning_rate": 3.763134023709031e-06, "loss": 1.2109, "step": 6340 }, { "epoch": 4.13142485361093, "grad_norm": 6.153190612792969, "learning_rate": 3.7088463617448637e-06, "loss": 1.1838, "step": 6350 }, { "epoch": 4.137931034482759, "grad_norm": 6.382300853729248, "learning_rate": 3.6549217421511795e-06, "loss": 1.1712, "step": 6360 }, { "epoch": 4.1444372153545865, "grad_norm": 7.045514106750488, "learning_rate": 3.601361084414176e-06, "loss": 1.1548, "step": 6370 }, { "epoch": 4.150943396226415, "grad_norm": 5.8788862228393555, "learning_rate": 3.5481653018139995e-06, "loss": 1.2092, "step": 6380 }, { "epoch": 4.157449577098244, "grad_norm": 4.951000213623047, "learning_rate": 3.4953353014092057e-06, "loss": 1.1869, "step": 6390 }, { "epoch": 4.163955757970071, "grad_norm": 8.229723930358887, "learning_rate": 3.4428719840212814e-06, "loss": 1.1801, "step": 6400 }, { "epoch": 4.1704619388419, "grad_norm": 5.354263782501221, "learning_rate": 3.3907762442192735e-06, "loss": 1.2002, "step": 6410 }, { "epoch": 4.176968119713728, "grad_norm": 5.551435470581055, "learning_rate": 3.3390489703045593e-06, "loss": 1.1893, "step": 6420 }, { "epoch": 4.183474300585556, "grad_norm": 6.283133506774902, "learning_rate": 3.2876910442956573e-06, "loss": 1.2097, "step": 6430 }, { "epoch": 4.189980481457384, "grad_norm": 5.303994178771973, "learning_rate": 3.2367033419132388e-06, "loss": 1.1891, "step": 6440 }, { "epoch": 4.196486662329213, "grad_norm": 5.536159992218018, "learning_rate": 3.1860867325651717e-06, "loss": 1.1842, "step": 6450 }, { "epoch": 4.202992843201041, "grad_norm": 5.999258041381836, "learning_rate": 3.1358420793316744e-06, "loss": 1.2111, "step": 6460 }, { "epoch": 4.209499024072869, "grad_norm": 6.4903388023376465, "learning_rate": 3.085970238950653e-06, "loss": 1.2196, "step": 6470 }, { "epoch": 4.216005204944698, "grad_norm": 6.352773189544678, "learning_rate": 3.036472061803025e-06, "loss": 1.1658, "step": 6480 }, { "epoch": 4.222511385816525, "grad_norm": 7.125950336456299, "learning_rate": 2.987348391898284e-06, "loss": 1.1744, "step": 6490 }, { "epoch": 4.229017566688354, "grad_norm": 6.506474494934082, "learning_rate": 2.9386000668600698e-06, "loss": 1.1476, "step": 6500 }, { "epoch": 4.235523747560182, "grad_norm": 5.437373161315918, "learning_rate": 2.8902279179118837e-06, "loss": 1.1911, "step": 6510 }, { "epoch": 4.24202992843201, "grad_norm": 5.792574405670166, "learning_rate": 2.8422327698629405e-06, "loss": 1.1865, "step": 6520 }, { "epoch": 4.2485361093038385, "grad_norm": 5.307682514190674, "learning_rate": 2.794615441094095e-06, "loss": 1.1951, "step": 6530 }, { "epoch": 4.255042290175667, "grad_norm": 5.282588958740234, "learning_rate": 2.747376743543853e-06, "loss": 1.1709, "step": 6540 }, { "epoch": 4.261548471047496, "grad_norm": 5.824193000793457, "learning_rate": 2.7005174826946004e-06, "loss": 1.2019, "step": 6550 }, { "epoch": 4.268054651919323, "grad_norm": 5.809417247772217, "learning_rate": 2.6540384575587885e-06, "loss": 1.1595, "step": 6560 }, { "epoch": 4.274560832791152, "grad_norm": 5.6779913902282715, "learning_rate": 2.607940460665359e-06, "loss": 1.1611, "step": 6570 }, { "epoch": 4.28106701366298, "grad_norm": 5.9347686767578125, "learning_rate": 2.5622242780462243e-06, "loss": 1.1898, "step": 6580 }, { "epoch": 4.287573194534808, "grad_norm": 7.421865463256836, "learning_rate": 2.516890689222845e-06, "loss": 1.1953, "step": 6590 }, { "epoch": 4.294079375406636, "grad_norm": 5.2877702713012695, "learning_rate": 2.471940467192957e-06, "loss": 1.1642, "step": 6600 }, { "epoch": 4.300585556278465, "grad_norm": 5.69005012512207, "learning_rate": 2.427374378417388e-06, "loss": 1.1986, "step": 6610 }, { "epoch": 4.307091737150293, "grad_norm": 5.4011549949646, "learning_rate": 2.383193182806978e-06, "loss": 1.1752, "step": 6620 }, { "epoch": 4.313597918022121, "grad_norm": 5.179370403289795, "learning_rate": 2.3393976337096334e-06, "loss": 1.1898, "step": 6630 }, { "epoch": 4.32010409889395, "grad_norm": 5.5664544105529785, "learning_rate": 2.2959884778974735e-06, "loss": 1.1629, "step": 6640 }, { "epoch": 4.326610279765777, "grad_norm": 6.538241386413574, "learning_rate": 2.252966455554101e-06, "loss": 1.1515, "step": 6650 }, { "epoch": 4.333116460637606, "grad_norm": 6.5795979499816895, "learning_rate": 2.2103323002619857e-06, "loss": 1.1835, "step": 6660 }, { "epoch": 4.339622641509434, "grad_norm": 5.8081889152526855, "learning_rate": 2.1680867389899355e-06, "loss": 1.1843, "step": 6670 }, { "epoch": 4.346128822381262, "grad_norm": 5.710062503814697, "learning_rate": 2.126230492080744e-06, "loss": 1.2036, "step": 6680 }, { "epoch": 4.3526350032530905, "grad_norm": 5.167084217071533, "learning_rate": 2.0847642732388457e-06, "loss": 1.1424, "step": 6690 }, { "epoch": 4.359141184124919, "grad_norm": 5.835074424743652, "learning_rate": 2.0436887895182e-06, "loss": 1.1528, "step": 6700 }, { "epoch": 4.365647364996747, "grad_norm": 6.528144836425781, "learning_rate": 2.003004741310216e-06, "loss": 1.1679, "step": 6710 }, { "epoch": 4.372153545868575, "grad_norm": 5.71160888671875, "learning_rate": 1.9627128223317942e-06, "loss": 1.17, "step": 6720 }, { "epoch": 4.378659726740404, "grad_norm": 5.882184982299805, "learning_rate": 1.9228137196135254e-06, "loss": 1.1768, "step": 6730 }, { "epoch": 4.385165907612231, "grad_norm": 5.519601345062256, "learning_rate": 1.8833081134879637e-06, "loss": 1.1655, "step": 6740 }, { "epoch": 4.39167208848406, "grad_norm": 5.781759262084961, "learning_rate": 1.8441966775780112e-06, "loss": 1.1664, "step": 6750 }, { "epoch": 4.398178269355888, "grad_norm": 5.19895076751709, "learning_rate": 1.8054800787854569e-06, "loss": 1.172, "step": 6760 }, { "epoch": 4.404684450227716, "grad_norm": 5.598287582397461, "learning_rate": 1.767158977279601e-06, "loss": 1.1639, "step": 6770 }, { "epoch": 4.411190631099545, "grad_norm": 5.8030171394348145, "learning_rate": 1.729234026485968e-06, "loss": 1.1988, "step": 6780 }, { "epoch": 4.417696811971373, "grad_norm": 6.493566513061523, "learning_rate": 1.691705873075211e-06, "loss": 1.1965, "step": 6790 }, { "epoch": 4.424202992843201, "grad_norm": 7.035156726837158, "learning_rate": 1.654575156952054e-06, "loss": 1.1637, "step": 6800 }, { "epoch": 4.430709173715029, "grad_norm": 7.03166389465332, "learning_rate": 1.6178425112443774e-06, "loss": 1.1727, "step": 6810 }, { "epoch": 4.437215354586858, "grad_norm": 6.267909526824951, "learning_rate": 1.5815085622924607e-06, "loss": 1.1748, "step": 6820 }, { "epoch": 4.443721535458685, "grad_norm": 6.7049946784973145, "learning_rate": 1.5455739296382442e-06, "loss": 1.1641, "step": 6830 }, { "epoch": 4.450227716330514, "grad_norm": 7.1322431564331055, "learning_rate": 1.51003922601482e-06, "loss": 1.1724, "step": 6840 }, { "epoch": 4.4567338972023425, "grad_norm": 6.178156852722168, "learning_rate": 1.474905057335954e-06, "loss": 1.2027, "step": 6850 }, { "epoch": 4.46324007807417, "grad_norm": 5.851823806762695, "learning_rate": 1.4401720226857485e-06, "loss": 1.1806, "step": 6860 }, { "epoch": 4.469746258945999, "grad_norm": 6.205535888671875, "learning_rate": 1.4058407143084596e-06, "loss": 1.1637, "step": 6870 }, { "epoch": 4.476252439817827, "grad_norm": 6.5300517082214355, "learning_rate": 1.371911717598362e-06, "loss": 1.1857, "step": 6880 }, { "epoch": 4.482758620689655, "grad_norm": 5.84266471862793, "learning_rate": 1.3383856110897901e-06, "loss": 1.1683, "step": 6890 }, { "epoch": 4.489264801561483, "grad_norm": 6.5396728515625, "learning_rate": 1.3052629664472738e-06, "loss": 1.1915, "step": 6900 }, { "epoch": 4.495770982433312, "grad_norm": 7.348312854766846, "learning_rate": 1.2725443484557675e-06, "loss": 1.1924, "step": 6910 }, { "epoch": 4.5022771633051395, "grad_norm": 5.832183837890625, "learning_rate": 1.2402303150110455e-06, "loss": 1.1522, "step": 6920 }, { "epoch": 4.508783344176968, "grad_norm": 5.630441188812256, "learning_rate": 1.2083214171101893e-06, "loss": 1.1698, "step": 6930 }, { "epoch": 4.5152895250487965, "grad_norm": 6.512978553771973, "learning_rate": 1.1768181988421583e-06, "loss": 1.1803, "step": 6940 }, { "epoch": 4.521795705920624, "grad_norm": 5.784249305725098, "learning_rate": 1.1457211973785632e-06, "loss": 1.2045, "step": 6950 }, { "epoch": 4.528301886792453, "grad_norm": 5.739207744598389, "learning_rate": 1.1150309429644623e-06, "loss": 1.1972, "step": 6960 }, { "epoch": 4.534808067664281, "grad_norm": 6.033750534057617, "learning_rate": 1.0847479589093435e-06, "loss": 1.1856, "step": 6970 }, { "epoch": 4.541314248536109, "grad_norm": 6.38392448425293, "learning_rate": 1.0548727615782017e-06, "loss": 1.1763, "step": 6980 }, { "epoch": 4.547820429407937, "grad_norm": 5.401200771331787, "learning_rate": 1.0254058603827139e-06, "loss": 1.1637, "step": 6990 }, { "epoch": 4.554326610279766, "grad_norm": 5.801834583282471, "learning_rate": 9.96347757772581e-07, "loss": 1.1795, "step": 7000 }, { "epoch": 4.560832791151594, "grad_norm": 6.14340353012085, "learning_rate": 9.676989492269417e-07, "loss": 1.1776, "step": 7010 }, { "epoch": 4.567338972023422, "grad_norm": 5.303777694702148, "learning_rate": 9.394599232459222e-07, "loss": 1.1889, "step": 7020 }, { "epoch": 4.573845152895251, "grad_norm": 5.7834038734436035, "learning_rate": 9.116311613423272e-07, "loss": 1.1778, "step": 7030 }, { "epoch": 4.580351333767078, "grad_norm": 6.606414318084717, "learning_rate": 8.842131380334017e-07, "loss": 1.1692, "step": 7040 }, { "epoch": 4.586857514638907, "grad_norm": 5.640596389770508, "learning_rate": 8.572063208327568e-07, "loss": 1.1918, "step": 7050 }, { "epoch": 4.593363695510735, "grad_norm": 6.887298583984375, "learning_rate": 8.306111702424069e-07, "loss": 1.2011, "step": 7060 }, { "epoch": 4.599869876382563, "grad_norm": 6.672758102416992, "learning_rate": 8.044281397448894e-07, "loss": 1.189, "step": 7070 }, { "epoch": 4.6063760572543915, "grad_norm": 5.722778797149658, "learning_rate": 7.786576757955521e-07, "loss": 1.154, "step": 7080 }, { "epoch": 4.61288223812622, "grad_norm": 7.151712894439697, "learning_rate": 7.533002178149451e-07, "loss": 1.1897, "step": 7090 }, { "epoch": 4.6193884189980485, "grad_norm": 6.262500762939453, "learning_rate": 7.283561981813125e-07, "loss": 1.2021, "step": 7100 }, { "epoch": 4.625894599869876, "grad_norm": 5.623366355895996, "learning_rate": 7.038260422232296e-07, "loss": 1.1975, "step": 7110 }, { "epoch": 4.632400780741705, "grad_norm": 5.359673023223877, "learning_rate": 6.797101682123414e-07, "loss": 1.2075, "step": 7120 }, { "epoch": 4.638906961613533, "grad_norm": 5.430258274078369, "learning_rate": 6.560089873562464e-07, "loss": 1.1648, "step": 7130 }, { "epoch": 4.645413142485361, "grad_norm": 5.5312275886535645, "learning_rate": 6.327229037914717e-07, "loss": 1.1593, "step": 7140 }, { "epoch": 4.651919323357189, "grad_norm": 6.830257892608643, "learning_rate": 6.09852314576581e-07, "loss": 1.1959, "step": 7150 }, { "epoch": 4.658425504229018, "grad_norm": 5.554884433746338, "learning_rate": 5.873976096854195e-07, "loss": 1.1661, "step": 7160 }, { "epoch": 4.6649316851008455, "grad_norm": 6.155848979949951, "learning_rate": 5.65359172000443e-07, "loss": 1.1686, "step": 7170 }, { "epoch": 4.671437865972674, "grad_norm": 5.273393154144287, "learning_rate": 5.437373773062082e-07, "loss": 1.1631, "step": 7180 }, { "epoch": 4.677944046844503, "grad_norm": 6.339115142822266, "learning_rate": 5.225325942829512e-07, "loss": 1.1623, "step": 7190 }, { "epoch": 4.68445022771633, "grad_norm": 6.310550212860107, "learning_rate": 5.017451845003074e-07, "loss": 1.1534, "step": 7200 }, { "epoch": 4.690956408588159, "grad_norm": 6.055802345275879, "learning_rate": 4.813755024111522e-07, "loss": 1.1544, "step": 7210 }, { "epoch": 4.697462589459987, "grad_norm": 6.022665977478027, "learning_rate": 4.614238953455391e-07, "loss": 1.1804, "step": 7220 }, { "epoch": 4.703968770331815, "grad_norm": 8.092384338378906, "learning_rate": 4.41890703504802e-07, "loss": 1.1734, "step": 7230 }, { "epoch": 4.7104749512036435, "grad_norm": 6.269159317016602, "learning_rate": 4.2277625995573445e-07, "loss": 1.182, "step": 7240 }, { "epoch": 4.716981132075472, "grad_norm": 5.261497974395752, "learning_rate": 4.040808906249083e-07, "loss": 1.1509, "step": 7250 }, { "epoch": 4.7234873129473, "grad_norm": 6.042685508728027, "learning_rate": 3.858049142931419e-07, "loss": 1.1609, "step": 7260 }, { "epoch": 4.729993493819128, "grad_norm": 6.550515174865723, "learning_rate": 3.6794864259003527e-07, "loss": 1.1892, "step": 7270 }, { "epoch": 4.736499674690957, "grad_norm": 5.826042652130127, "learning_rate": 3.5051237998867394e-07, "loss": 1.1344, "step": 7280 }, { "epoch": 4.743005855562784, "grad_norm": 5.940308094024658, "learning_rate": 3.334964238004279e-07, "loss": 1.1803, "step": 7290 }, { "epoch": 4.749512036434613, "grad_norm": 6.260829925537109, "learning_rate": 3.1690106416988885e-07, "loss": 1.1756, "step": 7300 }, { "epoch": 4.756018217306441, "grad_norm": 6.480372428894043, "learning_rate": 3.0072658406992137e-07, "loss": 1.1969, "step": 7310 }, { "epoch": 4.762524398178269, "grad_norm": 7.373197555541992, "learning_rate": 2.8497325929683624e-07, "loss": 1.2117, "step": 7320 }, { "epoch": 4.7690305790500975, "grad_norm": 5.685016632080078, "learning_rate": 2.696413584656832e-07, "loss": 1.1583, "step": 7330 }, { "epoch": 4.775536759921926, "grad_norm": 6.192778587341309, "learning_rate": 2.547311430056792e-07, "loss": 1.1674, "step": 7340 }, { "epoch": 4.782042940793754, "grad_norm": 5.0014824867248535, "learning_rate": 2.4024286715574883e-07, "loss": 1.1625, "step": 7350 }, { "epoch": 4.788549121665582, "grad_norm": 5.7597761154174805, "learning_rate": 2.2617677796018534e-07, "loss": 1.1614, "step": 7360 }, { "epoch": 4.795055302537411, "grad_norm": 5.913872718811035, "learning_rate": 2.1253311526444052e-07, "loss": 1.1797, "step": 7370 }, { "epoch": 4.801561483409239, "grad_norm": 6.681622505187988, "learning_rate": 1.9931211171103359e-07, "loss": 1.1852, "step": 7380 }, { "epoch": 4.808067664281067, "grad_norm": 5.706891059875488, "learning_rate": 1.8651399273559022e-07, "loss": 1.2021, "step": 7390 }, { "epoch": 4.814573845152895, "grad_norm": 5.723352432250977, "learning_rate": 1.7413897656298472e-07, "loss": 1.1699, "step": 7400 }, { "epoch": 4.821080026024724, "grad_norm": 5.888330936431885, "learning_rate": 1.6218727420364288e-07, "loss": 1.1658, "step": 7410 }, { "epoch": 4.827586206896552, "grad_norm": 5.5335798263549805, "learning_rate": 1.5065908944991159e-07, "loss": 1.1735, "step": 7420 }, { "epoch": 4.83409238776838, "grad_norm": 6.942819118499756, "learning_rate": 1.3955461887261435e-07, "loss": 1.1908, "step": 7430 }, { "epoch": 4.840598568640209, "grad_norm": 5.177826881408691, "learning_rate": 1.2887405181768174e-07, "loss": 1.1845, "step": 7440 }, { "epoch": 4.847104749512036, "grad_norm": 5.800699710845947, "learning_rate": 1.1861757040293186e-07, "loss": 1.1645, "step": 7450 }, { "epoch": 4.853610930383865, "grad_norm": 5.468136310577393, "learning_rate": 1.0878534951495323e-07, "loss": 1.1849, "step": 7460 }, { "epoch": 4.860117111255693, "grad_norm": 6.139671325683594, "learning_rate": 9.937755680613781e-08, "loss": 1.1844, "step": 7470 }, { "epoch": 4.866623292127521, "grad_norm": 5.556206703186035, "learning_rate": 9.039435269181384e-08, "loss": 1.1822, "step": 7480 }, { "epoch": 4.8731294729993495, "grad_norm": 5.747623920440674, "learning_rate": 8.183589034750639e-08, "loss": 1.1878, "step": 7490 }, { "epoch": 4.879635653871178, "grad_norm": 6.927537441253662, "learning_rate": 7.370231570633656e-08, "loss": 1.1935, "step": 7500 }, { "epoch": 4.886141834743006, "grad_norm": 5.887487888336182, "learning_rate": 6.599376745652641e-08, "loss": 1.1562, "step": 7510 }, { "epoch": 4.892648015614834, "grad_norm": 5.907958984375, "learning_rate": 5.8710377039031264e-08, "loss": 1.169, "step": 7520 }, { "epoch": 4.899154196486663, "grad_norm": 5.584612846374512, "learning_rate": 5.185226864530546e-08, "loss": 1.2084, "step": 7530 }, { "epoch": 4.90566037735849, "grad_norm": 7.526587963104248, "learning_rate": 4.541955921518182e-08, "loss": 1.17, "step": 7540 }, { "epoch": 4.912166558230319, "grad_norm": 6.845060348510742, "learning_rate": 3.9412358434876003e-08, "loss": 1.1763, "step": 7550 }, { "epoch": 4.918672739102147, "grad_norm": 5.705447673797607, "learning_rate": 3.383076873511859e-08, "loss": 1.1433, "step": 7560 }, { "epoch": 4.925178919973975, "grad_norm": 6.1236348152160645, "learning_rate": 2.867488528940643e-08, "loss": 1.1628, "step": 7570 }, { "epoch": 4.931685100845804, "grad_norm": 5.873434543609619, "learning_rate": 2.3944796012381754e-08, "loss": 1.16, "step": 7580 }, { "epoch": 4.938191281717632, "grad_norm": 5.979989528656006, "learning_rate": 1.9640581558330594e-08, "loss": 1.166, "step": 7590 }, { "epoch": 4.94469746258946, "grad_norm": 5.885493755340576, "learning_rate": 1.5762315319814425e-08, "loss": 1.1647, "step": 7600 }, { "epoch": 4.951203643461288, "grad_norm": 6.1328535079956055, "learning_rate": 1.2310063426404506e-08, "loss": 1.1896, "step": 7610 }, { "epoch": 4.957709824333117, "grad_norm": 5.883079528808594, "learning_rate": 9.28388474357167e-09, "loss": 1.1923, "step": 7620 }, { "epoch": 4.964216005204944, "grad_norm": 6.423701763153076, "learning_rate": 6.683830871667685e-09, "loss": 1.1847, "step": 7630 }, { "epoch": 4.970722186076773, "grad_norm": 6.050955295562744, "learning_rate": 4.509946145059285e-09, "loss": 1.1996, "step": 7640 }, { "epoch": 4.9772283669486015, "grad_norm": 6.264808177947998, "learning_rate": 2.762267631356563e-09, "loss": 1.1441, "step": 7650 }, { "epoch": 4.983734547820429, "grad_norm": 6.791917324066162, "learning_rate": 1.440825130796797e-09, "loss": 1.1742, "step": 7660 }, { "epoch": 4.990240728692258, "grad_norm": 6.346398830413818, "learning_rate": 5.456411757198688e-10, "loss": 1.1946, "step": 7670 }, { "epoch": 4.996746909564086, "grad_norm": 5.792771339416504, "learning_rate": 7.673103020189309e-11, "loss": 1.1733, "step": 7680 }, { "epoch": 5.0, "eval_f1": 81.83177398493193, "eval_loss": 0.4605475664138794, "eval_precision": 81.83610539261306, "eval_recall": 81.83206882966691, "eval_runtime": 62.5471, "eval_samples_per_second": 6290.224, "eval_steps_per_second": 6.155, "step": 7685 }, { "epoch": 5.0, "step": 7685, "total_flos": 4.0941379391754076e+18, "train_loss": 1.548930292371503, "train_runtime": 4577.3403, "train_samples_per_second": 3438.108, "train_steps_per_second": 1.679 } ], "logging_steps": 10, "max_steps": 7685, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.0941379391754076e+18, "train_batch_size": 512, "trial_name": null, "trial_params": null }