{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9977695167286247, "eval_steps": 500, "global_step": 504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005947955390334572, "grad_norm": 1.824343204498291, "learning_rate": 0.0002, "loss": 3.1087, "step": 1 }, { "epoch": 0.011895910780669145, "grad_norm": 1.374577283859253, "learning_rate": 0.0002, "loss": 2.9898, "step": 2 }, { "epoch": 0.017843866171003718, "grad_norm": 1.5655758380889893, "learning_rate": 0.0002, "loss": 3.0999, "step": 3 }, { "epoch": 0.02379182156133829, "grad_norm": 5.276195049285889, "learning_rate": 0.0002, "loss": 2.7586, "step": 4 }, { "epoch": 0.02973977695167286, "grad_norm": 1.3304948806762695, "learning_rate": 0.0002, "loss": 2.7295, "step": 5 }, { "epoch": 0.035687732342007436, "grad_norm": 1.5444062948226929, "learning_rate": 0.0002, "loss": 2.5949, "step": 6 }, { "epoch": 0.041635687732342004, "grad_norm": 1.2866592407226562, "learning_rate": 0.0002, "loss": 2.4865, "step": 7 }, { "epoch": 0.04758364312267658, "grad_norm": 1.6175459623336792, "learning_rate": 0.0002, "loss": 2.3327, "step": 8 }, { "epoch": 0.053531598513011154, "grad_norm": 1.503796100616455, "learning_rate": 0.0002, "loss": 2.274, "step": 9 }, { "epoch": 0.05947955390334572, "grad_norm": 1.5973471403121948, "learning_rate": 0.0002, "loss": 2.3809, "step": 10 }, { "epoch": 0.0654275092936803, "grad_norm": 1.4748364686965942, "learning_rate": 0.0002, "loss": 2.1347, "step": 11 }, { "epoch": 0.07137546468401487, "grad_norm": 1.7337145805358887, "learning_rate": 0.0002, "loss": 1.9428, "step": 12 }, { "epoch": 0.07732342007434945, "grad_norm": 2.0839593410491943, "learning_rate": 0.0002, "loss": 2.0188, "step": 13 }, { "epoch": 0.08327137546468401, "grad_norm": 2.5018622875213623, "learning_rate": 0.0002, "loss": 1.7943, "step": 14 }, { "epoch": 0.08921933085501858, "grad_norm": 3.416013479232788, "learning_rate": 0.0002, "loss": 1.6812, "step": 15 }, { "epoch": 0.09516728624535316, "grad_norm": 4.241348743438721, "learning_rate": 0.0002, "loss": 1.6478, "step": 16 }, { "epoch": 0.10111524163568773, "grad_norm": 2.6027214527130127, "learning_rate": 0.0002, "loss": 1.5114, "step": 17 }, { "epoch": 0.10706319702602231, "grad_norm": 2.1649773120880127, "learning_rate": 0.0002, "loss": 1.5063, "step": 18 }, { "epoch": 0.11301115241635688, "grad_norm": 1.9704638719558716, "learning_rate": 0.0002, "loss": 1.3781, "step": 19 }, { "epoch": 0.11895910780669144, "grad_norm": 1.7165110111236572, "learning_rate": 0.0002, "loss": 1.3058, "step": 20 }, { "epoch": 0.12490706319702602, "grad_norm": 1.3949488401412964, "learning_rate": 0.0002, "loss": 1.3533, "step": 21 }, { "epoch": 0.1308550185873606, "grad_norm": 1.5300015211105347, "learning_rate": 0.0002, "loss": 1.2578, "step": 22 }, { "epoch": 0.13680297397769517, "grad_norm": 1.9964842796325684, "learning_rate": 0.0002, "loss": 1.2485, "step": 23 }, { "epoch": 0.14275092936802974, "grad_norm": 1.322247862815857, "learning_rate": 0.0002, "loss": 1.1887, "step": 24 }, { "epoch": 0.14869888475836432, "grad_norm": 1.2447245121002197, "learning_rate": 0.0002, "loss": 1.0807, "step": 25 }, { "epoch": 0.1546468401486989, "grad_norm": 1.2943564653396606, "learning_rate": 0.0002, "loss": 1.1295, "step": 26 }, { "epoch": 0.16059479553903347, "grad_norm": 1.2561174631118774, "learning_rate": 0.0002, "loss": 1.1011, "step": 27 }, { "epoch": 0.16654275092936802, "grad_norm": 1.305808663368225, "learning_rate": 0.0002, "loss": 1.1217, "step": 28 }, { "epoch": 0.1724907063197026, "grad_norm": 1.1935081481933594, "learning_rate": 0.0002, "loss": 1.0971, "step": 29 }, { "epoch": 0.17843866171003717, "grad_norm": 0.9849146008491516, "learning_rate": 0.0002, "loss": 0.9949, "step": 30 }, { "epoch": 0.18438661710037174, "grad_norm": 1.236385703086853, "learning_rate": 0.0002, "loss": 1.0533, "step": 31 }, { "epoch": 0.19033457249070632, "grad_norm": 1.091674566268921, "learning_rate": 0.0002, "loss": 0.9963, "step": 32 }, { "epoch": 0.1962825278810409, "grad_norm": 1.4563655853271484, "learning_rate": 0.0002, "loss": 1.0137, "step": 33 }, { "epoch": 0.20223048327137547, "grad_norm": 1.1690599918365479, "learning_rate": 0.0002, "loss": 0.9163, "step": 34 }, { "epoch": 0.20817843866171004, "grad_norm": 1.2094273567199707, "learning_rate": 0.0002, "loss": 1.0236, "step": 35 }, { "epoch": 0.21412639405204462, "grad_norm": 1.388743281364441, "learning_rate": 0.0002, "loss": 0.9591, "step": 36 }, { "epoch": 0.2200743494423792, "grad_norm": 1.2390081882476807, "learning_rate": 0.0002, "loss": 0.8751, "step": 37 }, { "epoch": 0.22602230483271377, "grad_norm": 2.5453145503997803, "learning_rate": 0.0002, "loss": 0.8459, "step": 38 }, { "epoch": 0.2319702602230483, "grad_norm": 1.3461544513702393, "learning_rate": 0.0002, "loss": 0.9373, "step": 39 }, { "epoch": 0.2379182156133829, "grad_norm": 1.2979869842529297, "learning_rate": 0.0002, "loss": 0.9373, "step": 40 }, { "epoch": 0.24386617100371746, "grad_norm": 53.14031219482422, "learning_rate": 0.0002, "loss": 0.7923, "step": 41 }, { "epoch": 0.24981412639405204, "grad_norm": 3.772839307785034, "learning_rate": 0.0002, "loss": 1.0239, "step": 42 }, { "epoch": 0.25576208178438664, "grad_norm": 1.684868574142456, "learning_rate": 0.0002, "loss": 0.8878, "step": 43 }, { "epoch": 0.2617100371747212, "grad_norm": 1.1423863172531128, "learning_rate": 0.0002, "loss": 0.896, "step": 44 }, { "epoch": 0.26765799256505574, "grad_norm": 1.3496270179748535, "learning_rate": 0.0002, "loss": 1.0198, "step": 45 }, { "epoch": 0.27360594795539034, "grad_norm": 1.2799283266067505, "learning_rate": 0.0002, "loss": 0.88, "step": 46 }, { "epoch": 0.2795539033457249, "grad_norm": 1.1613731384277344, "learning_rate": 0.0002, "loss": 0.9708, "step": 47 }, { "epoch": 0.2855018587360595, "grad_norm": 1.1430435180664062, "learning_rate": 0.0002, "loss": 0.8913, "step": 48 }, { "epoch": 0.29144981412639404, "grad_norm": 0.9552589654922485, "learning_rate": 0.0002, "loss": 0.8091, "step": 49 }, { "epoch": 0.29739776951672864, "grad_norm": 1.1052002906799316, "learning_rate": 0.0002, "loss": 0.9737, "step": 50 }, { "epoch": 0.3033457249070632, "grad_norm": 1.3066654205322266, "learning_rate": 0.0002, "loss": 0.9281, "step": 51 }, { "epoch": 0.3092936802973978, "grad_norm": 1.0277180671691895, "learning_rate": 0.0002, "loss": 0.96, "step": 52 }, { "epoch": 0.31524163568773234, "grad_norm": 1.208615779876709, "learning_rate": 0.0002, "loss": 0.8485, "step": 53 }, { "epoch": 0.32118959107806694, "grad_norm": 1.0156666040420532, "learning_rate": 0.0002, "loss": 0.9602, "step": 54 }, { "epoch": 0.3271375464684015, "grad_norm": 1.0822789669036865, "learning_rate": 0.0002, "loss": 0.891, "step": 55 }, { "epoch": 0.33308550185873603, "grad_norm": 1.063072681427002, "learning_rate": 0.0002, "loss": 0.7898, "step": 56 }, { "epoch": 0.33903345724907064, "grad_norm": 1.061710238456726, "learning_rate": 0.0002, "loss": 0.7903, "step": 57 }, { "epoch": 0.3449814126394052, "grad_norm": 1.1998765468597412, "learning_rate": 0.0002, "loss": 0.8346, "step": 58 }, { "epoch": 0.3509293680297398, "grad_norm": 1.083093285560608, "learning_rate": 0.0002, "loss": 0.8103, "step": 59 }, { "epoch": 0.35687732342007433, "grad_norm": 1.0685770511627197, "learning_rate": 0.0002, "loss": 0.7284, "step": 60 }, { "epoch": 0.36282527881040894, "grad_norm": 1.3935203552246094, "learning_rate": 0.0002, "loss": 0.8349, "step": 61 }, { "epoch": 0.3687732342007435, "grad_norm": 1.005191445350647, "learning_rate": 0.0002, "loss": 0.8611, "step": 62 }, { "epoch": 0.3747211895910781, "grad_norm": 1.1198813915252686, "learning_rate": 0.0002, "loss": 0.7994, "step": 63 }, { "epoch": 0.38066914498141263, "grad_norm": 1.454626202583313, "learning_rate": 0.0002, "loss": 0.7984, "step": 64 }, { "epoch": 0.38661710037174724, "grad_norm": 1.1353782415390015, "learning_rate": 0.0002, "loss": 0.7505, "step": 65 }, { "epoch": 0.3925650557620818, "grad_norm": 1.1953253746032715, "learning_rate": 0.0002, "loss": 0.7754, "step": 66 }, { "epoch": 0.39851301115241633, "grad_norm": 1.0996239185333252, "learning_rate": 0.0002, "loss": 0.8747, "step": 67 }, { "epoch": 0.40446096654275093, "grad_norm": 1.5701665878295898, "learning_rate": 0.0002, "loss": 0.8047, "step": 68 }, { "epoch": 0.4104089219330855, "grad_norm": 1.29320228099823, "learning_rate": 0.0002, "loss": 0.7546, "step": 69 }, { "epoch": 0.4163568773234201, "grad_norm": 1.284342646598816, "learning_rate": 0.0002, "loss": 0.7324, "step": 70 }, { "epoch": 0.42230483271375463, "grad_norm": 1.0330944061279297, "learning_rate": 0.0002, "loss": 0.8614, "step": 71 }, { "epoch": 0.42825278810408923, "grad_norm": 1.0411959886550903, "learning_rate": 0.0002, "loss": 0.7431, "step": 72 }, { "epoch": 0.4342007434944238, "grad_norm": 1.2095258235931396, "learning_rate": 0.0002, "loss": 0.7909, "step": 73 }, { "epoch": 0.4401486988847584, "grad_norm": 1.3570586442947388, "learning_rate": 0.0002, "loss": 0.8113, "step": 74 }, { "epoch": 0.44609665427509293, "grad_norm": 1.0079586505889893, "learning_rate": 0.0002, "loss": 0.8457, "step": 75 }, { "epoch": 0.45204460966542753, "grad_norm": 0.9446130990982056, "learning_rate": 0.0002, "loss": 0.7934, "step": 76 }, { "epoch": 0.4579925650557621, "grad_norm": 1.0489394664764404, "learning_rate": 0.0002, "loss": 0.856, "step": 77 }, { "epoch": 0.4639405204460966, "grad_norm": 1.0112191438674927, "learning_rate": 0.0002, "loss": 0.7455, "step": 78 }, { "epoch": 0.46988847583643123, "grad_norm": 0.9976668953895569, "learning_rate": 0.0002, "loss": 0.8143, "step": 79 }, { "epoch": 0.4758364312267658, "grad_norm": 1.0991159677505493, "learning_rate": 0.0002, "loss": 0.6766, "step": 80 }, { "epoch": 0.4817843866171004, "grad_norm": 1.1794909238815308, "learning_rate": 0.0002, "loss": 0.6635, "step": 81 }, { "epoch": 0.4877323420074349, "grad_norm": 1.0414669513702393, "learning_rate": 0.0002, "loss": 0.8228, "step": 82 }, { "epoch": 0.49368029739776953, "grad_norm": 1.0767929553985596, "learning_rate": 0.0002, "loss": 0.8092, "step": 83 }, { "epoch": 0.4996282527881041, "grad_norm": 1.3375307321548462, "learning_rate": 0.0002, "loss": 0.7361, "step": 84 }, { "epoch": 0.5055762081784386, "grad_norm": 1.2492313385009766, "learning_rate": 0.0002, "loss": 0.8343, "step": 85 }, { "epoch": 0.5115241635687733, "grad_norm": 1.0948379039764404, "learning_rate": 0.0002, "loss": 0.7559, "step": 86 }, { "epoch": 0.5174721189591078, "grad_norm": 1.1456286907196045, "learning_rate": 0.0002, "loss": 0.7819, "step": 87 }, { "epoch": 0.5234200743494424, "grad_norm": 0.9729915857315063, "learning_rate": 0.0002, "loss": 0.7507, "step": 88 }, { "epoch": 0.5293680297397769, "grad_norm": 1.089845895767212, "learning_rate": 0.0002, "loss": 0.857, "step": 89 }, { "epoch": 0.5353159851301115, "grad_norm": 1.0552901029586792, "learning_rate": 0.0002, "loss": 0.8387, "step": 90 }, { "epoch": 0.5412639405204461, "grad_norm": 1.2134290933609009, "learning_rate": 0.0002, "loss": 0.845, "step": 91 }, { "epoch": 0.5472118959107807, "grad_norm": 1.2725780010223389, "learning_rate": 0.0002, "loss": 0.8203, "step": 92 }, { "epoch": 0.5531598513011152, "grad_norm": 1.3931224346160889, "learning_rate": 0.0002, "loss": 0.8428, "step": 93 }, { "epoch": 0.5591078066914498, "grad_norm": 1.0805130004882812, "learning_rate": 0.0002, "loss": 0.7855, "step": 94 }, { "epoch": 0.5650557620817844, "grad_norm": 1.018471598625183, "learning_rate": 0.0002, "loss": 0.6318, "step": 95 }, { "epoch": 0.571003717472119, "grad_norm": 3.2651963233947754, "learning_rate": 0.0002, "loss": 0.7039, "step": 96 }, { "epoch": 0.5769516728624535, "grad_norm": 0.9978213906288147, "learning_rate": 0.0002, "loss": 0.6734, "step": 97 }, { "epoch": 0.5828996282527881, "grad_norm": 0.8679234385490417, "learning_rate": 0.0002, "loss": 0.7277, "step": 98 }, { "epoch": 0.5888475836431227, "grad_norm": 1.1249589920043945, "learning_rate": 0.0002, "loss": 0.7392, "step": 99 }, { "epoch": 0.5947955390334573, "grad_norm": 0.9032052755355835, "learning_rate": 0.0002, "loss": 0.6876, "step": 100 }, { "epoch": 0.6007434944237918, "grad_norm": 0.9359114170074463, "learning_rate": 0.0002, "loss": 0.6448, "step": 101 }, { "epoch": 0.6066914498141264, "grad_norm": 1.076899528503418, "learning_rate": 0.0002, "loss": 0.7399, "step": 102 }, { "epoch": 0.6126394052044609, "grad_norm": 1.4630522727966309, "learning_rate": 0.0002, "loss": 0.6902, "step": 103 }, { "epoch": 0.6185873605947956, "grad_norm": 1.0862653255462646, "learning_rate": 0.0002, "loss": 0.854, "step": 104 }, { "epoch": 0.6245353159851301, "grad_norm": 1.3135863542556763, "learning_rate": 0.0002, "loss": 0.8188, "step": 105 }, { "epoch": 0.6304832713754647, "grad_norm": 0.9794917106628418, "learning_rate": 0.0002, "loss": 0.6275, "step": 106 }, { "epoch": 0.6364312267657992, "grad_norm": 1.02755868434906, "learning_rate": 0.0002, "loss": 0.7058, "step": 107 }, { "epoch": 0.6423791821561339, "grad_norm": 0.9642486572265625, "learning_rate": 0.0002, "loss": 0.8053, "step": 108 }, { "epoch": 0.6483271375464684, "grad_norm": 1.1192632913589478, "learning_rate": 0.0002, "loss": 0.7515, "step": 109 }, { "epoch": 0.654275092936803, "grad_norm": 1.1808356046676636, "learning_rate": 0.0002, "loss": 0.678, "step": 110 }, { "epoch": 0.6602230483271375, "grad_norm": 1.2461023330688477, "learning_rate": 0.0002, "loss": 0.6892, "step": 111 }, { "epoch": 0.6661710037174721, "grad_norm": 1.0632222890853882, "learning_rate": 0.0002, "loss": 0.698, "step": 112 }, { "epoch": 0.6721189591078067, "grad_norm": 1.0353591442108154, "learning_rate": 0.0002, "loss": 0.7453, "step": 113 }, { "epoch": 0.6780669144981413, "grad_norm": 1.124794602394104, "learning_rate": 0.0002, "loss": 0.815, "step": 114 }, { "epoch": 0.6840148698884758, "grad_norm": 1.0341081619262695, "learning_rate": 0.0002, "loss": 0.7816, "step": 115 }, { "epoch": 0.6899628252788104, "grad_norm": 1.082952857017517, "learning_rate": 0.0002, "loss": 0.6825, "step": 116 }, { "epoch": 0.695910780669145, "grad_norm": 0.9126180410385132, "learning_rate": 0.0002, "loss": 0.7042, "step": 117 }, { "epoch": 0.7018587360594796, "grad_norm": 1.2339016199111938, "learning_rate": 0.0002, "loss": 0.7759, "step": 118 }, { "epoch": 0.7078066914498141, "grad_norm": 1.5227537155151367, "learning_rate": 0.0002, "loss": 0.8574, "step": 119 }, { "epoch": 0.7137546468401487, "grad_norm": 1.0859841108322144, "learning_rate": 0.0002, "loss": 0.7241, "step": 120 }, { "epoch": 0.7197026022304833, "grad_norm": 1.0609203577041626, "learning_rate": 0.0002, "loss": 0.7391, "step": 121 }, { "epoch": 0.7256505576208179, "grad_norm": 0.9025185704231262, "learning_rate": 0.0002, "loss": 0.7538, "step": 122 }, { "epoch": 0.7315985130111524, "grad_norm": 0.9280850291252136, "learning_rate": 0.0002, "loss": 0.8023, "step": 123 }, { "epoch": 0.737546468401487, "grad_norm": 1.0120896100997925, "learning_rate": 0.0002, "loss": 0.797, "step": 124 }, { "epoch": 0.7434944237918215, "grad_norm": 0.9294270277023315, "learning_rate": 0.0002, "loss": 0.7939, "step": 125 }, { "epoch": 0.7494423791821562, "grad_norm": 1.001685380935669, "learning_rate": 0.0002, "loss": 0.7297, "step": 126 }, { "epoch": 0.7553903345724907, "grad_norm": 1.0650714635849, "learning_rate": 0.0002, "loss": 0.7955, "step": 127 }, { "epoch": 0.7613382899628253, "grad_norm": 0.9343367218971252, "learning_rate": 0.0002, "loss": 0.7008, "step": 128 }, { "epoch": 0.7672862453531598, "grad_norm": 1.0042743682861328, "learning_rate": 0.0002, "loss": 0.8086, "step": 129 }, { "epoch": 0.7732342007434945, "grad_norm": 0.9538952708244324, "learning_rate": 0.0002, "loss": 0.6839, "step": 130 }, { "epoch": 0.779182156133829, "grad_norm": 1.0010913610458374, "learning_rate": 0.0002, "loss": 0.6247, "step": 131 }, { "epoch": 0.7851301115241636, "grad_norm": 0.8673060536384583, "learning_rate": 0.0002, "loss": 0.7232, "step": 132 }, { "epoch": 0.7910780669144981, "grad_norm": 1.070591688156128, "learning_rate": 0.0002, "loss": 0.785, "step": 133 }, { "epoch": 0.7970260223048327, "grad_norm": 1.0302468538284302, "learning_rate": 0.0002, "loss": 0.8044, "step": 134 }, { "epoch": 0.8029739776951673, "grad_norm": 1.0886098146438599, "learning_rate": 0.0002, "loss": 0.6984, "step": 135 }, { "epoch": 0.8089219330855019, "grad_norm": 0.9349246025085449, "learning_rate": 0.0002, "loss": 0.6711, "step": 136 }, { "epoch": 0.8148698884758364, "grad_norm": 1.2482446432113647, "learning_rate": 0.0002, "loss": 0.6468, "step": 137 }, { "epoch": 0.820817843866171, "grad_norm": 1.184043049812317, "learning_rate": 0.0002, "loss": 0.6909, "step": 138 }, { "epoch": 0.8267657992565056, "grad_norm": 0.8721855878829956, "learning_rate": 0.0002, "loss": 0.6553, "step": 139 }, { "epoch": 0.8327137546468402, "grad_norm": 1.30323326587677, "learning_rate": 0.0002, "loss": 0.7109, "step": 140 }, { "epoch": 0.8386617100371747, "grad_norm": 1.0187689065933228, "learning_rate": 0.0002, "loss": 0.6682, "step": 141 }, { "epoch": 0.8446096654275093, "grad_norm": 2.3475165367126465, "learning_rate": 0.0002, "loss": 0.7615, "step": 142 }, { "epoch": 0.8505576208178439, "grad_norm": 0.9803043603897095, "learning_rate": 0.0002, "loss": 0.7179, "step": 143 }, { "epoch": 0.8565055762081785, "grad_norm": 1.2290213108062744, "learning_rate": 0.0002, "loss": 0.8696, "step": 144 }, { "epoch": 0.862453531598513, "grad_norm": 1.1041066646575928, "learning_rate": 0.0002, "loss": 0.8196, "step": 145 }, { "epoch": 0.8684014869888476, "grad_norm": 0.9638866186141968, "learning_rate": 0.0002, "loss": 0.655, "step": 146 }, { "epoch": 0.8743494423791821, "grad_norm": 0.9777591824531555, "learning_rate": 0.0002, "loss": 0.8053, "step": 147 }, { "epoch": 0.8802973977695168, "grad_norm": 0.8717353343963623, "learning_rate": 0.0002, "loss": 0.6726, "step": 148 }, { "epoch": 0.8862453531598513, "grad_norm": 1.1772398948669434, "learning_rate": 0.0002, "loss": 0.7188, "step": 149 }, { "epoch": 0.8921933085501859, "grad_norm": 1.1113988161087036, "learning_rate": 0.0002, "loss": 0.7177, "step": 150 }, { "epoch": 0.8981412639405204, "grad_norm": 1.6691763401031494, "learning_rate": 0.0002, "loss": 0.5822, "step": 151 }, { "epoch": 0.9040892193308551, "grad_norm": 1.0139896869659424, "learning_rate": 0.0002, "loss": 0.7416, "step": 152 }, { "epoch": 0.9100371747211896, "grad_norm": 1.2538039684295654, "learning_rate": 0.0002, "loss": 0.7822, "step": 153 }, { "epoch": 0.9159851301115242, "grad_norm": 0.833595335483551, "learning_rate": 0.0002, "loss": 0.6617, "step": 154 }, { "epoch": 0.9219330855018587, "grad_norm": 0.869482696056366, "learning_rate": 0.0002, "loss": 0.6146, "step": 155 }, { "epoch": 0.9278810408921933, "grad_norm": 0.973523736000061, "learning_rate": 0.0002, "loss": 0.6685, "step": 156 }, { "epoch": 0.9338289962825279, "grad_norm": 0.982566237449646, "learning_rate": 0.0002, "loss": 0.6685, "step": 157 }, { "epoch": 0.9397769516728625, "grad_norm": 1.0534875392913818, "learning_rate": 0.0002, "loss": 0.656, "step": 158 }, { "epoch": 0.945724907063197, "grad_norm": 1.11860990524292, "learning_rate": 0.0002, "loss": 0.7261, "step": 159 }, { "epoch": 0.9516728624535316, "grad_norm": 1.0286844968795776, "learning_rate": 0.0002, "loss": 0.7271, "step": 160 }, { "epoch": 0.9576208178438662, "grad_norm": 0.8426340818405151, "learning_rate": 0.0002, "loss": 0.718, "step": 161 }, { "epoch": 0.9635687732342008, "grad_norm": 0.990667998790741, "learning_rate": 0.0002, "loss": 0.7795, "step": 162 }, { "epoch": 0.9695167286245353, "grad_norm": 1.1110923290252686, "learning_rate": 0.0002, "loss": 0.6946, "step": 163 }, { "epoch": 0.9754646840148699, "grad_norm": 1.0378597974777222, "learning_rate": 0.0002, "loss": 0.8106, "step": 164 }, { "epoch": 0.9814126394052045, "grad_norm": 0.9507467746734619, "learning_rate": 0.0002, "loss": 0.7031, "step": 165 }, { "epoch": 0.9873605947955391, "grad_norm": 0.8636868596076965, "learning_rate": 0.0002, "loss": 0.6636, "step": 166 }, { "epoch": 0.9933085501858736, "grad_norm": 1.0482003688812256, "learning_rate": 0.0002, "loss": 0.7204, "step": 167 }, { "epoch": 0.9992565055762082, "grad_norm": 0.9490022659301758, "learning_rate": 0.0002, "loss": 0.5626, "step": 168 }, { "epoch": 1.0052044609665427, "grad_norm": 0.8918917179107666, "learning_rate": 0.0002, "loss": 0.6822, "step": 169 }, { "epoch": 1.0111524163568772, "grad_norm": 0.9100430011749268, "learning_rate": 0.0002, "loss": 0.7147, "step": 170 }, { "epoch": 1.0171003717472118, "grad_norm": 0.9007158279418945, "learning_rate": 0.0002, "loss": 0.6015, "step": 171 }, { "epoch": 1.0230483271375466, "grad_norm": 0.9267099499702454, "learning_rate": 0.0002, "loss": 0.6562, "step": 172 }, { "epoch": 1.0289962825278811, "grad_norm": 1.0618972778320312, "learning_rate": 0.0002, "loss": 0.6601, "step": 173 }, { "epoch": 1.0349442379182157, "grad_norm": 1.1782737970352173, "learning_rate": 0.0002, "loss": 0.6172, "step": 174 }, { "epoch": 1.0408921933085502, "grad_norm": 1.141661286354065, "learning_rate": 0.0002, "loss": 0.5854, "step": 175 }, { "epoch": 1.0468401486988848, "grad_norm": 1.1038105487823486, "learning_rate": 0.0002, "loss": 0.6655, "step": 176 }, { "epoch": 1.0527881040892193, "grad_norm": 1.1518810987472534, "learning_rate": 0.0002, "loss": 0.493, "step": 177 }, { "epoch": 1.0587360594795538, "grad_norm": 1.0501494407653809, "learning_rate": 0.0002, "loss": 0.6256, "step": 178 }, { "epoch": 1.0646840148698884, "grad_norm": 0.9064037799835205, "learning_rate": 0.0002, "loss": 0.5695, "step": 179 }, { "epoch": 1.070631970260223, "grad_norm": 1.1978446245193481, "learning_rate": 0.0002, "loss": 0.617, "step": 180 }, { "epoch": 1.0765799256505577, "grad_norm": 0.9782500267028809, "learning_rate": 0.0002, "loss": 0.6542, "step": 181 }, { "epoch": 1.0825278810408923, "grad_norm": 1.6090043783187866, "learning_rate": 0.0002, "loss": 0.6827, "step": 182 }, { "epoch": 1.0884758364312268, "grad_norm": 1.2153990268707275, "learning_rate": 0.0002, "loss": 0.7469, "step": 183 }, { "epoch": 1.0944237918215614, "grad_norm": 0.9915666580200195, "learning_rate": 0.0002, "loss": 0.6475, "step": 184 }, { "epoch": 1.100371747211896, "grad_norm": 1.1319215297698975, "learning_rate": 0.0002, "loss": 0.7879, "step": 185 }, { "epoch": 1.1063197026022304, "grad_norm": 1.0497454404830933, "learning_rate": 0.0002, "loss": 0.6386, "step": 186 }, { "epoch": 1.112267657992565, "grad_norm": 1.1735246181488037, "learning_rate": 0.0002, "loss": 0.7276, "step": 187 }, { "epoch": 1.1182156133828995, "grad_norm": 1.280543327331543, "learning_rate": 0.0002, "loss": 0.6403, "step": 188 }, { "epoch": 1.124163568773234, "grad_norm": 1.0103743076324463, "learning_rate": 0.0002, "loss": 0.5913, "step": 189 }, { "epoch": 1.1301115241635689, "grad_norm": 1.0629348754882812, "learning_rate": 0.0002, "loss": 0.708, "step": 190 }, { "epoch": 1.1360594795539034, "grad_norm": 0.9152292609214783, "learning_rate": 0.0002, "loss": 0.8295, "step": 191 }, { "epoch": 1.142007434944238, "grad_norm": 0.9847885370254517, "learning_rate": 0.0002, "loss": 0.6569, "step": 192 }, { "epoch": 1.1479553903345725, "grad_norm": 1.5211213827133179, "learning_rate": 0.0002, "loss": 0.676, "step": 193 }, { "epoch": 1.153903345724907, "grad_norm": 1.0376240015029907, "learning_rate": 0.0002, "loss": 0.5558, "step": 194 }, { "epoch": 1.1598513011152416, "grad_norm": 0.9746745824813843, "learning_rate": 0.0002, "loss": 0.6051, "step": 195 }, { "epoch": 1.1657992565055761, "grad_norm": 1.0810937881469727, "learning_rate": 0.0002, "loss": 0.5947, "step": 196 }, { "epoch": 1.1717472118959107, "grad_norm": 0.8687509894371033, "learning_rate": 0.0002, "loss": 0.5834, "step": 197 }, { "epoch": 1.1776951672862452, "grad_norm": 1.3437882661819458, "learning_rate": 0.0002, "loss": 0.7133, "step": 198 }, { "epoch": 1.18364312267658, "grad_norm": 0.9247745871543884, "learning_rate": 0.0002, "loss": 0.668, "step": 199 }, { "epoch": 1.1895910780669146, "grad_norm": 1.116870403289795, "learning_rate": 0.0002, "loss": 0.5763, "step": 200 }, { "epoch": 1.195539033457249, "grad_norm": 1.0791046619415283, "learning_rate": 0.0002, "loss": 0.7535, "step": 201 }, { "epoch": 1.2014869888475836, "grad_norm": 1.1156578063964844, "learning_rate": 0.0002, "loss": 0.623, "step": 202 }, { "epoch": 1.2074349442379182, "grad_norm": 1.3306505680084229, "learning_rate": 0.0002, "loss": 0.6239, "step": 203 }, { "epoch": 1.2133828996282527, "grad_norm": 1.1251856088638306, "learning_rate": 0.0002, "loss": 0.6122, "step": 204 }, { "epoch": 1.2193308550185873, "grad_norm": 0.9659932255744934, "learning_rate": 0.0002, "loss": 0.7163, "step": 205 }, { "epoch": 1.2252788104089218, "grad_norm": 1.1080381870269775, "learning_rate": 0.0002, "loss": 0.5558, "step": 206 }, { "epoch": 1.2312267657992564, "grad_norm": 1.1085773706436157, "learning_rate": 0.0002, "loss": 0.6306, "step": 207 }, { "epoch": 1.2371747211895912, "grad_norm": 1.5555293560028076, "learning_rate": 0.0002, "loss": 0.7266, "step": 208 }, { "epoch": 1.2431226765799257, "grad_norm": 1.1494146585464478, "learning_rate": 0.0002, "loss": 0.6902, "step": 209 }, { "epoch": 1.2490706319702602, "grad_norm": 1.0835429430007935, "learning_rate": 0.0002, "loss": 0.7219, "step": 210 }, { "epoch": 1.2550185873605948, "grad_norm": 1.1306850910186768, "learning_rate": 0.0002, "loss": 0.5931, "step": 211 }, { "epoch": 1.2609665427509293, "grad_norm": 1.148278832435608, "learning_rate": 0.0002, "loss": 0.6452, "step": 212 }, { "epoch": 1.266914498141264, "grad_norm": 1.097596526145935, "learning_rate": 0.0002, "loss": 0.6486, "step": 213 }, { "epoch": 1.2728624535315984, "grad_norm": 1.3407150506973267, "learning_rate": 0.0002, "loss": 0.5874, "step": 214 }, { "epoch": 1.2788104089219332, "grad_norm": 0.8781871199607849, "learning_rate": 0.0002, "loss": 0.6079, "step": 215 }, { "epoch": 1.2847583643122675, "grad_norm": 0.998188853263855, "learning_rate": 0.0002, "loss": 0.7292, "step": 216 }, { "epoch": 1.2907063197026023, "grad_norm": 1.0128471851348877, "learning_rate": 0.0002, "loss": 0.6729, "step": 217 }, { "epoch": 1.2966542750929368, "grad_norm": 1.2172327041625977, "learning_rate": 0.0002, "loss": 0.7529, "step": 218 }, { "epoch": 1.3026022304832714, "grad_norm": 0.9904006719589233, "learning_rate": 0.0002, "loss": 0.639, "step": 219 }, { "epoch": 1.308550185873606, "grad_norm": 0.8886059522628784, "learning_rate": 0.0002, "loss": 0.6128, "step": 220 }, { "epoch": 1.3144981412639405, "grad_norm": 1.0350927114486694, "learning_rate": 0.0002, "loss": 0.6668, "step": 221 }, { "epoch": 1.320446096654275, "grad_norm": 1.0321650505065918, "learning_rate": 0.0002, "loss": 0.6493, "step": 222 }, { "epoch": 1.3263940520446096, "grad_norm": 0.8952768445014954, "learning_rate": 0.0002, "loss": 0.7005, "step": 223 }, { "epoch": 1.3323420074349444, "grad_norm": 1.3372063636779785, "learning_rate": 0.0002, "loss": 0.6347, "step": 224 }, { "epoch": 1.3382899628252787, "grad_norm": 0.9312218427658081, "learning_rate": 0.0002, "loss": 0.5963, "step": 225 }, { "epoch": 1.3442379182156134, "grad_norm": 0.8845749497413635, "learning_rate": 0.0002, "loss": 0.5445, "step": 226 }, { "epoch": 1.350185873605948, "grad_norm": 1.292598843574524, "learning_rate": 0.0002, "loss": 0.6662, "step": 227 }, { "epoch": 1.3561338289962825, "grad_norm": 1.0537996292114258, "learning_rate": 0.0002, "loss": 0.7332, "step": 228 }, { "epoch": 1.362081784386617, "grad_norm": 0.9492632150650024, "learning_rate": 0.0002, "loss": 0.6157, "step": 229 }, { "epoch": 1.3680297397769516, "grad_norm": 1.0352752208709717, "learning_rate": 0.0002, "loss": 0.5248, "step": 230 }, { "epoch": 1.3739776951672862, "grad_norm": 1.085534930229187, "learning_rate": 0.0002, "loss": 0.6358, "step": 231 }, { "epoch": 1.3799256505576207, "grad_norm": 1.098999261856079, "learning_rate": 0.0002, "loss": 0.6181, "step": 232 }, { "epoch": 1.3858736059479555, "grad_norm": 1.114450454711914, "learning_rate": 0.0002, "loss": 0.7067, "step": 233 }, { "epoch": 1.3918215613382898, "grad_norm": 1.3746989965438843, "learning_rate": 0.0002, "loss": 0.6016, "step": 234 }, { "epoch": 1.3977695167286246, "grad_norm": 0.916519820690155, "learning_rate": 0.0002, "loss": 0.6579, "step": 235 }, { "epoch": 1.4037174721189591, "grad_norm": 1.0786117315292358, "learning_rate": 0.0002, "loss": 0.6516, "step": 236 }, { "epoch": 1.4096654275092937, "grad_norm": 0.9264970421791077, "learning_rate": 0.0002, "loss": 0.5927, "step": 237 }, { "epoch": 1.4156133828996282, "grad_norm": 1.0969526767730713, "learning_rate": 0.0002, "loss": 0.6747, "step": 238 }, { "epoch": 1.4215613382899628, "grad_norm": 0.9945991635322571, "learning_rate": 0.0002, "loss": 0.6719, "step": 239 }, { "epoch": 1.4275092936802973, "grad_norm": 1.0272929668426514, "learning_rate": 0.0002, "loss": 0.7015, "step": 240 }, { "epoch": 1.4334572490706319, "grad_norm": 1.2321354150772095, "learning_rate": 0.0002, "loss": 0.6904, "step": 241 }, { "epoch": 1.4394052044609666, "grad_norm": 1.1331416368484497, "learning_rate": 0.0002, "loss": 0.6444, "step": 242 }, { "epoch": 1.4453531598513012, "grad_norm": 1.0527664422988892, "learning_rate": 0.0002, "loss": 0.6135, "step": 243 }, { "epoch": 1.4513011152416357, "grad_norm": 1.0586967468261719, "learning_rate": 0.0002, "loss": 0.6705, "step": 244 }, { "epoch": 1.4572490706319703, "grad_norm": 1.0302836894989014, "learning_rate": 0.0002, "loss": 0.6757, "step": 245 }, { "epoch": 1.4631970260223048, "grad_norm": 0.9323686957359314, "learning_rate": 0.0002, "loss": 0.7458, "step": 246 }, { "epoch": 1.4691449814126394, "grad_norm": 1.103028416633606, "learning_rate": 0.0002, "loss": 0.5832, "step": 247 }, { "epoch": 1.475092936802974, "grad_norm": 1.1638356447219849, "learning_rate": 0.0002, "loss": 0.6286, "step": 248 }, { "epoch": 1.4810408921933085, "grad_norm": 1.0685887336730957, "learning_rate": 0.0002, "loss": 0.6255, "step": 249 }, { "epoch": 1.486988847583643, "grad_norm": 0.9854826927185059, "learning_rate": 0.0002, "loss": 0.7764, "step": 250 }, { "epoch": 1.4929368029739778, "grad_norm": 1.1790441274642944, "learning_rate": 0.0002, "loss": 0.5791, "step": 251 }, { "epoch": 1.4988847583643123, "grad_norm": 0.9097880721092224, "learning_rate": 0.0002, "loss": 0.5517, "step": 252 }, { "epoch": 1.504832713754647, "grad_norm": 1.1351274251937866, "learning_rate": 0.0002, "loss": 0.6725, "step": 253 }, { "epoch": 1.5107806691449814, "grad_norm": 1.2710050344467163, "learning_rate": 0.0002, "loss": 0.7106, "step": 254 }, { "epoch": 1.516728624535316, "grad_norm": 1.2035406827926636, "learning_rate": 0.0002, "loss": 0.623, "step": 255 }, { "epoch": 1.5226765799256505, "grad_norm": 1.100200891494751, "learning_rate": 0.0002, "loss": 0.6418, "step": 256 }, { "epoch": 1.528624535315985, "grad_norm": 1.3976622819900513, "learning_rate": 0.0002, "loss": 0.7007, "step": 257 }, { "epoch": 1.5345724907063198, "grad_norm": 1.2113823890686035, "learning_rate": 0.0002, "loss": 0.7271, "step": 258 }, { "epoch": 1.5405204460966542, "grad_norm": 1.1983304023742676, "learning_rate": 0.0002, "loss": 0.6937, "step": 259 }, { "epoch": 1.546468401486989, "grad_norm": 1.2594386339187622, "learning_rate": 0.0002, "loss": 0.708, "step": 260 }, { "epoch": 1.5524163568773233, "grad_norm": 1.1495513916015625, "learning_rate": 0.0002, "loss": 0.621, "step": 261 }, { "epoch": 1.558364312267658, "grad_norm": 1.0474885702133179, "learning_rate": 0.0002, "loss": 0.5433, "step": 262 }, { "epoch": 1.5643122676579926, "grad_norm": 1.1138205528259277, "learning_rate": 0.0002, "loss": 0.6886, "step": 263 }, { "epoch": 1.5702602230483271, "grad_norm": 0.9678700566291809, "learning_rate": 0.0002, "loss": 0.5945, "step": 264 }, { "epoch": 1.5762081784386617, "grad_norm": 0.928419828414917, "learning_rate": 0.0002, "loss": 0.7483, "step": 265 }, { "epoch": 1.5821561338289962, "grad_norm": 0.8806396126747131, "learning_rate": 0.0002, "loss": 0.6022, "step": 266 }, { "epoch": 1.588104089219331, "grad_norm": 0.9389284253120422, "learning_rate": 0.0002, "loss": 0.6638, "step": 267 }, { "epoch": 1.5940520446096653, "grad_norm": 1.0797287225723267, "learning_rate": 0.0002, "loss": 0.6298, "step": 268 }, { "epoch": 1.6, "grad_norm": 0.9545785784721375, "learning_rate": 0.0002, "loss": 0.6335, "step": 269 }, { "epoch": 1.6059479553903344, "grad_norm": 0.9800273776054382, "learning_rate": 0.0002, "loss": 0.5772, "step": 270 }, { "epoch": 1.6118959107806692, "grad_norm": 1.3683196306228638, "learning_rate": 0.0002, "loss": 0.6181, "step": 271 }, { "epoch": 1.6178438661710037, "grad_norm": 1.1559855937957764, "learning_rate": 0.0002, "loss": 0.62, "step": 272 }, { "epoch": 1.6237918215613383, "grad_norm": 1.1240603923797607, "learning_rate": 0.0002, "loss": 0.682, "step": 273 }, { "epoch": 1.6297397769516728, "grad_norm": 0.9673051834106445, "learning_rate": 0.0002, "loss": 0.6835, "step": 274 }, { "epoch": 1.6356877323420074, "grad_norm": 1.1218955516815186, "learning_rate": 0.0002, "loss": 0.6591, "step": 275 }, { "epoch": 1.6416356877323421, "grad_norm": 1.2360399961471558, "learning_rate": 0.0002, "loss": 0.7957, "step": 276 }, { "epoch": 1.6475836431226765, "grad_norm": 1.2180172204971313, "learning_rate": 0.0002, "loss": 0.6951, "step": 277 }, { "epoch": 1.6535315985130112, "grad_norm": 1.2104121446609497, "learning_rate": 0.0002, "loss": 0.6549, "step": 278 }, { "epoch": 1.6594795539033456, "grad_norm": 0.9836241006851196, "learning_rate": 0.0002, "loss": 0.6077, "step": 279 }, { "epoch": 1.6654275092936803, "grad_norm": 0.8980191349983215, "learning_rate": 0.0002, "loss": 0.7533, "step": 280 }, { "epoch": 1.6713754646840149, "grad_norm": 1.056117296218872, "learning_rate": 0.0002, "loss": 0.6283, "step": 281 }, { "epoch": 1.6773234200743494, "grad_norm": 1.0315310955047607, "learning_rate": 0.0002, "loss": 0.7099, "step": 282 }, { "epoch": 1.683271375464684, "grad_norm": 1.1293710470199585, "learning_rate": 0.0002, "loss": 0.7628, "step": 283 }, { "epoch": 1.6892193308550185, "grad_norm": 0.8841990232467651, "learning_rate": 0.0002, "loss": 0.6076, "step": 284 }, { "epoch": 1.6951672862453533, "grad_norm": 1.0221779346466064, "learning_rate": 0.0002, "loss": 0.6003, "step": 285 }, { "epoch": 1.7011152416356876, "grad_norm": 0.9923282861709595, "learning_rate": 0.0002, "loss": 0.5743, "step": 286 }, { "epoch": 1.7070631970260224, "grad_norm": 1.1585432291030884, "learning_rate": 0.0002, "loss": 0.6121, "step": 287 }, { "epoch": 1.713011152416357, "grad_norm": 0.9201356172561646, "learning_rate": 0.0002, "loss": 0.6343, "step": 288 }, { "epoch": 1.7189591078066915, "grad_norm": 1.164581298828125, "learning_rate": 0.0002, "loss": 0.615, "step": 289 }, { "epoch": 1.724907063197026, "grad_norm": 0.9991989135742188, "learning_rate": 0.0002, "loss": 0.62, "step": 290 }, { "epoch": 1.7308550185873606, "grad_norm": 1.0976234674453735, "learning_rate": 0.0002, "loss": 0.687, "step": 291 }, { "epoch": 1.7368029739776951, "grad_norm": 1.1581001281738281, "learning_rate": 0.0002, "loss": 0.6227, "step": 292 }, { "epoch": 1.7427509293680297, "grad_norm": 1.0079922676086426, "learning_rate": 0.0002, "loss": 0.675, "step": 293 }, { "epoch": 1.7486988847583644, "grad_norm": 1.0962276458740234, "learning_rate": 0.0002, "loss": 0.6694, "step": 294 }, { "epoch": 1.7546468401486988, "grad_norm": 1.0988850593566895, "learning_rate": 0.0002, "loss": 0.6114, "step": 295 }, { "epoch": 1.7605947955390335, "grad_norm": 1.4446635246276855, "learning_rate": 0.0002, "loss": 0.5885, "step": 296 }, { "epoch": 1.766542750929368, "grad_norm": 1.2141138315200806, "learning_rate": 0.0002, "loss": 0.7856, "step": 297 }, { "epoch": 1.7724907063197026, "grad_norm": 1.1908177137374878, "learning_rate": 0.0002, "loss": 0.7033, "step": 298 }, { "epoch": 1.7784386617100372, "grad_norm": 1.019839882850647, "learning_rate": 0.0002, "loss": 0.6763, "step": 299 }, { "epoch": 1.7843866171003717, "grad_norm": 1.039696216583252, "learning_rate": 0.0002, "loss": 0.6279, "step": 300 }, { "epoch": 1.7903345724907063, "grad_norm": 0.974805474281311, "learning_rate": 0.0002, "loss": 0.5777, "step": 301 }, { "epoch": 1.7962825278810408, "grad_norm": 1.1052793264389038, "learning_rate": 0.0002, "loss": 0.7294, "step": 302 }, { "epoch": 1.8022304832713756, "grad_norm": 1.4657918214797974, "learning_rate": 0.0002, "loss": 0.5142, "step": 303 }, { "epoch": 1.80817843866171, "grad_norm": 1.0391294956207275, "learning_rate": 0.0002, "loss": 0.6388, "step": 304 }, { "epoch": 1.8141263940520447, "grad_norm": 1.0521687269210815, "learning_rate": 0.0002, "loss": 0.6944, "step": 305 }, { "epoch": 1.8200743494423792, "grad_norm": 1.0755914449691772, "learning_rate": 0.0002, "loss": 0.6974, "step": 306 }, { "epoch": 1.8260223048327138, "grad_norm": 1.128304123878479, "learning_rate": 0.0002, "loss": 0.7687, "step": 307 }, { "epoch": 1.8319702602230483, "grad_norm": 1.0178970098495483, "learning_rate": 0.0002, "loss": 0.6074, "step": 308 }, { "epoch": 1.8379182156133829, "grad_norm": 0.9115421772003174, "learning_rate": 0.0002, "loss": 0.725, "step": 309 }, { "epoch": 1.8438661710037176, "grad_norm": 1.0200258493423462, "learning_rate": 0.0002, "loss": 0.6866, "step": 310 }, { "epoch": 1.849814126394052, "grad_norm": 1.286431908607483, "learning_rate": 0.0002, "loss": 0.6618, "step": 311 }, { "epoch": 1.8557620817843867, "grad_norm": 1.0576943159103394, "learning_rate": 0.0002, "loss": 0.6217, "step": 312 }, { "epoch": 1.861710037174721, "grad_norm": 0.9450961351394653, "learning_rate": 0.0002, "loss": 0.7285, "step": 313 }, { "epoch": 1.8676579925650558, "grad_norm": 1.2659786939620972, "learning_rate": 0.0002, "loss": 0.6004, "step": 314 }, { "epoch": 1.8736059479553904, "grad_norm": 1.0950329303741455, "learning_rate": 0.0002, "loss": 0.6792, "step": 315 }, { "epoch": 1.879553903345725, "grad_norm": 1.0434305667877197, "learning_rate": 0.0002, "loss": 0.6651, "step": 316 }, { "epoch": 1.8855018587360595, "grad_norm": 2.390085458755493, "learning_rate": 0.0002, "loss": 0.6394, "step": 317 }, { "epoch": 1.891449814126394, "grad_norm": 1.1428786516189575, "learning_rate": 0.0002, "loss": 0.6519, "step": 318 }, { "epoch": 1.8973977695167288, "grad_norm": 1.1516354084014893, "learning_rate": 0.0002, "loss": 0.5967, "step": 319 }, { "epoch": 1.903345724907063, "grad_norm": 0.9553952813148499, "learning_rate": 0.0002, "loss": 0.6626, "step": 320 }, { "epoch": 1.9092936802973979, "grad_norm": 1.1295243501663208, "learning_rate": 0.0002, "loss": 0.6151, "step": 321 }, { "epoch": 1.9152416356877322, "grad_norm": 1.1593585014343262, "learning_rate": 0.0002, "loss": 0.6412, "step": 322 }, { "epoch": 1.921189591078067, "grad_norm": 1.830063819885254, "learning_rate": 0.0002, "loss": 0.6746, "step": 323 }, { "epoch": 1.9271375464684015, "grad_norm": 1.3906419277191162, "learning_rate": 0.0002, "loss": 0.6351, "step": 324 }, { "epoch": 1.933085501858736, "grad_norm": 1.4869827032089233, "learning_rate": 0.0002, "loss": 0.6806, "step": 325 }, { "epoch": 1.9390334572490706, "grad_norm": 1.110323429107666, "learning_rate": 0.0002, "loss": 0.5748, "step": 326 }, { "epoch": 1.9449814126394052, "grad_norm": 1.4225271940231323, "learning_rate": 0.0002, "loss": 0.6572, "step": 327 }, { "epoch": 1.95092936802974, "grad_norm": 1.0343074798583984, "learning_rate": 0.0002, "loss": 0.5376, "step": 328 }, { "epoch": 1.9568773234200743, "grad_norm": 0.9949336647987366, "learning_rate": 0.0002, "loss": 0.69, "step": 329 }, { "epoch": 1.962825278810409, "grad_norm": 1.0380656719207764, "learning_rate": 0.0002, "loss": 0.63, "step": 330 }, { "epoch": 1.9687732342007433, "grad_norm": 1.390371561050415, "learning_rate": 0.0002, "loss": 0.7011, "step": 331 }, { "epoch": 1.9747211895910781, "grad_norm": 1.4939589500427246, "learning_rate": 0.0002, "loss": 0.5689, "step": 332 }, { "epoch": 1.9806691449814127, "grad_norm": 1.2601418495178223, "learning_rate": 0.0002, "loss": 0.5838, "step": 333 }, { "epoch": 1.9866171003717472, "grad_norm": 2.679206132888794, "learning_rate": 0.0002, "loss": 0.7981, "step": 334 }, { "epoch": 1.9925650557620818, "grad_norm": 1.1042869091033936, "learning_rate": 0.0002, "loss": 0.6212, "step": 335 }, { "epoch": 1.9985130111524163, "grad_norm": 1.4491620063781738, "learning_rate": 0.0002, "loss": 0.6194, "step": 336 }, { "epoch": 2.004460966542751, "grad_norm": 0.9622808694839478, "learning_rate": 0.0002, "loss": 0.6566, "step": 337 }, { "epoch": 2.0104089219330854, "grad_norm": 1.5044182538986206, "learning_rate": 0.0002, "loss": 0.6093, "step": 338 }, { "epoch": 2.01635687732342, "grad_norm": 1.699040174484253, "learning_rate": 0.0002, "loss": 0.5237, "step": 339 }, { "epoch": 2.0223048327137545, "grad_norm": 1.1767878532409668, "learning_rate": 0.0002, "loss": 0.6145, "step": 340 }, { "epoch": 2.0282527881040893, "grad_norm": 1.2151747941970825, "learning_rate": 0.0002, "loss": 0.6171, "step": 341 }, { "epoch": 2.0342007434944236, "grad_norm": 1.2429864406585693, "learning_rate": 0.0002, "loss": 0.6489, "step": 342 }, { "epoch": 2.0401486988847584, "grad_norm": 1.164552092552185, "learning_rate": 0.0002, "loss": 0.509, "step": 343 }, { "epoch": 2.046096654275093, "grad_norm": 1.1822024583816528, "learning_rate": 0.0002, "loss": 0.6568, "step": 344 }, { "epoch": 2.0520446096654275, "grad_norm": 7.130686283111572, "learning_rate": 0.0002, "loss": 0.5688, "step": 345 }, { "epoch": 2.0579925650557622, "grad_norm": 1.1000553369522095, "learning_rate": 0.0002, "loss": 0.5202, "step": 346 }, { "epoch": 2.0639405204460965, "grad_norm": 1.0652920007705688, "learning_rate": 0.0002, "loss": 0.5797, "step": 347 }, { "epoch": 2.0698884758364313, "grad_norm": 1.3442667722702026, "learning_rate": 0.0002, "loss": 0.6164, "step": 348 }, { "epoch": 2.0758364312267656, "grad_norm": 1.1383881568908691, "learning_rate": 0.0002, "loss": 0.5462, "step": 349 }, { "epoch": 2.0817843866171004, "grad_norm": 0.9077207446098328, "learning_rate": 0.0002, "loss": 0.4876, "step": 350 }, { "epoch": 2.0877323420074347, "grad_norm": 1.0893263816833496, "learning_rate": 0.0002, "loss": 0.6335, "step": 351 }, { "epoch": 2.0936802973977695, "grad_norm": 0.9917628169059753, "learning_rate": 0.0002, "loss": 0.5745, "step": 352 }, { "epoch": 2.0996282527881043, "grad_norm": 1.0131194591522217, "learning_rate": 0.0002, "loss": 0.5427, "step": 353 }, { "epoch": 2.1055762081784386, "grad_norm": 2.081542730331421, "learning_rate": 0.0002, "loss": 0.5692, "step": 354 }, { "epoch": 2.1115241635687734, "grad_norm": 1.659941554069519, "learning_rate": 0.0002, "loss": 0.4985, "step": 355 }, { "epoch": 2.1174721189591077, "grad_norm": 1.309465765953064, "learning_rate": 0.0002, "loss": 0.5439, "step": 356 }, { "epoch": 2.1234200743494425, "grad_norm": 1.2212107181549072, "learning_rate": 0.0002, "loss": 0.5133, "step": 357 }, { "epoch": 2.129368029739777, "grad_norm": 1.0576850175857544, "learning_rate": 0.0002, "loss": 0.6627, "step": 358 }, { "epoch": 2.1353159851301116, "grad_norm": 1.2587406635284424, "learning_rate": 0.0002, "loss": 0.5235, "step": 359 }, { "epoch": 2.141263940520446, "grad_norm": 1.05579674243927, "learning_rate": 0.0002, "loss": 0.5994, "step": 360 }, { "epoch": 2.1472118959107807, "grad_norm": 1.2232978343963623, "learning_rate": 0.0002, "loss": 0.586, "step": 361 }, { "epoch": 2.1531598513011154, "grad_norm": 1.0725406408309937, "learning_rate": 0.0002, "loss": 0.4801, "step": 362 }, { "epoch": 2.1591078066914497, "grad_norm": 1.0593106746673584, "learning_rate": 0.0002, "loss": 0.4877, "step": 363 }, { "epoch": 2.1650557620817845, "grad_norm": 2.2175445556640625, "learning_rate": 0.0002, "loss": 0.57, "step": 364 }, { "epoch": 2.171003717472119, "grad_norm": 1.1013628244400024, "learning_rate": 0.0002, "loss": 0.5797, "step": 365 }, { "epoch": 2.1769516728624536, "grad_norm": 1.186463713645935, "learning_rate": 0.0002, "loss": 0.5938, "step": 366 }, { "epoch": 2.182899628252788, "grad_norm": 1.1608301401138306, "learning_rate": 0.0002, "loss": 0.4446, "step": 367 }, { "epoch": 2.1888475836431227, "grad_norm": 1.2304465770721436, "learning_rate": 0.0002, "loss": 0.5154, "step": 368 }, { "epoch": 2.194795539033457, "grad_norm": 1.1233623027801514, "learning_rate": 0.0002, "loss": 0.6199, "step": 369 }, { "epoch": 2.200743494423792, "grad_norm": 1.2339355945587158, "learning_rate": 0.0002, "loss": 0.584, "step": 370 }, { "epoch": 2.2066914498141266, "grad_norm": 1.6794264316558838, "learning_rate": 0.0002, "loss": 0.5609, "step": 371 }, { "epoch": 2.212639405204461, "grad_norm": 1.1440285444259644, "learning_rate": 0.0002, "loss": 0.5306, "step": 372 }, { "epoch": 2.2185873605947957, "grad_norm": 1.6425179243087769, "learning_rate": 0.0002, "loss": 0.6556, "step": 373 }, { "epoch": 2.22453531598513, "grad_norm": 1.1068412065505981, "learning_rate": 0.0002, "loss": 0.6442, "step": 374 }, { "epoch": 2.2304832713754648, "grad_norm": 1.1996163129806519, "learning_rate": 0.0002, "loss": 0.51, "step": 375 }, { "epoch": 2.236431226765799, "grad_norm": 1.193741798400879, "learning_rate": 0.0002, "loss": 0.5323, "step": 376 }, { "epoch": 2.242379182156134, "grad_norm": 1.3267923593521118, "learning_rate": 0.0002, "loss": 0.5576, "step": 377 }, { "epoch": 2.248327137546468, "grad_norm": 1.1256170272827148, "learning_rate": 0.0002, "loss": 0.6218, "step": 378 }, { "epoch": 2.254275092936803, "grad_norm": 1.1282093524932861, "learning_rate": 0.0002, "loss": 0.5095, "step": 379 }, { "epoch": 2.2602230483271377, "grad_norm": 1.156480073928833, "learning_rate": 0.0002, "loss": 0.5423, "step": 380 }, { "epoch": 2.266171003717472, "grad_norm": 1.2083227634429932, "learning_rate": 0.0002, "loss": 0.5714, "step": 381 }, { "epoch": 2.272118959107807, "grad_norm": 1.5389211177825928, "learning_rate": 0.0002, "loss": 0.4797, "step": 382 }, { "epoch": 2.278066914498141, "grad_norm": 1.1418561935424805, "learning_rate": 0.0002, "loss": 0.585, "step": 383 }, { "epoch": 2.284014869888476, "grad_norm": 1.0242007970809937, "learning_rate": 0.0002, "loss": 0.562, "step": 384 }, { "epoch": 2.2899628252788102, "grad_norm": 1.4750384092330933, "learning_rate": 0.0002, "loss": 0.558, "step": 385 }, { "epoch": 2.295910780669145, "grad_norm": 1.176080346107483, "learning_rate": 0.0002, "loss": 0.4977, "step": 386 }, { "epoch": 2.3018587360594793, "grad_norm": 1.1733489036560059, "learning_rate": 0.0002, "loss": 0.5984, "step": 387 }, { "epoch": 2.307806691449814, "grad_norm": 1.0431591272354126, "learning_rate": 0.0002, "loss": 0.5805, "step": 388 }, { "epoch": 2.313754646840149, "grad_norm": 1.1595654487609863, "learning_rate": 0.0002, "loss": 0.5639, "step": 389 }, { "epoch": 2.319702602230483, "grad_norm": 1.2077865600585938, "learning_rate": 0.0002, "loss": 0.5021, "step": 390 }, { "epoch": 2.325650557620818, "grad_norm": 1.4747991561889648, "learning_rate": 0.0002, "loss": 0.5809, "step": 391 }, { "epoch": 2.3315985130111523, "grad_norm": 1.0486669540405273, "learning_rate": 0.0002, "loss": 0.6334, "step": 392 }, { "epoch": 2.337546468401487, "grad_norm": 1.8145817518234253, "learning_rate": 0.0002, "loss": 0.5223, "step": 393 }, { "epoch": 2.3434944237918214, "grad_norm": 1.393776535987854, "learning_rate": 0.0002, "loss": 0.4034, "step": 394 }, { "epoch": 2.349442379182156, "grad_norm": 1.208957314491272, "learning_rate": 0.0002, "loss": 0.5929, "step": 395 }, { "epoch": 2.3553903345724905, "grad_norm": 1.3021739721298218, "learning_rate": 0.0002, "loss": 0.6176, "step": 396 }, { "epoch": 2.3613382899628252, "grad_norm": 1.4258112907409668, "learning_rate": 0.0002, "loss": 0.6765, "step": 397 }, { "epoch": 2.36728624535316, "grad_norm": 1.2623789310455322, "learning_rate": 0.0002, "loss": 0.5973, "step": 398 }, { "epoch": 2.3732342007434943, "grad_norm": 1.1510920524597168, "learning_rate": 0.0002, "loss": 0.6167, "step": 399 }, { "epoch": 2.379182156133829, "grad_norm": 1.0992542505264282, "learning_rate": 0.0002, "loss": 0.4328, "step": 400 }, { "epoch": 2.3851301115241634, "grad_norm": 1.3745630979537964, "learning_rate": 0.0002, "loss": 0.6485, "step": 401 }, { "epoch": 2.391078066914498, "grad_norm": 1.2044932842254639, "learning_rate": 0.0002, "loss": 0.6345, "step": 402 }, { "epoch": 2.3970260223048325, "grad_norm": 1.4290401935577393, "learning_rate": 0.0002, "loss": 0.5706, "step": 403 }, { "epoch": 2.4029739776951673, "grad_norm": 1.3580994606018066, "learning_rate": 0.0002, "loss": 0.5373, "step": 404 }, { "epoch": 2.4089219330855016, "grad_norm": 1.0747463703155518, "learning_rate": 0.0002, "loss": 0.5597, "step": 405 }, { "epoch": 2.4148698884758364, "grad_norm": 1.288228154182434, "learning_rate": 0.0002, "loss": 0.6366, "step": 406 }, { "epoch": 2.420817843866171, "grad_norm": 1.2379798889160156, "learning_rate": 0.0002, "loss": 0.474, "step": 407 }, { "epoch": 2.4267657992565055, "grad_norm": 1.234220266342163, "learning_rate": 0.0002, "loss": 0.5363, "step": 408 }, { "epoch": 2.4327137546468403, "grad_norm": 1.2338114976882935, "learning_rate": 0.0002, "loss": 0.4992, "step": 409 }, { "epoch": 2.4386617100371746, "grad_norm": 1.3846346139907837, "learning_rate": 0.0002, "loss": 0.6412, "step": 410 }, { "epoch": 2.4446096654275093, "grad_norm": 1.2423279285430908, "learning_rate": 0.0002, "loss": 0.528, "step": 411 }, { "epoch": 2.4505576208178437, "grad_norm": 1.235088586807251, "learning_rate": 0.0002, "loss": 0.5763, "step": 412 }, { "epoch": 2.4565055762081784, "grad_norm": 1.3832026720046997, "learning_rate": 0.0002, "loss": 0.6355, "step": 413 }, { "epoch": 2.4624535315985128, "grad_norm": 1.214076280593872, "learning_rate": 0.0002, "loss": 0.6345, "step": 414 }, { "epoch": 2.4684014869888475, "grad_norm": 1.463728904724121, "learning_rate": 0.0002, "loss": 0.6186, "step": 415 }, { "epoch": 2.4743494423791823, "grad_norm": 1.0485203266143799, "learning_rate": 0.0002, "loss": 0.4723, "step": 416 }, { "epoch": 2.4802973977695166, "grad_norm": 1.5590802431106567, "learning_rate": 0.0002, "loss": 0.5688, "step": 417 }, { "epoch": 2.4862453531598514, "grad_norm": 1.1843955516815186, "learning_rate": 0.0002, "loss": 0.5043, "step": 418 }, { "epoch": 2.4921933085501857, "grad_norm": 1.579487919807434, "learning_rate": 0.0002, "loss": 0.5257, "step": 419 }, { "epoch": 2.4981412639405205, "grad_norm": 1.2086743116378784, "learning_rate": 0.0002, "loss": 0.6798, "step": 420 }, { "epoch": 2.5040892193308553, "grad_norm": 1.195058822631836, "learning_rate": 0.0002, "loss": 0.6485, "step": 421 }, { "epoch": 2.5100371747211896, "grad_norm": 1.283530354499817, "learning_rate": 0.0002, "loss": 0.5873, "step": 422 }, { "epoch": 2.515985130111524, "grad_norm": 1.2394181489944458, "learning_rate": 0.0002, "loss": 0.6391, "step": 423 }, { "epoch": 2.5219330855018587, "grad_norm": 1.2939765453338623, "learning_rate": 0.0002, "loss": 0.6709, "step": 424 }, { "epoch": 2.5278810408921935, "grad_norm": 1.1421490907669067, "learning_rate": 0.0002, "loss": 0.563, "step": 425 }, { "epoch": 2.533828996282528, "grad_norm": 1.1861711740493774, "learning_rate": 0.0002, "loss": 0.5176, "step": 426 }, { "epoch": 2.5397769516728625, "grad_norm": 1.3675614595413208, "learning_rate": 0.0002, "loss": 0.529, "step": 427 }, { "epoch": 2.545724907063197, "grad_norm": 1.5121257305145264, "learning_rate": 0.0002, "loss": 0.7019, "step": 428 }, { "epoch": 2.5516728624535316, "grad_norm": 1.169859766960144, "learning_rate": 0.0002, "loss": 0.5985, "step": 429 }, { "epoch": 2.5576208178438664, "grad_norm": 1.3540085554122925, "learning_rate": 0.0002, "loss": 0.5499, "step": 430 }, { "epoch": 2.5635687732342007, "grad_norm": 1.0933575630187988, "learning_rate": 0.0002, "loss": 0.5208, "step": 431 }, { "epoch": 2.569516728624535, "grad_norm": 1.1712511777877808, "learning_rate": 0.0002, "loss": 0.536, "step": 432 }, { "epoch": 2.57546468401487, "grad_norm": 1.4346905946731567, "learning_rate": 0.0002, "loss": 0.6115, "step": 433 }, { "epoch": 2.5814126394052046, "grad_norm": 2.1174967288970947, "learning_rate": 0.0002, "loss": 0.5341, "step": 434 }, { "epoch": 2.587360594795539, "grad_norm": 1.3911654949188232, "learning_rate": 0.0002, "loss": 0.5707, "step": 435 }, { "epoch": 2.5933085501858737, "grad_norm": 1.3274894952774048, "learning_rate": 0.0002, "loss": 0.6272, "step": 436 }, { "epoch": 2.599256505576208, "grad_norm": 0.9820629358291626, "learning_rate": 0.0002, "loss": 0.4664, "step": 437 }, { "epoch": 2.605204460966543, "grad_norm": 1.1450122594833374, "learning_rate": 0.0002, "loss": 0.4652, "step": 438 }, { "epoch": 2.6111524163568776, "grad_norm": 2.7575511932373047, "learning_rate": 0.0002, "loss": 0.5652, "step": 439 }, { "epoch": 2.617100371747212, "grad_norm": 1.4760148525238037, "learning_rate": 0.0002, "loss": 0.6331, "step": 440 }, { "epoch": 2.623048327137546, "grad_norm": 1.2463843822479248, "learning_rate": 0.0002, "loss": 0.5721, "step": 441 }, { "epoch": 2.628996282527881, "grad_norm": 1.2081701755523682, "learning_rate": 0.0002, "loss": 0.752, "step": 442 }, { "epoch": 2.6349442379182157, "grad_norm": 1.1693692207336426, "learning_rate": 0.0002, "loss": 0.5505, "step": 443 }, { "epoch": 2.64089219330855, "grad_norm": 1.3918544054031372, "learning_rate": 0.0002, "loss": 0.5387, "step": 444 }, { "epoch": 2.646840148698885, "grad_norm": 1.3081449270248413, "learning_rate": 0.0002, "loss": 0.4156, "step": 445 }, { "epoch": 2.652788104089219, "grad_norm": 1.1178191900253296, "learning_rate": 0.0002, "loss": 0.6099, "step": 446 }, { "epoch": 2.658736059479554, "grad_norm": 1.172034740447998, "learning_rate": 0.0002, "loss": 0.6112, "step": 447 }, { "epoch": 2.6646840148698887, "grad_norm": 1.3142459392547607, "learning_rate": 0.0002, "loss": 0.6123, "step": 448 }, { "epoch": 2.670631970260223, "grad_norm": 1.2363723516464233, "learning_rate": 0.0002, "loss": 0.5472, "step": 449 }, { "epoch": 2.6765799256505574, "grad_norm": 1.2881202697753906, "learning_rate": 0.0002, "loss": 0.4572, "step": 450 }, { "epoch": 2.682527881040892, "grad_norm": 1.0761253833770752, "learning_rate": 0.0002, "loss": 0.5366, "step": 451 }, { "epoch": 2.688475836431227, "grad_norm": 1.0405654907226562, "learning_rate": 0.0002, "loss": 0.6858, "step": 452 }, { "epoch": 2.694423791821561, "grad_norm": 1.3384194374084473, "learning_rate": 0.0002, "loss": 0.616, "step": 453 }, { "epoch": 2.700371747211896, "grad_norm": 8.933956146240234, "learning_rate": 0.0002, "loss": 0.6136, "step": 454 }, { "epoch": 2.7063197026022303, "grad_norm": 1.1435190439224243, "learning_rate": 0.0002, "loss": 0.5598, "step": 455 }, { "epoch": 2.712267657992565, "grad_norm": 1.2891956567764282, "learning_rate": 0.0002, "loss": 0.5945, "step": 456 }, { "epoch": 2.7182156133829, "grad_norm": 1.3077706098556519, "learning_rate": 0.0002, "loss": 0.5793, "step": 457 }, { "epoch": 2.724163568773234, "grad_norm": 1.1445353031158447, "learning_rate": 0.0002, "loss": 0.5644, "step": 458 }, { "epoch": 2.7301115241635685, "grad_norm": 1.1466567516326904, "learning_rate": 0.0002, "loss": 0.5748, "step": 459 }, { "epoch": 2.7360594795539033, "grad_norm": 1.0083645582199097, "learning_rate": 0.0002, "loss": 0.5676, "step": 460 }, { "epoch": 2.742007434944238, "grad_norm": 0.9980899691581726, "learning_rate": 0.0002, "loss": 0.5975, "step": 461 }, { "epoch": 2.7479553903345724, "grad_norm": 1.0702303647994995, "learning_rate": 0.0002, "loss": 0.5235, "step": 462 }, { "epoch": 2.753903345724907, "grad_norm": 1.3305853605270386, "learning_rate": 0.0002, "loss": 0.6493, "step": 463 }, { "epoch": 2.7598513011152415, "grad_norm": 1.4583408832550049, "learning_rate": 0.0002, "loss": 0.5948, "step": 464 }, { "epoch": 2.7657992565055762, "grad_norm": 1.1704531908035278, "learning_rate": 0.0002, "loss": 0.5631, "step": 465 }, { "epoch": 2.771747211895911, "grad_norm": 1.1165651082992554, "learning_rate": 0.0002, "loss": 0.6674, "step": 466 }, { "epoch": 2.7776951672862453, "grad_norm": 1.2043639421463013, "learning_rate": 0.0002, "loss": 0.649, "step": 467 }, { "epoch": 2.7836431226765797, "grad_norm": 1.0930832624435425, "learning_rate": 0.0002, "loss": 0.541, "step": 468 }, { "epoch": 2.7895910780669144, "grad_norm": 1.1153466701507568, "learning_rate": 0.0002, "loss": 0.4666, "step": 469 }, { "epoch": 2.795539033457249, "grad_norm": 3.27708101272583, "learning_rate": 0.0002, "loss": 0.6981, "step": 470 }, { "epoch": 2.8014869888475835, "grad_norm": 1.200003981590271, "learning_rate": 0.0002, "loss": 0.5579, "step": 471 }, { "epoch": 2.8074349442379183, "grad_norm": 1.2021151781082153, "learning_rate": 0.0002, "loss": 0.5383, "step": 472 }, { "epoch": 2.8133828996282526, "grad_norm": 1.0844088792800903, "learning_rate": 0.0002, "loss": 0.6027, "step": 473 }, { "epoch": 2.8193308550185874, "grad_norm": 1.1981035470962524, "learning_rate": 0.0002, "loss": 0.5589, "step": 474 }, { "epoch": 2.825278810408922, "grad_norm": 1.23332679271698, "learning_rate": 0.0002, "loss": 0.6733, "step": 475 }, { "epoch": 2.8312267657992565, "grad_norm": 1.2242364883422852, "learning_rate": 0.0002, "loss": 0.6392, "step": 476 }, { "epoch": 2.8371747211895912, "grad_norm": 1.2482764720916748, "learning_rate": 0.0002, "loss": 0.6185, "step": 477 }, { "epoch": 2.8431226765799256, "grad_norm": 1.3755487203598022, "learning_rate": 0.0002, "loss": 0.5761, "step": 478 }, { "epoch": 2.8490706319702603, "grad_norm": 1.2065231800079346, "learning_rate": 0.0002, "loss": 0.5185, "step": 479 }, { "epoch": 2.8550185873605947, "grad_norm": 1.1161603927612305, "learning_rate": 0.0002, "loss": 0.5697, "step": 480 }, { "epoch": 2.8609665427509294, "grad_norm": 1.7466390132904053, "learning_rate": 0.0002, "loss": 0.5835, "step": 481 }, { "epoch": 2.8669144981412638, "grad_norm": 1.371319055557251, "learning_rate": 0.0002, "loss": 0.6031, "step": 482 }, { "epoch": 2.8728624535315985, "grad_norm": 1.4363592863082886, "learning_rate": 0.0002, "loss": 0.6028, "step": 483 }, { "epoch": 2.8788104089219333, "grad_norm": 1.1025314331054688, "learning_rate": 0.0002, "loss": 0.492, "step": 484 }, { "epoch": 2.8847583643122676, "grad_norm": 1.04302978515625, "learning_rate": 0.0002, "loss": 0.6032, "step": 485 }, { "epoch": 2.8907063197026024, "grad_norm": 1.5093481540679932, "learning_rate": 0.0002, "loss": 0.6832, "step": 486 }, { "epoch": 2.8966542750929367, "grad_norm": 1.068484902381897, "learning_rate": 0.0002, "loss": 0.4942, "step": 487 }, { "epoch": 2.9026022304832715, "grad_norm": 2.1092681884765625, "learning_rate": 0.0002, "loss": 0.4909, "step": 488 }, { "epoch": 2.908550185873606, "grad_norm": 1.22842276096344, "learning_rate": 0.0002, "loss": 0.6226, "step": 489 }, { "epoch": 2.9144981412639406, "grad_norm": 1.1664717197418213, "learning_rate": 0.0002, "loss": 0.6093, "step": 490 }, { "epoch": 2.920446096654275, "grad_norm": 1.2886883020401, "learning_rate": 0.0002, "loss": 0.5866, "step": 491 }, { "epoch": 2.9263940520446097, "grad_norm": 1.1186504364013672, "learning_rate": 0.0002, "loss": 0.5942, "step": 492 }, { "epoch": 2.9323420074349444, "grad_norm": 1.2734028100967407, "learning_rate": 0.0002, "loss": 0.5871, "step": 493 }, { "epoch": 2.9382899628252788, "grad_norm": 1.1976778507232666, "learning_rate": 0.0002, "loss": 0.6766, "step": 494 }, { "epoch": 2.9442379182156135, "grad_norm": 1.524681806564331, "learning_rate": 0.0002, "loss": 0.602, "step": 495 }, { "epoch": 2.950185873605948, "grad_norm": 1.4174754619598389, "learning_rate": 0.0002, "loss": 0.6079, "step": 496 }, { "epoch": 2.9561338289962826, "grad_norm": 1.1006587743759155, "learning_rate": 0.0002, "loss": 0.6393, "step": 497 }, { "epoch": 2.962081784386617, "grad_norm": 1.3037843704223633, "learning_rate": 0.0002, "loss": 0.6065, "step": 498 }, { "epoch": 2.9680297397769517, "grad_norm": 1.5767035484313965, "learning_rate": 0.0002, "loss": 0.6444, "step": 499 }, { "epoch": 2.973977695167286, "grad_norm": 1.2918823957443237, "learning_rate": 0.0002, "loss": 0.5221, "step": 500 }, { "epoch": 2.979925650557621, "grad_norm": 1.0898538827896118, "learning_rate": 0.0002, "loss": 0.687, "step": 501 }, { "epoch": 2.9858736059479556, "grad_norm": 1.6398361921310425, "learning_rate": 0.0002, "loss": 0.5781, "step": 502 }, { "epoch": 2.99182156133829, "grad_norm": 1.2504217624664307, "learning_rate": 0.0002, "loss": 0.5916, "step": 503 }, { "epoch": 2.9977695167286247, "grad_norm": 1.8298507928848267, "learning_rate": 0.0002, "loss": 0.5777, "step": 504 } ], "logging_steps": 1, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.641281849472852e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }