{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 7820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00128, "grad_norm": 13.502479553222656, "learning_rate": 0.0, "loss": 10.9785, "step": 1 }, { "epoch": 0.00256, "grad_norm": 13.624500274658203, "learning_rate": 7.672634271099745e-07, "loss": 10.98, "step": 2 }, { "epoch": 0.00384, "grad_norm": 13.133468627929688, "learning_rate": 1.534526854219949e-06, "loss": 10.9409, "step": 3 }, { "epoch": 0.00512, "grad_norm": 13.427982330322266, "learning_rate": 2.301790281329923e-06, "loss": 10.8518, "step": 4 }, { "epoch": 0.0064, "grad_norm": 12.44352912902832, "learning_rate": 3.069053708439898e-06, "loss": 10.738, "step": 5 }, { "epoch": 0.00768, "grad_norm": 10.861345291137695, "learning_rate": 3.836317135549872e-06, "loss": 10.6095, "step": 6 }, { "epoch": 0.00896, "grad_norm": 9.260466575622559, "learning_rate": 4.603580562659846e-06, "loss": 10.4604, "step": 7 }, { "epoch": 0.01024, "grad_norm": 7.927541255950928, "learning_rate": 5.3708439897698205e-06, "loss": 10.3281, "step": 8 }, { "epoch": 0.01152, "grad_norm": 6.898108959197998, "learning_rate": 6.138107416879796e-06, "loss": 10.1985, "step": 9 }, { "epoch": 0.0128, "grad_norm": 5.914000988006592, "learning_rate": 6.905370843989769e-06, "loss": 10.0951, "step": 10 }, { "epoch": 0.01408, "grad_norm": 5.054530143737793, "learning_rate": 7.672634271099744e-06, "loss": 10.012, "step": 11 }, { "epoch": 0.01536, "grad_norm": 4.351410865783691, "learning_rate": 8.439897698209718e-06, "loss": 9.9117, "step": 12 }, { "epoch": 0.01664, "grad_norm": 3.752089500427246, "learning_rate": 9.207161125319692e-06, "loss": 9.8225, "step": 13 }, { "epoch": 0.01792, "grad_norm": 3.252004623413086, "learning_rate": 9.974424552429668e-06, "loss": 9.7603, "step": 14 }, { "epoch": 0.0192, "grad_norm": 2.8720457553863525, "learning_rate": 1.0741687979539641e-05, "loss": 9.7285, "step": 15 }, { "epoch": 0.02048, "grad_norm": 2.666605234146118, "learning_rate": 1.1508951406649615e-05, "loss": 9.6632, "step": 16 }, { "epoch": 0.02176, "grad_norm": 2.497063398361206, "learning_rate": 1.2276214833759591e-05, "loss": 9.6141, "step": 17 }, { "epoch": 0.02304, "grad_norm": 2.3066694736480713, "learning_rate": 1.3043478260869564e-05, "loss": 9.6125, "step": 18 }, { "epoch": 0.02432, "grad_norm": 2.267719268798828, "learning_rate": 1.3810741687979538e-05, "loss": 9.5742, "step": 19 }, { "epoch": 0.0256, "grad_norm": 2.268841505050659, "learning_rate": 1.4578005115089511e-05, "loss": 9.5505, "step": 20 }, { "epoch": 0.02688, "grad_norm": 2.2032310962677, "learning_rate": 1.5345268542199487e-05, "loss": 9.5406, "step": 21 }, { "epoch": 0.02816, "grad_norm": 2.1540212631225586, "learning_rate": 1.611253196930946e-05, "loss": 9.5205, "step": 22 }, { "epoch": 0.02944, "grad_norm": 2.1648716926574707, "learning_rate": 1.6879795396419436e-05, "loss": 9.5023, "step": 23 }, { "epoch": 0.03072, "grad_norm": 2.1777994632720947, "learning_rate": 1.764705882352941e-05, "loss": 9.4566, "step": 24 }, { "epoch": 0.032, "grad_norm": 2.093733310699463, "learning_rate": 1.8414322250639385e-05, "loss": 9.4797, "step": 25 }, { "epoch": 0.03328, "grad_norm": 2.14025616645813, "learning_rate": 1.918158567774936e-05, "loss": 9.4326, "step": 26 }, { "epoch": 0.03456, "grad_norm": 2.243187665939331, "learning_rate": 1.9948849104859337e-05, "loss": 9.3569, "step": 27 }, { "epoch": 0.03584, "grad_norm": 2.1062655448913574, "learning_rate": 2.0716112531969308e-05, "loss": 9.3849, "step": 28 }, { "epoch": 0.03712, "grad_norm": 2.0766758918762207, "learning_rate": 2.1483375959079282e-05, "loss": 9.351, "step": 29 }, { "epoch": 0.0384, "grad_norm": 2.101067304611206, "learning_rate": 2.2250639386189256e-05, "loss": 9.3139, "step": 30 }, { "epoch": 0.03968, "grad_norm": 2.0506746768951416, "learning_rate": 2.301790281329923e-05, "loss": 9.2755, "step": 31 }, { "epoch": 0.04096, "grad_norm": 2.015533924102783, "learning_rate": 2.3785166240409205e-05, "loss": 9.2446, "step": 32 }, { "epoch": 0.04224, "grad_norm": 1.9934455156326294, "learning_rate": 2.4552429667519183e-05, "loss": 9.2283, "step": 33 }, { "epoch": 0.04352, "grad_norm": 1.9839218854904175, "learning_rate": 2.5319693094629154e-05, "loss": 9.1785, "step": 34 }, { "epoch": 0.0448, "grad_norm": 2.0591323375701904, "learning_rate": 2.6086956521739128e-05, "loss": 9.1733, "step": 35 }, { "epoch": 0.04608, "grad_norm": 1.8989123106002808, "learning_rate": 2.6854219948849103e-05, "loss": 9.1775, "step": 36 }, { "epoch": 0.04736, "grad_norm": 2.2588517665863037, "learning_rate": 2.7621483375959077e-05, "loss": 9.1317, "step": 37 }, { "epoch": 0.04864, "grad_norm": 1.869653582572937, "learning_rate": 2.838874680306905e-05, "loss": 9.0794, "step": 38 }, { "epoch": 0.04992, "grad_norm": 2.098316192626953, "learning_rate": 2.9156010230179022e-05, "loss": 9.0853, "step": 39 }, { "epoch": 0.0512, "grad_norm": 1.8606550693511963, "learning_rate": 2.9923273657289e-05, "loss": 9.0243, "step": 40 }, { "epoch": 0.05248, "grad_norm": 1.9760617017745972, "learning_rate": 3.0690537084398974e-05, "loss": 8.9874, "step": 41 }, { "epoch": 0.05376, "grad_norm": 1.8577723503112793, "learning_rate": 3.145780051150895e-05, "loss": 8.9687, "step": 42 }, { "epoch": 0.05504, "grad_norm": 1.7823419570922852, "learning_rate": 3.222506393861892e-05, "loss": 8.9302, "step": 43 }, { "epoch": 0.05632, "grad_norm": 1.8468880653381348, "learning_rate": 3.29923273657289e-05, "loss": 8.8693, "step": 44 }, { "epoch": 0.0576, "grad_norm": 1.7553874254226685, "learning_rate": 3.375959079283887e-05, "loss": 8.8835, "step": 45 }, { "epoch": 0.05888, "grad_norm": 1.8048585653305054, "learning_rate": 3.4526854219948846e-05, "loss": 8.8015, "step": 46 }, { "epoch": 0.06016, "grad_norm": 1.8145036697387695, "learning_rate": 3.529411764705882e-05, "loss": 8.7847, "step": 47 }, { "epoch": 0.06144, "grad_norm": 1.6603283882141113, "learning_rate": 3.6061381074168795e-05, "loss": 8.7738, "step": 48 }, { "epoch": 0.06272, "grad_norm": 1.6837772130966187, "learning_rate": 3.682864450127877e-05, "loss": 8.7088, "step": 49 }, { "epoch": 0.064, "grad_norm": 1.686160683631897, "learning_rate": 3.7595907928388744e-05, "loss": 8.6635, "step": 50 }, { "epoch": 0.06528, "grad_norm": 1.7713919878005981, "learning_rate": 3.836317135549872e-05, "loss": 8.6265, "step": 51 }, { "epoch": 0.06656, "grad_norm": 1.8491101264953613, "learning_rate": 3.913043478260869e-05, "loss": 8.5384, "step": 52 }, { "epoch": 0.06784, "grad_norm": 1.6337801218032837, "learning_rate": 3.989769820971867e-05, "loss": 8.5446, "step": 53 }, { "epoch": 0.06912, "grad_norm": 1.5614173412322998, "learning_rate": 4.066496163682864e-05, "loss": 8.5252, "step": 54 }, { "epoch": 0.0704, "grad_norm": 1.5374433994293213, "learning_rate": 4.1432225063938615e-05, "loss": 8.4633, "step": 55 }, { "epoch": 0.07168, "grad_norm": 1.6636866331100464, "learning_rate": 4.219948849104859e-05, "loss": 8.4685, "step": 56 }, { "epoch": 0.07296, "grad_norm": 1.6900804042816162, "learning_rate": 4.2966751918158564e-05, "loss": 8.4324, "step": 57 }, { "epoch": 0.07424, "grad_norm": 1.5862479209899902, "learning_rate": 4.373401534526854e-05, "loss": 8.359, "step": 58 }, { "epoch": 0.07552, "grad_norm": 1.5928101539611816, "learning_rate": 4.450127877237851e-05, "loss": 8.3375, "step": 59 }, { "epoch": 0.0768, "grad_norm": 1.5766217708587646, "learning_rate": 4.526854219948848e-05, "loss": 8.2851, "step": 60 }, { "epoch": 0.07808, "grad_norm": 1.5101016759872437, "learning_rate": 4.603580562659846e-05, "loss": 8.2516, "step": 61 }, { "epoch": 0.07936, "grad_norm": 1.3688981533050537, "learning_rate": 4.6803069053708436e-05, "loss": 8.268, "step": 62 }, { "epoch": 0.08064, "grad_norm": 1.4474924802780151, "learning_rate": 4.757033248081841e-05, "loss": 8.2099, "step": 63 }, { "epoch": 0.08192, "grad_norm": 1.4008480310440063, "learning_rate": 4.8337595907928385e-05, "loss": 8.1661, "step": 64 }, { "epoch": 0.0832, "grad_norm": 1.4576811790466309, "learning_rate": 4.9104859335038366e-05, "loss": 8.1582, "step": 65 }, { "epoch": 0.08448, "grad_norm": 1.4062941074371338, "learning_rate": 4.987212276214833e-05, "loss": 8.0852, "step": 66 }, { "epoch": 0.08576, "grad_norm": 1.3305187225341797, "learning_rate": 5.063938618925831e-05, "loss": 8.0746, "step": 67 }, { "epoch": 0.08704, "grad_norm": 1.3217337131500244, "learning_rate": 5.140664961636828e-05, "loss": 8.0454, "step": 68 }, { "epoch": 0.08832, "grad_norm": 1.3334351778030396, "learning_rate": 5.2173913043478256e-05, "loss": 7.9983, "step": 69 }, { "epoch": 0.0896, "grad_norm": 1.2380847930908203, "learning_rate": 5.294117647058824e-05, "loss": 7.9684, "step": 70 }, { "epoch": 0.09088, "grad_norm": 1.3053793907165527, "learning_rate": 5.3708439897698205e-05, "loss": 7.9135, "step": 71 }, { "epoch": 0.09216, "grad_norm": 1.3208932876586914, "learning_rate": 5.447570332480817e-05, "loss": 7.9277, "step": 72 }, { "epoch": 0.09344, "grad_norm": 1.3776912689208984, "learning_rate": 5.5242966751918154e-05, "loss": 7.861, "step": 73 }, { "epoch": 0.09472, "grad_norm": 1.1503986120224, "learning_rate": 5.601023017902813e-05, "loss": 7.8479, "step": 74 }, { "epoch": 0.096, "grad_norm": 1.2197959423065186, "learning_rate": 5.67774936061381e-05, "loss": 7.7553, "step": 75 }, { "epoch": 0.09728, "grad_norm": 1.0848344564437866, "learning_rate": 5.754475703324808e-05, "loss": 7.7622, "step": 76 }, { "epoch": 0.09856, "grad_norm": 1.0727125406265259, "learning_rate": 5.8312020460358044e-05, "loss": 7.6749, "step": 77 }, { "epoch": 0.09984, "grad_norm": 1.0996843576431274, "learning_rate": 5.9079283887468026e-05, "loss": 7.7144, "step": 78 }, { "epoch": 0.10112, "grad_norm": 1.086287498474121, "learning_rate": 5.9846547314578e-05, "loss": 7.644, "step": 79 }, { "epoch": 0.1024, "grad_norm": 0.9513853192329407, "learning_rate": 6.0613810741687974e-05, "loss": 7.6277, "step": 80 }, { "epoch": 0.10368, "grad_norm": 1.0651500225067139, "learning_rate": 6.138107416879795e-05, "loss": 7.6083, "step": 81 }, { "epoch": 0.10496, "grad_norm": 1.1370134353637695, "learning_rate": 6.214833759590792e-05, "loss": 7.5705, "step": 82 }, { "epoch": 0.10624, "grad_norm": 1.0704236030578613, "learning_rate": 6.29156010230179e-05, "loss": 7.5459, "step": 83 }, { "epoch": 0.10752, "grad_norm": 1.046200156211853, "learning_rate": 6.368286445012787e-05, "loss": 7.5118, "step": 84 }, { "epoch": 0.1088, "grad_norm": 0.8698644638061523, "learning_rate": 6.445012787723785e-05, "loss": 7.4862, "step": 85 }, { "epoch": 0.11008, "grad_norm": 1.1209050416946411, "learning_rate": 6.521739130434782e-05, "loss": 7.485, "step": 86 }, { "epoch": 0.11136, "grad_norm": 1.3683184385299683, "learning_rate": 6.59846547314578e-05, "loss": 7.4883, "step": 87 }, { "epoch": 0.11264, "grad_norm": 0.9742879867553711, "learning_rate": 6.675191815856777e-05, "loss": 7.3843, "step": 88 }, { "epoch": 0.11392, "grad_norm": 1.3988070487976074, "learning_rate": 6.751918158567774e-05, "loss": 7.394, "step": 89 }, { "epoch": 0.1152, "grad_norm": 1.1061723232269287, "learning_rate": 6.828644501278772e-05, "loss": 7.3124, "step": 90 }, { "epoch": 0.11648, "grad_norm": 1.0231846570968628, "learning_rate": 6.905370843989769e-05, "loss": 7.3708, "step": 91 }, { "epoch": 0.11776, "grad_norm": 1.3509619235992432, "learning_rate": 6.982097186700767e-05, "loss": 7.3974, "step": 92 }, { "epoch": 0.11904, "grad_norm": 0.8934109807014465, "learning_rate": 7.058823529411764e-05, "loss": 7.3336, "step": 93 }, { "epoch": 0.12032, "grad_norm": 0.7775417566299438, "learning_rate": 7.135549872122762e-05, "loss": 7.3351, "step": 94 }, { "epoch": 0.1216, "grad_norm": 1.0462260246276855, "learning_rate": 7.212276214833759e-05, "loss": 7.3454, "step": 95 }, { "epoch": 0.12288, "grad_norm": 0.9035119414329529, "learning_rate": 7.289002557544756e-05, "loss": 7.2667, "step": 96 }, { "epoch": 0.12416, "grad_norm": 0.8907987475395203, "learning_rate": 7.365728900255754e-05, "loss": 7.2154, "step": 97 }, { "epoch": 0.12544, "grad_norm": 0.9043131470680237, "learning_rate": 7.442455242966751e-05, "loss": 7.2902, "step": 98 }, { "epoch": 0.12672, "grad_norm": 0.9289432764053345, "learning_rate": 7.519181585677749e-05, "loss": 7.2135, "step": 99 }, { "epoch": 0.128, "grad_norm": 0.6835809350013733, "learning_rate": 7.595907928388747e-05, "loss": 7.1967, "step": 100 }, { "epoch": 0.12928, "grad_norm": 1.0327857732772827, "learning_rate": 7.672634271099744e-05, "loss": 7.1981, "step": 101 }, { "epoch": 0.13056, "grad_norm": 0.8814536929130554, "learning_rate": 7.749360613810741e-05, "loss": 7.1923, "step": 102 }, { "epoch": 0.13184, "grad_norm": 0.8455113172531128, "learning_rate": 7.826086956521738e-05, "loss": 7.2318, "step": 103 }, { "epoch": 0.13312, "grad_norm": 1.2715808153152466, "learning_rate": 7.902813299232736e-05, "loss": 7.1884, "step": 104 }, { "epoch": 0.1344, "grad_norm": 0.904243528842926, "learning_rate": 7.979539641943735e-05, "loss": 7.1839, "step": 105 }, { "epoch": 0.13568, "grad_norm": 0.9426031112670898, "learning_rate": 8.056265984654731e-05, "loss": 7.1849, "step": 106 }, { "epoch": 0.13696, "grad_norm": 0.7906021475791931, "learning_rate": 8.132992327365728e-05, "loss": 7.1946, "step": 107 }, { "epoch": 0.13824, "grad_norm": 0.8392283320426941, "learning_rate": 8.209718670076726e-05, "loss": 7.1438, "step": 108 }, { "epoch": 0.13952, "grad_norm": 0.7050894498825073, "learning_rate": 8.286445012787723e-05, "loss": 7.1547, "step": 109 }, { "epoch": 0.1408, "grad_norm": 0.8787212371826172, "learning_rate": 8.363171355498722e-05, "loss": 7.1193, "step": 110 }, { "epoch": 0.14208, "grad_norm": 0.5892102122306824, "learning_rate": 8.439897698209718e-05, "loss": 7.0926, "step": 111 }, { "epoch": 0.14336, "grad_norm": 1.2569445371627808, "learning_rate": 8.516624040920715e-05, "loss": 7.1459, "step": 112 }, { "epoch": 0.14464, "grad_norm": 0.6565424799919128, "learning_rate": 8.593350383631713e-05, "loss": 7.0536, "step": 113 }, { "epoch": 0.14592, "grad_norm": 0.875479519367218, "learning_rate": 8.670076726342709e-05, "loss": 7.1119, "step": 114 }, { "epoch": 0.1472, "grad_norm": 0.6395849585533142, "learning_rate": 8.746803069053708e-05, "loss": 7.0775, "step": 115 }, { "epoch": 0.14848, "grad_norm": 0.7110955715179443, "learning_rate": 8.823529411764705e-05, "loss": 7.0741, "step": 116 }, { "epoch": 0.14976, "grad_norm": 0.6394131779670715, "learning_rate": 8.900255754475703e-05, "loss": 7.0442, "step": 117 }, { "epoch": 0.15104, "grad_norm": 0.9272814393043518, "learning_rate": 8.9769820971867e-05, "loss": 7.0344, "step": 118 }, { "epoch": 0.15232, "grad_norm": 0.858862578868866, "learning_rate": 9.053708439897696e-05, "loss": 7.0646, "step": 119 }, { "epoch": 0.1536, "grad_norm": 1.1841069459915161, "learning_rate": 9.130434782608695e-05, "loss": 7.0339, "step": 120 }, { "epoch": 0.15488, "grad_norm": 0.9319404363632202, "learning_rate": 9.207161125319692e-05, "loss": 7.0008, "step": 121 }, { "epoch": 0.15616, "grad_norm": 0.8237971067428589, "learning_rate": 9.28388746803069e-05, "loss": 7.0618, "step": 122 }, { "epoch": 0.15744, "grad_norm": 1.379276156425476, "learning_rate": 9.360613810741687e-05, "loss": 6.9875, "step": 123 }, { "epoch": 0.15872, "grad_norm": 0.9294237494468689, "learning_rate": 9.437340153452683e-05, "loss": 6.9856, "step": 124 }, { "epoch": 0.16, "grad_norm": 0.9710653424263, "learning_rate": 9.514066496163682e-05, "loss": 6.9368, "step": 125 }, { "epoch": 0.16128, "grad_norm": 1.0589849948883057, "learning_rate": 9.59079283887468e-05, "loss": 6.9884, "step": 126 }, { "epoch": 0.16256, "grad_norm": 0.6673716902732849, "learning_rate": 9.667519181585677e-05, "loss": 6.9719, "step": 127 }, { "epoch": 0.16384, "grad_norm": 0.7187034487724304, "learning_rate": 9.744245524296674e-05, "loss": 6.9662, "step": 128 }, { "epoch": 0.16512, "grad_norm": 0.7513381242752075, "learning_rate": 9.820971867007673e-05, "loss": 6.8949, "step": 129 }, { "epoch": 0.1664, "grad_norm": 0.6384944319725037, "learning_rate": 9.897698209718669e-05, "loss": 6.9296, "step": 130 }, { "epoch": 0.16768, "grad_norm": 0.6383824348449707, "learning_rate": 9.974424552429667e-05, "loss": 6.956, "step": 131 }, { "epoch": 0.16896, "grad_norm": 0.8002307415008545, "learning_rate": 0.00010051150895140664, "loss": 6.9135, "step": 132 }, { "epoch": 0.17024, "grad_norm": 0.6587149500846863, "learning_rate": 0.00010127877237851662, "loss": 6.8886, "step": 133 }, { "epoch": 0.17152, "grad_norm": 0.7700603604316711, "learning_rate": 0.0001020460358056266, "loss": 6.9318, "step": 134 }, { "epoch": 0.1728, "grad_norm": 0.7758870720863342, "learning_rate": 0.00010281329923273656, "loss": 6.9266, "step": 135 }, { "epoch": 0.17408, "grad_norm": 0.818088948726654, "learning_rate": 0.00010358056265984654, "loss": 6.8917, "step": 136 }, { "epoch": 0.17536, "grad_norm": 1.1521923542022705, "learning_rate": 0.00010434782608695651, "loss": 6.9264, "step": 137 }, { "epoch": 0.17664, "grad_norm": 0.7598437070846558, "learning_rate": 0.00010511508951406649, "loss": 6.8756, "step": 138 }, { "epoch": 0.17792, "grad_norm": 0.8913196921348572, "learning_rate": 0.00010588235294117647, "loss": 6.8535, "step": 139 }, { "epoch": 0.1792, "grad_norm": 1.0058693885803223, "learning_rate": 0.00010664961636828644, "loss": 6.88, "step": 140 }, { "epoch": 0.18048, "grad_norm": 0.9338283538818359, "learning_rate": 0.00010741687979539641, "loss": 6.8383, "step": 141 }, { "epoch": 0.18176, "grad_norm": 0.6229548454284668, "learning_rate": 0.00010818414322250638, "loss": 6.7874, "step": 142 }, { "epoch": 0.18304, "grad_norm": 0.7639989256858826, "learning_rate": 0.00010895140664961635, "loss": 6.7625, "step": 143 }, { "epoch": 0.18432, "grad_norm": 0.6291145086288452, "learning_rate": 0.00010971867007672633, "loss": 6.7944, "step": 144 }, { "epoch": 0.1856, "grad_norm": 0.8313521146774292, "learning_rate": 0.00011048593350383631, "loss": 6.859, "step": 145 }, { "epoch": 0.18688, "grad_norm": 1.0272287130355835, "learning_rate": 0.00011125319693094628, "loss": 6.8568, "step": 146 }, { "epoch": 0.18816, "grad_norm": 1.1838618516921997, "learning_rate": 0.00011202046035805626, "loss": 6.7789, "step": 147 }, { "epoch": 0.18944, "grad_norm": 0.5190697312355042, "learning_rate": 0.00011278772378516622, "loss": 6.8148, "step": 148 }, { "epoch": 0.19072, "grad_norm": 0.7801781892776489, "learning_rate": 0.0001135549872122762, "loss": 6.8162, "step": 149 }, { "epoch": 0.192, "grad_norm": 0.8845781087875366, "learning_rate": 0.00011432225063938618, "loss": 6.7817, "step": 150 }, { "epoch": 0.19328, "grad_norm": 0.7235129475593567, "learning_rate": 0.00011508951406649615, "loss": 6.8149, "step": 151 }, { "epoch": 0.19456, "grad_norm": 0.827165961265564, "learning_rate": 0.00011585677749360613, "loss": 6.6936, "step": 152 }, { "epoch": 0.19584, "grad_norm": 0.6322879791259766, "learning_rate": 0.00011662404092071609, "loss": 6.7753, "step": 153 }, { "epoch": 0.19712, "grad_norm": 0.6085387468338013, "learning_rate": 0.00011739130434782608, "loss": 6.7522, "step": 154 }, { "epoch": 0.1984, "grad_norm": 0.5771993398666382, "learning_rate": 0.00011815856777493605, "loss": 6.7441, "step": 155 }, { "epoch": 0.19968, "grad_norm": 0.6479660272598267, "learning_rate": 0.00011892583120204603, "loss": 6.795, "step": 156 }, { "epoch": 0.20096, "grad_norm": 0.7639020085334778, "learning_rate": 0.000119693094629156, "loss": 6.7089, "step": 157 }, { "epoch": 0.20224, "grad_norm": 0.7963739037513733, "learning_rate": 0.00012046035805626599, "loss": 6.7321, "step": 158 }, { "epoch": 0.20352, "grad_norm": 0.8144316077232361, "learning_rate": 0.00012122762148337595, "loss": 6.7284, "step": 159 }, { "epoch": 0.2048, "grad_norm": 0.8595559000968933, "learning_rate": 0.00012199488491048592, "loss": 6.7329, "step": 160 }, { "epoch": 0.20608, "grad_norm": 0.8563242554664612, "learning_rate": 0.0001227621483375959, "loss": 6.7072, "step": 161 }, { "epoch": 0.20736, "grad_norm": 0.9249243140220642, "learning_rate": 0.00012352941176470587, "loss": 6.7389, "step": 162 }, { "epoch": 0.20864, "grad_norm": 0.7681375741958618, "learning_rate": 0.00012429667519181585, "loss": 6.6333, "step": 163 }, { "epoch": 0.20992, "grad_norm": 0.8071373105049133, "learning_rate": 0.00012506393861892582, "loss": 6.7132, "step": 164 }, { "epoch": 0.2112, "grad_norm": 0.7506120204925537, "learning_rate": 0.0001258312020460358, "loss": 6.6808, "step": 165 }, { "epoch": 0.21248, "grad_norm": 0.8336671590805054, "learning_rate": 0.00012659846547314577, "loss": 6.7015, "step": 166 }, { "epoch": 0.21376, "grad_norm": 0.9540615677833557, "learning_rate": 0.00012736572890025574, "loss": 6.6975, "step": 167 }, { "epoch": 0.21504, "grad_norm": 0.8073275089263916, "learning_rate": 0.00012813299232736572, "loss": 6.6884, "step": 168 }, { "epoch": 0.21632, "grad_norm": 0.6824653148651123, "learning_rate": 0.0001289002557544757, "loss": 6.6723, "step": 169 }, { "epoch": 0.2176, "grad_norm": 0.5908713340759277, "learning_rate": 0.00012966751918158567, "loss": 6.6807, "step": 170 }, { "epoch": 0.21888, "grad_norm": 0.7622809410095215, "learning_rate": 0.00013043478260869564, "loss": 6.6472, "step": 171 }, { "epoch": 0.22016, "grad_norm": 0.6623175740242004, "learning_rate": 0.00013120204603580562, "loss": 6.6533, "step": 172 }, { "epoch": 0.22144, "grad_norm": 0.7238216400146484, "learning_rate": 0.0001319693094629156, "loss": 6.654, "step": 173 }, { "epoch": 0.22272, "grad_norm": 0.6670112013816833, "learning_rate": 0.00013273657289002556, "loss": 6.6262, "step": 174 }, { "epoch": 0.224, "grad_norm": 0.6878064274787903, "learning_rate": 0.00013350383631713554, "loss": 6.6101, "step": 175 }, { "epoch": 0.22528, "grad_norm": 0.7931588292121887, "learning_rate": 0.0001342710997442455, "loss": 6.6616, "step": 176 }, { "epoch": 0.22656, "grad_norm": 1.0632978677749634, "learning_rate": 0.0001350383631713555, "loss": 6.6203, "step": 177 }, { "epoch": 0.22784, "grad_norm": 0.9125176668167114, "learning_rate": 0.00013580562659846546, "loss": 6.5786, "step": 178 }, { "epoch": 0.22912, "grad_norm": 0.6771340370178223, "learning_rate": 0.00013657289002557544, "loss": 6.6114, "step": 179 }, { "epoch": 0.2304, "grad_norm": 0.7708578109741211, "learning_rate": 0.0001373401534526854, "loss": 6.663, "step": 180 }, { "epoch": 0.23168, "grad_norm": 0.7952269315719604, "learning_rate": 0.00013810741687979538, "loss": 6.5768, "step": 181 }, { "epoch": 0.23296, "grad_norm": 0.9929698705673218, "learning_rate": 0.00013887468030690536, "loss": 6.5578, "step": 182 }, { "epoch": 0.23424, "grad_norm": 1.016129493713379, "learning_rate": 0.00013964194373401533, "loss": 6.6065, "step": 183 }, { "epoch": 0.23552, "grad_norm": 1.0715256929397583, "learning_rate": 0.0001404092071611253, "loss": 6.6292, "step": 184 }, { "epoch": 0.2368, "grad_norm": 1.1549087762832642, "learning_rate": 0.00014117647058823528, "loss": 6.5751, "step": 185 }, { "epoch": 0.23808, "grad_norm": 0.736225962638855, "learning_rate": 0.00014194373401534526, "loss": 6.5302, "step": 186 }, { "epoch": 0.23936, "grad_norm": 0.6689443588256836, "learning_rate": 0.00014271099744245523, "loss": 6.6393, "step": 187 }, { "epoch": 0.24064, "grad_norm": 0.9276636838912964, "learning_rate": 0.0001434782608695652, "loss": 6.5304, "step": 188 }, { "epoch": 0.24192, "grad_norm": 1.093260645866394, "learning_rate": 0.00014424552429667518, "loss": 6.5826, "step": 189 }, { "epoch": 0.2432, "grad_norm": 0.9400092959403992, "learning_rate": 0.00014501278772378515, "loss": 6.5214, "step": 190 }, { "epoch": 0.24448, "grad_norm": 0.9401909708976746, "learning_rate": 0.00014578005115089513, "loss": 6.5743, "step": 191 }, { "epoch": 0.24576, "grad_norm": 0.8417365550994873, "learning_rate": 0.0001465473145780051, "loss": 6.5711, "step": 192 }, { "epoch": 0.24704, "grad_norm": 0.8696411848068237, "learning_rate": 0.00014731457800511508, "loss": 6.5372, "step": 193 }, { "epoch": 0.24832, "grad_norm": 0.696698784828186, "learning_rate": 0.00014808184143222505, "loss": 6.5471, "step": 194 }, { "epoch": 0.2496, "grad_norm": 0.8627312779426575, "learning_rate": 0.00014884910485933503, "loss": 6.495, "step": 195 }, { "epoch": 0.25088, "grad_norm": 0.8102883100509644, "learning_rate": 0.000149616368286445, "loss": 6.5341, "step": 196 }, { "epoch": 0.25216, "grad_norm": 0.8614912629127502, "learning_rate": 0.00015038363171355497, "loss": 6.5825, "step": 197 }, { "epoch": 0.25344, "grad_norm": 0.6828194260597229, "learning_rate": 0.00015115089514066495, "loss": 6.5386, "step": 198 }, { "epoch": 0.25472, "grad_norm": 0.6624521613121033, "learning_rate": 0.00015191815856777495, "loss": 6.4857, "step": 199 }, { "epoch": 0.256, "grad_norm": 0.6489097476005554, "learning_rate": 0.0001526854219948849, "loss": 6.5139, "step": 200 }, { "epoch": 0.25728, "grad_norm": 0.6855762004852295, "learning_rate": 0.00015345268542199487, "loss": 6.4764, "step": 201 }, { "epoch": 0.25856, "grad_norm": 0.8213603496551514, "learning_rate": 0.00015421994884910485, "loss": 6.5242, "step": 202 }, { "epoch": 0.25984, "grad_norm": 0.824782133102417, "learning_rate": 0.00015498721227621482, "loss": 6.5478, "step": 203 }, { "epoch": 0.26112, "grad_norm": 0.9565497636795044, "learning_rate": 0.00015575447570332482, "loss": 6.5082, "step": 204 }, { "epoch": 0.2624, "grad_norm": 0.9202224612236023, "learning_rate": 0.00015652173913043477, "loss": 6.4114, "step": 205 }, { "epoch": 0.26368, "grad_norm": 0.992260217666626, "learning_rate": 0.00015728900255754474, "loss": 6.4625, "step": 206 }, { "epoch": 0.26496, "grad_norm": 0.8465138077735901, "learning_rate": 0.00015805626598465472, "loss": 6.439, "step": 207 }, { "epoch": 0.26624, "grad_norm": 0.7151897549629211, "learning_rate": 0.0001588235294117647, "loss": 6.4956, "step": 208 }, { "epoch": 0.26752, "grad_norm": 0.8685120940208435, "learning_rate": 0.0001595907928388747, "loss": 6.4803, "step": 209 }, { "epoch": 0.2688, "grad_norm": 0.81340491771698, "learning_rate": 0.00016035805626598464, "loss": 6.4193, "step": 210 }, { "epoch": 0.27008, "grad_norm": 0.7921631932258606, "learning_rate": 0.00016112531969309462, "loss": 6.4279, "step": 211 }, { "epoch": 0.27136, "grad_norm": 0.6297151446342468, "learning_rate": 0.0001618925831202046, "loss": 6.5347, "step": 212 }, { "epoch": 0.27264, "grad_norm": 0.6811320781707764, "learning_rate": 0.00016265984654731456, "loss": 6.4414, "step": 213 }, { "epoch": 0.27392, "grad_norm": 0.6986665725708008, "learning_rate": 0.00016342710997442457, "loss": 6.4284, "step": 214 }, { "epoch": 0.2752, "grad_norm": 0.6655412316322327, "learning_rate": 0.0001641943734015345, "loss": 6.4398, "step": 215 }, { "epoch": 0.27648, "grad_norm": 0.6471274495124817, "learning_rate": 0.0001649616368286445, "loss": 6.4229, "step": 216 }, { "epoch": 0.27776, "grad_norm": 0.7184582948684692, "learning_rate": 0.00016572890025575446, "loss": 6.4255, "step": 217 }, { "epoch": 0.27904, "grad_norm": 0.7616591453552246, "learning_rate": 0.00016649616368286444, "loss": 6.4472, "step": 218 }, { "epoch": 0.28032, "grad_norm": 0.6204221248626709, "learning_rate": 0.00016726342710997444, "loss": 6.3404, "step": 219 }, { "epoch": 0.2816, "grad_norm": 0.7307862639427185, "learning_rate": 0.00016803069053708438, "loss": 6.4221, "step": 220 }, { "epoch": 0.28288, "grad_norm": 0.6093372702598572, "learning_rate": 0.00016879795396419436, "loss": 6.3769, "step": 221 }, { "epoch": 0.28416, "grad_norm": 0.7051405906677246, "learning_rate": 0.00016956521739130433, "loss": 6.4473, "step": 222 }, { "epoch": 0.28544, "grad_norm": 0.6887394785881042, "learning_rate": 0.0001703324808184143, "loss": 6.4528, "step": 223 }, { "epoch": 0.28672, "grad_norm": 0.7406412959098816, "learning_rate": 0.00017109974424552428, "loss": 6.375, "step": 224 }, { "epoch": 0.288, "grad_norm": 0.6635006666183472, "learning_rate": 0.00017186700767263426, "loss": 6.4649, "step": 225 }, { "epoch": 0.28928, "grad_norm": 0.7995834946632385, "learning_rate": 0.00017263427109974423, "loss": 6.3737, "step": 226 }, { "epoch": 0.29056, "grad_norm": 0.9514179825782776, "learning_rate": 0.00017340153452685418, "loss": 6.3449, "step": 227 }, { "epoch": 0.29184, "grad_norm": 1.0529365539550781, "learning_rate": 0.00017416879795396418, "loss": 6.3666, "step": 228 }, { "epoch": 0.29312, "grad_norm": 1.0295612812042236, "learning_rate": 0.00017493606138107415, "loss": 6.4025, "step": 229 }, { "epoch": 0.2944, "grad_norm": 1.07936692237854, "learning_rate": 0.00017570332480818413, "loss": 6.3812, "step": 230 }, { "epoch": 0.29568, "grad_norm": 0.9472998976707458, "learning_rate": 0.0001764705882352941, "loss": 6.3867, "step": 231 }, { "epoch": 0.29696, "grad_norm": 0.6875970959663391, "learning_rate": 0.00017723785166240405, "loss": 6.4047, "step": 232 }, { "epoch": 0.29824, "grad_norm": 0.8679647445678711, "learning_rate": 0.00017800511508951405, "loss": 6.3743, "step": 233 }, { "epoch": 0.29952, "grad_norm": 0.9203290939331055, "learning_rate": 0.00017877237851662403, "loss": 6.3606, "step": 234 }, { "epoch": 0.3008, "grad_norm": 0.8777357935905457, "learning_rate": 0.000179539641943734, "loss": 6.3564, "step": 235 }, { "epoch": 0.30208, "grad_norm": 0.8760378956794739, "learning_rate": 0.00018030690537084397, "loss": 6.3576, "step": 236 }, { "epoch": 0.30336, "grad_norm": 0.7797200083732605, "learning_rate": 0.00018107416879795392, "loss": 6.2981, "step": 237 }, { "epoch": 0.30464, "grad_norm": 0.7287745475769043, "learning_rate": 0.00018184143222506392, "loss": 6.342, "step": 238 }, { "epoch": 0.30592, "grad_norm": 0.7142657041549683, "learning_rate": 0.0001826086956521739, "loss": 6.4097, "step": 239 }, { "epoch": 0.3072, "grad_norm": 0.8947247266769409, "learning_rate": 0.00018337595907928387, "loss": 6.4198, "step": 240 }, { "epoch": 0.30848, "grad_norm": 0.7610101103782654, "learning_rate": 0.00018414322250639385, "loss": 6.3638, "step": 241 }, { "epoch": 0.30976, "grad_norm": 0.7212010622024536, "learning_rate": 0.0001849104859335038, "loss": 6.3509, "step": 242 }, { "epoch": 0.31104, "grad_norm": 0.711169421672821, "learning_rate": 0.0001856777493606138, "loss": 6.2656, "step": 243 }, { "epoch": 0.31232, "grad_norm": 0.6636462807655334, "learning_rate": 0.00018644501278772377, "loss": 6.3338, "step": 244 }, { "epoch": 0.3136, "grad_norm": 0.6644899249076843, "learning_rate": 0.00018721227621483374, "loss": 6.3888, "step": 245 }, { "epoch": 0.31488, "grad_norm": 0.8558899760246277, "learning_rate": 0.00018797953964194372, "loss": 6.383, "step": 246 }, { "epoch": 0.31616, "grad_norm": 0.8236832618713379, "learning_rate": 0.00018874680306905366, "loss": 6.3142, "step": 247 }, { "epoch": 0.31744, "grad_norm": 0.6856957674026489, "learning_rate": 0.00018951406649616367, "loss": 6.3231, "step": 248 }, { "epoch": 0.31872, "grad_norm": 0.8850679993629456, "learning_rate": 0.00019028132992327364, "loss": 6.334, "step": 249 }, { "epoch": 0.32, "grad_norm": 0.9059402942657471, "learning_rate": 0.00019104859335038361, "loss": 6.3826, "step": 250 }, { "epoch": 0.32128, "grad_norm": 0.7600975036621094, "learning_rate": 0.0001918158567774936, "loss": 6.3067, "step": 251 }, { "epoch": 0.32256, "grad_norm": 0.73809814453125, "learning_rate": 0.00019258312020460354, "loss": 6.3276, "step": 252 }, { "epoch": 0.32384, "grad_norm": 0.7253278493881226, "learning_rate": 0.00019335038363171354, "loss": 6.282, "step": 253 }, { "epoch": 0.32512, "grad_norm": 0.6514863967895508, "learning_rate": 0.0001941176470588235, "loss": 6.2885, "step": 254 }, { "epoch": 0.3264, "grad_norm": 0.8405912518501282, "learning_rate": 0.0001948849104859335, "loss": 6.3166, "step": 255 }, { "epoch": 0.32768, "grad_norm": 1.0162445306777954, "learning_rate": 0.00019565217391304346, "loss": 6.2927, "step": 256 }, { "epoch": 0.32896, "grad_norm": 1.0011142492294312, "learning_rate": 0.00019641943734015346, "loss": 6.2595, "step": 257 }, { "epoch": 0.33024, "grad_norm": 0.8047503232955933, "learning_rate": 0.0001971867007672634, "loss": 6.333, "step": 258 }, { "epoch": 0.33152, "grad_norm": 0.7660624384880066, "learning_rate": 0.00019795396419437338, "loss": 6.2664, "step": 259 }, { "epoch": 0.3328, "grad_norm": 0.7502520680427551, "learning_rate": 0.00019872122762148336, "loss": 6.2721, "step": 260 }, { "epoch": 0.33408, "grad_norm": 0.9337821006774902, "learning_rate": 0.00019948849104859333, "loss": 6.3069, "step": 261 }, { "epoch": 0.33536, "grad_norm": 0.6733538508415222, "learning_rate": 0.00020025575447570333, "loss": 6.221, "step": 262 }, { "epoch": 0.33664, "grad_norm": 0.7917484045028687, "learning_rate": 0.00020102301790281328, "loss": 6.2338, "step": 263 }, { "epoch": 0.33792, "grad_norm": 0.7609951496124268, "learning_rate": 0.00020179028132992326, "loss": 6.3031, "step": 264 }, { "epoch": 0.3392, "grad_norm": 0.6764228940010071, "learning_rate": 0.00020255754475703323, "loss": 6.2349, "step": 265 }, { "epoch": 0.34048, "grad_norm": 0.7090582847595215, "learning_rate": 0.0002033248081841432, "loss": 6.244, "step": 266 }, { "epoch": 0.34176, "grad_norm": 0.7496779561042786, "learning_rate": 0.0002040920716112532, "loss": 6.2315, "step": 267 }, { "epoch": 0.34304, "grad_norm": 0.987930417060852, "learning_rate": 0.00020485933503836315, "loss": 6.2223, "step": 268 }, { "epoch": 0.34432, "grad_norm": 0.8948729634284973, "learning_rate": 0.00020562659846547313, "loss": 6.2722, "step": 269 }, { "epoch": 0.3456, "grad_norm": 0.7875823974609375, "learning_rate": 0.0002063938618925831, "loss": 6.228, "step": 270 }, { "epoch": 0.34688, "grad_norm": 0.7225289940834045, "learning_rate": 0.00020716112531969308, "loss": 6.1819, "step": 271 }, { "epoch": 0.34816, "grad_norm": 0.9579319953918457, "learning_rate": 0.00020792838874680308, "loss": 6.2365, "step": 272 }, { "epoch": 0.34944, "grad_norm": 1.02460777759552, "learning_rate": 0.00020869565217391303, "loss": 6.2907, "step": 273 }, { "epoch": 0.35072, "grad_norm": 1.275744915008545, "learning_rate": 0.000209462915601023, "loss": 6.259, "step": 274 }, { "epoch": 0.352, "grad_norm": 0.8093612194061279, "learning_rate": 0.00021023017902813297, "loss": 6.1701, "step": 275 }, { "epoch": 0.35328, "grad_norm": 0.9731954336166382, "learning_rate": 0.00021099744245524295, "loss": 6.259, "step": 276 }, { "epoch": 0.35456, "grad_norm": 1.0986355543136597, "learning_rate": 0.00021176470588235295, "loss": 6.1786, "step": 277 }, { "epoch": 0.35584, "grad_norm": 1.1325825452804565, "learning_rate": 0.0002125319693094629, "loss": 6.253, "step": 278 }, { "epoch": 0.35712, "grad_norm": 0.7239522337913513, "learning_rate": 0.00021329923273657287, "loss": 6.2677, "step": 279 }, { "epoch": 0.3584, "grad_norm": 0.9211587905883789, "learning_rate": 0.00021406649616368285, "loss": 6.1859, "step": 280 }, { "epoch": 0.35968, "grad_norm": 0.8542487621307373, "learning_rate": 0.00021483375959079282, "loss": 6.1534, "step": 281 }, { "epoch": 0.36096, "grad_norm": 0.7646416425704956, "learning_rate": 0.0002156010230179028, "loss": 6.1181, "step": 282 }, { "epoch": 0.36224, "grad_norm": 0.7755529880523682, "learning_rate": 0.00021636828644501277, "loss": 6.1676, "step": 283 }, { "epoch": 0.36352, "grad_norm": 0.6631358861923218, "learning_rate": 0.00021713554987212274, "loss": 6.2041, "step": 284 }, { "epoch": 0.3648, "grad_norm": 0.676986038684845, "learning_rate": 0.0002179028132992327, "loss": 6.178, "step": 285 }, { "epoch": 0.36608, "grad_norm": 0.7331904172897339, "learning_rate": 0.0002186700767263427, "loss": 6.1775, "step": 286 }, { "epoch": 0.36736, "grad_norm": 0.7227631211280823, "learning_rate": 0.00021943734015345267, "loss": 6.1277, "step": 287 }, { "epoch": 0.36864, "grad_norm": 0.7674806118011475, "learning_rate": 0.00022020460358056264, "loss": 6.1654, "step": 288 }, { "epoch": 0.36992, "grad_norm": 0.7369276881217957, "learning_rate": 0.00022097186700767261, "loss": 6.1768, "step": 289 }, { "epoch": 0.3712, "grad_norm": 0.7614607214927673, "learning_rate": 0.00022173913043478256, "loss": 6.175, "step": 290 }, { "epoch": 0.37248, "grad_norm": 0.6262795329093933, "learning_rate": 0.00022250639386189256, "loss": 6.1554, "step": 291 }, { "epoch": 0.37376, "grad_norm": 0.7323906421661377, "learning_rate": 0.00022327365728900254, "loss": 6.1748, "step": 292 }, { "epoch": 0.37504, "grad_norm": 0.6152296662330627, "learning_rate": 0.0002240409207161125, "loss": 6.1391, "step": 293 }, { "epoch": 0.37632, "grad_norm": 0.672359824180603, "learning_rate": 0.0002248081841432225, "loss": 6.1529, "step": 294 }, { "epoch": 0.3776, "grad_norm": 0.8039406538009644, "learning_rate": 0.00022557544757033243, "loss": 6.1391, "step": 295 }, { "epoch": 0.37888, "grad_norm": 0.9391714334487915, "learning_rate": 0.00022634271099744244, "loss": 6.1234, "step": 296 }, { "epoch": 0.38016, "grad_norm": 0.6732305884361267, "learning_rate": 0.0002271099744245524, "loss": 6.1335, "step": 297 }, { "epoch": 0.38144, "grad_norm": 0.8230670094490051, "learning_rate": 0.00022787723785166238, "loss": 6.136, "step": 298 }, { "epoch": 0.38272, "grad_norm": 0.5934028625488281, "learning_rate": 0.00022864450127877236, "loss": 6.1398, "step": 299 }, { "epoch": 0.384, "grad_norm": 0.8017310500144958, "learning_rate": 0.0002294117647058823, "loss": 6.1337, "step": 300 }, { "epoch": 0.38528, "grad_norm": 0.7498769760131836, "learning_rate": 0.0002301790281329923, "loss": 6.1117, "step": 301 }, { "epoch": 0.38656, "grad_norm": 0.727734386920929, "learning_rate": 0.00023094629156010228, "loss": 6.1219, "step": 302 }, { "epoch": 0.38784, "grad_norm": 0.7669496536254883, "learning_rate": 0.00023171355498721226, "loss": 6.1272, "step": 303 }, { "epoch": 0.38912, "grad_norm": 0.7476457357406616, "learning_rate": 0.00023248081841432223, "loss": 6.0301, "step": 304 }, { "epoch": 0.3904, "grad_norm": 0.7540501952171326, "learning_rate": 0.00023324808184143218, "loss": 6.1143, "step": 305 }, { "epoch": 0.39168, "grad_norm": 1.032804012298584, "learning_rate": 0.00023401534526854218, "loss": 6.1371, "step": 306 }, { "epoch": 0.39296, "grad_norm": 1.1408971548080444, "learning_rate": 0.00023478260869565215, "loss": 6.0916, "step": 307 }, { "epoch": 0.39424, "grad_norm": 1.027319312095642, "learning_rate": 0.00023554987212276213, "loss": 6.1407, "step": 308 }, { "epoch": 0.39552, "grad_norm": 0.8574281930923462, "learning_rate": 0.0002363171355498721, "loss": 6.0626, "step": 309 }, { "epoch": 0.3968, "grad_norm": 0.937127411365509, "learning_rate": 0.0002370843989769821, "loss": 6.1034, "step": 310 }, { "epoch": 0.39808, "grad_norm": 0.7400676012039185, "learning_rate": 0.00023785166240409205, "loss": 6.0996, "step": 311 }, { "epoch": 0.39936, "grad_norm": 0.7361345291137695, "learning_rate": 0.00023861892583120203, "loss": 6.047, "step": 312 }, { "epoch": 0.40064, "grad_norm": 0.7439408898353577, "learning_rate": 0.000239386189258312, "loss": 6.0906, "step": 313 }, { "epoch": 0.40192, "grad_norm": 0.7513951659202576, "learning_rate": 0.00024015345268542197, "loss": 6.0956, "step": 314 }, { "epoch": 0.4032, "grad_norm": 0.7911155223846436, "learning_rate": 0.00024092071611253198, "loss": 6.0411, "step": 315 }, { "epoch": 0.40448, "grad_norm": 0.7218300700187683, "learning_rate": 0.00024168797953964192, "loss": 6.1029, "step": 316 }, { "epoch": 0.40576, "grad_norm": 0.8403254151344299, "learning_rate": 0.0002424552429667519, "loss": 6.0944, "step": 317 }, { "epoch": 0.40704, "grad_norm": 0.676937460899353, "learning_rate": 0.00024322250639386187, "loss": 6.037, "step": 318 }, { "epoch": 0.40832, "grad_norm": 0.8323513865470886, "learning_rate": 0.00024398976982097185, "loss": 6.0856, "step": 319 }, { "epoch": 0.4096, "grad_norm": 0.9101656675338745, "learning_rate": 0.0002447570332480818, "loss": 6.089, "step": 320 }, { "epoch": 0.41088, "grad_norm": 0.7529237270355225, "learning_rate": 0.0002455242966751918, "loss": 6.0525, "step": 321 }, { "epoch": 0.41216, "grad_norm": 0.6977245211601257, "learning_rate": 0.00024629156010230177, "loss": 6.0561, "step": 322 }, { "epoch": 0.41344, "grad_norm": 0.9390827417373657, "learning_rate": 0.00024705882352941174, "loss": 6.0536, "step": 323 }, { "epoch": 0.41472, "grad_norm": 0.895576536655426, "learning_rate": 0.0002478260869565217, "loss": 6.0667, "step": 324 }, { "epoch": 0.416, "grad_norm": 0.9296811819076538, "learning_rate": 0.0002485933503836317, "loss": 6.0086, "step": 325 }, { "epoch": 0.41728, "grad_norm": 0.9841163754463196, "learning_rate": 0.00024936061381074167, "loss": 6.0587, "step": 326 }, { "epoch": 0.41856, "grad_norm": 0.7630248665809631, "learning_rate": 0.00025012787723785164, "loss": 5.9773, "step": 327 }, { "epoch": 0.41984, "grad_norm": 0.8875618577003479, "learning_rate": 0.0002508951406649616, "loss": 6.0148, "step": 328 }, { "epoch": 0.42112, "grad_norm": 0.8923934102058411, "learning_rate": 0.0002516624040920716, "loss": 6.0467, "step": 329 }, { "epoch": 0.4224, "grad_norm": 0.8564996719360352, "learning_rate": 0.00025242966751918156, "loss": 6.0135, "step": 330 }, { "epoch": 0.42368, "grad_norm": 0.803805410861969, "learning_rate": 0.00025319693094629154, "loss": 5.9922, "step": 331 }, { "epoch": 0.42496, "grad_norm": 0.6739373803138733, "learning_rate": 0.0002539641943734015, "loss": 6.0465, "step": 332 }, { "epoch": 0.42624, "grad_norm": 0.7415125966072083, "learning_rate": 0.0002547314578005115, "loss": 6.0081, "step": 333 }, { "epoch": 0.42752, "grad_norm": 0.6618335247039795, "learning_rate": 0.00025549872122762146, "loss": 5.9794, "step": 334 }, { "epoch": 0.4288, "grad_norm": 0.7585832476615906, "learning_rate": 0.00025626598465473144, "loss": 5.9738, "step": 335 }, { "epoch": 0.43008, "grad_norm": 0.7203150987625122, "learning_rate": 0.0002570332480818414, "loss": 6.0329, "step": 336 }, { "epoch": 0.43136, "grad_norm": 0.7902894020080566, "learning_rate": 0.0002578005115089514, "loss": 5.9299, "step": 337 }, { "epoch": 0.43264, "grad_norm": 0.725581705570221, "learning_rate": 0.00025856777493606136, "loss": 5.9855, "step": 338 }, { "epoch": 0.43392, "grad_norm": 0.8299371004104614, "learning_rate": 0.00025933503836317133, "loss": 6.0628, "step": 339 }, { "epoch": 0.4352, "grad_norm": 0.7858306765556335, "learning_rate": 0.0002601023017902813, "loss": 6.0055, "step": 340 }, { "epoch": 0.43648, "grad_norm": 0.8693034052848816, "learning_rate": 0.0002608695652173913, "loss": 6.032, "step": 341 }, { "epoch": 0.43776, "grad_norm": 0.8078804612159729, "learning_rate": 0.00026163682864450126, "loss": 5.9117, "step": 342 }, { "epoch": 0.43904, "grad_norm": 0.6977505683898926, "learning_rate": 0.00026240409207161123, "loss": 6.0052, "step": 343 }, { "epoch": 0.44032, "grad_norm": 0.7459436058998108, "learning_rate": 0.0002631713554987212, "loss": 6.0064, "step": 344 }, { "epoch": 0.4416, "grad_norm": 0.8525255918502808, "learning_rate": 0.0002639386189258312, "loss": 5.9702, "step": 345 }, { "epoch": 0.44288, "grad_norm": 0.8468485474586487, "learning_rate": 0.00026470588235294115, "loss": 6.0015, "step": 346 }, { "epoch": 0.44416, "grad_norm": 0.8215516805648804, "learning_rate": 0.00026547314578005113, "loss": 5.9744, "step": 347 }, { "epoch": 0.44544, "grad_norm": 0.7234410643577576, "learning_rate": 0.0002662404092071611, "loss": 5.9851, "step": 348 }, { "epoch": 0.44672, "grad_norm": 0.7426914572715759, "learning_rate": 0.0002670076726342711, "loss": 5.9737, "step": 349 }, { "epoch": 0.448, "grad_norm": 0.7684586048126221, "learning_rate": 0.00026777493606138105, "loss": 5.9324, "step": 350 }, { "epoch": 0.44928, "grad_norm": 0.8180778622627258, "learning_rate": 0.000268542199488491, "loss": 5.8851, "step": 351 }, { "epoch": 0.45056, "grad_norm": 0.7742370367050171, "learning_rate": 0.000269309462915601, "loss": 5.9474, "step": 352 }, { "epoch": 0.45184, "grad_norm": 0.9160585403442383, "learning_rate": 0.000270076726342711, "loss": 5.958, "step": 353 }, { "epoch": 0.45312, "grad_norm": 0.9427086710929871, "learning_rate": 0.00027084398976982095, "loss": 5.9811, "step": 354 }, { "epoch": 0.4544, "grad_norm": 1.0058586597442627, "learning_rate": 0.0002716112531969309, "loss": 5.9625, "step": 355 }, { "epoch": 0.45568, "grad_norm": 1.0322132110595703, "learning_rate": 0.0002723785166240409, "loss": 5.8985, "step": 356 }, { "epoch": 0.45696, "grad_norm": 0.8881152868270874, "learning_rate": 0.00027314578005115087, "loss": 6.0119, "step": 357 }, { "epoch": 0.45824, "grad_norm": 0.7439785599708557, "learning_rate": 0.00027391304347826085, "loss": 5.9378, "step": 358 }, { "epoch": 0.45952, "grad_norm": 0.8855685591697693, "learning_rate": 0.0002746803069053708, "loss": 5.9244, "step": 359 }, { "epoch": 0.4608, "grad_norm": 0.6977657079696655, "learning_rate": 0.0002754475703324808, "loss": 5.9455, "step": 360 }, { "epoch": 0.46208, "grad_norm": 0.6967872381210327, "learning_rate": 0.00027621483375959077, "loss": 5.9463, "step": 361 }, { "epoch": 0.46336, "grad_norm": 0.6706664562225342, "learning_rate": 0.00027698209718670074, "loss": 5.9024, "step": 362 }, { "epoch": 0.46464, "grad_norm": 1.084336757659912, "learning_rate": 0.0002777493606138107, "loss": 5.864, "step": 363 }, { "epoch": 0.46592, "grad_norm": 1.0996636152267456, "learning_rate": 0.0002785166240409207, "loss": 5.9009, "step": 364 }, { "epoch": 0.4672, "grad_norm": 0.9186033606529236, "learning_rate": 0.00027928388746803067, "loss": 5.8365, "step": 365 }, { "epoch": 0.46848, "grad_norm": 0.8409336805343628, "learning_rate": 0.00028005115089514064, "loss": 5.908, "step": 366 }, { "epoch": 0.46976, "grad_norm": 0.8734238147735596, "learning_rate": 0.0002808184143222506, "loss": 5.8922, "step": 367 }, { "epoch": 0.47104, "grad_norm": 1.0424097776412964, "learning_rate": 0.0002815856777493606, "loss": 5.975, "step": 368 }, { "epoch": 0.47232, "grad_norm": 1.0325592756271362, "learning_rate": 0.00028235294117647056, "loss": 5.8882, "step": 369 }, { "epoch": 0.4736, "grad_norm": 0.8004043102264404, "learning_rate": 0.00028312020460358054, "loss": 5.8663, "step": 370 }, { "epoch": 0.47488, "grad_norm": 0.6563780307769775, "learning_rate": 0.0002838874680306905, "loss": 5.8704, "step": 371 }, { "epoch": 0.47616, "grad_norm": 0.7133100032806396, "learning_rate": 0.0002846547314578005, "loss": 5.8959, "step": 372 }, { "epoch": 0.47744, "grad_norm": 0.656963050365448, "learning_rate": 0.00028542199488491046, "loss": 5.866, "step": 373 }, { "epoch": 0.47872, "grad_norm": 0.761816143989563, "learning_rate": 0.00028618925831202044, "loss": 5.8776, "step": 374 }, { "epoch": 0.48, "grad_norm": 0.900486171245575, "learning_rate": 0.0002869565217391304, "loss": 5.9083, "step": 375 }, { "epoch": 0.48128, "grad_norm": 1.0122328996658325, "learning_rate": 0.0002877237851662404, "loss": 5.8945, "step": 376 }, { "epoch": 0.48256, "grad_norm": 0.9674193859100342, "learning_rate": 0.00028849104859335036, "loss": 5.8972, "step": 377 }, { "epoch": 0.48384, "grad_norm": 0.7003923058509827, "learning_rate": 0.00028925831202046033, "loss": 5.8883, "step": 378 }, { "epoch": 0.48512, "grad_norm": 0.7299737930297852, "learning_rate": 0.0002900255754475703, "loss": 5.8642, "step": 379 }, { "epoch": 0.4864, "grad_norm": 0.68450927734375, "learning_rate": 0.0002907928388746803, "loss": 5.8051, "step": 380 }, { "epoch": 0.48768, "grad_norm": 0.744691789150238, "learning_rate": 0.00029156010230179026, "loss": 5.8283, "step": 381 }, { "epoch": 0.48896, "grad_norm": 0.8168903589248657, "learning_rate": 0.00029232736572890023, "loss": 5.8316, "step": 382 }, { "epoch": 0.49024, "grad_norm": 1.2333744764328003, "learning_rate": 0.0002930946291560102, "loss": 5.8244, "step": 383 }, { "epoch": 0.49152, "grad_norm": 0.9197435975074768, "learning_rate": 0.0002938618925831202, "loss": 5.8464, "step": 384 }, { "epoch": 0.4928, "grad_norm": 1.0070711374282837, "learning_rate": 0.00029462915601023015, "loss": 5.8809, "step": 385 }, { "epoch": 0.49408, "grad_norm": 0.9315182566642761, "learning_rate": 0.00029539641943734013, "loss": 5.8207, "step": 386 }, { "epoch": 0.49536, "grad_norm": 0.7878130078315735, "learning_rate": 0.0002961636828644501, "loss": 5.8888, "step": 387 }, { "epoch": 0.49664, "grad_norm": 0.8333479762077332, "learning_rate": 0.0002969309462915601, "loss": 5.8616, "step": 388 }, { "epoch": 0.49792, "grad_norm": 0.8158892393112183, "learning_rate": 0.00029769820971867005, "loss": 5.8188, "step": 389 }, { "epoch": 0.4992, "grad_norm": 0.6871780157089233, "learning_rate": 0.00029846547314578, "loss": 5.8525, "step": 390 }, { "epoch": 0.50048, "grad_norm": 0.7237544059753418, "learning_rate": 0.00029923273657289, "loss": 5.8758, "step": 391 }, { "epoch": 0.50176, "grad_norm": 0.6453325152397156, "learning_rate": 0.0003, "loss": 5.7895, "step": 392 }, { "epoch": 0.50304, "grad_norm": 0.5974975824356079, "learning_rate": 0.0002999596177143626, "loss": 5.8014, "step": 393 }, { "epoch": 0.50432, "grad_norm": 0.6222047209739685, "learning_rate": 0.00029991923542872523, "loss": 5.7825, "step": 394 }, { "epoch": 0.5056, "grad_norm": 0.6509471535682678, "learning_rate": 0.00029987885314308786, "loss": 5.806, "step": 395 }, { "epoch": 0.50688, "grad_norm": 0.6112677454948425, "learning_rate": 0.0002998384708574505, "loss": 5.7888, "step": 396 }, { "epoch": 0.50816, "grad_norm": 0.67631596326828, "learning_rate": 0.0002997980885718131, "loss": 5.7826, "step": 397 }, { "epoch": 0.50944, "grad_norm": 0.6522372961044312, "learning_rate": 0.00029975770628617575, "loss": 5.7987, "step": 398 }, { "epoch": 0.51072, "grad_norm": 0.5814371109008789, "learning_rate": 0.00029971732400053844, "loss": 5.7732, "step": 399 }, { "epoch": 0.512, "grad_norm": 0.7316078543663025, "learning_rate": 0.00029967694171490107, "loss": 5.7616, "step": 400 }, { "epoch": 0.51328, "grad_norm": 0.817568302154541, "learning_rate": 0.00029963655942926364, "loss": 5.8326, "step": 401 }, { "epoch": 0.51456, "grad_norm": 0.6369678378105164, "learning_rate": 0.0002995961771436263, "loss": 5.7762, "step": 402 }, { "epoch": 0.51584, "grad_norm": 0.6471033096313477, "learning_rate": 0.00029955579485798896, "loss": 5.7027, "step": 403 }, { "epoch": 0.51712, "grad_norm": 0.6128319501876831, "learning_rate": 0.0002995154125723516, "loss": 5.754, "step": 404 }, { "epoch": 0.5184, "grad_norm": 0.7475748062133789, "learning_rate": 0.0002994750302867142, "loss": 5.7301, "step": 405 }, { "epoch": 0.51968, "grad_norm": 0.6675015091896057, "learning_rate": 0.00029943464800107685, "loss": 5.7793, "step": 406 }, { "epoch": 0.52096, "grad_norm": 0.640458345413208, "learning_rate": 0.0002993942657154395, "loss": 5.735, "step": 407 }, { "epoch": 0.52224, "grad_norm": 0.6345043778419495, "learning_rate": 0.0002993538834298021, "loss": 5.6872, "step": 408 }, { "epoch": 0.52352, "grad_norm": 0.6826564073562622, "learning_rate": 0.00029931350114416474, "loss": 5.691, "step": 409 }, { "epoch": 0.5248, "grad_norm": 0.576991856098175, "learning_rate": 0.00029927311885852737, "loss": 5.7254, "step": 410 }, { "epoch": 0.52608, "grad_norm": 0.572139322757721, "learning_rate": 0.00029923273657289, "loss": 5.7618, "step": 411 }, { "epoch": 0.52736, "grad_norm": 0.7463775277137756, "learning_rate": 0.00029919235428725263, "loss": 5.7217, "step": 412 }, { "epoch": 0.52864, "grad_norm": 0.7591479420661926, "learning_rate": 0.00029915197200161526, "loss": 5.745, "step": 413 }, { "epoch": 0.52992, "grad_norm": 0.7695474624633789, "learning_rate": 0.0002991115897159779, "loss": 5.7294, "step": 414 }, { "epoch": 0.5312, "grad_norm": 0.704517126083374, "learning_rate": 0.0002990712074303405, "loss": 5.7123, "step": 415 }, { "epoch": 0.53248, "grad_norm": 0.7229870557785034, "learning_rate": 0.0002990308251447032, "loss": 5.6996, "step": 416 }, { "epoch": 0.53376, "grad_norm": 0.7659012079238892, "learning_rate": 0.0002989904428590658, "loss": 5.7188, "step": 417 }, { "epoch": 0.53504, "grad_norm": 0.8571960926055908, "learning_rate": 0.0002989500605734284, "loss": 5.7277, "step": 418 }, { "epoch": 0.53632, "grad_norm": 0.9707953929901123, "learning_rate": 0.00029890967828779104, "loss": 5.668, "step": 419 }, { "epoch": 0.5376, "grad_norm": 0.9887204766273499, "learning_rate": 0.00029886929600215367, "loss": 5.7147, "step": 420 }, { "epoch": 0.53888, "grad_norm": 0.8890637159347534, "learning_rate": 0.00029882891371651635, "loss": 5.6983, "step": 421 }, { "epoch": 0.54016, "grad_norm": 0.9526740908622742, "learning_rate": 0.000298788531430879, "loss": 5.7069, "step": 422 }, { "epoch": 0.54144, "grad_norm": 1.1278778314590454, "learning_rate": 0.0002987481491452416, "loss": 5.6393, "step": 423 }, { "epoch": 0.54272, "grad_norm": 1.0044233798980713, "learning_rate": 0.0002987077668596042, "loss": 5.7143, "step": 424 }, { "epoch": 0.544, "grad_norm": 0.9827817678451538, "learning_rate": 0.0002986673845739669, "loss": 5.7981, "step": 425 }, { "epoch": 0.54528, "grad_norm": 0.8233329057693481, "learning_rate": 0.0002986270022883295, "loss": 5.7221, "step": 426 }, { "epoch": 0.54656, "grad_norm": 0.7597877383232117, "learning_rate": 0.00029858662000269213, "loss": 5.698, "step": 427 }, { "epoch": 0.54784, "grad_norm": 0.8371834754943848, "learning_rate": 0.00029854623771705476, "loss": 5.6721, "step": 428 }, { "epoch": 0.54912, "grad_norm": 0.6564275026321411, "learning_rate": 0.0002985058554314174, "loss": 5.6973, "step": 429 }, { "epoch": 0.5504, "grad_norm": 0.7076083421707153, "learning_rate": 0.00029846547314578, "loss": 5.6448, "step": 430 }, { "epoch": 0.55168, "grad_norm": 0.7080224752426147, "learning_rate": 0.00029842509086014266, "loss": 5.6874, "step": 431 }, { "epoch": 0.55296, "grad_norm": 0.792473554611206, "learning_rate": 0.0002983847085745053, "loss": 5.7225, "step": 432 }, { "epoch": 0.55424, "grad_norm": 1.0753412246704102, "learning_rate": 0.0002983443262888679, "loss": 5.6673, "step": 433 }, { "epoch": 0.55552, "grad_norm": 1.0193465948104858, "learning_rate": 0.00029830394400323055, "loss": 5.7159, "step": 434 }, { "epoch": 0.5568, "grad_norm": 1.0127590894699097, "learning_rate": 0.0002982635617175932, "loss": 5.6806, "step": 435 }, { "epoch": 0.55808, "grad_norm": 1.2227166891098022, "learning_rate": 0.0002982231794319558, "loss": 5.66, "step": 436 }, { "epoch": 0.55936, "grad_norm": 0.7891542315483093, "learning_rate": 0.00029818279714631844, "loss": 5.6076, "step": 437 }, { "epoch": 0.56064, "grad_norm": 0.7889094948768616, "learning_rate": 0.0002981424148606811, "loss": 5.6662, "step": 438 }, { "epoch": 0.56192, "grad_norm": 0.7303580045700073, "learning_rate": 0.00029810203257504375, "loss": 5.6123, "step": 439 }, { "epoch": 0.5632, "grad_norm": 0.7027185559272766, "learning_rate": 0.0002980616502894063, "loss": 5.6627, "step": 440 }, { "epoch": 0.56448, "grad_norm": 0.6738660931587219, "learning_rate": 0.00029802126800376896, "loss": 5.6118, "step": 441 }, { "epoch": 0.56576, "grad_norm": 0.7443022727966309, "learning_rate": 0.00029798088571813164, "loss": 5.629, "step": 442 }, { "epoch": 0.56704, "grad_norm": 0.685570478439331, "learning_rate": 0.00029794050343249427, "loss": 5.6223, "step": 443 }, { "epoch": 0.56832, "grad_norm": 0.672305703163147, "learning_rate": 0.0002979001211468569, "loss": 5.6039, "step": 444 }, { "epoch": 0.5696, "grad_norm": 0.7415111064910889, "learning_rate": 0.00029785973886121953, "loss": 5.6167, "step": 445 }, { "epoch": 0.57088, "grad_norm": 0.7740527987480164, "learning_rate": 0.00029781935657558216, "loss": 5.5913, "step": 446 }, { "epoch": 0.57216, "grad_norm": 0.7420479655265808, "learning_rate": 0.0002977789742899448, "loss": 5.6269, "step": 447 }, { "epoch": 0.57344, "grad_norm": 0.7080816626548767, "learning_rate": 0.0002977385920043074, "loss": 5.7041, "step": 448 }, { "epoch": 0.57472, "grad_norm": 0.7158259153366089, "learning_rate": 0.00029769820971867005, "loss": 5.6587, "step": 449 }, { "epoch": 0.576, "grad_norm": 0.7243630290031433, "learning_rate": 0.0002976578274330327, "loss": 5.6178, "step": 450 }, { "epoch": 0.57728, "grad_norm": 0.6919866800308228, "learning_rate": 0.0002976174451473953, "loss": 5.6674, "step": 451 }, { "epoch": 0.57856, "grad_norm": 0.7445707321166992, "learning_rate": 0.00029757706286175794, "loss": 5.6193, "step": 452 }, { "epoch": 0.57984, "grad_norm": 0.7470800876617432, "learning_rate": 0.00029753668057612057, "loss": 5.6655, "step": 453 }, { "epoch": 0.58112, "grad_norm": 0.7181954383850098, "learning_rate": 0.0002974962982904832, "loss": 5.607, "step": 454 }, { "epoch": 0.5824, "grad_norm": 0.7144619822502136, "learning_rate": 0.0002974559160048459, "loss": 5.6176, "step": 455 }, { "epoch": 0.58368, "grad_norm": 0.6728463768959045, "learning_rate": 0.00029741553371920846, "loss": 5.5554, "step": 456 }, { "epoch": 0.58496, "grad_norm": 0.7108640074729919, "learning_rate": 0.0002973751514335711, "loss": 5.5815, "step": 457 }, { "epoch": 0.58624, "grad_norm": 0.5689119696617126, "learning_rate": 0.0002973347691479337, "loss": 5.5097, "step": 458 }, { "epoch": 0.58752, "grad_norm": 0.6072651743888855, "learning_rate": 0.0002972943868622964, "loss": 5.5147, "step": 459 }, { "epoch": 0.5888, "grad_norm": 0.5563207268714905, "learning_rate": 0.00029725400457665904, "loss": 5.5191, "step": 460 }, { "epoch": 0.59008, "grad_norm": 0.6030489802360535, "learning_rate": 0.00029721362229102167, "loss": 5.5896, "step": 461 }, { "epoch": 0.59136, "grad_norm": 0.6141897439956665, "learning_rate": 0.0002971732400053843, "loss": 5.6277, "step": 462 }, { "epoch": 0.59264, "grad_norm": 0.6721770167350769, "learning_rate": 0.00029713285771974687, "loss": 5.5537, "step": 463 }, { "epoch": 0.59392, "grad_norm": 0.6960611343383789, "learning_rate": 0.00029709247543410956, "loss": 5.569, "step": 464 }, { "epoch": 0.5952, "grad_norm": 0.6458708047866821, "learning_rate": 0.0002970520931484722, "loss": 5.5418, "step": 465 }, { "epoch": 0.59648, "grad_norm": 0.7103924751281738, "learning_rate": 0.0002970117108628348, "loss": 5.5574, "step": 466 }, { "epoch": 0.59776, "grad_norm": 0.677963376045227, "learning_rate": 0.00029697132857719745, "loss": 5.5791, "step": 467 }, { "epoch": 0.59904, "grad_norm": 0.6947761178016663, "learning_rate": 0.0002969309462915601, "loss": 5.5745, "step": 468 }, { "epoch": 0.60032, "grad_norm": 0.6401717662811279, "learning_rate": 0.0002968905640059227, "loss": 5.5222, "step": 469 }, { "epoch": 0.6016, "grad_norm": 0.6908291578292847, "learning_rate": 0.00029685018172028534, "loss": 5.5118, "step": 470 }, { "epoch": 0.60288, "grad_norm": 0.7326011061668396, "learning_rate": 0.00029680979943464797, "loss": 5.5127, "step": 471 }, { "epoch": 0.60416, "grad_norm": 0.8172990679740906, "learning_rate": 0.00029676941714901065, "loss": 5.5237, "step": 472 }, { "epoch": 0.60544, "grad_norm": 0.7394744157791138, "learning_rate": 0.0002967290348633732, "loss": 5.4849, "step": 473 }, { "epoch": 0.60672, "grad_norm": 0.6962910294532776, "learning_rate": 0.00029668865257773586, "loss": 5.5167, "step": 474 }, { "epoch": 0.608, "grad_norm": 0.6705038547515869, "learning_rate": 0.0002966482702920985, "loss": 5.5084, "step": 475 }, { "epoch": 0.60928, "grad_norm": 0.8199055790901184, "learning_rate": 0.0002966078880064611, "loss": 5.5338, "step": 476 }, { "epoch": 0.61056, "grad_norm": 0.833018958568573, "learning_rate": 0.0002965675057208238, "loss": 5.5654, "step": 477 }, { "epoch": 0.61184, "grad_norm": 0.7582036852836609, "learning_rate": 0.00029652712343518643, "loss": 5.5814, "step": 478 }, { "epoch": 0.61312, "grad_norm": 0.8647993803024292, "learning_rate": 0.000296486741149549, "loss": 5.5089, "step": 479 }, { "epoch": 0.6144, "grad_norm": 1.1151838302612305, "learning_rate": 0.00029644635886391164, "loss": 5.5816, "step": 480 }, { "epoch": 0.61568, "grad_norm": 0.8047844767570496, "learning_rate": 0.0002964059765782743, "loss": 5.5261, "step": 481 }, { "epoch": 0.61696, "grad_norm": 0.7434077858924866, "learning_rate": 0.00029636559429263695, "loss": 5.5063, "step": 482 }, { "epoch": 0.61824, "grad_norm": 0.7304471135139465, "learning_rate": 0.0002963252120069996, "loss": 5.5246, "step": 483 }, { "epoch": 0.61952, "grad_norm": 0.686316728591919, "learning_rate": 0.0002962848297213622, "loss": 5.4783, "step": 484 }, { "epoch": 0.6208, "grad_norm": 0.6505491733551025, "learning_rate": 0.00029624444743572484, "loss": 5.4989, "step": 485 }, { "epoch": 0.62208, "grad_norm": 0.6548778414726257, "learning_rate": 0.00029620406515008747, "loss": 5.4819, "step": 486 }, { "epoch": 0.62336, "grad_norm": 0.6001530289649963, "learning_rate": 0.0002961636828644501, "loss": 5.4859, "step": 487 }, { "epoch": 0.62464, "grad_norm": 0.6683222055435181, "learning_rate": 0.00029612330057881273, "loss": 5.4872, "step": 488 }, { "epoch": 0.62592, "grad_norm": 0.7493375539779663, "learning_rate": 0.00029608291829317536, "loss": 5.5236, "step": 489 }, { "epoch": 0.6272, "grad_norm": 0.7252801060676575, "learning_rate": 0.000296042536007538, "loss": 5.4432, "step": 490 }, { "epoch": 0.62848, "grad_norm": 0.6479570269584656, "learning_rate": 0.0002960021537219006, "loss": 5.4957, "step": 491 }, { "epoch": 0.62976, "grad_norm": 0.6929423809051514, "learning_rate": 0.00029596177143626325, "loss": 5.4412, "step": 492 }, { "epoch": 0.63104, "grad_norm": 0.6217756271362305, "learning_rate": 0.0002959213891506259, "loss": 5.4922, "step": 493 }, { "epoch": 0.63232, "grad_norm": 0.7749668955802917, "learning_rate": 0.00029588100686498857, "loss": 5.5026, "step": 494 }, { "epoch": 0.6336, "grad_norm": 0.7131249308586121, "learning_rate": 0.0002958406245793512, "loss": 5.4615, "step": 495 }, { "epoch": 0.63488, "grad_norm": 0.6029372215270996, "learning_rate": 0.00029580024229371377, "loss": 5.5491, "step": 496 }, { "epoch": 0.63616, "grad_norm": 0.8320589661598206, "learning_rate": 0.0002957598600080764, "loss": 5.4513, "step": 497 }, { "epoch": 0.63744, "grad_norm": 0.8968410491943359, "learning_rate": 0.0002957194777224391, "loss": 5.5575, "step": 498 }, { "epoch": 0.63872, "grad_norm": 0.8323341012001038, "learning_rate": 0.0002956790954368017, "loss": 5.4464, "step": 499 }, { "epoch": 0.64, "grad_norm": 0.7660730481147766, "learning_rate": 0.00029563871315116435, "loss": 5.5199, "step": 500 }, { "epoch": 0.64128, "grad_norm": 0.8838502764701843, "learning_rate": 0.000295598330865527, "loss": 5.4954, "step": 501 }, { "epoch": 0.64256, "grad_norm": 0.8735381960868835, "learning_rate": 0.0002955579485798896, "loss": 5.4527, "step": 502 }, { "epoch": 0.64384, "grad_norm": 0.7608069777488708, "learning_rate": 0.00029551756629425224, "loss": 5.4398, "step": 503 }, { "epoch": 0.64512, "grad_norm": 0.7376892566680908, "learning_rate": 0.00029547718400861487, "loss": 5.4521, "step": 504 }, { "epoch": 0.6464, "grad_norm": 0.6925538778305054, "learning_rate": 0.0002954368017229775, "loss": 5.504, "step": 505 }, { "epoch": 0.64768, "grad_norm": 0.6140308380126953, "learning_rate": 0.00029539641943734013, "loss": 5.4147, "step": 506 }, { "epoch": 0.64896, "grad_norm": 0.6824973225593567, "learning_rate": 0.00029535603715170276, "loss": 5.4963, "step": 507 }, { "epoch": 0.65024, "grad_norm": 0.736219584941864, "learning_rate": 0.0002953156548660654, "loss": 5.4566, "step": 508 }, { "epoch": 0.65152, "grad_norm": 0.6932424902915955, "learning_rate": 0.000295275272580428, "loss": 5.3826, "step": 509 }, { "epoch": 0.6528, "grad_norm": 0.6957204341888428, "learning_rate": 0.00029523489029479065, "loss": 5.4521, "step": 510 }, { "epoch": 0.65408, "grad_norm": 0.5883374810218811, "learning_rate": 0.00029519450800915333, "loss": 5.4681, "step": 511 }, { "epoch": 0.65536, "grad_norm": 0.6524292826652527, "learning_rate": 0.0002951541257235159, "loss": 5.3794, "step": 512 }, { "epoch": 0.65664, "grad_norm": 0.7906764149665833, "learning_rate": 0.00029511374343787854, "loss": 5.443, "step": 513 }, { "epoch": 0.65792, "grad_norm": 0.7420827746391296, "learning_rate": 0.00029507336115224117, "loss": 5.4216, "step": 514 }, { "epoch": 0.6592, "grad_norm": 0.7884359955787659, "learning_rate": 0.0002950329788666038, "loss": 5.4701, "step": 515 }, { "epoch": 0.66048, "grad_norm": 0.7779830694198608, "learning_rate": 0.0002949925965809665, "loss": 5.4779, "step": 516 }, { "epoch": 0.66176, "grad_norm": 0.7066437602043152, "learning_rate": 0.0002949522142953291, "loss": 5.4453, "step": 517 }, { "epoch": 0.66304, "grad_norm": 0.8100753426551819, "learning_rate": 0.00029491183200969174, "loss": 5.4251, "step": 518 }, { "epoch": 0.66432, "grad_norm": 0.7143018245697021, "learning_rate": 0.0002948714497240543, "loss": 5.4786, "step": 519 }, { "epoch": 0.6656, "grad_norm": 0.7291150689125061, "learning_rate": 0.000294831067438417, "loss": 5.4265, "step": 520 }, { "epoch": 0.66688, "grad_norm": 0.6859623193740845, "learning_rate": 0.00029479068515277963, "loss": 5.4724, "step": 521 }, { "epoch": 0.66816, "grad_norm": 0.6909458637237549, "learning_rate": 0.00029475030286714226, "loss": 5.3719, "step": 522 }, { "epoch": 0.66944, "grad_norm": 0.6806208491325378, "learning_rate": 0.0002947099205815049, "loss": 5.3872, "step": 523 }, { "epoch": 0.67072, "grad_norm": 0.6667022109031677, "learning_rate": 0.0002946695382958675, "loss": 5.3614, "step": 524 }, { "epoch": 0.672, "grad_norm": 0.6555402278900146, "learning_rate": 0.00029462915601023015, "loss": 5.4534, "step": 525 }, { "epoch": 0.67328, "grad_norm": 0.685303270816803, "learning_rate": 0.0002945887737245928, "loss": 5.426, "step": 526 }, { "epoch": 0.67456, "grad_norm": 0.7246394753456116, "learning_rate": 0.0002945483914389554, "loss": 5.4041, "step": 527 }, { "epoch": 0.67584, "grad_norm": 0.765101969242096, "learning_rate": 0.00029450800915331804, "loss": 5.386, "step": 528 }, { "epoch": 0.67712, "grad_norm": 0.7084211111068726, "learning_rate": 0.0002944676268676807, "loss": 5.3894, "step": 529 }, { "epoch": 0.6784, "grad_norm": 0.6788263916969299, "learning_rate": 0.0002944272445820433, "loss": 5.375, "step": 530 }, { "epoch": 0.67968, "grad_norm": 0.7256125807762146, "learning_rate": 0.00029438686229640593, "loss": 5.3544, "step": 531 }, { "epoch": 0.68096, "grad_norm": 0.8374799489974976, "learning_rate": 0.00029434648001076856, "loss": 5.3758, "step": 532 }, { "epoch": 0.68224, "grad_norm": 0.8988571166992188, "learning_rate": 0.00029430609772513125, "loss": 5.4003, "step": 533 }, { "epoch": 0.68352, "grad_norm": 0.7412604093551636, "learning_rate": 0.0002942657154394939, "loss": 5.3558, "step": 534 }, { "epoch": 0.6848, "grad_norm": 0.7604132890701294, "learning_rate": 0.00029422533315385645, "loss": 5.4506, "step": 535 }, { "epoch": 0.68608, "grad_norm": 0.9959605932235718, "learning_rate": 0.0002941849508682191, "loss": 5.3877, "step": 536 }, { "epoch": 0.68736, "grad_norm": 0.834484875202179, "learning_rate": 0.00029414456858258177, "loss": 5.3407, "step": 537 }, { "epoch": 0.68864, "grad_norm": 0.6473014950752258, "learning_rate": 0.0002941041862969444, "loss": 5.3553, "step": 538 }, { "epoch": 0.68992, "grad_norm": 0.8236899375915527, "learning_rate": 0.00029406380401130703, "loss": 5.3946, "step": 539 }, { "epoch": 0.6912, "grad_norm": 0.7361433506011963, "learning_rate": 0.00029402342172566966, "loss": 5.3825, "step": 540 }, { "epoch": 0.69248, "grad_norm": 0.5942022800445557, "learning_rate": 0.0002939830394400323, "loss": 5.3754, "step": 541 }, { "epoch": 0.69376, "grad_norm": 0.6257084608078003, "learning_rate": 0.0002939426571543949, "loss": 5.3383, "step": 542 }, { "epoch": 0.69504, "grad_norm": 0.5751544237136841, "learning_rate": 0.00029390227486875755, "loss": 5.3087, "step": 543 }, { "epoch": 0.69632, "grad_norm": 0.6479839086532593, "learning_rate": 0.0002938618925831202, "loss": 5.4463, "step": 544 }, { "epoch": 0.6976, "grad_norm": 0.6127558350563049, "learning_rate": 0.0002938215102974828, "loss": 5.4066, "step": 545 }, { "epoch": 0.69888, "grad_norm": 0.6831675171852112, "learning_rate": 0.00029378112801184544, "loss": 5.3043, "step": 546 }, { "epoch": 0.70016, "grad_norm": 0.69155353307724, "learning_rate": 0.00029374074572620807, "loss": 5.3776, "step": 547 }, { "epoch": 0.70144, "grad_norm": 0.6620556116104126, "learning_rate": 0.0002937003634405707, "loss": 5.3669, "step": 548 }, { "epoch": 0.70272, "grad_norm": 0.5906518697738647, "learning_rate": 0.00029365998115493333, "loss": 5.3356, "step": 549 }, { "epoch": 0.704, "grad_norm": 0.6667118668556213, "learning_rate": 0.000293619598869296, "loss": 5.327, "step": 550 }, { "epoch": 0.70528, "grad_norm": 0.7207005023956299, "learning_rate": 0.0002935792165836586, "loss": 5.3675, "step": 551 }, { "epoch": 0.70656, "grad_norm": 0.8886672854423523, "learning_rate": 0.0002935388342980212, "loss": 5.3172, "step": 552 }, { "epoch": 0.70784, "grad_norm": 0.9381269216537476, "learning_rate": 0.00029349845201238385, "loss": 5.3404, "step": 553 }, { "epoch": 0.70912, "grad_norm": 0.790340006351471, "learning_rate": 0.0002934580697267465, "loss": 5.3174, "step": 554 }, { "epoch": 0.7104, "grad_norm": 0.636177659034729, "learning_rate": 0.00029341768744110916, "loss": 5.3767, "step": 555 }, { "epoch": 0.71168, "grad_norm": 0.7448100447654724, "learning_rate": 0.0002933773051554718, "loss": 5.3201, "step": 556 }, { "epoch": 0.71296, "grad_norm": 0.7365016341209412, "learning_rate": 0.0002933369228698344, "loss": 5.318, "step": 557 }, { "epoch": 0.71424, "grad_norm": 0.6620506048202515, "learning_rate": 0.000293296540584197, "loss": 5.3417, "step": 558 }, { "epoch": 0.71552, "grad_norm": 0.5934759378433228, "learning_rate": 0.0002932561582985597, "loss": 5.3751, "step": 559 }, { "epoch": 0.7168, "grad_norm": 0.6874362230300903, "learning_rate": 0.0002932157760129223, "loss": 5.3089, "step": 560 }, { "epoch": 0.71808, "grad_norm": 0.6296939253807068, "learning_rate": 0.00029317539372728494, "loss": 5.3057, "step": 561 }, { "epoch": 0.71936, "grad_norm": 0.5342876315116882, "learning_rate": 0.0002931350114416476, "loss": 5.3055, "step": 562 }, { "epoch": 0.72064, "grad_norm": 0.7313459515571594, "learning_rate": 0.0002930946291560102, "loss": 5.3002, "step": 563 }, { "epoch": 0.72192, "grad_norm": 0.7934847474098206, "learning_rate": 0.00029305424687037283, "loss": 5.394, "step": 564 }, { "epoch": 0.7232, "grad_norm": 0.6675227284431458, "learning_rate": 0.00029301386458473546, "loss": 5.2447, "step": 565 }, { "epoch": 0.72448, "grad_norm": 0.6422837972640991, "learning_rate": 0.0002929734822990981, "loss": 5.3177, "step": 566 }, { "epoch": 0.72576, "grad_norm": 0.6145399212837219, "learning_rate": 0.0002929331000134607, "loss": 5.3537, "step": 567 }, { "epoch": 0.72704, "grad_norm": 0.5638757944107056, "learning_rate": 0.00029289271772782335, "loss": 5.2738, "step": 568 }, { "epoch": 0.72832, "grad_norm": 0.6802723407745361, "learning_rate": 0.000292852335442186, "loss": 5.3411, "step": 569 }, { "epoch": 0.7296, "grad_norm": 0.8496166467666626, "learning_rate": 0.0002928119531565486, "loss": 5.3137, "step": 570 }, { "epoch": 0.73088, "grad_norm": 0.8245171904563904, "learning_rate": 0.00029277157087091125, "loss": 5.3141, "step": 571 }, { "epoch": 0.73216, "grad_norm": 0.6584154963493347, "learning_rate": 0.00029273118858527393, "loss": 5.3264, "step": 572 }, { "epoch": 0.73344, "grad_norm": 0.6775988936424255, "learning_rate": 0.00029269080629963656, "loss": 5.2959, "step": 573 }, { "epoch": 0.73472, "grad_norm": 0.6198107600212097, "learning_rate": 0.0002926504240139992, "loss": 5.2696, "step": 574 }, { "epoch": 0.736, "grad_norm": 0.7053045034408569, "learning_rate": 0.00029261004172836177, "loss": 5.3243, "step": 575 }, { "epoch": 0.73728, "grad_norm": 0.7388597130775452, "learning_rate": 0.00029256965944272445, "loss": 5.3298, "step": 576 }, { "epoch": 0.73856, "grad_norm": 0.7207250595092773, "learning_rate": 0.0002925292771570871, "loss": 5.3113, "step": 577 }, { "epoch": 0.73984, "grad_norm": 0.7020829319953918, "learning_rate": 0.0002924888948714497, "loss": 5.2813, "step": 578 }, { "epoch": 0.74112, "grad_norm": 0.8059638738632202, "learning_rate": 0.00029244851258581234, "loss": 5.2436, "step": 579 }, { "epoch": 0.7424, "grad_norm": 1.0253353118896484, "learning_rate": 0.00029240813030017497, "loss": 5.2739, "step": 580 }, { "epoch": 0.74368, "grad_norm": 0.963039755821228, "learning_rate": 0.0002923677480145376, "loss": 5.2639, "step": 581 }, { "epoch": 0.74496, "grad_norm": 0.7356200814247131, "learning_rate": 0.00029232736572890023, "loss": 5.2874, "step": 582 }, { "epoch": 0.74624, "grad_norm": 0.7162445187568665, "learning_rate": 0.00029228698344326286, "loss": 5.3408, "step": 583 }, { "epoch": 0.74752, "grad_norm": 0.708344042301178, "learning_rate": 0.0002922466011576255, "loss": 5.2798, "step": 584 }, { "epoch": 0.7488, "grad_norm": 0.5786714553833008, "learning_rate": 0.0002922062188719881, "loss": 5.285, "step": 585 }, { "epoch": 0.75008, "grad_norm": 0.6987271904945374, "learning_rate": 0.00029216583658635075, "loss": 5.2781, "step": 586 }, { "epoch": 0.75136, "grad_norm": 0.6231822967529297, "learning_rate": 0.0002921254543007134, "loss": 5.2242, "step": 587 }, { "epoch": 0.75264, "grad_norm": 0.679735004901886, "learning_rate": 0.000292085072015076, "loss": 5.3149, "step": 588 }, { "epoch": 0.75392, "grad_norm": 0.6461718678474426, "learning_rate": 0.0002920446897294387, "loss": 5.2661, "step": 589 }, { "epoch": 0.7552, "grad_norm": 0.6281011700630188, "learning_rate": 0.0002920043074438013, "loss": 5.1899, "step": 590 }, { "epoch": 0.75648, "grad_norm": 0.6488833427429199, "learning_rate": 0.0002919639251581639, "loss": 5.2635, "step": 591 }, { "epoch": 0.75776, "grad_norm": 0.7168968319892883, "learning_rate": 0.00029192354287252653, "loss": 5.1908, "step": 592 }, { "epoch": 0.75904, "grad_norm": 0.7377144694328308, "learning_rate": 0.0002918831605868892, "loss": 5.2676, "step": 593 }, { "epoch": 0.76032, "grad_norm": 0.5983268022537231, "learning_rate": 0.00029184277830125185, "loss": 5.26, "step": 594 }, { "epoch": 0.7616, "grad_norm": 0.6470866203308105, "learning_rate": 0.0002918023960156145, "loss": 5.228, "step": 595 }, { "epoch": 0.76288, "grad_norm": 0.7147027850151062, "learning_rate": 0.0002917620137299771, "loss": 5.2475, "step": 596 }, { "epoch": 0.76416, "grad_norm": 0.686434268951416, "learning_rate": 0.00029172163144433974, "loss": 5.2174, "step": 597 }, { "epoch": 0.76544, "grad_norm": 0.7053865194320679, "learning_rate": 0.00029168124915870237, "loss": 5.2822, "step": 598 }, { "epoch": 0.76672, "grad_norm": 0.8161548972129822, "learning_rate": 0.000291640866873065, "loss": 5.1864, "step": 599 }, { "epoch": 0.768, "grad_norm": 0.6877032518386841, "learning_rate": 0.0002916004845874276, "loss": 5.281, "step": 600 }, { "epoch": 0.76928, "grad_norm": 0.5830395817756653, "learning_rate": 0.00029156010230179026, "loss": 5.2073, "step": 601 }, { "epoch": 0.77056, "grad_norm": 0.7007346749305725, "learning_rate": 0.0002915197200161529, "loss": 5.3173, "step": 602 }, { "epoch": 0.77184, "grad_norm": 0.7271237969398499, "learning_rate": 0.0002914793377305155, "loss": 5.235, "step": 603 }, { "epoch": 0.77312, "grad_norm": 0.5913999080657959, "learning_rate": 0.00029143895544487815, "loss": 5.2552, "step": 604 }, { "epoch": 0.7744, "grad_norm": 0.6722851991653442, "learning_rate": 0.0002913985731592408, "loss": 5.2112, "step": 605 }, { "epoch": 0.77568, "grad_norm": 0.6968867182731628, "learning_rate": 0.00029135819087360346, "loss": 5.2505, "step": 606 }, { "epoch": 0.77696, "grad_norm": 0.6645686030387878, "learning_rate": 0.00029131780858796604, "loss": 5.1846, "step": 607 }, { "epoch": 0.77824, "grad_norm": 0.6209181547164917, "learning_rate": 0.00029127742630232867, "loss": 5.2269, "step": 608 }, { "epoch": 0.77952, "grad_norm": 0.7518525123596191, "learning_rate": 0.0002912370440166913, "loss": 5.1761, "step": 609 }, { "epoch": 0.7808, "grad_norm": 0.6878477334976196, "learning_rate": 0.0002911966617310539, "loss": 5.2348, "step": 610 }, { "epoch": 0.78208, "grad_norm": 0.7975960969924927, "learning_rate": 0.0002911562794454166, "loss": 5.2519, "step": 611 }, { "epoch": 0.78336, "grad_norm": 0.7792649269104004, "learning_rate": 0.00029111589715977924, "loss": 5.1812, "step": 612 }, { "epoch": 0.78464, "grad_norm": 0.7435747385025024, "learning_rate": 0.00029107551487414187, "loss": 5.1505, "step": 613 }, { "epoch": 0.78592, "grad_norm": 0.7414288520812988, "learning_rate": 0.00029103513258850445, "loss": 5.1972, "step": 614 }, { "epoch": 0.7872, "grad_norm": 0.7069399356842041, "learning_rate": 0.00029099475030286713, "loss": 5.1766, "step": 615 }, { "epoch": 0.78848, "grad_norm": 0.5848967432975769, "learning_rate": 0.00029095436801722976, "loss": 5.1791, "step": 616 }, { "epoch": 0.78976, "grad_norm": 0.6382315158843994, "learning_rate": 0.0002909139857315924, "loss": 5.2266, "step": 617 }, { "epoch": 0.79104, "grad_norm": 0.6295589208602905, "learning_rate": 0.000290873603445955, "loss": 5.2489, "step": 618 }, { "epoch": 0.79232, "grad_norm": 0.6399298310279846, "learning_rate": 0.00029083322116031765, "loss": 5.2295, "step": 619 }, { "epoch": 0.7936, "grad_norm": 0.6969920992851257, "learning_rate": 0.0002907928388746803, "loss": 5.1619, "step": 620 }, { "epoch": 0.79488, "grad_norm": 0.6793097853660583, "learning_rate": 0.0002907524565890429, "loss": 5.2213, "step": 621 }, { "epoch": 0.79616, "grad_norm": 0.6430673003196716, "learning_rate": 0.00029071207430340554, "loss": 5.2848, "step": 622 }, { "epoch": 0.79744, "grad_norm": 0.7029622793197632, "learning_rate": 0.00029067169201776817, "loss": 5.2335, "step": 623 }, { "epoch": 0.79872, "grad_norm": 0.5936407446861267, "learning_rate": 0.0002906313097321308, "loss": 5.2245, "step": 624 }, { "epoch": 0.8, "grad_norm": 0.7349820733070374, "learning_rate": 0.00029059092744649343, "loss": 5.2356, "step": 625 }, { "epoch": 0.80128, "grad_norm": 0.747024655342102, "learning_rate": 0.00029055054516085606, "loss": 5.1537, "step": 626 }, { "epoch": 0.80256, "grad_norm": 0.5645571351051331, "learning_rate": 0.0002905101628752187, "loss": 5.1635, "step": 627 }, { "epoch": 0.80384, "grad_norm": 0.7009657621383667, "learning_rate": 0.0002904697805895814, "loss": 5.2236, "step": 628 }, { "epoch": 0.80512, "grad_norm": 0.7518696188926697, "learning_rate": 0.000290429398303944, "loss": 5.132, "step": 629 }, { "epoch": 0.8064, "grad_norm": 0.7385541796684265, "learning_rate": 0.0002903890160183066, "loss": 5.184, "step": 630 }, { "epoch": 0.80768, "grad_norm": 0.6494706869125366, "learning_rate": 0.0002903486337326692, "loss": 5.2385, "step": 631 }, { "epoch": 0.80896, "grad_norm": 0.7356922626495361, "learning_rate": 0.0002903082514470319, "loss": 5.1194, "step": 632 }, { "epoch": 0.81024, "grad_norm": 0.5700814723968506, "learning_rate": 0.0002902678691613945, "loss": 5.1652, "step": 633 }, { "epoch": 0.81152, "grad_norm": 0.5895199775695801, "learning_rate": 0.00029022748687575716, "loss": 5.1429, "step": 634 }, { "epoch": 0.8128, "grad_norm": 0.5886778831481934, "learning_rate": 0.0002901871045901198, "loss": 5.1932, "step": 635 }, { "epoch": 0.81408, "grad_norm": 0.6429824829101562, "learning_rate": 0.0002901467223044824, "loss": 5.1488, "step": 636 }, { "epoch": 0.81536, "grad_norm": 0.5817999243736267, "learning_rate": 0.00029010634001884505, "loss": 5.1306, "step": 637 }, { "epoch": 0.81664, "grad_norm": 0.6249139308929443, "learning_rate": 0.0002900659577332077, "loss": 5.1539, "step": 638 }, { "epoch": 0.81792, "grad_norm": 0.6901640295982361, "learning_rate": 0.0002900255754475703, "loss": 5.1751, "step": 639 }, { "epoch": 0.8192, "grad_norm": 0.7093073725700378, "learning_rate": 0.00028998519316193294, "loss": 5.1692, "step": 640 }, { "epoch": 0.82048, "grad_norm": 0.6962242722511292, "learning_rate": 0.00028994481087629557, "loss": 5.1768, "step": 641 }, { "epoch": 0.82176, "grad_norm": 0.712927520275116, "learning_rate": 0.0002899044285906582, "loss": 5.1236, "step": 642 }, { "epoch": 0.82304, "grad_norm": 0.600985586643219, "learning_rate": 0.00028986404630502083, "loss": 5.2116, "step": 643 }, { "epoch": 0.82432, "grad_norm": 0.6715438365936279, "learning_rate": 0.00028982366401938346, "loss": 5.1825, "step": 644 }, { "epoch": 0.8256, "grad_norm": 0.854724645614624, "learning_rate": 0.00028978328173374614, "loss": 5.1579, "step": 645 }, { "epoch": 0.82688, "grad_norm": 0.8737854957580566, "learning_rate": 0.00028974289944810877, "loss": 5.1176, "step": 646 }, { "epoch": 0.82816, "grad_norm": 0.8141425251960754, "learning_rate": 0.00028970251716247135, "loss": 5.172, "step": 647 }, { "epoch": 0.82944, "grad_norm": 0.6891692280769348, "learning_rate": 0.000289662134876834, "loss": 5.2035, "step": 648 }, { "epoch": 0.83072, "grad_norm": 0.6787595748901367, "learning_rate": 0.0002896217525911966, "loss": 5.1818, "step": 649 }, { "epoch": 0.832, "grad_norm": 0.8094693422317505, "learning_rate": 0.0002895813703055593, "loss": 5.0946, "step": 650 }, { "epoch": 0.83328, "grad_norm": 0.6562574505805969, "learning_rate": 0.0002895409880199219, "loss": 5.093, "step": 651 }, { "epoch": 0.83456, "grad_norm": 0.5972064733505249, "learning_rate": 0.00028950060573428455, "loss": 5.2078, "step": 652 }, { "epoch": 0.83584, "grad_norm": 0.6744362115859985, "learning_rate": 0.00028946022344864713, "loss": 5.0914, "step": 653 }, { "epoch": 0.83712, "grad_norm": 0.7293115258216858, "learning_rate": 0.0002894198411630098, "loss": 5.0954, "step": 654 }, { "epoch": 0.8384, "grad_norm": 0.6300185918807983, "learning_rate": 0.00028937945887737244, "loss": 5.2061, "step": 655 }, { "epoch": 0.83968, "grad_norm": 0.729161262512207, "learning_rate": 0.00028933907659173507, "loss": 5.1129, "step": 656 }, { "epoch": 0.84096, "grad_norm": 0.7147690653800964, "learning_rate": 0.0002892986943060977, "loss": 5.1773, "step": 657 }, { "epoch": 0.84224, "grad_norm": 0.6832327842712402, "learning_rate": 0.00028925831202046033, "loss": 5.1379, "step": 658 }, { "epoch": 0.84352, "grad_norm": 0.7403486371040344, "learning_rate": 0.00028921792973482296, "loss": 5.1094, "step": 659 }, { "epoch": 0.8448, "grad_norm": 0.7538831830024719, "learning_rate": 0.0002891775474491856, "loss": 5.0802, "step": 660 }, { "epoch": 0.84608, "grad_norm": 0.7270022630691528, "learning_rate": 0.0002891371651635482, "loss": 5.1342, "step": 661 }, { "epoch": 0.84736, "grad_norm": 0.6888076066970825, "learning_rate": 0.00028909678287791085, "loss": 5.118, "step": 662 }, { "epoch": 0.84864, "grad_norm": 0.6924152374267578, "learning_rate": 0.0002890564005922735, "loss": 5.1728, "step": 663 }, { "epoch": 0.84992, "grad_norm": 0.6232258081436157, "learning_rate": 0.0002890160183066361, "loss": 5.1357, "step": 664 }, { "epoch": 0.8512, "grad_norm": 0.7234542369842529, "learning_rate": 0.00028897563602099874, "loss": 5.0607, "step": 665 }, { "epoch": 0.85248, "grad_norm": 0.7320601940155029, "learning_rate": 0.0002889352537353614, "loss": 5.1074, "step": 666 }, { "epoch": 0.85376, "grad_norm": 0.6856744289398193, "learning_rate": 0.00028889487144972406, "loss": 5.0398, "step": 667 }, { "epoch": 0.85504, "grad_norm": 0.6123003363609314, "learning_rate": 0.0002888544891640867, "loss": 5.178, "step": 668 }, { "epoch": 0.85632, "grad_norm": 0.6712738871574402, "learning_rate": 0.0002888141068784493, "loss": 5.1151, "step": 669 }, { "epoch": 0.8576, "grad_norm": 0.6770631670951843, "learning_rate": 0.0002887737245928119, "loss": 5.1691, "step": 670 }, { "epoch": 0.85888, "grad_norm": 0.603068470954895, "learning_rate": 0.0002887333423071746, "loss": 5.0652, "step": 671 }, { "epoch": 0.86016, "grad_norm": 0.7057216167449951, "learning_rate": 0.0002886929600215372, "loss": 5.1038, "step": 672 }, { "epoch": 0.86144, "grad_norm": 0.708411693572998, "learning_rate": 0.00028865257773589984, "loss": 5.1423, "step": 673 }, { "epoch": 0.86272, "grad_norm": 0.634673535823822, "learning_rate": 0.00028861219545026247, "loss": 5.0765, "step": 674 }, { "epoch": 0.864, "grad_norm": 0.6029425263404846, "learning_rate": 0.0002885718131646251, "loss": 5.0668, "step": 675 }, { "epoch": 0.86528, "grad_norm": 0.6047025918960571, "learning_rate": 0.00028853143087898773, "loss": 5.0427, "step": 676 }, { "epoch": 0.86656, "grad_norm": 0.6101155281066895, "learning_rate": 0.00028849104859335036, "loss": 5.1077, "step": 677 }, { "epoch": 0.86784, "grad_norm": 0.6415575742721558, "learning_rate": 0.000288450666307713, "loss": 5.1074, "step": 678 }, { "epoch": 0.86912, "grad_norm": 0.6284694671630859, "learning_rate": 0.0002884102840220756, "loss": 5.1415, "step": 679 }, { "epoch": 0.8704, "grad_norm": 0.6168042421340942, "learning_rate": 0.00028836990173643825, "loss": 5.06, "step": 680 }, { "epoch": 0.87168, "grad_norm": 0.6429285407066345, "learning_rate": 0.0002883295194508009, "loss": 5.1369, "step": 681 }, { "epoch": 0.87296, "grad_norm": 0.6893006563186646, "learning_rate": 0.0002882891371651635, "loss": 5.0639, "step": 682 }, { "epoch": 0.87424, "grad_norm": 0.5782660841941833, "learning_rate": 0.00028824875487952614, "loss": 5.0679, "step": 683 }, { "epoch": 0.87552, "grad_norm": 0.6747514009475708, "learning_rate": 0.0002882083725938888, "loss": 5.097, "step": 684 }, { "epoch": 0.8768, "grad_norm": 0.7318249940872192, "learning_rate": 0.00028816799030825145, "loss": 5.1338, "step": 685 }, { "epoch": 0.87808, "grad_norm": 0.6769218444824219, "learning_rate": 0.00028812760802261403, "loss": 5.1185, "step": 686 }, { "epoch": 0.87936, "grad_norm": 0.6341337561607361, "learning_rate": 0.00028808722573697666, "loss": 5.1188, "step": 687 }, { "epoch": 0.88064, "grad_norm": 0.6396981477737427, "learning_rate": 0.0002880468434513393, "loss": 5.0718, "step": 688 }, { "epoch": 0.88192, "grad_norm": 0.703349232673645, "learning_rate": 0.000288006461165702, "loss": 5.1783, "step": 689 }, { "epoch": 0.8832, "grad_norm": 0.7219293713569641, "learning_rate": 0.0002879660788800646, "loss": 5.0716, "step": 690 }, { "epoch": 0.88448, "grad_norm": 0.6729134321212769, "learning_rate": 0.00028792569659442723, "loss": 5.0406, "step": 691 }, { "epoch": 0.88576, "grad_norm": 0.6320032477378845, "learning_rate": 0.00028788531430878986, "loss": 5.0302, "step": 692 }, { "epoch": 0.88704, "grad_norm": 0.662146806716919, "learning_rate": 0.0002878449320231525, "loss": 5.0877, "step": 693 }, { "epoch": 0.88832, "grad_norm": 0.6499378085136414, "learning_rate": 0.0002878045497375151, "loss": 4.983, "step": 694 }, { "epoch": 0.8896, "grad_norm": 0.600250780582428, "learning_rate": 0.00028776416745187775, "loss": 5.0135, "step": 695 }, { "epoch": 0.89088, "grad_norm": 0.6061322689056396, "learning_rate": 0.0002877237851662404, "loss": 5.0913, "step": 696 }, { "epoch": 0.89216, "grad_norm": 0.6367159485816956, "learning_rate": 0.000287683402880603, "loss": 5.0589, "step": 697 }, { "epoch": 0.89344, "grad_norm": 0.6972000598907471, "learning_rate": 0.00028764302059496564, "loss": 5.057, "step": 698 }, { "epoch": 0.89472, "grad_norm": 0.7460048794746399, "learning_rate": 0.0002876026383093283, "loss": 4.9804, "step": 699 }, { "epoch": 0.896, "grad_norm": 0.7880580425262451, "learning_rate": 0.0002875622560236909, "loss": 5.0366, "step": 700 }, { "epoch": 0.89728, "grad_norm": 0.857755720615387, "learning_rate": 0.00028752187373805353, "loss": 5.1399, "step": 701 }, { "epoch": 0.89856, "grad_norm": 0.8949594497680664, "learning_rate": 0.00028748149145241616, "loss": 5.0537, "step": 702 }, { "epoch": 0.89984, "grad_norm": 0.6883854866027832, "learning_rate": 0.0002874411091667788, "loss": 5.0771, "step": 703 }, { "epoch": 0.90112, "grad_norm": 0.7915716171264648, "learning_rate": 0.0002874007268811414, "loss": 5.1028, "step": 704 }, { "epoch": 0.9024, "grad_norm": 0.6309002041816711, "learning_rate": 0.00028736034459550405, "loss": 5.079, "step": 705 }, { "epoch": 0.90368, "grad_norm": 0.639817476272583, "learning_rate": 0.00028731996230986674, "loss": 5.1016, "step": 706 }, { "epoch": 0.90496, "grad_norm": 0.6250396966934204, "learning_rate": 0.00028727958002422937, "loss": 4.9772, "step": 707 }, { "epoch": 0.90624, "grad_norm": 0.5922684669494629, "learning_rate": 0.000287239197738592, "loss": 5.0849, "step": 708 }, { "epoch": 0.90752, "grad_norm": 0.6167359352111816, "learning_rate": 0.0002871988154529546, "loss": 5.0408, "step": 709 }, { "epoch": 0.9088, "grad_norm": 0.6019271612167358, "learning_rate": 0.00028715843316731726, "loss": 5.0312, "step": 710 }, { "epoch": 0.91008, "grad_norm": 0.6929312348365784, "learning_rate": 0.0002871180508816799, "loss": 5.0345, "step": 711 }, { "epoch": 0.91136, "grad_norm": 0.6549073457717896, "learning_rate": 0.0002870776685960425, "loss": 5.0344, "step": 712 }, { "epoch": 0.91264, "grad_norm": 0.6558994054794312, "learning_rate": 0.00028703728631040515, "loss": 4.9554, "step": 713 }, { "epoch": 0.91392, "grad_norm": 0.5860869884490967, "learning_rate": 0.0002869969040247678, "loss": 4.9838, "step": 714 }, { "epoch": 0.9152, "grad_norm": 0.6493767499923706, "learning_rate": 0.0002869565217391304, "loss": 5.0245, "step": 715 }, { "epoch": 0.91648, "grad_norm": 0.6401461958885193, "learning_rate": 0.00028691613945349304, "loss": 5.0256, "step": 716 }, { "epoch": 0.91776, "grad_norm": 0.8375266194343567, "learning_rate": 0.00028687575716785567, "loss": 4.9672, "step": 717 }, { "epoch": 0.91904, "grad_norm": 0.7304956912994385, "learning_rate": 0.0002868353748822183, "loss": 5.024, "step": 718 }, { "epoch": 0.92032, "grad_norm": 0.7382602095603943, "learning_rate": 0.00028679499259658093, "loss": 5.0483, "step": 719 }, { "epoch": 0.9216, "grad_norm": 0.7157966494560242, "learning_rate": 0.00028675461031094356, "loss": 4.9645, "step": 720 }, { "epoch": 0.92288, "grad_norm": 0.7719393372535706, "learning_rate": 0.0002867142280253062, "loss": 5.0018, "step": 721 }, { "epoch": 0.92416, "grad_norm": 0.6828843355178833, "learning_rate": 0.0002866738457396688, "loss": 4.9844, "step": 722 }, { "epoch": 0.92544, "grad_norm": 0.6229972243309021, "learning_rate": 0.0002866334634540315, "loss": 4.9949, "step": 723 }, { "epoch": 0.92672, "grad_norm": 0.6336907744407654, "learning_rate": 0.00028659308116839413, "loss": 5.0498, "step": 724 }, { "epoch": 0.928, "grad_norm": 0.6664116978645325, "learning_rate": 0.0002865526988827567, "loss": 5.0202, "step": 725 }, { "epoch": 0.92928, "grad_norm": 0.6601686477661133, "learning_rate": 0.00028651231659711934, "loss": 4.892, "step": 726 }, { "epoch": 0.93056, "grad_norm": 0.713094174861908, "learning_rate": 0.000286471934311482, "loss": 4.9817, "step": 727 }, { "epoch": 0.93184, "grad_norm": 0.5439857840538025, "learning_rate": 0.00028643155202584465, "loss": 5.0493, "step": 728 }, { "epoch": 0.93312, "grad_norm": 0.6119688749313354, "learning_rate": 0.0002863911697402073, "loss": 5.0149, "step": 729 }, { "epoch": 0.9344, "grad_norm": 0.6848023533821106, "learning_rate": 0.0002863507874545699, "loss": 5.0056, "step": 730 }, { "epoch": 0.93568, "grad_norm": 0.6950737833976746, "learning_rate": 0.00028631040516893255, "loss": 5.0121, "step": 731 }, { "epoch": 0.93696, "grad_norm": 0.825659453868866, "learning_rate": 0.0002862700228832952, "loss": 4.9852, "step": 732 }, { "epoch": 0.93824, "grad_norm": 0.7373353242874146, "learning_rate": 0.0002862296405976578, "loss": 4.943, "step": 733 }, { "epoch": 0.93952, "grad_norm": 0.6202784776687622, "learning_rate": 0.00028618925831202044, "loss": 5.0669, "step": 734 }, { "epoch": 0.9408, "grad_norm": 0.6890573501586914, "learning_rate": 0.00028614887602638307, "loss": 4.9571, "step": 735 }, { "epoch": 0.94208, "grad_norm": 0.6824938654899597, "learning_rate": 0.0002861084937407457, "loss": 4.9799, "step": 736 }, { "epoch": 0.94336, "grad_norm": 0.693109393119812, "learning_rate": 0.0002860681114551083, "loss": 5.0417, "step": 737 }, { "epoch": 0.94464, "grad_norm": 0.5777782201766968, "learning_rate": 0.00028602772916947096, "loss": 5.0035, "step": 738 }, { "epoch": 0.94592, "grad_norm": 0.6748936772346497, "learning_rate": 0.0002859873468838336, "loss": 4.9764, "step": 739 }, { "epoch": 0.9472, "grad_norm": 0.7863640785217285, "learning_rate": 0.00028594696459819627, "loss": 4.9554, "step": 740 }, { "epoch": 0.94848, "grad_norm": 0.7226070761680603, "learning_rate": 0.0002859065823125589, "loss": 4.9605, "step": 741 }, { "epoch": 0.94976, "grad_norm": 0.6418464779853821, "learning_rate": 0.0002858662000269215, "loss": 4.9418, "step": 742 }, { "epoch": 0.95104, "grad_norm": 0.8660367727279663, "learning_rate": 0.0002858258177412841, "loss": 5.0473, "step": 743 }, { "epoch": 0.95232, "grad_norm": 0.7218934297561646, "learning_rate": 0.00028578543545564674, "loss": 5.0287, "step": 744 }, { "epoch": 0.9536, "grad_norm": 0.6135517954826355, "learning_rate": 0.0002857450531700094, "loss": 4.9141, "step": 745 }, { "epoch": 0.95488, "grad_norm": 0.7062588334083557, "learning_rate": 0.00028570467088437205, "loss": 4.974, "step": 746 }, { "epoch": 0.95616, "grad_norm": 0.6527339220046997, "learning_rate": 0.0002856642885987347, "loss": 4.9275, "step": 747 }, { "epoch": 0.95744, "grad_norm": 0.601574718952179, "learning_rate": 0.0002856239063130973, "loss": 5.0131, "step": 748 }, { "epoch": 0.95872, "grad_norm": 0.6972563862800598, "learning_rate": 0.00028558352402745994, "loss": 5.0037, "step": 749 }, { "epoch": 0.96, "grad_norm": 0.6463833451271057, "learning_rate": 0.00028554314174182257, "loss": 5.0173, "step": 750 }, { "epoch": 0.96128, "grad_norm": 0.7086387872695923, "learning_rate": 0.0002855027594561852, "loss": 5.0418, "step": 751 }, { "epoch": 0.96256, "grad_norm": 0.6756019592285156, "learning_rate": 0.00028546237717054783, "loss": 4.9527, "step": 752 }, { "epoch": 0.96384, "grad_norm": 0.7218304872512817, "learning_rate": 0.00028542199488491046, "loss": 4.99, "step": 753 }, { "epoch": 0.96512, "grad_norm": 0.6159854531288147, "learning_rate": 0.0002853816125992731, "loss": 4.9612, "step": 754 }, { "epoch": 0.9664, "grad_norm": 0.7188109159469604, "learning_rate": 0.0002853412303136357, "loss": 4.957, "step": 755 }, { "epoch": 0.96768, "grad_norm": 0.7729025483131409, "learning_rate": 0.00028530084802799835, "loss": 5.03, "step": 756 }, { "epoch": 0.96896, "grad_norm": 0.6761441230773926, "learning_rate": 0.000285260465742361, "loss": 4.9148, "step": 757 }, { "epoch": 0.97024, "grad_norm": 0.6675111055374146, "learning_rate": 0.0002852200834567236, "loss": 4.9832, "step": 758 }, { "epoch": 0.97152, "grad_norm": 0.6335251927375793, "learning_rate": 0.00028517970117108624, "loss": 4.9268, "step": 759 }, { "epoch": 0.9728, "grad_norm": 0.633287787437439, "learning_rate": 0.00028513931888544887, "loss": 4.9717, "step": 760 }, { "epoch": 0.97408, "grad_norm": 0.5922399163246155, "learning_rate": 0.0002850989365998115, "loss": 4.9945, "step": 761 }, { "epoch": 0.97536, "grad_norm": 0.5817521214485168, "learning_rate": 0.0002850585543141742, "loss": 4.9261, "step": 762 }, { "epoch": 0.97664, "grad_norm": 0.6280835270881653, "learning_rate": 0.0002850181720285368, "loss": 4.9935, "step": 763 }, { "epoch": 0.97792, "grad_norm": 0.6975805163383484, "learning_rate": 0.00028497778974289945, "loss": 4.9161, "step": 764 }, { "epoch": 0.9792, "grad_norm": 0.63359135389328, "learning_rate": 0.000284937407457262, "loss": 4.9866, "step": 765 }, { "epoch": 0.98048, "grad_norm": 0.5709059238433838, "learning_rate": 0.0002848970251716247, "loss": 5.0262, "step": 766 }, { "epoch": 0.98176, "grad_norm": 0.5760507583618164, "learning_rate": 0.00028485664288598734, "loss": 4.9359, "step": 767 }, { "epoch": 0.98304, "grad_norm": 0.5946138501167297, "learning_rate": 0.00028481626060034997, "loss": 4.9843, "step": 768 }, { "epoch": 0.98432, "grad_norm": 0.6102321743965149, "learning_rate": 0.0002847758783147126, "loss": 4.9541, "step": 769 }, { "epoch": 0.9856, "grad_norm": 0.7592500448226929, "learning_rate": 0.0002847354960290752, "loss": 4.9617, "step": 770 }, { "epoch": 0.98688, "grad_norm": 0.7633534669876099, "learning_rate": 0.00028469511374343786, "loss": 4.9587, "step": 771 }, { "epoch": 0.98816, "grad_norm": 0.778260350227356, "learning_rate": 0.0002846547314578005, "loss": 4.9701, "step": 772 }, { "epoch": 0.98944, "grad_norm": 0.6937357187271118, "learning_rate": 0.0002846143491721631, "loss": 4.8862, "step": 773 }, { "epoch": 0.99072, "grad_norm": 0.7559827566146851, "learning_rate": 0.00028457396688652575, "loss": 4.9598, "step": 774 }, { "epoch": 0.992, "grad_norm": 0.699320375919342, "learning_rate": 0.0002845335846008884, "loss": 4.9923, "step": 775 }, { "epoch": 0.99328, "grad_norm": 0.6152313947677612, "learning_rate": 0.000284493202315251, "loss": 4.9211, "step": 776 }, { "epoch": 0.99456, "grad_norm": 0.6833812594413757, "learning_rate": 0.00028445282002961364, "loss": 4.9163, "step": 777 }, { "epoch": 0.99584, "grad_norm": 0.6250735521316528, "learning_rate": 0.00028441243774397627, "loss": 4.8906, "step": 778 }, { "epoch": 0.99712, "grad_norm": 0.592545747756958, "learning_rate": 0.00028437205545833895, "loss": 4.8642, "step": 779 }, { "epoch": 0.9984, "grad_norm": 0.6293628811836243, "learning_rate": 0.0002843316731727016, "loss": 4.899, "step": 780 }, { "epoch": 0.99968, "grad_norm": 0.8118374943733215, "learning_rate": 0.00028429129088706416, "loss": 4.9125, "step": 781 }, { "epoch": 1.0, "grad_norm": 1.0649632215499878, "learning_rate": 0.0002842509086014268, "loss": 4.8757, "step": 782 }, { "epoch": 1.00128, "grad_norm": 0.8830782175064087, "learning_rate": 0.0002842105263157894, "loss": 4.9041, "step": 783 }, { "epoch": 1.00256, "grad_norm": 0.9660894870758057, "learning_rate": 0.0002841701440301521, "loss": 4.9646, "step": 784 }, { "epoch": 1.00384, "grad_norm": 0.8279880285263062, "learning_rate": 0.00028412976174451473, "loss": 4.9266, "step": 785 }, { "epoch": 1.00512, "grad_norm": 0.8470652103424072, "learning_rate": 0.00028408937945887736, "loss": 4.8514, "step": 786 }, { "epoch": 1.0064, "grad_norm": 0.7699068784713745, "learning_rate": 0.00028404899717324, "loss": 4.809, "step": 787 }, { "epoch": 1.00768, "grad_norm": 0.7924918532371521, "learning_rate": 0.0002840086148876026, "loss": 4.8487, "step": 788 }, { "epoch": 1.00896, "grad_norm": 0.7347840070724487, "learning_rate": 0.00028396823260196525, "loss": 4.8208, "step": 789 }, { "epoch": 1.01024, "grad_norm": 0.6861421465873718, "learning_rate": 0.0002839278503163279, "loss": 4.8625, "step": 790 }, { "epoch": 1.01152, "grad_norm": 0.6445014476776123, "learning_rate": 0.0002838874680306905, "loss": 4.916, "step": 791 }, { "epoch": 1.0128, "grad_norm": 0.6200584769248962, "learning_rate": 0.00028384708574505314, "loss": 4.955, "step": 792 }, { "epoch": 1.01408, "grad_norm": 0.5568555593490601, "learning_rate": 0.00028380670345941577, "loss": 4.9134, "step": 793 }, { "epoch": 1.01536, "grad_norm": 0.6153519749641418, "learning_rate": 0.0002837663211737784, "loss": 4.8438, "step": 794 }, { "epoch": 1.01664, "grad_norm": 0.7319142818450928, "learning_rate": 0.00028372593888814103, "loss": 4.919, "step": 795 }, { "epoch": 1.01792, "grad_norm": 0.7483083605766296, "learning_rate": 0.00028368555660250366, "loss": 4.8444, "step": 796 }, { "epoch": 1.0192, "grad_norm": 0.7571709156036377, "learning_rate": 0.0002836451743168663, "loss": 4.907, "step": 797 }, { "epoch": 1.02048, "grad_norm": 0.7250765562057495, "learning_rate": 0.0002836047920312289, "loss": 4.9101, "step": 798 }, { "epoch": 1.02176, "grad_norm": 0.7267122864723206, "learning_rate": 0.00028356440974559155, "loss": 4.8398, "step": 799 }, { "epoch": 1.02304, "grad_norm": 0.7116459608078003, "learning_rate": 0.0002835240274599542, "loss": 4.8418, "step": 800 }, { "epoch": 1.02432, "grad_norm": 0.6556388735771179, "learning_rate": 0.00028348364517431687, "loss": 4.8538, "step": 801 }, { "epoch": 1.0256, "grad_norm": 0.6552951335906982, "learning_rate": 0.0002834432628886795, "loss": 4.8909, "step": 802 }, { "epoch": 1.02688, "grad_norm": 0.7316877841949463, "learning_rate": 0.00028340288060304213, "loss": 4.8016, "step": 803 }, { "epoch": 1.02816, "grad_norm": 0.7442478537559509, "learning_rate": 0.0002833624983174047, "loss": 4.8387, "step": 804 }, { "epoch": 1.02944, "grad_norm": 0.6588963866233826, "learning_rate": 0.0002833221160317674, "loss": 4.8561, "step": 805 }, { "epoch": 1.03072, "grad_norm": 0.6899812817573547, "learning_rate": 0.00028328173374613, "loss": 4.914, "step": 806 }, { "epoch": 1.032, "grad_norm": 0.7177838087081909, "learning_rate": 0.00028324135146049265, "loss": 4.7727, "step": 807 }, { "epoch": 1.03328, "grad_norm": 0.7071174383163452, "learning_rate": 0.0002832009691748553, "loss": 4.9105, "step": 808 }, { "epoch": 1.03456, "grad_norm": 0.6587151288986206, "learning_rate": 0.0002831605868892179, "loss": 4.8851, "step": 809 }, { "epoch": 1.03584, "grad_norm": 0.6153483390808105, "learning_rate": 0.00028312020460358054, "loss": 4.8638, "step": 810 }, { "epoch": 1.03712, "grad_norm": 0.594373345375061, "learning_rate": 0.00028307982231794317, "loss": 4.865, "step": 811 }, { "epoch": 1.0384, "grad_norm": 0.6585094332695007, "learning_rate": 0.0002830394400323058, "loss": 4.8542, "step": 812 }, { "epoch": 1.03968, "grad_norm": 0.6621379852294922, "learning_rate": 0.00028299905774666843, "loss": 4.8349, "step": 813 }, { "epoch": 1.04096, "grad_norm": 0.5880494713783264, "learning_rate": 0.00028295867546103106, "loss": 4.8509, "step": 814 }, { "epoch": 1.04224, "grad_norm": 0.6204310059547424, "learning_rate": 0.0002829182931753937, "loss": 4.8359, "step": 815 }, { "epoch": 1.04352, "grad_norm": 0.6215956211090088, "learning_rate": 0.0002828779108897563, "loss": 4.8127, "step": 816 }, { "epoch": 1.0448, "grad_norm": 0.6148485541343689, "learning_rate": 0.00028283752860411895, "loss": 4.8714, "step": 817 }, { "epoch": 1.04608, "grad_norm": 0.6803502440452576, "learning_rate": 0.00028279714631848163, "loss": 4.8163, "step": 818 }, { "epoch": 1.04736, "grad_norm": 0.6470843553543091, "learning_rate": 0.00028275676403284426, "loss": 4.8039, "step": 819 }, { "epoch": 1.04864, "grad_norm": 0.5897905230522156, "learning_rate": 0.0002827163817472069, "loss": 4.8399, "step": 820 }, { "epoch": 1.04992, "grad_norm": 0.6008836030960083, "learning_rate": 0.00028267599946156947, "loss": 4.8523, "step": 821 }, { "epoch": 1.0512, "grad_norm": 0.5722599029541016, "learning_rate": 0.0002826356171759321, "loss": 4.8057, "step": 822 }, { "epoch": 1.05248, "grad_norm": 0.5825332403182983, "learning_rate": 0.0002825952348902948, "loss": 4.8341, "step": 823 }, { "epoch": 1.05376, "grad_norm": 0.6503397822380066, "learning_rate": 0.0002825548526046574, "loss": 4.8925, "step": 824 }, { "epoch": 1.05504, "grad_norm": 0.6947764158248901, "learning_rate": 0.00028251447031902004, "loss": 4.7832, "step": 825 }, { "epoch": 1.05632, "grad_norm": 0.6167904734611511, "learning_rate": 0.0002824740880333827, "loss": 4.7814, "step": 826 }, { "epoch": 1.0576, "grad_norm": 0.624954342842102, "learning_rate": 0.0002824337057477453, "loss": 4.7072, "step": 827 }, { "epoch": 1.05888, "grad_norm": 0.7460287809371948, "learning_rate": 0.00028239332346210793, "loss": 4.7652, "step": 828 }, { "epoch": 1.06016, "grad_norm": 0.6951721906661987, "learning_rate": 0.00028235294117647056, "loss": 4.8089, "step": 829 }, { "epoch": 1.06144, "grad_norm": 0.7647305130958557, "learning_rate": 0.0002823125588908332, "loss": 4.8414, "step": 830 }, { "epoch": 1.06272, "grad_norm": 0.7322429418563843, "learning_rate": 0.0002822721766051958, "loss": 4.787, "step": 831 }, { "epoch": 1.064, "grad_norm": 0.6130174398422241, "learning_rate": 0.00028223179431955845, "loss": 4.8232, "step": 832 }, { "epoch": 1.06528, "grad_norm": 0.7291455268859863, "learning_rate": 0.0002821914120339211, "loss": 4.9358, "step": 833 }, { "epoch": 1.06656, "grad_norm": 0.7091724872589111, "learning_rate": 0.0002821510297482837, "loss": 4.7688, "step": 834 }, { "epoch": 1.06784, "grad_norm": 0.6038631200790405, "learning_rate": 0.0002821106474626464, "loss": 4.8498, "step": 835 }, { "epoch": 1.06912, "grad_norm": 0.5576080083847046, "learning_rate": 0.00028207026517700903, "loss": 4.7541, "step": 836 }, { "epoch": 1.0704, "grad_norm": 0.6607750058174133, "learning_rate": 0.0002820298828913716, "loss": 4.7774, "step": 837 }, { "epoch": 1.07168, "grad_norm": 0.6005157232284546, "learning_rate": 0.00028198950060573423, "loss": 4.8224, "step": 838 }, { "epoch": 1.07296, "grad_norm": 0.7134353518486023, "learning_rate": 0.00028194911832009686, "loss": 4.7846, "step": 839 }, { "epoch": 1.07424, "grad_norm": 0.8062997460365295, "learning_rate": 0.00028190873603445955, "loss": 4.8548, "step": 840 }, { "epoch": 1.07552, "grad_norm": 0.7981602549552917, "learning_rate": 0.0002818683537488222, "loss": 4.8362, "step": 841 }, { "epoch": 1.0768, "grad_norm": 0.7718906998634338, "learning_rate": 0.0002818279714631848, "loss": 4.8407, "step": 842 }, { "epoch": 1.07808, "grad_norm": 0.7952736020088196, "learning_rate": 0.00028178758917754744, "loss": 4.8551, "step": 843 }, { "epoch": 1.07936, "grad_norm": 0.7064546346664429, "learning_rate": 0.00028174720689191007, "loss": 4.755, "step": 844 }, { "epoch": 1.08064, "grad_norm": 0.6656110882759094, "learning_rate": 0.0002817068246062727, "loss": 4.7786, "step": 845 }, { "epoch": 1.08192, "grad_norm": 0.6304865479469299, "learning_rate": 0.00028166644232063533, "loss": 4.8153, "step": 846 }, { "epoch": 1.0832, "grad_norm": 0.6220903992652893, "learning_rate": 0.00028162606003499796, "loss": 4.8118, "step": 847 }, { "epoch": 1.08448, "grad_norm": 0.7010989189147949, "learning_rate": 0.0002815856777493606, "loss": 4.8627, "step": 848 }, { "epoch": 1.08576, "grad_norm": 0.6893423795700073, "learning_rate": 0.0002815452954637232, "loss": 4.7906, "step": 849 }, { "epoch": 1.08704, "grad_norm": 0.6807447075843811, "learning_rate": 0.00028150491317808585, "loss": 4.7955, "step": 850 }, { "epoch": 1.08832, "grad_norm": 0.6272568702697754, "learning_rate": 0.0002814645308924485, "loss": 4.6878, "step": 851 }, { "epoch": 1.0896, "grad_norm": 0.6356240510940552, "learning_rate": 0.0002814241486068111, "loss": 4.7682, "step": 852 }, { "epoch": 1.09088, "grad_norm": 0.7134535312652588, "learning_rate": 0.00028138376632117374, "loss": 4.7629, "step": 853 }, { "epoch": 1.09216, "grad_norm": 0.6460220813751221, "learning_rate": 0.00028134338403553637, "loss": 4.7132, "step": 854 }, { "epoch": 1.09344, "grad_norm": 0.746244490146637, "learning_rate": 0.000281303001749899, "loss": 4.8273, "step": 855 }, { "epoch": 1.09472, "grad_norm": 0.7077825665473938, "learning_rate": 0.00028126261946426163, "loss": 4.7677, "step": 856 }, { "epoch": 1.096, "grad_norm": 0.652271568775177, "learning_rate": 0.0002812222371786243, "loss": 4.812, "step": 857 }, { "epoch": 1.09728, "grad_norm": 0.6102200150489807, "learning_rate": 0.00028118185489298694, "loss": 4.7964, "step": 858 }, { "epoch": 1.09856, "grad_norm": 0.7953206896781921, "learning_rate": 0.0002811414726073496, "loss": 4.7903, "step": 859 }, { "epoch": 1.09984, "grad_norm": 0.641959547996521, "learning_rate": 0.00028110109032171215, "loss": 4.8177, "step": 860 }, { "epoch": 1.10112, "grad_norm": 0.7065843939781189, "learning_rate": 0.00028106070803607483, "loss": 4.7971, "step": 861 }, { "epoch": 1.1024, "grad_norm": 0.6949476003646851, "learning_rate": 0.00028102032575043746, "loss": 4.836, "step": 862 }, { "epoch": 1.10368, "grad_norm": 0.7160281538963318, "learning_rate": 0.0002809799434648001, "loss": 4.7041, "step": 863 }, { "epoch": 1.10496, "grad_norm": 0.6318966150283813, "learning_rate": 0.0002809395611791627, "loss": 4.7332, "step": 864 }, { "epoch": 1.1062400000000001, "grad_norm": 0.6665629148483276, "learning_rate": 0.00028089917889352535, "loss": 4.8316, "step": 865 }, { "epoch": 1.10752, "grad_norm": 0.6924526691436768, "learning_rate": 0.000280858796607888, "loss": 4.7881, "step": 866 }, { "epoch": 1.1088, "grad_norm": 0.6507744789123535, "learning_rate": 0.0002808184143222506, "loss": 4.8198, "step": 867 }, { "epoch": 1.11008, "grad_norm": 0.7218488454818726, "learning_rate": 0.00028077803203661324, "loss": 4.8512, "step": 868 }, { "epoch": 1.11136, "grad_norm": 0.7143361568450928, "learning_rate": 0.0002807376497509759, "loss": 4.795, "step": 869 }, { "epoch": 1.11264, "grad_norm": 0.6624107956886292, "learning_rate": 0.0002806972674653385, "loss": 4.7502, "step": 870 }, { "epoch": 1.11392, "grad_norm": 0.6978575587272644, "learning_rate": 0.00028065688517970114, "loss": 4.8148, "step": 871 }, { "epoch": 1.1152, "grad_norm": 0.6641087532043457, "learning_rate": 0.00028061650289406377, "loss": 4.7369, "step": 872 }, { "epoch": 1.11648, "grad_norm": 0.7032644748687744, "learning_rate": 0.0002805761206084264, "loss": 4.7203, "step": 873 }, { "epoch": 1.11776, "grad_norm": 0.6251493096351624, "learning_rate": 0.0002805357383227891, "loss": 4.8037, "step": 874 }, { "epoch": 1.11904, "grad_norm": 0.6594140529632568, "learning_rate": 0.0002804953560371517, "loss": 4.7187, "step": 875 }, { "epoch": 1.12032, "grad_norm": 0.6820217370986938, "learning_rate": 0.0002804549737515143, "loss": 4.7954, "step": 876 }, { "epoch": 1.1216, "grad_norm": 0.7188774347305298, "learning_rate": 0.0002804145914658769, "loss": 4.7619, "step": 877 }, { "epoch": 1.12288, "grad_norm": 0.7069806456565857, "learning_rate": 0.00028037420918023955, "loss": 4.7879, "step": 878 }, { "epoch": 1.12416, "grad_norm": 0.5844770669937134, "learning_rate": 0.00028033382689460223, "loss": 4.7458, "step": 879 }, { "epoch": 1.12544, "grad_norm": 0.6429082751274109, "learning_rate": 0.00028029344460896486, "loss": 4.6233, "step": 880 }, { "epoch": 1.12672, "grad_norm": 0.736958384513855, "learning_rate": 0.0002802530623233275, "loss": 4.745, "step": 881 }, { "epoch": 1.1280000000000001, "grad_norm": 0.6151041984558105, "learning_rate": 0.0002802126800376901, "loss": 4.6771, "step": 882 }, { "epoch": 1.12928, "grad_norm": 0.6079396605491638, "learning_rate": 0.00028017229775205275, "loss": 4.8183, "step": 883 }, { "epoch": 1.13056, "grad_norm": 0.6472486853599548, "learning_rate": 0.0002801319154664154, "loss": 4.713, "step": 884 }, { "epoch": 1.13184, "grad_norm": 0.6157152056694031, "learning_rate": 0.000280091533180778, "loss": 4.727, "step": 885 }, { "epoch": 1.13312, "grad_norm": 0.5990360975265503, "learning_rate": 0.00028005115089514064, "loss": 4.8425, "step": 886 }, { "epoch": 1.1344, "grad_norm": 0.6508432030677795, "learning_rate": 0.00028001076860950327, "loss": 4.7481, "step": 887 }, { "epoch": 1.13568, "grad_norm": 0.6805848479270935, "learning_rate": 0.0002799703863238659, "loss": 4.7301, "step": 888 }, { "epoch": 1.13696, "grad_norm": 0.6594050526618958, "learning_rate": 0.00027993000403822853, "loss": 4.726, "step": 889 }, { "epoch": 1.13824, "grad_norm": 0.5740160942077637, "learning_rate": 0.00027988962175259116, "loss": 4.7209, "step": 890 }, { "epoch": 1.13952, "grad_norm": 0.6788250207901001, "learning_rate": 0.0002798492394669538, "loss": 4.7201, "step": 891 }, { "epoch": 1.1408, "grad_norm": 0.6584210395812988, "learning_rate": 0.0002798088571813165, "loss": 4.7145, "step": 892 }, { "epoch": 1.14208, "grad_norm": 0.6468678116798401, "learning_rate": 0.00027976847489567905, "loss": 4.6967, "step": 893 }, { "epoch": 1.14336, "grad_norm": 0.6821480989456177, "learning_rate": 0.0002797280926100417, "loss": 4.7826, "step": 894 }, { "epoch": 1.1446399999999999, "grad_norm": 0.6273735165596008, "learning_rate": 0.0002796877103244043, "loss": 4.7037, "step": 895 }, { "epoch": 1.14592, "grad_norm": 0.6939446926116943, "learning_rate": 0.000279647328038767, "loss": 4.7438, "step": 896 }, { "epoch": 1.1472, "grad_norm": 0.7489894032478333, "learning_rate": 0.0002796069457531296, "loss": 4.6979, "step": 897 }, { "epoch": 1.14848, "grad_norm": 0.7742366194725037, "learning_rate": 0.00027956656346749226, "loss": 4.691, "step": 898 }, { "epoch": 1.1497600000000001, "grad_norm": 0.7484803199768066, "learning_rate": 0.00027952618118185483, "loss": 4.8254, "step": 899 }, { "epoch": 1.15104, "grad_norm": 0.8308618068695068, "learning_rate": 0.0002794857988962175, "loss": 4.7098, "step": 900 }, { "epoch": 1.15232, "grad_norm": 0.6708440780639648, "learning_rate": 0.00027944541661058015, "loss": 4.8018, "step": 901 }, { "epoch": 1.1536, "grad_norm": 0.6915727853775024, "learning_rate": 0.0002794050343249428, "loss": 4.6875, "step": 902 }, { "epoch": 1.15488, "grad_norm": 0.6630460619926453, "learning_rate": 0.0002793646520393054, "loss": 4.7815, "step": 903 }, { "epoch": 1.15616, "grad_norm": 0.6760913133621216, "learning_rate": 0.00027932426975366804, "loss": 4.7237, "step": 904 }, { "epoch": 1.15744, "grad_norm": 0.6769623160362244, "learning_rate": 0.00027928388746803067, "loss": 4.814, "step": 905 }, { "epoch": 1.15872, "grad_norm": 0.6976802945137024, "learning_rate": 0.0002792435051823933, "loss": 4.7954, "step": 906 }, { "epoch": 1.16, "grad_norm": 0.5799634456634521, "learning_rate": 0.0002792031228967559, "loss": 4.6929, "step": 907 }, { "epoch": 1.16128, "grad_norm": 0.6125800609588623, "learning_rate": 0.00027916274061111856, "loss": 4.6535, "step": 908 }, { "epoch": 1.16256, "grad_norm": 0.6879367828369141, "learning_rate": 0.0002791223583254812, "loss": 4.7521, "step": 909 }, { "epoch": 1.16384, "grad_norm": 0.737657368183136, "learning_rate": 0.0002790819760398438, "loss": 4.7687, "step": 910 }, { "epoch": 1.16512, "grad_norm": 0.6375575065612793, "learning_rate": 0.00027904159375420645, "loss": 4.8383, "step": 911 }, { "epoch": 1.1663999999999999, "grad_norm": 0.5931581258773804, "learning_rate": 0.0002790012114685691, "loss": 4.7189, "step": 912 }, { "epoch": 1.16768, "grad_norm": 0.7057203054428101, "learning_rate": 0.00027896082918293176, "loss": 4.6494, "step": 913 }, { "epoch": 1.16896, "grad_norm": 0.7094667553901672, "learning_rate": 0.0002789204468972944, "loss": 4.7491, "step": 914 }, { "epoch": 1.17024, "grad_norm": 0.6365569233894348, "learning_rate": 0.000278880064611657, "loss": 4.7078, "step": 915 }, { "epoch": 1.1715200000000001, "grad_norm": 0.7238393425941467, "learning_rate": 0.0002788396823260196, "loss": 4.6681, "step": 916 }, { "epoch": 1.1728, "grad_norm": 0.7257333993911743, "learning_rate": 0.0002787993000403822, "loss": 4.759, "step": 917 }, { "epoch": 1.17408, "grad_norm": 0.644520103931427, "learning_rate": 0.0002787589177547449, "loss": 4.7318, "step": 918 }, { "epoch": 1.17536, "grad_norm": 0.6309005618095398, "learning_rate": 0.00027871853546910754, "loss": 4.7544, "step": 919 }, { "epoch": 1.17664, "grad_norm": 0.715380847454071, "learning_rate": 0.00027867815318347017, "loss": 4.7209, "step": 920 }, { "epoch": 1.17792, "grad_norm": 0.7284442186355591, "learning_rate": 0.0002786377708978328, "loss": 4.7191, "step": 921 }, { "epoch": 1.1792, "grad_norm": 0.6391010284423828, "learning_rate": 0.00027859738861219543, "loss": 4.7282, "step": 922 }, { "epoch": 1.18048, "grad_norm": 0.6188117861747742, "learning_rate": 0.00027855700632655806, "loss": 4.6801, "step": 923 }, { "epoch": 1.18176, "grad_norm": 0.6303733587265015, "learning_rate": 0.0002785166240409207, "loss": 4.7246, "step": 924 }, { "epoch": 1.18304, "grad_norm": 0.5999323129653931, "learning_rate": 0.0002784762417552833, "loss": 4.749, "step": 925 }, { "epoch": 1.18432, "grad_norm": 0.6512132883071899, "learning_rate": 0.00027843585946964595, "loss": 4.6749, "step": 926 }, { "epoch": 1.1856, "grad_norm": 0.6530429124832153, "learning_rate": 0.0002783954771840086, "loss": 4.7483, "step": 927 }, { "epoch": 1.18688, "grad_norm": 0.6326019763946533, "learning_rate": 0.0002783550948983712, "loss": 4.6818, "step": 928 }, { "epoch": 1.1881599999999999, "grad_norm": 0.6044508218765259, "learning_rate": 0.00027831471261273384, "loss": 4.737, "step": 929 }, { "epoch": 1.18944, "grad_norm": 0.6268877983093262, "learning_rate": 0.00027827433032709647, "loss": 4.6278, "step": 930 }, { "epoch": 1.19072, "grad_norm": 0.6803573966026306, "learning_rate": 0.00027823394804145916, "loss": 4.6907, "step": 931 }, { "epoch": 1.192, "grad_norm": 0.7021913528442383, "learning_rate": 0.00027819356575582173, "loss": 4.683, "step": 932 }, { "epoch": 1.1932800000000001, "grad_norm": 0.6417155861854553, "learning_rate": 0.00027815318347018436, "loss": 4.7198, "step": 933 }, { "epoch": 1.19456, "grad_norm": 0.6091512441635132, "learning_rate": 0.000278112801184547, "loss": 4.694, "step": 934 }, { "epoch": 1.19584, "grad_norm": 0.7351169586181641, "learning_rate": 0.0002780724188989097, "loss": 4.697, "step": 935 }, { "epoch": 1.19712, "grad_norm": 0.7421384453773499, "learning_rate": 0.0002780320366132723, "loss": 4.6373, "step": 936 }, { "epoch": 1.1984, "grad_norm": 0.745575487613678, "learning_rate": 0.00027799165432763494, "loss": 4.6504, "step": 937 }, { "epoch": 1.19968, "grad_norm": 0.650227963924408, "learning_rate": 0.00027795127204199757, "loss": 4.6666, "step": 938 }, { "epoch": 1.20096, "grad_norm": 0.6333515048027039, "learning_rate": 0.0002779108897563602, "loss": 4.6865, "step": 939 }, { "epoch": 1.20224, "grad_norm": 0.6844230890274048, "learning_rate": 0.00027787050747072283, "loss": 4.6709, "step": 940 }, { "epoch": 1.20352, "grad_norm": 0.6432576775550842, "learning_rate": 0.00027783012518508546, "loss": 4.7449, "step": 941 }, { "epoch": 1.2048, "grad_norm": 0.7197453379631042, "learning_rate": 0.0002777897428994481, "loss": 4.6855, "step": 942 }, { "epoch": 1.20608, "grad_norm": 0.8134915232658386, "learning_rate": 0.0002777493606138107, "loss": 4.7429, "step": 943 }, { "epoch": 1.20736, "grad_norm": 0.7857049107551575, "learning_rate": 0.00027770897832817335, "loss": 4.7009, "step": 944 }, { "epoch": 1.20864, "grad_norm": 0.6796789765357971, "learning_rate": 0.000277668596042536, "loss": 4.7261, "step": 945 }, { "epoch": 1.2099199999999999, "grad_norm": 0.7167194485664368, "learning_rate": 0.0002776282137568986, "loss": 4.6099, "step": 946 }, { "epoch": 1.2112, "grad_norm": 0.7006614804267883, "learning_rate": 0.00027758783147126124, "loss": 4.6235, "step": 947 }, { "epoch": 1.21248, "grad_norm": 0.7764577865600586, "learning_rate": 0.00027754744918562387, "loss": 4.667, "step": 948 }, { "epoch": 1.21376, "grad_norm": 0.6722946166992188, "learning_rate": 0.0002775070668999865, "loss": 4.7208, "step": 949 }, { "epoch": 1.2150400000000001, "grad_norm": 0.6938252449035645, "learning_rate": 0.00027746668461434913, "loss": 4.6394, "step": 950 }, { "epoch": 1.21632, "grad_norm": 0.7050387859344482, "learning_rate": 0.00027742630232871176, "loss": 4.6333, "step": 951 }, { "epoch": 1.2176, "grad_norm": 0.7311280965805054, "learning_rate": 0.00027738592004307444, "loss": 4.6614, "step": 952 }, { "epoch": 1.21888, "grad_norm": 0.6674593091011047, "learning_rate": 0.00027734553775743707, "loss": 4.739, "step": 953 }, { "epoch": 1.22016, "grad_norm": 0.698934018611908, "learning_rate": 0.0002773051554717997, "loss": 4.6763, "step": 954 }, { "epoch": 1.22144, "grad_norm": 0.7208249568939209, "learning_rate": 0.0002772647731861623, "loss": 4.6478, "step": 955 }, { "epoch": 1.22272, "grad_norm": 0.6473603248596191, "learning_rate": 0.00027722439090052496, "loss": 4.6642, "step": 956 }, { "epoch": 1.224, "grad_norm": 0.7811574339866638, "learning_rate": 0.0002771840086148876, "loss": 4.7188, "step": 957 }, { "epoch": 1.22528, "grad_norm": 0.7508848309516907, "learning_rate": 0.0002771436263292502, "loss": 4.6873, "step": 958 }, { "epoch": 1.22656, "grad_norm": 0.6528134942054749, "learning_rate": 0.00027710324404361285, "loss": 4.6341, "step": 959 }, { "epoch": 1.22784, "grad_norm": 0.7064421772956848, "learning_rate": 0.0002770628617579755, "loss": 4.6409, "step": 960 }, { "epoch": 1.22912, "grad_norm": 0.7718419432640076, "learning_rate": 0.0002770224794723381, "loss": 4.7507, "step": 961 }, { "epoch": 1.2304, "grad_norm": 0.6676574945449829, "learning_rate": 0.00027698209718670074, "loss": 4.6491, "step": 962 }, { "epoch": 1.2316799999999999, "grad_norm": 0.7909667491912842, "learning_rate": 0.0002769417149010634, "loss": 4.6478, "step": 963 }, { "epoch": 1.23296, "grad_norm": 0.8206857442855835, "learning_rate": 0.000276901332615426, "loss": 4.6306, "step": 964 }, { "epoch": 1.23424, "grad_norm": 0.6872743964195251, "learning_rate": 0.00027686095032978863, "loss": 4.6992, "step": 965 }, { "epoch": 1.23552, "grad_norm": 0.659874439239502, "learning_rate": 0.00027682056804415126, "loss": 4.6612, "step": 966 }, { "epoch": 1.2368000000000001, "grad_norm": 0.7639236450195312, "learning_rate": 0.0002767801857585139, "loss": 4.6523, "step": 967 }, { "epoch": 1.23808, "grad_norm": 0.6640061140060425, "learning_rate": 0.0002767398034728765, "loss": 4.6604, "step": 968 }, { "epoch": 1.23936, "grad_norm": 0.6309702396392822, "learning_rate": 0.0002766994211872392, "loss": 4.6036, "step": 969 }, { "epoch": 1.24064, "grad_norm": 0.6865962147712708, "learning_rate": 0.00027665903890160184, "loss": 4.6821, "step": 970 }, { "epoch": 1.24192, "grad_norm": 0.7177154421806335, "learning_rate": 0.0002766186566159644, "loss": 4.5896, "step": 971 }, { "epoch": 1.2432, "grad_norm": 0.7251400351524353, "learning_rate": 0.00027657827433032704, "loss": 4.567, "step": 972 }, { "epoch": 1.24448, "grad_norm": 0.6386240124702454, "learning_rate": 0.0002765378920446897, "loss": 4.5937, "step": 973 }, { "epoch": 1.24576, "grad_norm": 0.666797935962677, "learning_rate": 0.00027649750975905236, "loss": 4.6854, "step": 974 }, { "epoch": 1.24704, "grad_norm": 0.6357764601707458, "learning_rate": 0.000276457127473415, "loss": 4.623, "step": 975 }, { "epoch": 1.24832, "grad_norm": 0.6851757168769836, "learning_rate": 0.0002764167451877776, "loss": 4.6517, "step": 976 }, { "epoch": 1.2496, "grad_norm": 0.6265616416931152, "learning_rate": 0.00027637636290214025, "loss": 4.6597, "step": 977 }, { "epoch": 1.25088, "grad_norm": 0.698387861251831, "learning_rate": 0.0002763359806165029, "loss": 4.5611, "step": 978 }, { "epoch": 1.25216, "grad_norm": 0.6262152194976807, "learning_rate": 0.0002762955983308655, "loss": 4.6248, "step": 979 }, { "epoch": 1.2534399999999999, "grad_norm": 0.675564706325531, "learning_rate": 0.00027625521604522814, "loss": 4.6137, "step": 980 }, { "epoch": 1.25472, "grad_norm": 0.5722922682762146, "learning_rate": 0.00027621483375959077, "loss": 4.6744, "step": 981 }, { "epoch": 1.256, "grad_norm": 0.6412468552589417, "learning_rate": 0.0002761744514739534, "loss": 4.5666, "step": 982 }, { "epoch": 1.25728, "grad_norm": 0.7287204265594482, "learning_rate": 0.00027613406918831603, "loss": 4.6355, "step": 983 }, { "epoch": 1.2585600000000001, "grad_norm": 0.742250382900238, "learning_rate": 0.00027609368690267866, "loss": 4.667, "step": 984 }, { "epoch": 1.25984, "grad_norm": 0.6279268860816956, "learning_rate": 0.0002760533046170413, "loss": 4.5517, "step": 985 }, { "epoch": 1.26112, "grad_norm": 0.6083707213401794, "learning_rate": 0.0002760129223314039, "loss": 4.5803, "step": 986 }, { "epoch": 1.2624, "grad_norm": 0.6249653697013855, "learning_rate": 0.0002759725400457666, "loss": 4.6421, "step": 987 }, { "epoch": 1.26368, "grad_norm": 0.5808601975440979, "learning_rate": 0.0002759321577601292, "loss": 4.6447, "step": 988 }, { "epoch": 1.2649599999999999, "grad_norm": 0.6520289778709412, "learning_rate": 0.0002758917754744918, "loss": 4.7274, "step": 989 }, { "epoch": 1.26624, "grad_norm": 0.7377476692199707, "learning_rate": 0.00027585139318885444, "loss": 4.625, "step": 990 }, { "epoch": 1.26752, "grad_norm": 0.6483809947967529, "learning_rate": 0.0002758110109032171, "loss": 4.6238, "step": 991 }, { "epoch": 1.2688, "grad_norm": 0.625088632106781, "learning_rate": 0.00027577062861757975, "loss": 4.6473, "step": 992 }, { "epoch": 1.27008, "grad_norm": 0.6513147950172424, "learning_rate": 0.0002757302463319424, "loss": 4.5971, "step": 993 }, { "epoch": 1.27136, "grad_norm": 0.6792516112327576, "learning_rate": 0.000275689864046305, "loss": 4.6855, "step": 994 }, { "epoch": 1.27264, "grad_norm": 0.6334472894668579, "learning_rate": 0.00027564948176066764, "loss": 4.6, "step": 995 }, { "epoch": 1.27392, "grad_norm": 0.6187386512756348, "learning_rate": 0.0002756090994750303, "loss": 4.6733, "step": 996 }, { "epoch": 1.2752, "grad_norm": 0.6294391751289368, "learning_rate": 0.0002755687171893929, "loss": 4.6067, "step": 997 }, { "epoch": 1.27648, "grad_norm": 0.6575160026550293, "learning_rate": 0.00027552833490375553, "loss": 4.662, "step": 998 }, { "epoch": 1.27776, "grad_norm": 0.6640904545783997, "learning_rate": 0.00027548795261811816, "loss": 4.6074, "step": 999 }, { "epoch": 1.27904, "grad_norm": 0.630664050579071, "learning_rate": 0.0002754475703324808, "loss": 4.6227, "step": 1000 }, { "epoch": 1.2803200000000001, "grad_norm": 0.7079930901527405, "learning_rate": 0.0002754071880468434, "loss": 4.6341, "step": 1001 }, { "epoch": 1.2816, "grad_norm": 0.7556226253509521, "learning_rate": 0.00027536680576120605, "loss": 4.6352, "step": 1002 }, { "epoch": 1.28288, "grad_norm": 0.5955154895782471, "learning_rate": 0.0002753264234755687, "loss": 4.5975, "step": 1003 }, { "epoch": 1.28416, "grad_norm": 0.7106191515922546, "learning_rate": 0.0002752860411899313, "loss": 4.6302, "step": 1004 }, { "epoch": 1.28544, "grad_norm": 0.7254298329353333, "learning_rate": 0.00027524565890429394, "loss": 4.6382, "step": 1005 }, { "epoch": 1.2867199999999999, "grad_norm": 0.6350436210632324, "learning_rate": 0.0002752052766186566, "loss": 4.6083, "step": 1006 }, { "epoch": 1.288, "grad_norm": 0.7333962321281433, "learning_rate": 0.0002751648943330192, "loss": 4.6096, "step": 1007 }, { "epoch": 1.28928, "grad_norm": 0.5859498977661133, "learning_rate": 0.0002751245120473819, "loss": 4.6632, "step": 1008 }, { "epoch": 1.29056, "grad_norm": 0.7196768522262573, "learning_rate": 0.0002750841297617445, "loss": 4.7089, "step": 1009 }, { "epoch": 1.29184, "grad_norm": 0.6264676451683044, "learning_rate": 0.00027504374747610715, "loss": 4.609, "step": 1010 }, { "epoch": 1.29312, "grad_norm": 0.6764931082725525, "learning_rate": 0.0002750033651904697, "loss": 4.5912, "step": 1011 }, { "epoch": 1.2944, "grad_norm": 0.6790111064910889, "learning_rate": 0.00027496298290483236, "loss": 4.6433, "step": 1012 }, { "epoch": 1.29568, "grad_norm": 0.6634168028831482, "learning_rate": 0.00027492260061919504, "loss": 4.6097, "step": 1013 }, { "epoch": 1.29696, "grad_norm": 0.63548344373703, "learning_rate": 0.00027488221833355767, "loss": 4.5944, "step": 1014 }, { "epoch": 1.29824, "grad_norm": 0.6369843482971191, "learning_rate": 0.0002748418360479203, "loss": 4.5632, "step": 1015 }, { "epoch": 1.29952, "grad_norm": 0.6853580474853516, "learning_rate": 0.00027480145376228293, "loss": 4.6141, "step": 1016 }, { "epoch": 1.3008, "grad_norm": 0.648949921131134, "learning_rate": 0.00027476107147664556, "loss": 4.5685, "step": 1017 }, { "epoch": 1.3020800000000001, "grad_norm": 0.5883113145828247, "learning_rate": 0.0002747206891910082, "loss": 4.6188, "step": 1018 }, { "epoch": 1.30336, "grad_norm": 0.6754709482192993, "learning_rate": 0.0002746803069053708, "loss": 4.6567, "step": 1019 }, { "epoch": 1.30464, "grad_norm": 0.6051660776138306, "learning_rate": 0.00027463992461973345, "loss": 4.6101, "step": 1020 }, { "epoch": 1.30592, "grad_norm": 0.609302282333374, "learning_rate": 0.0002745995423340961, "loss": 4.5448, "step": 1021 }, { "epoch": 1.3072, "grad_norm": 0.6241468191146851, "learning_rate": 0.0002745591600484587, "loss": 4.6168, "step": 1022 }, { "epoch": 1.3084799999999999, "grad_norm": 0.6044094562530518, "learning_rate": 0.00027451877776282134, "loss": 4.6363, "step": 1023 }, { "epoch": 1.30976, "grad_norm": 0.6301230788230896, "learning_rate": 0.00027447839547718397, "loss": 4.5699, "step": 1024 }, { "epoch": 1.31104, "grad_norm": 0.6775204539299011, "learning_rate": 0.0002744380131915466, "loss": 4.582, "step": 1025 }, { "epoch": 1.31232, "grad_norm": 0.631717324256897, "learning_rate": 0.0002743976309059093, "loss": 4.559, "step": 1026 }, { "epoch": 1.3136, "grad_norm": 0.631300687789917, "learning_rate": 0.00027435724862027186, "loss": 4.5394, "step": 1027 }, { "epoch": 1.31488, "grad_norm": 0.6998112201690674, "learning_rate": 0.0002743168663346345, "loss": 4.5337, "step": 1028 }, { "epoch": 1.31616, "grad_norm": 0.6501991748809814, "learning_rate": 0.0002742764840489971, "loss": 4.5545, "step": 1029 }, { "epoch": 1.31744, "grad_norm": 0.6726256608963013, "learning_rate": 0.0002742361017633598, "loss": 4.5374, "step": 1030 }, { "epoch": 1.31872, "grad_norm": 0.6180437803268433, "learning_rate": 0.00027419571947772244, "loss": 4.5881, "step": 1031 }, { "epoch": 1.32, "grad_norm": 0.6200023889541626, "learning_rate": 0.00027415533719208507, "loss": 4.5298, "step": 1032 }, { "epoch": 1.32128, "grad_norm": 0.6141354441642761, "learning_rate": 0.0002741149549064477, "loss": 4.4838, "step": 1033 }, { "epoch": 1.32256, "grad_norm": 0.600661039352417, "learning_rate": 0.0002740745726208103, "loss": 4.5947, "step": 1034 }, { "epoch": 1.3238400000000001, "grad_norm": 0.7037731409072876, "learning_rate": 0.00027403419033517296, "loss": 4.544, "step": 1035 }, { "epoch": 1.32512, "grad_norm": 0.6471102237701416, "learning_rate": 0.0002739938080495356, "loss": 4.5557, "step": 1036 }, { "epoch": 1.3264, "grad_norm": 0.6341505646705627, "learning_rate": 0.0002739534257638982, "loss": 4.662, "step": 1037 }, { "epoch": 1.32768, "grad_norm": 0.660749614238739, "learning_rate": 0.00027391304347826085, "loss": 4.628, "step": 1038 }, { "epoch": 1.32896, "grad_norm": 0.6301653385162354, "learning_rate": 0.0002738726611926235, "loss": 4.5595, "step": 1039 }, { "epoch": 1.3302399999999999, "grad_norm": 0.6750319004058838, "learning_rate": 0.0002738322789069861, "loss": 4.6398, "step": 1040 }, { "epoch": 1.33152, "grad_norm": 0.6259351372718811, "learning_rate": 0.00027379189662134874, "loss": 4.6368, "step": 1041 }, { "epoch": 1.3328, "grad_norm": 0.6357682347297668, "learning_rate": 0.00027375151433571137, "loss": 4.5332, "step": 1042 }, { "epoch": 1.33408, "grad_norm": 0.6520000100135803, "learning_rate": 0.000273711132050074, "loss": 4.6516, "step": 1043 }, { "epoch": 1.33536, "grad_norm": 0.7159348130226135, "learning_rate": 0.0002736707497644366, "loss": 4.5656, "step": 1044 }, { "epoch": 1.33664, "grad_norm": 0.7513425946235657, "learning_rate": 0.00027363036747879926, "loss": 4.6232, "step": 1045 }, { "epoch": 1.33792, "grad_norm": 0.7019796967506409, "learning_rate": 0.0002735899851931619, "loss": 4.516, "step": 1046 }, { "epoch": 1.3392, "grad_norm": 0.5741701722145081, "learning_rate": 0.00027354960290752457, "loss": 4.5341, "step": 1047 }, { "epoch": 1.34048, "grad_norm": 0.6854413151741028, "learning_rate": 0.0002735092206218872, "loss": 4.5648, "step": 1048 }, { "epoch": 1.34176, "grad_norm": 0.599707305431366, "learning_rate": 0.00027346883833624983, "loss": 4.5988, "step": 1049 }, { "epoch": 1.34304, "grad_norm": 0.6394950151443481, "learning_rate": 0.0002734284560506124, "loss": 4.5769, "step": 1050 }, { "epoch": 1.34432, "grad_norm": 0.5611634850502014, "learning_rate": 0.00027338807376497504, "loss": 4.5538, "step": 1051 }, { "epoch": 1.3456000000000001, "grad_norm": 0.6633642315864563, "learning_rate": 0.0002733476914793377, "loss": 4.4661, "step": 1052 }, { "epoch": 1.34688, "grad_norm": 0.6434425711631775, "learning_rate": 0.00027330730919370035, "loss": 4.5914, "step": 1053 }, { "epoch": 1.34816, "grad_norm": 0.6126766204833984, "learning_rate": 0.000273266926908063, "loss": 4.5852, "step": 1054 }, { "epoch": 1.34944, "grad_norm": 0.6653432250022888, "learning_rate": 0.0002732265446224256, "loss": 4.603, "step": 1055 }, { "epoch": 1.35072, "grad_norm": 0.7112525701522827, "learning_rate": 0.00027318616233678824, "loss": 4.5842, "step": 1056 }, { "epoch": 1.3519999999999999, "grad_norm": 0.6654365062713623, "learning_rate": 0.00027314578005115087, "loss": 4.5841, "step": 1057 }, { "epoch": 1.35328, "grad_norm": 0.6465290784835815, "learning_rate": 0.0002731053977655135, "loss": 4.5549, "step": 1058 }, { "epoch": 1.35456, "grad_norm": 0.6904200911521912, "learning_rate": 0.00027306501547987613, "loss": 4.6345, "step": 1059 }, { "epoch": 1.35584, "grad_norm": 0.5807105302810669, "learning_rate": 0.00027302463319423876, "loss": 4.4827, "step": 1060 }, { "epoch": 1.35712, "grad_norm": 0.6408985257148743, "learning_rate": 0.0002729842509086014, "loss": 4.5378, "step": 1061 }, { "epoch": 1.3584, "grad_norm": 0.5737316608428955, "learning_rate": 0.000272943868622964, "loss": 4.5322, "step": 1062 }, { "epoch": 1.35968, "grad_norm": 0.6886394023895264, "learning_rate": 0.00027290348633732665, "loss": 4.5859, "step": 1063 }, { "epoch": 1.36096, "grad_norm": 0.6456865072250366, "learning_rate": 0.0002728631040516893, "loss": 4.6009, "step": 1064 }, { "epoch": 1.36224, "grad_norm": 0.6285141110420227, "learning_rate": 0.00027282272176605197, "loss": 4.6286, "step": 1065 }, { "epoch": 1.36352, "grad_norm": 0.5843459963798523, "learning_rate": 0.00027278233948041454, "loss": 4.5497, "step": 1066 }, { "epoch": 1.3648, "grad_norm": 0.6061887145042419, "learning_rate": 0.00027274195719477717, "loss": 4.6066, "step": 1067 }, { "epoch": 1.36608, "grad_norm": 0.668018639087677, "learning_rate": 0.0002727015749091398, "loss": 4.5314, "step": 1068 }, { "epoch": 1.3673600000000001, "grad_norm": 0.6002050042152405, "learning_rate": 0.0002726611926235025, "loss": 4.5271, "step": 1069 }, { "epoch": 1.36864, "grad_norm": 0.6185527443885803, "learning_rate": 0.0002726208103378651, "loss": 4.5401, "step": 1070 }, { "epoch": 1.36992, "grad_norm": 0.6066594123840332, "learning_rate": 0.00027258042805222775, "loss": 4.5988, "step": 1071 }, { "epoch": 1.3712, "grad_norm": 0.6096265912055969, "learning_rate": 0.0002725400457665904, "loss": 4.5043, "step": 1072 }, { "epoch": 1.37248, "grad_norm": 0.6521106958389282, "learning_rate": 0.000272499663480953, "loss": 4.5546, "step": 1073 }, { "epoch": 1.3737599999999999, "grad_norm": 0.582575798034668, "learning_rate": 0.00027245928119531564, "loss": 4.4859, "step": 1074 }, { "epoch": 1.37504, "grad_norm": 0.5959002375602722, "learning_rate": 0.00027241889890967827, "loss": 4.5375, "step": 1075 }, { "epoch": 1.37632, "grad_norm": 0.7106321454048157, "learning_rate": 0.0002723785166240409, "loss": 4.53, "step": 1076 }, { "epoch": 1.3776, "grad_norm": 0.648299515247345, "learning_rate": 0.0002723381343384035, "loss": 4.6495, "step": 1077 }, { "epoch": 1.37888, "grad_norm": 0.7060708403587341, "learning_rate": 0.00027229775205276616, "loss": 4.5456, "step": 1078 }, { "epoch": 1.38016, "grad_norm": 0.655817985534668, "learning_rate": 0.0002722573697671288, "loss": 4.5239, "step": 1079 }, { "epoch": 1.38144, "grad_norm": 0.625787079334259, "learning_rate": 0.0002722169874814914, "loss": 4.5517, "step": 1080 }, { "epoch": 1.38272, "grad_norm": 0.6610313057899475, "learning_rate": 0.00027217660519585405, "loss": 4.4966, "step": 1081 }, { "epoch": 1.384, "grad_norm": 0.6236038208007812, "learning_rate": 0.00027213622291021673, "loss": 4.5368, "step": 1082 }, { "epoch": 1.38528, "grad_norm": 0.7284337878227234, "learning_rate": 0.0002720958406245793, "loss": 4.5835, "step": 1083 }, { "epoch": 1.38656, "grad_norm": 0.6679523587226868, "learning_rate": 0.00027205545833894194, "loss": 4.4663, "step": 1084 }, { "epoch": 1.38784, "grad_norm": 0.6354249715805054, "learning_rate": 0.00027201507605330457, "loss": 4.4995, "step": 1085 }, { "epoch": 1.3891200000000001, "grad_norm": 0.5772947669029236, "learning_rate": 0.00027197469376766725, "loss": 4.4817, "step": 1086 }, { "epoch": 1.3904, "grad_norm": 0.6011262536048889, "learning_rate": 0.0002719343114820299, "loss": 4.5965, "step": 1087 }, { "epoch": 1.39168, "grad_norm": 0.6319146752357483, "learning_rate": 0.0002718939291963925, "loss": 4.4057, "step": 1088 }, { "epoch": 1.39296, "grad_norm": 0.6585641503334045, "learning_rate": 0.00027185354691075514, "loss": 4.4783, "step": 1089 }, { "epoch": 1.39424, "grad_norm": 0.6011718511581421, "learning_rate": 0.00027181316462511777, "loss": 4.5192, "step": 1090 }, { "epoch": 1.3955199999999999, "grad_norm": 0.6210798621177673, "learning_rate": 0.0002717727823394804, "loss": 4.5047, "step": 1091 }, { "epoch": 1.3968, "grad_norm": 0.591758131980896, "learning_rate": 0.00027173240005384303, "loss": 4.5768, "step": 1092 }, { "epoch": 1.39808, "grad_norm": 0.636916995048523, "learning_rate": 0.00027169201776820566, "loss": 4.5244, "step": 1093 }, { "epoch": 1.39936, "grad_norm": 0.697331428527832, "learning_rate": 0.0002716516354825683, "loss": 4.5199, "step": 1094 }, { "epoch": 1.40064, "grad_norm": 0.6302866339683533, "learning_rate": 0.0002716112531969309, "loss": 4.5211, "step": 1095 }, { "epoch": 1.40192, "grad_norm": 0.7289062738418579, "learning_rate": 0.00027157087091129355, "loss": 4.4961, "step": 1096 }, { "epoch": 1.4032, "grad_norm": 0.6740280985832214, "learning_rate": 0.0002715304886256562, "loss": 4.5848, "step": 1097 }, { "epoch": 1.40448, "grad_norm": 0.6113433241844177, "learning_rate": 0.0002714901063400188, "loss": 4.482, "step": 1098 }, { "epoch": 1.40576, "grad_norm": 0.6321492195129395, "learning_rate": 0.00027144972405438144, "loss": 4.463, "step": 1099 }, { "epoch": 1.40704, "grad_norm": 0.6910666227340698, "learning_rate": 0.00027140934176874407, "loss": 4.53, "step": 1100 }, { "epoch": 1.40832, "grad_norm": 0.6177242994308472, "learning_rate": 0.0002713689594831067, "loss": 4.5152, "step": 1101 }, { "epoch": 1.4096, "grad_norm": 0.6206321716308594, "learning_rate": 0.00027132857719746933, "loss": 4.5393, "step": 1102 }, { "epoch": 1.4108800000000001, "grad_norm": 0.6209645867347717, "learning_rate": 0.000271288194911832, "loss": 4.4624, "step": 1103 }, { "epoch": 1.41216, "grad_norm": 0.7263031601905823, "learning_rate": 0.00027124781262619465, "loss": 4.5301, "step": 1104 }, { "epoch": 1.41344, "grad_norm": 0.6254947781562805, "learning_rate": 0.0002712074303405573, "loss": 4.5234, "step": 1105 }, { "epoch": 1.41472, "grad_norm": 0.6119322180747986, "learning_rate": 0.00027116704805491985, "loss": 4.4959, "step": 1106 }, { "epoch": 1.416, "grad_norm": 0.6006280183792114, "learning_rate": 0.0002711266657692825, "loss": 4.5361, "step": 1107 }, { "epoch": 1.4172799999999999, "grad_norm": 0.6143077611923218, "learning_rate": 0.00027108628348364517, "loss": 4.4948, "step": 1108 }, { "epoch": 1.41856, "grad_norm": 0.6156766414642334, "learning_rate": 0.0002710459011980078, "loss": 4.502, "step": 1109 }, { "epoch": 1.41984, "grad_norm": 0.6217614412307739, "learning_rate": 0.00027100551891237043, "loss": 4.4685, "step": 1110 }, { "epoch": 1.42112, "grad_norm": 0.592921793460846, "learning_rate": 0.00027096513662673306, "loss": 4.5341, "step": 1111 }, { "epoch": 1.4224, "grad_norm": 0.5901917815208435, "learning_rate": 0.0002709247543410957, "loss": 4.4953, "step": 1112 }, { "epoch": 1.42368, "grad_norm": 0.7119625210762024, "learning_rate": 0.0002708843720554583, "loss": 4.5269, "step": 1113 }, { "epoch": 1.42496, "grad_norm": 0.7488095164299011, "learning_rate": 0.00027084398976982095, "loss": 4.5553, "step": 1114 }, { "epoch": 1.42624, "grad_norm": 0.6430398225784302, "learning_rate": 0.0002708036074841836, "loss": 4.524, "step": 1115 }, { "epoch": 1.42752, "grad_norm": 0.6690952777862549, "learning_rate": 0.0002707632251985462, "loss": 4.5503, "step": 1116 }, { "epoch": 1.4288, "grad_norm": 0.7325178980827332, "learning_rate": 0.00027072284291290884, "loss": 4.4675, "step": 1117 }, { "epoch": 1.43008, "grad_norm": 0.6277335286140442, "learning_rate": 0.00027068246062727147, "loss": 4.5683, "step": 1118 }, { "epoch": 1.43136, "grad_norm": 0.6613750457763672, "learning_rate": 0.0002706420783416341, "loss": 4.5421, "step": 1119 }, { "epoch": 1.4326400000000001, "grad_norm": 0.6405130624771118, "learning_rate": 0.00027060169605599673, "loss": 4.4973, "step": 1120 }, { "epoch": 1.43392, "grad_norm": 0.7388433218002319, "learning_rate": 0.0002705613137703594, "loss": 4.4663, "step": 1121 }, { "epoch": 1.4352, "grad_norm": 0.7195430397987366, "learning_rate": 0.000270520931484722, "loss": 4.4404, "step": 1122 }, { "epoch": 1.43648, "grad_norm": 0.5988063812255859, "learning_rate": 0.0002704805491990846, "loss": 4.4924, "step": 1123 }, { "epoch": 1.43776, "grad_norm": 0.6605175733566284, "learning_rate": 0.00027044016691344725, "loss": 4.4393, "step": 1124 }, { "epoch": 1.4390399999999999, "grad_norm": 0.6142287850379944, "learning_rate": 0.00027039978462780993, "loss": 4.4968, "step": 1125 }, { "epoch": 1.44032, "grad_norm": 0.6107270121574402, "learning_rate": 0.00027035940234217256, "loss": 4.5411, "step": 1126 }, { "epoch": 1.4416, "grad_norm": 0.5510809421539307, "learning_rate": 0.0002703190200565352, "loss": 4.5815, "step": 1127 }, { "epoch": 1.44288, "grad_norm": 0.6231082081794739, "learning_rate": 0.0002702786377708978, "loss": 4.4505, "step": 1128 }, { "epoch": 1.44416, "grad_norm": 0.6307341456413269, "learning_rate": 0.00027023825548526045, "loss": 4.5774, "step": 1129 }, { "epoch": 1.44544, "grad_norm": 0.5655580759048462, "learning_rate": 0.0002701978731996231, "loss": 4.451, "step": 1130 }, { "epoch": 1.44672, "grad_norm": 0.6464011669158936, "learning_rate": 0.0002701574909139857, "loss": 4.446, "step": 1131 }, { "epoch": 1.448, "grad_norm": 0.6590990424156189, "learning_rate": 0.00027011710862834834, "loss": 4.5092, "step": 1132 }, { "epoch": 1.44928, "grad_norm": 0.5793240666389465, "learning_rate": 0.000270076726342711, "loss": 4.4777, "step": 1133 }, { "epoch": 1.45056, "grad_norm": 0.6022570729255676, "learning_rate": 0.0002700363440570736, "loss": 4.4337, "step": 1134 }, { "epoch": 1.45184, "grad_norm": 0.6188995838165283, "learning_rate": 0.00026999596177143623, "loss": 4.5174, "step": 1135 }, { "epoch": 1.45312, "grad_norm": 0.6144684553146362, "learning_rate": 0.00026995557948579886, "loss": 4.4565, "step": 1136 }, { "epoch": 1.4544000000000001, "grad_norm": 0.6763806939125061, "learning_rate": 0.0002699151972001615, "loss": 4.4995, "step": 1137 }, { "epoch": 1.45568, "grad_norm": 0.7390356063842773, "learning_rate": 0.0002698748149145241, "loss": 4.5674, "step": 1138 }, { "epoch": 1.45696, "grad_norm": 0.6634504199028015, "learning_rate": 0.00026983443262888675, "loss": 4.5301, "step": 1139 }, { "epoch": 1.45824, "grad_norm": 0.6377415060997009, "learning_rate": 0.0002697940503432494, "loss": 4.4919, "step": 1140 }, { "epoch": 1.45952, "grad_norm": 0.6241118907928467, "learning_rate": 0.000269753668057612, "loss": 4.5093, "step": 1141 }, { "epoch": 1.4607999999999999, "grad_norm": 0.676541805267334, "learning_rate": 0.0002697132857719747, "loss": 4.5714, "step": 1142 }, { "epoch": 1.46208, "grad_norm": 0.6231991648674011, "learning_rate": 0.00026967290348633733, "loss": 4.494, "step": 1143 }, { "epoch": 1.46336, "grad_norm": 0.6391655802726746, "learning_rate": 0.00026963252120069996, "loss": 4.5113, "step": 1144 }, { "epoch": 1.46464, "grad_norm": 0.6239742636680603, "learning_rate": 0.00026959213891506253, "loss": 4.4657, "step": 1145 }, { "epoch": 1.4659200000000001, "grad_norm": 0.6046785116195679, "learning_rate": 0.00026955175662942516, "loss": 4.5132, "step": 1146 }, { "epoch": 1.4672, "grad_norm": 0.663732647895813, "learning_rate": 0.00026951137434378785, "loss": 4.5261, "step": 1147 }, { "epoch": 1.46848, "grad_norm": 0.6262268424034119, "learning_rate": 0.0002694709920581505, "loss": 4.5257, "step": 1148 }, { "epoch": 1.46976, "grad_norm": 0.552179217338562, "learning_rate": 0.0002694306097725131, "loss": 4.5232, "step": 1149 }, { "epoch": 1.47104, "grad_norm": 0.6553817987442017, "learning_rate": 0.00026939022748687574, "loss": 4.442, "step": 1150 }, { "epoch": 1.47232, "grad_norm": 0.6651473641395569, "learning_rate": 0.00026934984520123837, "loss": 4.5369, "step": 1151 }, { "epoch": 1.4736, "grad_norm": 0.6147942543029785, "learning_rate": 0.000269309462915601, "loss": 4.5399, "step": 1152 }, { "epoch": 1.47488, "grad_norm": 0.662200927734375, "learning_rate": 0.00026926908062996363, "loss": 4.5464, "step": 1153 }, { "epoch": 1.4761600000000001, "grad_norm": 0.6553072929382324, "learning_rate": 0.00026922869834432626, "loss": 4.5623, "step": 1154 }, { "epoch": 1.47744, "grad_norm": 0.5930311679840088, "learning_rate": 0.0002691883160586889, "loss": 4.4286, "step": 1155 }, { "epoch": 1.47872, "grad_norm": 0.6711944341659546, "learning_rate": 0.0002691479337730515, "loss": 4.4298, "step": 1156 }, { "epoch": 1.48, "grad_norm": 0.5974631905555725, "learning_rate": 0.00026910755148741415, "loss": 4.4365, "step": 1157 }, { "epoch": 1.48128, "grad_norm": 0.6224148273468018, "learning_rate": 0.0002690671692017768, "loss": 4.4888, "step": 1158 }, { "epoch": 1.4825599999999999, "grad_norm": 0.5823886394500732, "learning_rate": 0.0002690267869161394, "loss": 4.48, "step": 1159 }, { "epoch": 1.48384, "grad_norm": 0.6274778842926025, "learning_rate": 0.0002689864046305021, "loss": 4.5036, "step": 1160 }, { "epoch": 1.48512, "grad_norm": 0.6259602904319763, "learning_rate": 0.0002689460223448647, "loss": 4.5225, "step": 1161 }, { "epoch": 1.4864, "grad_norm": 0.5894110798835754, "learning_rate": 0.0002689056400592273, "loss": 4.5004, "step": 1162 }, { "epoch": 1.4876800000000001, "grad_norm": 0.6239898204803467, "learning_rate": 0.00026886525777358993, "loss": 4.5111, "step": 1163 }, { "epoch": 1.48896, "grad_norm": 0.6786986589431763, "learning_rate": 0.0002688248754879526, "loss": 4.5008, "step": 1164 }, { "epoch": 1.49024, "grad_norm": 0.5871908664703369, "learning_rate": 0.00026878449320231524, "loss": 4.5222, "step": 1165 }, { "epoch": 1.49152, "grad_norm": 0.7175498604774475, "learning_rate": 0.0002687441109166779, "loss": 4.5113, "step": 1166 }, { "epoch": 1.4928, "grad_norm": 0.6610713005065918, "learning_rate": 0.0002687037286310405, "loss": 4.489, "step": 1167 }, { "epoch": 1.49408, "grad_norm": 0.6492409110069275, "learning_rate": 0.00026866334634540313, "loss": 4.5002, "step": 1168 }, { "epoch": 1.49536, "grad_norm": 0.637315034866333, "learning_rate": 0.00026862296405976576, "loss": 4.503, "step": 1169 }, { "epoch": 1.49664, "grad_norm": 0.6214714646339417, "learning_rate": 0.0002685825817741284, "loss": 4.5694, "step": 1170 }, { "epoch": 1.49792, "grad_norm": 0.6326239109039307, "learning_rate": 0.000268542199488491, "loss": 4.4157, "step": 1171 }, { "epoch": 1.4992, "grad_norm": 0.726486086845398, "learning_rate": 0.00026850181720285366, "loss": 4.445, "step": 1172 }, { "epoch": 1.50048, "grad_norm": 0.5670982599258423, "learning_rate": 0.0002684614349172163, "loss": 4.5223, "step": 1173 }, { "epoch": 1.50176, "grad_norm": 0.6748657822608948, "learning_rate": 0.0002684210526315789, "loss": 4.4798, "step": 1174 }, { "epoch": 1.50304, "grad_norm": 0.6664227843284607, "learning_rate": 0.00026838067034594155, "loss": 4.5394, "step": 1175 }, { "epoch": 1.5043199999999999, "grad_norm": 0.6093580722808838, "learning_rate": 0.0002683402880603042, "loss": 4.4771, "step": 1176 }, { "epoch": 1.5056, "grad_norm": 0.593794047832489, "learning_rate": 0.00026829990577466686, "loss": 4.4568, "step": 1177 }, { "epoch": 1.50688, "grad_norm": 0.5962923765182495, "learning_rate": 0.00026825952348902944, "loss": 4.5111, "step": 1178 }, { "epoch": 1.50816, "grad_norm": 0.5938080549240112, "learning_rate": 0.00026821914120339207, "loss": 4.4462, "step": 1179 }, { "epoch": 1.5094400000000001, "grad_norm": 0.600837767124176, "learning_rate": 0.0002681787589177547, "loss": 4.5177, "step": 1180 }, { "epoch": 1.51072, "grad_norm": 0.601696252822876, "learning_rate": 0.0002681383766321174, "loss": 4.4331, "step": 1181 }, { "epoch": 1.512, "grad_norm": 0.5914311408996582, "learning_rate": 0.00026809799434648, "loss": 4.4859, "step": 1182 }, { "epoch": 1.51328, "grad_norm": 0.6770086288452148, "learning_rate": 0.00026805761206084264, "loss": 4.4447, "step": 1183 }, { "epoch": 1.51456, "grad_norm": 0.6460147500038147, "learning_rate": 0.00026801722977520527, "loss": 4.4399, "step": 1184 }, { "epoch": 1.5158399999999999, "grad_norm": 0.5961685180664062, "learning_rate": 0.00026797684748956785, "loss": 4.4354, "step": 1185 }, { "epoch": 1.51712, "grad_norm": 0.6054593324661255, "learning_rate": 0.00026793646520393053, "loss": 4.3969, "step": 1186 }, { "epoch": 1.5184, "grad_norm": 0.6508775949478149, "learning_rate": 0.00026789608291829316, "loss": 4.4683, "step": 1187 }, { "epoch": 1.5196800000000001, "grad_norm": 0.7032009363174438, "learning_rate": 0.0002678557006326558, "loss": 4.4864, "step": 1188 }, { "epoch": 1.52096, "grad_norm": 0.6030545234680176, "learning_rate": 0.0002678153183470184, "loss": 4.389, "step": 1189 }, { "epoch": 1.52224, "grad_norm": 0.5884782075881958, "learning_rate": 0.00026777493606138105, "loss": 4.5031, "step": 1190 }, { "epoch": 1.52352, "grad_norm": 0.6099117994308472, "learning_rate": 0.0002677345537757437, "loss": 4.4668, "step": 1191 }, { "epoch": 1.5248, "grad_norm": 0.5954748392105103, "learning_rate": 0.0002676941714901063, "loss": 4.4554, "step": 1192 }, { "epoch": 1.5260799999999999, "grad_norm": 0.5383332371711731, "learning_rate": 0.00026765378920446894, "loss": 4.5648, "step": 1193 }, { "epoch": 1.52736, "grad_norm": 0.5823171138763428, "learning_rate": 0.00026761340691883157, "loss": 4.5046, "step": 1194 }, { "epoch": 1.52864, "grad_norm": 0.6584584712982178, "learning_rate": 0.0002675730246331942, "loss": 4.4529, "step": 1195 }, { "epoch": 1.52992, "grad_norm": 0.5978401899337769, "learning_rate": 0.00026753264234755683, "loss": 4.4333, "step": 1196 }, { "epoch": 1.5312000000000001, "grad_norm": 0.591083288192749, "learning_rate": 0.00026749226006191946, "loss": 4.4279, "step": 1197 }, { "epoch": 1.53248, "grad_norm": 0.5882396101951599, "learning_rate": 0.00026745187777628215, "loss": 4.3951, "step": 1198 }, { "epoch": 1.53376, "grad_norm": 0.571337103843689, "learning_rate": 0.0002674114954906448, "loss": 4.4579, "step": 1199 }, { "epoch": 1.53504, "grad_norm": 0.5543642044067383, "learning_rate": 0.0002673711132050074, "loss": 4.4323, "step": 1200 }, { "epoch": 1.53632, "grad_norm": 0.5652990341186523, "learning_rate": 0.00026733073091937, "loss": 4.4284, "step": 1201 }, { "epoch": 1.5375999999999999, "grad_norm": 0.5995742678642273, "learning_rate": 0.0002672903486337326, "loss": 4.4789, "step": 1202 }, { "epoch": 1.53888, "grad_norm": 0.5706351399421692, "learning_rate": 0.0002672499663480953, "loss": 4.4814, "step": 1203 }, { "epoch": 1.54016, "grad_norm": 0.5830731391906738, "learning_rate": 0.0002672095840624579, "loss": 4.4525, "step": 1204 }, { "epoch": 1.5414400000000001, "grad_norm": 0.6036829352378845, "learning_rate": 0.00026716920177682056, "loss": 4.3816, "step": 1205 }, { "epoch": 1.54272, "grad_norm": 0.5913065671920776, "learning_rate": 0.0002671288194911832, "loss": 4.4997, "step": 1206 }, { "epoch": 1.544, "grad_norm": 0.6033707857131958, "learning_rate": 0.0002670884372055458, "loss": 4.4725, "step": 1207 }, { "epoch": 1.54528, "grad_norm": 0.6108120679855347, "learning_rate": 0.00026704805491990845, "loss": 4.4673, "step": 1208 }, { "epoch": 1.54656, "grad_norm": 0.6634685397148132, "learning_rate": 0.0002670076726342711, "loss": 4.4737, "step": 1209 }, { "epoch": 1.5478399999999999, "grad_norm": 0.5890428423881531, "learning_rate": 0.0002669672903486337, "loss": 4.4499, "step": 1210 }, { "epoch": 1.54912, "grad_norm": 0.6377815008163452, "learning_rate": 0.00026692690806299634, "loss": 4.4546, "step": 1211 }, { "epoch": 1.5504, "grad_norm": 0.6385360956192017, "learning_rate": 0.00026688652577735897, "loss": 4.4226, "step": 1212 }, { "epoch": 1.55168, "grad_norm": 0.5952944159507751, "learning_rate": 0.0002668461434917216, "loss": 4.4762, "step": 1213 }, { "epoch": 1.5529600000000001, "grad_norm": 0.5970460772514343, "learning_rate": 0.0002668057612060842, "loss": 4.4617, "step": 1214 }, { "epoch": 1.55424, "grad_norm": 0.6275203227996826, "learning_rate": 0.00026676537892044686, "loss": 4.4252, "step": 1215 }, { "epoch": 1.55552, "grad_norm": 0.6631754636764526, "learning_rate": 0.00026672499663480954, "loss": 4.5075, "step": 1216 }, { "epoch": 1.5568, "grad_norm": 0.6074349880218506, "learning_rate": 0.0002666846143491721, "loss": 4.442, "step": 1217 }, { "epoch": 1.55808, "grad_norm": 0.6577631235122681, "learning_rate": 0.00026664423206353475, "loss": 4.4871, "step": 1218 }, { "epoch": 1.5593599999999999, "grad_norm": 0.6167892217636108, "learning_rate": 0.0002666038497778974, "loss": 4.4119, "step": 1219 }, { "epoch": 1.56064, "grad_norm": 0.6109944581985474, "learning_rate": 0.00026656346749226006, "loss": 4.439, "step": 1220 }, { "epoch": 1.56192, "grad_norm": 0.6446990966796875, "learning_rate": 0.0002665230852066227, "loss": 4.4472, "step": 1221 }, { "epoch": 1.5632000000000001, "grad_norm": 0.5861752033233643, "learning_rate": 0.0002664827029209853, "loss": 4.4121, "step": 1222 }, { "epoch": 1.56448, "grad_norm": 0.620345413684845, "learning_rate": 0.00026644232063534795, "loss": 4.5141, "step": 1223 }, { "epoch": 1.56576, "grad_norm": 0.6211066842079163, "learning_rate": 0.0002664019383497106, "loss": 4.4676, "step": 1224 }, { "epoch": 1.56704, "grad_norm": 0.5987910628318787, "learning_rate": 0.0002663615560640732, "loss": 4.4051, "step": 1225 }, { "epoch": 1.56832, "grad_norm": 0.6385419368743896, "learning_rate": 0.00026632117377843584, "loss": 4.457, "step": 1226 }, { "epoch": 1.5695999999999999, "grad_norm": 0.6010638475418091, "learning_rate": 0.00026628079149279847, "loss": 4.4322, "step": 1227 }, { "epoch": 1.57088, "grad_norm": 0.6236041784286499, "learning_rate": 0.0002662404092071611, "loss": 4.4491, "step": 1228 }, { "epoch": 1.57216, "grad_norm": 0.6467570066452026, "learning_rate": 0.00026620002692152373, "loss": 4.4418, "step": 1229 }, { "epoch": 1.57344, "grad_norm": 0.5972265601158142, "learning_rate": 0.00026615964463588636, "loss": 4.4185, "step": 1230 }, { "epoch": 1.5747200000000001, "grad_norm": 0.6430701017379761, "learning_rate": 0.000266119262350249, "loss": 4.4144, "step": 1231 }, { "epoch": 1.576, "grad_norm": 0.5837542414665222, "learning_rate": 0.0002660788800646116, "loss": 4.4138, "step": 1232 }, { "epoch": 1.57728, "grad_norm": 0.6243807077407837, "learning_rate": 0.0002660384977789743, "loss": 4.4507, "step": 1233 }, { "epoch": 1.57856, "grad_norm": 0.5919942259788513, "learning_rate": 0.0002659981154933369, "loss": 4.4508, "step": 1234 }, { "epoch": 1.57984, "grad_norm": 0.540010929107666, "learning_rate": 0.0002659577332076995, "loss": 4.3369, "step": 1235 }, { "epoch": 1.5811199999999999, "grad_norm": 0.568360447883606, "learning_rate": 0.00026591735092206214, "loss": 4.4326, "step": 1236 }, { "epoch": 1.5824, "grad_norm": 0.5942397117614746, "learning_rate": 0.0002658769686364248, "loss": 4.412, "step": 1237 }, { "epoch": 1.58368, "grad_norm": 0.5995660424232483, "learning_rate": 0.00026583658635078746, "loss": 4.4086, "step": 1238 }, { "epoch": 1.5849600000000001, "grad_norm": 0.620974063873291, "learning_rate": 0.0002657962040651501, "loss": 4.3333, "step": 1239 }, { "epoch": 1.58624, "grad_norm": 0.6151325702667236, "learning_rate": 0.00026575582177951266, "loss": 4.4844, "step": 1240 }, { "epoch": 1.58752, "grad_norm": 0.640434741973877, "learning_rate": 0.0002657154394938753, "loss": 4.3976, "step": 1241 }, { "epoch": 1.5888, "grad_norm": 0.5995258688926697, "learning_rate": 0.000265675057208238, "loss": 4.3342, "step": 1242 }, { "epoch": 1.59008, "grad_norm": 0.5680806636810303, "learning_rate": 0.0002656346749226006, "loss": 4.4304, "step": 1243 }, { "epoch": 1.5913599999999999, "grad_norm": 0.6433593034744263, "learning_rate": 0.00026559429263696324, "loss": 4.375, "step": 1244 }, { "epoch": 1.5926399999999998, "grad_norm": 0.5471886992454529, "learning_rate": 0.00026555391035132587, "loss": 4.3626, "step": 1245 }, { "epoch": 1.59392, "grad_norm": 0.6657853126525879, "learning_rate": 0.0002655135280656885, "loss": 4.3994, "step": 1246 }, { "epoch": 1.5952, "grad_norm": 0.6484869122505188, "learning_rate": 0.00026547314578005113, "loss": 4.4166, "step": 1247 }, { "epoch": 1.5964800000000001, "grad_norm": 0.6245217323303223, "learning_rate": 0.00026543276349441376, "loss": 4.3975, "step": 1248 }, { "epoch": 1.59776, "grad_norm": 0.6242031455039978, "learning_rate": 0.0002653923812087764, "loss": 4.4047, "step": 1249 }, { "epoch": 1.59904, "grad_norm": 0.6540752649307251, "learning_rate": 0.000265351998923139, "loss": 4.3767, "step": 1250 }, { "epoch": 1.60032, "grad_norm": 0.5985410213470459, "learning_rate": 0.00026531161663750165, "loss": 4.4277, "step": 1251 }, { "epoch": 1.6016, "grad_norm": 0.7160242795944214, "learning_rate": 0.0002652712343518643, "loss": 4.3891, "step": 1252 }, { "epoch": 1.6028799999999999, "grad_norm": 0.5682446360588074, "learning_rate": 0.0002652308520662269, "loss": 4.3845, "step": 1253 }, { "epoch": 1.60416, "grad_norm": 0.6273159980773926, "learning_rate": 0.00026519046978058954, "loss": 4.3962, "step": 1254 }, { "epoch": 1.60544, "grad_norm": 0.631803035736084, "learning_rate": 0.0002651500874949522, "loss": 4.4692, "step": 1255 }, { "epoch": 1.6067200000000001, "grad_norm": 0.5755056142807007, "learning_rate": 0.00026510970520931485, "loss": 4.4311, "step": 1256 }, { "epoch": 1.608, "grad_norm": 0.5929430723190308, "learning_rate": 0.00026506932292367743, "loss": 4.4624, "step": 1257 }, { "epoch": 1.60928, "grad_norm": 0.6523311138153076, "learning_rate": 0.00026502894063804006, "loss": 4.417, "step": 1258 }, { "epoch": 1.61056, "grad_norm": 0.5619366765022278, "learning_rate": 0.00026498855835240274, "loss": 4.3603, "step": 1259 }, { "epoch": 1.61184, "grad_norm": 0.60685795545578, "learning_rate": 0.00026494817606676537, "loss": 4.4113, "step": 1260 }, { "epoch": 1.6131199999999999, "grad_norm": 0.5795391201972961, "learning_rate": 0.000264907793781128, "loss": 4.3658, "step": 1261 }, { "epoch": 1.6143999999999998, "grad_norm": 0.6668011546134949, "learning_rate": 0.00026486741149549063, "loss": 4.3773, "step": 1262 }, { "epoch": 1.61568, "grad_norm": 0.7039081454277039, "learning_rate": 0.00026482702920985326, "loss": 4.4388, "step": 1263 }, { "epoch": 1.61696, "grad_norm": 0.6980475187301636, "learning_rate": 0.0002647866469242159, "loss": 4.4047, "step": 1264 }, { "epoch": 1.6182400000000001, "grad_norm": 0.6274135112762451, "learning_rate": 0.0002647462646385785, "loss": 4.3472, "step": 1265 }, { "epoch": 1.61952, "grad_norm": 0.5991054773330688, "learning_rate": 0.00026470588235294115, "loss": 4.402, "step": 1266 }, { "epoch": 1.6208, "grad_norm": 0.5963894724845886, "learning_rate": 0.0002646655000673038, "loss": 4.4285, "step": 1267 }, { "epoch": 1.62208, "grad_norm": 0.6448376178741455, "learning_rate": 0.0002646251177816664, "loss": 4.3825, "step": 1268 }, { "epoch": 1.62336, "grad_norm": 0.5770543217658997, "learning_rate": 0.00026458473549602904, "loss": 4.3484, "step": 1269 }, { "epoch": 1.6246399999999999, "grad_norm": 0.6031239032745361, "learning_rate": 0.0002645443532103917, "loss": 4.3538, "step": 1270 }, { "epoch": 1.62592, "grad_norm": 0.5922113060951233, "learning_rate": 0.0002645039709247543, "loss": 4.3903, "step": 1271 }, { "epoch": 1.6272, "grad_norm": 0.5830681324005127, "learning_rate": 0.000264463588639117, "loss": 4.4233, "step": 1272 }, { "epoch": 1.6284800000000001, "grad_norm": 0.5209598541259766, "learning_rate": 0.00026442320635347956, "loss": 4.404, "step": 1273 }, { "epoch": 1.62976, "grad_norm": 0.6153559684753418, "learning_rate": 0.0002643828240678422, "loss": 4.3666, "step": 1274 }, { "epoch": 1.63104, "grad_norm": 0.566469132900238, "learning_rate": 0.0002643424417822048, "loss": 4.3731, "step": 1275 }, { "epoch": 1.63232, "grad_norm": 0.5851961374282837, "learning_rate": 0.0002643020594965675, "loss": 4.3827, "step": 1276 }, { "epoch": 1.6336, "grad_norm": 0.5738173127174377, "learning_rate": 0.00026426167721093014, "loss": 4.3779, "step": 1277 }, { "epoch": 1.6348799999999999, "grad_norm": 0.5467177629470825, "learning_rate": 0.00026422129492529277, "loss": 4.3595, "step": 1278 }, { "epoch": 1.6361599999999998, "grad_norm": 0.5850498080253601, "learning_rate": 0.0002641809126396554, "loss": 4.3581, "step": 1279 }, { "epoch": 1.63744, "grad_norm": 0.5361942052841187, "learning_rate": 0.000264140530354018, "loss": 4.4323, "step": 1280 }, { "epoch": 1.63872, "grad_norm": 0.5595226883888245, "learning_rate": 0.00026410014806838066, "loss": 4.4528, "step": 1281 }, { "epoch": 1.6400000000000001, "grad_norm": 0.6067069172859192, "learning_rate": 0.0002640597657827433, "loss": 4.3867, "step": 1282 }, { "epoch": 1.64128, "grad_norm": 0.5635828971862793, "learning_rate": 0.0002640193834971059, "loss": 4.3671, "step": 1283 }, { "epoch": 1.64256, "grad_norm": 0.6461728811264038, "learning_rate": 0.00026397900121146855, "loss": 4.3887, "step": 1284 }, { "epoch": 1.64384, "grad_norm": 0.5938706398010254, "learning_rate": 0.0002639386189258312, "loss": 4.4002, "step": 1285 }, { "epoch": 1.64512, "grad_norm": 0.6693472862243652, "learning_rate": 0.0002638982366401938, "loss": 4.3767, "step": 1286 }, { "epoch": 1.6463999999999999, "grad_norm": 0.6100233197212219, "learning_rate": 0.00026385785435455644, "loss": 4.4038, "step": 1287 }, { "epoch": 1.64768, "grad_norm": 0.5637868642807007, "learning_rate": 0.00026381747206891907, "loss": 4.3205, "step": 1288 }, { "epoch": 1.64896, "grad_norm": 0.6211172342300415, "learning_rate": 0.0002637770897832817, "loss": 4.3709, "step": 1289 }, { "epoch": 1.6502400000000002, "grad_norm": 0.5223026871681213, "learning_rate": 0.00026373670749764433, "loss": 4.381, "step": 1290 }, { "epoch": 1.65152, "grad_norm": 0.5736677646636963, "learning_rate": 0.00026369632521200696, "loss": 4.3631, "step": 1291 }, { "epoch": 1.6528, "grad_norm": 0.5660174489021301, "learning_rate": 0.0002636559429263696, "loss": 4.4151, "step": 1292 }, { "epoch": 1.65408, "grad_norm": 0.5827959775924683, "learning_rate": 0.0002636155606407322, "loss": 4.2954, "step": 1293 }, { "epoch": 1.65536, "grad_norm": 0.6205708384513855, "learning_rate": 0.0002635751783550949, "loss": 4.3476, "step": 1294 }, { "epoch": 1.65664, "grad_norm": 0.599597692489624, "learning_rate": 0.00026353479606945753, "loss": 4.3624, "step": 1295 }, { "epoch": 1.6579199999999998, "grad_norm": 0.6274026036262512, "learning_rate": 0.0002634944137838201, "loss": 4.4197, "step": 1296 }, { "epoch": 1.6592, "grad_norm": 0.570179283618927, "learning_rate": 0.00026345403149818274, "loss": 4.4012, "step": 1297 }, { "epoch": 1.66048, "grad_norm": 0.618751049041748, "learning_rate": 0.0002634136492125454, "loss": 4.3122, "step": 1298 }, { "epoch": 1.6617600000000001, "grad_norm": 0.6431691646575928, "learning_rate": 0.00026337326692690805, "loss": 4.4175, "step": 1299 }, { "epoch": 1.66304, "grad_norm": 0.6071829199790955, "learning_rate": 0.0002633328846412707, "loss": 4.394, "step": 1300 }, { "epoch": 1.66432, "grad_norm": 0.5877499580383301, "learning_rate": 0.0002632925023556333, "loss": 4.3989, "step": 1301 }, { "epoch": 1.6656, "grad_norm": 0.5928570628166199, "learning_rate": 0.00026325212006999594, "loss": 4.4153, "step": 1302 }, { "epoch": 1.66688, "grad_norm": 0.5851948261260986, "learning_rate": 0.0002632117377843586, "loss": 4.4008, "step": 1303 }, { "epoch": 1.6681599999999999, "grad_norm": 0.5977253317832947, "learning_rate": 0.0002631713554987212, "loss": 4.3256, "step": 1304 }, { "epoch": 1.66944, "grad_norm": 0.5974704027175903, "learning_rate": 0.00026313097321308383, "loss": 4.3814, "step": 1305 }, { "epoch": 1.67072, "grad_norm": 0.631009578704834, "learning_rate": 0.00026309059092744646, "loss": 4.3496, "step": 1306 }, { "epoch": 1.6720000000000002, "grad_norm": 0.5566964149475098, "learning_rate": 0.0002630502086418091, "loss": 4.4002, "step": 1307 }, { "epoch": 1.67328, "grad_norm": 0.5787959694862366, "learning_rate": 0.0002630098263561717, "loss": 4.3317, "step": 1308 }, { "epoch": 1.67456, "grad_norm": 0.6714434623718262, "learning_rate": 0.00026296944407053435, "loss": 4.3374, "step": 1309 }, { "epoch": 1.67584, "grad_norm": 0.6594877243041992, "learning_rate": 0.000262929061784897, "loss": 4.4183, "step": 1310 }, { "epoch": 1.67712, "grad_norm": 0.5786787867546082, "learning_rate": 0.00026288867949925967, "loss": 4.3543, "step": 1311 }, { "epoch": 1.6784, "grad_norm": 0.5720179677009583, "learning_rate": 0.00026284829721362225, "loss": 4.3702, "step": 1312 }, { "epoch": 1.6796799999999998, "grad_norm": 0.6159886121749878, "learning_rate": 0.0002628079149279849, "loss": 4.3829, "step": 1313 }, { "epoch": 1.68096, "grad_norm": 0.6541562080383301, "learning_rate": 0.0002627675326423475, "loss": 4.3588, "step": 1314 }, { "epoch": 1.68224, "grad_norm": 0.6164893507957458, "learning_rate": 0.0002627271503567102, "loss": 4.3731, "step": 1315 }, { "epoch": 1.6835200000000001, "grad_norm": 0.611280083656311, "learning_rate": 0.0002626867680710728, "loss": 4.454, "step": 1316 }, { "epoch": 1.6848, "grad_norm": 0.6023463606834412, "learning_rate": 0.00026264638578543545, "loss": 4.3007, "step": 1317 }, { "epoch": 1.68608, "grad_norm": 0.6427450180053711, "learning_rate": 0.0002626060034997981, "loss": 4.3692, "step": 1318 }, { "epoch": 1.68736, "grad_norm": 0.5956165194511414, "learning_rate": 0.00026256562121416066, "loss": 4.352, "step": 1319 }, { "epoch": 1.68864, "grad_norm": 0.5930037498474121, "learning_rate": 0.00026252523892852334, "loss": 4.3185, "step": 1320 }, { "epoch": 1.6899199999999999, "grad_norm": 0.5842078328132629, "learning_rate": 0.00026248485664288597, "loss": 4.3912, "step": 1321 }, { "epoch": 1.6912, "grad_norm": 0.5779310464859009, "learning_rate": 0.0002624444743572486, "loss": 4.4048, "step": 1322 }, { "epoch": 1.69248, "grad_norm": 0.5934463739395142, "learning_rate": 0.00026240409207161123, "loss": 4.3657, "step": 1323 }, { "epoch": 1.6937600000000002, "grad_norm": 0.636505663394928, "learning_rate": 0.00026236370978597386, "loss": 4.4074, "step": 1324 }, { "epoch": 1.69504, "grad_norm": 0.6304707527160645, "learning_rate": 0.0002623233275003365, "loss": 4.4129, "step": 1325 }, { "epoch": 1.69632, "grad_norm": 0.660848081111908, "learning_rate": 0.0002622829452146991, "loss": 4.3167, "step": 1326 }, { "epoch": 1.6976, "grad_norm": 0.5828097462654114, "learning_rate": 0.00026224256292906175, "loss": 4.3415, "step": 1327 }, { "epoch": 1.69888, "grad_norm": 0.6500957608222961, "learning_rate": 0.00026220218064342443, "loss": 4.3257, "step": 1328 }, { "epoch": 1.70016, "grad_norm": 0.5696218013763428, "learning_rate": 0.000262161798357787, "loss": 4.423, "step": 1329 }, { "epoch": 1.7014399999999998, "grad_norm": 0.6378457546234131, "learning_rate": 0.00026212141607214964, "loss": 4.371, "step": 1330 }, { "epoch": 1.70272, "grad_norm": 0.5619131326675415, "learning_rate": 0.00026208103378651227, "loss": 4.3918, "step": 1331 }, { "epoch": 1.704, "grad_norm": 0.607213020324707, "learning_rate": 0.00026204065150087496, "loss": 4.2955, "step": 1332 }, { "epoch": 1.7052800000000001, "grad_norm": 0.6598789095878601, "learning_rate": 0.0002620002692152376, "loss": 4.4082, "step": 1333 }, { "epoch": 1.70656, "grad_norm": 0.6173972487449646, "learning_rate": 0.0002619598869296002, "loss": 4.3991, "step": 1334 }, { "epoch": 1.70784, "grad_norm": 0.709816575050354, "learning_rate": 0.00026191950464396285, "loss": 4.3563, "step": 1335 }, { "epoch": 1.70912, "grad_norm": 0.5555682182312012, "learning_rate": 0.0002618791223583254, "loss": 4.386, "step": 1336 }, { "epoch": 1.7104, "grad_norm": 0.6640469431877136, "learning_rate": 0.0002618387400726881, "loss": 4.3813, "step": 1337 }, { "epoch": 1.7116799999999999, "grad_norm": 0.5649694800376892, "learning_rate": 0.00026179835778705074, "loss": 4.3737, "step": 1338 }, { "epoch": 1.71296, "grad_norm": 0.5696978569030762, "learning_rate": 0.00026175797550141337, "loss": 4.3254, "step": 1339 }, { "epoch": 1.71424, "grad_norm": 0.5677264928817749, "learning_rate": 0.000261717593215776, "loss": 4.3328, "step": 1340 }, { "epoch": 1.7155200000000002, "grad_norm": 0.5705046057701111, "learning_rate": 0.0002616772109301386, "loss": 4.2976, "step": 1341 }, { "epoch": 1.7168, "grad_norm": 0.5434849858283997, "learning_rate": 0.00026163682864450126, "loss": 4.4098, "step": 1342 }, { "epoch": 1.71808, "grad_norm": 0.6120153665542603, "learning_rate": 0.0002615964463588639, "loss": 4.3524, "step": 1343 }, { "epoch": 1.71936, "grad_norm": 0.5820379853248596, "learning_rate": 0.0002615560640732265, "loss": 4.3321, "step": 1344 }, { "epoch": 1.72064, "grad_norm": 0.586432695388794, "learning_rate": 0.00026151568178758915, "loss": 4.3806, "step": 1345 }, { "epoch": 1.72192, "grad_norm": 0.6412893533706665, "learning_rate": 0.0002614752995019518, "loss": 4.3795, "step": 1346 }, { "epoch": 1.7231999999999998, "grad_norm": 0.6333811283111572, "learning_rate": 0.0002614349172163144, "loss": 4.3555, "step": 1347 }, { "epoch": 1.72448, "grad_norm": 0.5543333292007446, "learning_rate": 0.00026139453493067704, "loss": 4.3285, "step": 1348 }, { "epoch": 1.72576, "grad_norm": 0.5972537398338318, "learning_rate": 0.00026135415264503967, "loss": 4.3328, "step": 1349 }, { "epoch": 1.7270400000000001, "grad_norm": 0.5956273078918457, "learning_rate": 0.00026131377035940235, "loss": 4.3246, "step": 1350 }, { "epoch": 1.72832, "grad_norm": 0.6021464467048645, "learning_rate": 0.000261273388073765, "loss": 4.4009, "step": 1351 }, { "epoch": 1.7296, "grad_norm": 0.6117057800292969, "learning_rate": 0.00026123300578812756, "loss": 4.3637, "step": 1352 }, { "epoch": 1.73088, "grad_norm": 0.5697115659713745, "learning_rate": 0.0002611926235024902, "loss": 4.3032, "step": 1353 }, { "epoch": 1.73216, "grad_norm": 0.6455910205841064, "learning_rate": 0.00026115224121685287, "loss": 4.2995, "step": 1354 }, { "epoch": 1.7334399999999999, "grad_norm": 0.5969467759132385, "learning_rate": 0.0002611118589312155, "loss": 4.3408, "step": 1355 }, { "epoch": 1.73472, "grad_norm": 0.6078411340713501, "learning_rate": 0.00026107147664557813, "loss": 4.3799, "step": 1356 }, { "epoch": 1.736, "grad_norm": 0.5551348328590393, "learning_rate": 0.00026103109435994076, "loss": 4.3688, "step": 1357 }, { "epoch": 1.7372800000000002, "grad_norm": 0.6095216870307922, "learning_rate": 0.0002609907120743034, "loss": 4.3867, "step": 1358 }, { "epoch": 1.73856, "grad_norm": 0.5706983208656311, "learning_rate": 0.000260950329788666, "loss": 4.3618, "step": 1359 }, { "epoch": 1.73984, "grad_norm": 0.5468519926071167, "learning_rate": 0.00026090994750302865, "loss": 4.361, "step": 1360 }, { "epoch": 1.74112, "grad_norm": 0.5678277611732483, "learning_rate": 0.0002608695652173913, "loss": 4.3545, "step": 1361 }, { "epoch": 1.7424, "grad_norm": 0.5359609723091125, "learning_rate": 0.0002608291829317539, "loss": 4.4574, "step": 1362 }, { "epoch": 1.74368, "grad_norm": 0.5697743892669678, "learning_rate": 0.00026078880064611654, "loss": 4.295, "step": 1363 }, { "epoch": 1.7449599999999998, "grad_norm": 0.605635404586792, "learning_rate": 0.00026074841836047917, "loss": 4.3309, "step": 1364 }, { "epoch": 1.74624, "grad_norm": 0.5920169353485107, "learning_rate": 0.0002607080360748418, "loss": 4.3717, "step": 1365 }, { "epoch": 1.74752, "grad_norm": 0.6027970910072327, "learning_rate": 0.00026066765378920443, "loss": 4.3445, "step": 1366 }, { "epoch": 1.7488000000000001, "grad_norm": 0.5497733354568481, "learning_rate": 0.0002606272715035671, "loss": 4.3752, "step": 1367 }, { "epoch": 1.75008, "grad_norm": 0.6217249631881714, "learning_rate": 0.0002605868892179297, "loss": 4.3425, "step": 1368 }, { "epoch": 1.75136, "grad_norm": 0.5949388742446899, "learning_rate": 0.0002605465069322923, "loss": 4.3937, "step": 1369 }, { "epoch": 1.75264, "grad_norm": 0.5935553908348083, "learning_rate": 0.00026050612464665495, "loss": 4.318, "step": 1370 }, { "epoch": 1.75392, "grad_norm": 0.5247051119804382, "learning_rate": 0.00026046574236101764, "loss": 4.305, "step": 1371 }, { "epoch": 1.7551999999999999, "grad_norm": 0.5983899235725403, "learning_rate": 0.00026042536007538027, "loss": 4.3383, "step": 1372 }, { "epoch": 1.75648, "grad_norm": 0.5618011951446533, "learning_rate": 0.0002603849777897429, "loss": 4.3788, "step": 1373 }, { "epoch": 1.75776, "grad_norm": 0.5628573298454285, "learning_rate": 0.0002603445955041055, "loss": 4.2755, "step": 1374 }, { "epoch": 1.7590400000000002, "grad_norm": 0.580451488494873, "learning_rate": 0.0002603042132184681, "loss": 4.3533, "step": 1375 }, { "epoch": 1.76032, "grad_norm": 0.5849772691726685, "learning_rate": 0.0002602638309328308, "loss": 4.3453, "step": 1376 }, { "epoch": 1.7616, "grad_norm": 0.6029077172279358, "learning_rate": 0.0002602234486471934, "loss": 4.3619, "step": 1377 }, { "epoch": 1.76288, "grad_norm": 0.5800113677978516, "learning_rate": 0.00026018306636155605, "loss": 4.3204, "step": 1378 }, { "epoch": 1.76416, "grad_norm": 0.5955139994621277, "learning_rate": 0.0002601426840759187, "loss": 4.3915, "step": 1379 }, { "epoch": 1.76544, "grad_norm": 0.5562100410461426, "learning_rate": 0.0002601023017902813, "loss": 4.2759, "step": 1380 }, { "epoch": 1.7667199999999998, "grad_norm": 0.61802738904953, "learning_rate": 0.00026006191950464394, "loss": 4.2538, "step": 1381 }, { "epoch": 1.768, "grad_norm": 0.5850224494934082, "learning_rate": 0.00026002153721900657, "loss": 4.3395, "step": 1382 }, { "epoch": 1.76928, "grad_norm": 0.5995482802391052, "learning_rate": 0.0002599811549333692, "loss": 4.2534, "step": 1383 }, { "epoch": 1.7705600000000001, "grad_norm": 0.5256519913673401, "learning_rate": 0.00025994077264773183, "loss": 4.2427, "step": 1384 }, { "epoch": 1.77184, "grad_norm": 0.5336461067199707, "learning_rate": 0.00025990039036209446, "loss": 4.3061, "step": 1385 }, { "epoch": 1.77312, "grad_norm": 0.5198113322257996, "learning_rate": 0.0002598600080764571, "loss": 4.3705, "step": 1386 }, { "epoch": 1.7744, "grad_norm": 0.5634206533432007, "learning_rate": 0.0002598196257908197, "loss": 4.3167, "step": 1387 }, { "epoch": 1.77568, "grad_norm": 0.598160445690155, "learning_rate": 0.00025977924350518235, "loss": 4.2889, "step": 1388 }, { "epoch": 1.7769599999999999, "grad_norm": 0.6371156573295593, "learning_rate": 0.00025973886121954503, "loss": 4.3066, "step": 1389 }, { "epoch": 1.77824, "grad_norm": 0.5342087149620056, "learning_rate": 0.00025969847893390766, "loss": 4.2872, "step": 1390 }, { "epoch": 1.77952, "grad_norm": 0.6202256083488464, "learning_rate": 0.00025965809664827024, "loss": 4.3316, "step": 1391 }, { "epoch": 1.7808000000000002, "grad_norm": 0.5649963021278381, "learning_rate": 0.00025961771436263287, "loss": 4.3691, "step": 1392 }, { "epoch": 1.78208, "grad_norm": 0.5771881341934204, "learning_rate": 0.00025957733207699555, "loss": 4.3228, "step": 1393 }, { "epoch": 1.78336, "grad_norm": 0.5943997502326965, "learning_rate": 0.0002595369497913582, "loss": 4.4159, "step": 1394 }, { "epoch": 1.78464, "grad_norm": 0.5805171132087708, "learning_rate": 0.0002594965675057208, "loss": 4.287, "step": 1395 }, { "epoch": 1.78592, "grad_norm": 0.5803609490394592, "learning_rate": 0.00025945618522008344, "loss": 4.2801, "step": 1396 }, { "epoch": 1.7872, "grad_norm": 0.5380381345748901, "learning_rate": 0.00025941580293444607, "loss": 4.3378, "step": 1397 }, { "epoch": 1.7884799999999998, "grad_norm": 0.5976554155349731, "learning_rate": 0.0002593754206488087, "loss": 4.3923, "step": 1398 }, { "epoch": 1.78976, "grad_norm": 0.6023809909820557, "learning_rate": 0.00025933503836317133, "loss": 4.3821, "step": 1399 }, { "epoch": 1.79104, "grad_norm": 0.4953809976577759, "learning_rate": 0.00025929465607753396, "loss": 4.3558, "step": 1400 }, { "epoch": 1.7923200000000001, "grad_norm": 0.6279889345169067, "learning_rate": 0.0002592542737918966, "loss": 4.319, "step": 1401 }, { "epoch": 1.7936, "grad_norm": 0.5510340929031372, "learning_rate": 0.0002592138915062592, "loss": 4.3578, "step": 1402 }, { "epoch": 1.79488, "grad_norm": 0.5811865925788879, "learning_rate": 0.00025917350922062185, "loss": 4.3256, "step": 1403 }, { "epoch": 1.79616, "grad_norm": 0.5557198524475098, "learning_rate": 0.0002591331269349845, "loss": 4.2975, "step": 1404 }, { "epoch": 1.79744, "grad_norm": 0.5800006985664368, "learning_rate": 0.0002590927446493471, "loss": 4.3556, "step": 1405 }, { "epoch": 1.7987199999999999, "grad_norm": 0.5096418857574463, "learning_rate": 0.0002590523623637098, "loss": 4.2946, "step": 1406 }, { "epoch": 1.8, "grad_norm": 0.5896133780479431, "learning_rate": 0.00025901198007807243, "loss": 4.3412, "step": 1407 }, { "epoch": 1.80128, "grad_norm": 0.5542340874671936, "learning_rate": 0.000258971597792435, "loss": 4.3673, "step": 1408 }, { "epoch": 1.8025600000000002, "grad_norm": 0.5914231538772583, "learning_rate": 0.00025893121550679763, "loss": 4.3599, "step": 1409 }, { "epoch": 1.80384, "grad_norm": 0.6253953576087952, "learning_rate": 0.0002588908332211603, "loss": 4.2901, "step": 1410 }, { "epoch": 1.80512, "grad_norm": 0.5754669904708862, "learning_rate": 0.00025885045093552295, "loss": 4.3587, "step": 1411 }, { "epoch": 1.8064, "grad_norm": 0.5589056015014648, "learning_rate": 0.0002588100686498856, "loss": 4.2848, "step": 1412 }, { "epoch": 1.80768, "grad_norm": 0.6159608960151672, "learning_rate": 0.0002587696863642482, "loss": 4.2993, "step": 1413 }, { "epoch": 1.80896, "grad_norm": 0.5682271718978882, "learning_rate": 0.0002587293040786108, "loss": 4.3867, "step": 1414 }, { "epoch": 1.8102399999999998, "grad_norm": 0.5960492491722107, "learning_rate": 0.00025868892179297347, "loss": 4.2558, "step": 1415 }, { "epoch": 1.81152, "grad_norm": 0.6439085006713867, "learning_rate": 0.0002586485395073361, "loss": 4.3216, "step": 1416 }, { "epoch": 1.8128, "grad_norm": 0.5476023554801941, "learning_rate": 0.00025860815722169873, "loss": 4.34, "step": 1417 }, { "epoch": 1.8140800000000001, "grad_norm": 0.6070132851600647, "learning_rate": 0.00025856777493606136, "loss": 4.3153, "step": 1418 }, { "epoch": 1.81536, "grad_norm": 0.6208356618881226, "learning_rate": 0.000258527392650424, "loss": 4.4003, "step": 1419 }, { "epoch": 1.81664, "grad_norm": 0.553810715675354, "learning_rate": 0.0002584870103647866, "loss": 4.321, "step": 1420 }, { "epoch": 1.81792, "grad_norm": 0.6032571196556091, "learning_rate": 0.00025844662807914925, "loss": 4.3312, "step": 1421 }, { "epoch": 1.8192, "grad_norm": 0.5861594676971436, "learning_rate": 0.0002584062457935119, "loss": 4.3138, "step": 1422 }, { "epoch": 1.8204799999999999, "grad_norm": 0.5738282203674316, "learning_rate": 0.00025836586350787456, "loss": 4.2894, "step": 1423 }, { "epoch": 1.82176, "grad_norm": 0.5541486144065857, "learning_rate": 0.00025832548122223714, "loss": 4.3835, "step": 1424 }, { "epoch": 1.82304, "grad_norm": 0.5869741439819336, "learning_rate": 0.00025828509893659977, "loss": 4.2246, "step": 1425 }, { "epoch": 1.8243200000000002, "grad_norm": 0.5501008033752441, "learning_rate": 0.0002582447166509624, "loss": 4.3119, "step": 1426 }, { "epoch": 1.8256000000000001, "grad_norm": 0.5797874927520752, "learning_rate": 0.00025820433436532503, "loss": 4.2885, "step": 1427 }, { "epoch": 1.82688, "grad_norm": 0.5502588748931885, "learning_rate": 0.0002581639520796877, "loss": 4.2423, "step": 1428 }, { "epoch": 1.82816, "grad_norm": 0.6173840761184692, "learning_rate": 0.00025812356979405034, "loss": 4.259, "step": 1429 }, { "epoch": 1.82944, "grad_norm": 0.5949298143386841, "learning_rate": 0.000258083187508413, "loss": 4.3307, "step": 1430 }, { "epoch": 1.83072, "grad_norm": 0.5942235589027405, "learning_rate": 0.00025804280522277555, "loss": 4.279, "step": 1431 }, { "epoch": 1.8319999999999999, "grad_norm": 0.6103411912918091, "learning_rate": 0.00025800242293713823, "loss": 4.2624, "step": 1432 }, { "epoch": 1.83328, "grad_norm": 0.5956135392189026, "learning_rate": 0.00025796204065150086, "loss": 4.3028, "step": 1433 }, { "epoch": 1.83456, "grad_norm": 0.5850538015365601, "learning_rate": 0.0002579216583658635, "loss": 4.2829, "step": 1434 }, { "epoch": 1.8358400000000001, "grad_norm": 0.5809298157691956, "learning_rate": 0.0002578812760802261, "loss": 4.332, "step": 1435 }, { "epoch": 1.83712, "grad_norm": 0.8085721135139465, "learning_rate": 0.00025784089379458875, "loss": 4.3069, "step": 1436 }, { "epoch": 1.8384, "grad_norm": 0.5805991888046265, "learning_rate": 0.0002578005115089514, "loss": 4.3018, "step": 1437 }, { "epoch": 1.83968, "grad_norm": 0.5863122344017029, "learning_rate": 0.000257760129223314, "loss": 4.275, "step": 1438 }, { "epoch": 1.84096, "grad_norm": 0.573593258857727, "learning_rate": 0.00025771974693767664, "loss": 4.2953, "step": 1439 }, { "epoch": 1.8422399999999999, "grad_norm": 0.6307305097579956, "learning_rate": 0.0002576793646520393, "loss": 4.3356, "step": 1440 }, { "epoch": 1.84352, "grad_norm": 0.5647289752960205, "learning_rate": 0.0002576389823664019, "loss": 4.3151, "step": 1441 }, { "epoch": 1.8448, "grad_norm": 0.640838086605072, "learning_rate": 0.00025759860008076453, "loss": 4.3466, "step": 1442 }, { "epoch": 1.8460800000000002, "grad_norm": 0.5609426498413086, "learning_rate": 0.00025755821779512716, "loss": 4.3062, "step": 1443 }, { "epoch": 1.8473600000000001, "grad_norm": 0.6041327118873596, "learning_rate": 0.0002575178355094898, "loss": 4.3406, "step": 1444 }, { "epoch": 1.84864, "grad_norm": 0.5379366874694824, "learning_rate": 0.0002574774532238525, "loss": 4.2807, "step": 1445 }, { "epoch": 1.84992, "grad_norm": 0.582633376121521, "learning_rate": 0.0002574370709382151, "loss": 4.316, "step": 1446 }, { "epoch": 1.8512, "grad_norm": 0.6131902933120728, "learning_rate": 0.0002573966886525777, "loss": 4.2406, "step": 1447 }, { "epoch": 1.85248, "grad_norm": 0.6240634322166443, "learning_rate": 0.0002573563063669403, "loss": 4.307, "step": 1448 }, { "epoch": 1.8537599999999999, "grad_norm": 0.5122919678688049, "learning_rate": 0.000257315924081303, "loss": 4.253, "step": 1449 }, { "epoch": 1.85504, "grad_norm": 0.6418374180793762, "learning_rate": 0.00025727554179566563, "loss": 4.3222, "step": 1450 }, { "epoch": 1.85632, "grad_norm": 0.5671072602272034, "learning_rate": 0.00025723515951002826, "loss": 4.3006, "step": 1451 }, { "epoch": 1.8576000000000001, "grad_norm": 0.5645801424980164, "learning_rate": 0.0002571947772243909, "loss": 4.3623, "step": 1452 }, { "epoch": 1.85888, "grad_norm": 0.5495478510856628, "learning_rate": 0.0002571543949387535, "loss": 4.2478, "step": 1453 }, { "epoch": 1.86016, "grad_norm": 0.5628487467765808, "learning_rate": 0.00025711401265311615, "loss": 4.2659, "step": 1454 }, { "epoch": 1.86144, "grad_norm": 0.5409569144248962, "learning_rate": 0.0002570736303674788, "loss": 4.2707, "step": 1455 }, { "epoch": 1.86272, "grad_norm": 0.5503236651420593, "learning_rate": 0.0002570332480818414, "loss": 4.2997, "step": 1456 }, { "epoch": 1.8639999999999999, "grad_norm": 0.5647298693656921, "learning_rate": 0.00025699286579620404, "loss": 4.3171, "step": 1457 }, { "epoch": 1.86528, "grad_norm": 0.5661958456039429, "learning_rate": 0.00025695248351056667, "loss": 4.2884, "step": 1458 }, { "epoch": 1.86656, "grad_norm": 0.5522183775901794, "learning_rate": 0.0002569121012249293, "loss": 4.1818, "step": 1459 }, { "epoch": 1.86784, "grad_norm": 0.5519942045211792, "learning_rate": 0.00025687171893929193, "loss": 4.3356, "step": 1460 }, { "epoch": 1.8691200000000001, "grad_norm": 0.5674711465835571, "learning_rate": 0.00025683133665365456, "loss": 4.3287, "step": 1461 }, { "epoch": 1.8704, "grad_norm": 0.6130083799362183, "learning_rate": 0.00025679095436801724, "loss": 4.3046, "step": 1462 }, { "epoch": 1.87168, "grad_norm": 0.5580954551696777, "learning_rate": 0.0002567505720823798, "loss": 4.2846, "step": 1463 }, { "epoch": 1.87296, "grad_norm": 0.6269332766532898, "learning_rate": 0.00025671018979674245, "loss": 4.3118, "step": 1464 }, { "epoch": 1.87424, "grad_norm": 0.5813364386558533, "learning_rate": 0.0002566698075111051, "loss": 4.2828, "step": 1465 }, { "epoch": 1.8755199999999999, "grad_norm": 0.584498941898346, "learning_rate": 0.00025662942522546776, "loss": 4.2625, "step": 1466 }, { "epoch": 1.8768, "grad_norm": 0.6057561635971069, "learning_rate": 0.0002565890429398304, "loss": 4.3507, "step": 1467 }, { "epoch": 1.87808, "grad_norm": 0.5964322686195374, "learning_rate": 0.000256548660654193, "loss": 4.3682, "step": 1468 }, { "epoch": 1.8793600000000001, "grad_norm": 0.6251275539398193, "learning_rate": 0.00025650827836855565, "loss": 4.3605, "step": 1469 }, { "epoch": 1.88064, "grad_norm": 0.6410034894943237, "learning_rate": 0.00025646789608291823, "loss": 4.3105, "step": 1470 }, { "epoch": 1.88192, "grad_norm": 0.6155179738998413, "learning_rate": 0.0002564275137972809, "loss": 4.2392, "step": 1471 }, { "epoch": 1.8832, "grad_norm": 0.5981379747390747, "learning_rate": 0.00025638713151164355, "loss": 4.3841, "step": 1472 }, { "epoch": 1.88448, "grad_norm": 0.637157142162323, "learning_rate": 0.0002563467492260062, "loss": 4.2601, "step": 1473 }, { "epoch": 1.8857599999999999, "grad_norm": 0.5592889189720154, "learning_rate": 0.0002563063669403688, "loss": 4.3205, "step": 1474 }, { "epoch": 1.88704, "grad_norm": 0.6076372265815735, "learning_rate": 0.00025626598465473144, "loss": 4.2291, "step": 1475 }, { "epoch": 1.88832, "grad_norm": 0.5905406475067139, "learning_rate": 0.00025622560236909407, "loss": 4.2718, "step": 1476 }, { "epoch": 1.8896, "grad_norm": 0.5950998663902283, "learning_rate": 0.0002561852200834567, "loss": 4.3082, "step": 1477 }, { "epoch": 1.8908800000000001, "grad_norm": 0.6349307298660278, "learning_rate": 0.0002561448377978193, "loss": 4.3038, "step": 1478 }, { "epoch": 1.89216, "grad_norm": 0.5779610276222229, "learning_rate": 0.000256104455512182, "loss": 4.2575, "step": 1479 }, { "epoch": 1.89344, "grad_norm": 0.5922044515609741, "learning_rate": 0.0002560640732265446, "loss": 4.2998, "step": 1480 }, { "epoch": 1.89472, "grad_norm": 0.6293798089027405, "learning_rate": 0.0002560236909409072, "loss": 4.2816, "step": 1481 }, { "epoch": 1.896, "grad_norm": 0.542755126953125, "learning_rate": 0.00025598330865526985, "loss": 4.3644, "step": 1482 }, { "epoch": 1.8972799999999999, "grad_norm": 0.591120719909668, "learning_rate": 0.0002559429263696325, "loss": 4.277, "step": 1483 }, { "epoch": 1.89856, "grad_norm": 0.5428628325462341, "learning_rate": 0.00025590254408399516, "loss": 4.1826, "step": 1484 }, { "epoch": 1.89984, "grad_norm": 0.5935090780258179, "learning_rate": 0.0002558621617983578, "loss": 4.265, "step": 1485 }, { "epoch": 1.9011200000000001, "grad_norm": 0.532969057559967, "learning_rate": 0.00025582177951272037, "loss": 4.3217, "step": 1486 }, { "epoch": 1.9024, "grad_norm": 0.5747160315513611, "learning_rate": 0.000255781397227083, "loss": 4.2958, "step": 1487 }, { "epoch": 1.90368, "grad_norm": 0.5506500005722046, "learning_rate": 0.0002557410149414457, "loss": 4.3303, "step": 1488 }, { "epoch": 1.90496, "grad_norm": 0.5252817869186401, "learning_rate": 0.0002557006326558083, "loss": 4.2845, "step": 1489 }, { "epoch": 1.90624, "grad_norm": 0.5319386124610901, "learning_rate": 0.00025566025037017094, "loss": 4.2375, "step": 1490 }, { "epoch": 1.9075199999999999, "grad_norm": 0.5286985635757446, "learning_rate": 0.00025561986808453357, "loss": 4.3033, "step": 1491 }, { "epoch": 1.9088, "grad_norm": 0.6041566133499146, "learning_rate": 0.0002555794857988962, "loss": 4.2996, "step": 1492 }, { "epoch": 1.91008, "grad_norm": 0.563486635684967, "learning_rate": 0.00025553910351325883, "loss": 4.3267, "step": 1493 }, { "epoch": 1.91136, "grad_norm": 0.5622044801712036, "learning_rate": 0.00025549872122762146, "loss": 4.2366, "step": 1494 }, { "epoch": 1.9126400000000001, "grad_norm": 0.5659383535385132, "learning_rate": 0.0002554583389419841, "loss": 4.3028, "step": 1495 }, { "epoch": 1.91392, "grad_norm": 0.6131373047828674, "learning_rate": 0.0002554179566563467, "loss": 4.2772, "step": 1496 }, { "epoch": 1.9152, "grad_norm": 0.5460529923439026, "learning_rate": 0.00025537757437070935, "loss": 4.2903, "step": 1497 }, { "epoch": 1.91648, "grad_norm": 0.6126199960708618, "learning_rate": 0.000255337192085072, "loss": 4.3442, "step": 1498 }, { "epoch": 1.91776, "grad_norm": 0.5673563480377197, "learning_rate": 0.0002552968097994346, "loss": 4.2208, "step": 1499 }, { "epoch": 1.9190399999999999, "grad_norm": 0.5467031002044678, "learning_rate": 0.00025525642751379724, "loss": 4.287, "step": 1500 }, { "epoch": 1.92032, "grad_norm": 0.5507038235664368, "learning_rate": 0.0002552160452281599, "loss": 4.2693, "step": 1501 }, { "epoch": 1.9216, "grad_norm": 0.5434231758117676, "learning_rate": 0.00025517566294252256, "loss": 4.2913, "step": 1502 }, { "epoch": 1.9228800000000001, "grad_norm": 0.5921497941017151, "learning_rate": 0.00025513528065688513, "loss": 4.3204, "step": 1503 }, { "epoch": 1.92416, "grad_norm": 0.5684370398521423, "learning_rate": 0.00025509489837124776, "loss": 4.2581, "step": 1504 }, { "epoch": 1.92544, "grad_norm": 0.5858400464057922, "learning_rate": 0.00025505451608561045, "loss": 4.2407, "step": 1505 }, { "epoch": 1.92672, "grad_norm": 0.5300204157829285, "learning_rate": 0.0002550141337999731, "loss": 4.3043, "step": 1506 }, { "epoch": 1.928, "grad_norm": 0.5366335511207581, "learning_rate": 0.0002549737515143357, "loss": 4.2903, "step": 1507 }, { "epoch": 1.9292799999999999, "grad_norm": 0.5699880719184875, "learning_rate": 0.00025493336922869834, "loss": 4.2495, "step": 1508 }, { "epoch": 1.93056, "grad_norm": 0.5679522752761841, "learning_rate": 0.00025489298694306097, "loss": 4.2577, "step": 1509 }, { "epoch": 1.93184, "grad_norm": 0.5857032537460327, "learning_rate": 0.0002548526046574236, "loss": 4.273, "step": 1510 }, { "epoch": 1.93312, "grad_norm": 0.574712872505188, "learning_rate": 0.0002548122223717862, "loss": 4.3117, "step": 1511 }, { "epoch": 1.9344000000000001, "grad_norm": 0.5547901391983032, "learning_rate": 0.00025477184008614886, "loss": 4.3168, "step": 1512 }, { "epoch": 1.93568, "grad_norm": 0.557061493396759, "learning_rate": 0.0002547314578005115, "loss": 4.2803, "step": 1513 }, { "epoch": 1.93696, "grad_norm": 0.5565205216407776, "learning_rate": 0.0002546910755148741, "loss": 4.2377, "step": 1514 }, { "epoch": 1.93824, "grad_norm": 0.5655121803283691, "learning_rate": 0.00025465069322923675, "loss": 4.2526, "step": 1515 }, { "epoch": 1.93952, "grad_norm": 0.6136040091514587, "learning_rate": 0.0002546103109435994, "loss": 4.3138, "step": 1516 }, { "epoch": 1.9407999999999999, "grad_norm": 0.5623380541801453, "learning_rate": 0.000254569928657962, "loss": 4.1775, "step": 1517 }, { "epoch": 1.94208, "grad_norm": 0.5378047823905945, "learning_rate": 0.0002545295463723247, "loss": 4.2809, "step": 1518 }, { "epoch": 1.94336, "grad_norm": 0.6011268496513367, "learning_rate": 0.00025448916408668727, "loss": 4.2017, "step": 1519 }, { "epoch": 1.9446400000000001, "grad_norm": 0.5707088708877563, "learning_rate": 0.0002544487818010499, "loss": 4.22, "step": 1520 }, { "epoch": 1.94592, "grad_norm": 0.5897967219352722, "learning_rate": 0.00025440839951541253, "loss": 4.2578, "step": 1521 }, { "epoch": 1.9472, "grad_norm": 0.5613348484039307, "learning_rate": 0.00025436801722977516, "loss": 4.2774, "step": 1522 }, { "epoch": 1.94848, "grad_norm": 0.5693078637123108, "learning_rate": 0.00025432763494413784, "loss": 4.2862, "step": 1523 }, { "epoch": 1.94976, "grad_norm": 0.6159646511077881, "learning_rate": 0.00025428725265850047, "loss": 4.3357, "step": 1524 }, { "epoch": 1.9510399999999999, "grad_norm": 0.5490036010742188, "learning_rate": 0.0002542468703728631, "loss": 4.2645, "step": 1525 }, { "epoch": 1.95232, "grad_norm": 0.6109630465507507, "learning_rate": 0.0002542064880872257, "loss": 4.2504, "step": 1526 }, { "epoch": 1.9536, "grad_norm": 0.5659199953079224, "learning_rate": 0.00025416610580158836, "loss": 4.2713, "step": 1527 }, { "epoch": 1.95488, "grad_norm": 0.5810695886611938, "learning_rate": 0.000254125723515951, "loss": 4.2639, "step": 1528 }, { "epoch": 1.9561600000000001, "grad_norm": 0.5711521506309509, "learning_rate": 0.0002540853412303136, "loss": 4.2478, "step": 1529 }, { "epoch": 1.95744, "grad_norm": 0.571063756942749, "learning_rate": 0.00025404495894467625, "loss": 4.2389, "step": 1530 }, { "epoch": 1.95872, "grad_norm": 0.5734847784042358, "learning_rate": 0.0002540045766590389, "loss": 4.216, "step": 1531 }, { "epoch": 1.96, "grad_norm": 0.5617849230766296, "learning_rate": 0.0002539641943734015, "loss": 4.2903, "step": 1532 }, { "epoch": 1.96128, "grad_norm": 0.5797948241233826, "learning_rate": 0.00025392381208776414, "loss": 4.2572, "step": 1533 }, { "epoch": 1.9625599999999999, "grad_norm": 0.5373856425285339, "learning_rate": 0.00025388342980212677, "loss": 4.2827, "step": 1534 }, { "epoch": 1.96384, "grad_norm": 0.5615948438644409, "learning_rate": 0.0002538430475164894, "loss": 4.2297, "step": 1535 }, { "epoch": 1.96512, "grad_norm": 0.5433390140533447, "learning_rate": 0.00025380266523085203, "loss": 4.3284, "step": 1536 }, { "epoch": 1.9664000000000001, "grad_norm": 0.604446530342102, "learning_rate": 0.00025376228294521466, "loss": 4.2762, "step": 1537 }, { "epoch": 1.96768, "grad_norm": 0.5676113367080688, "learning_rate": 0.0002537219006595773, "loss": 4.174, "step": 1538 }, { "epoch": 1.96896, "grad_norm": 0.5773786902427673, "learning_rate": 0.0002536815183739399, "loss": 4.227, "step": 1539 }, { "epoch": 1.97024, "grad_norm": 0.5377183556556702, "learning_rate": 0.0002536411360883026, "loss": 4.232, "step": 1540 }, { "epoch": 1.97152, "grad_norm": 0.5951273441314697, "learning_rate": 0.00025360075380266524, "loss": 4.2514, "step": 1541 }, { "epoch": 1.9727999999999999, "grad_norm": 0.5194257497787476, "learning_rate": 0.0002535603715170278, "loss": 4.2341, "step": 1542 }, { "epoch": 1.9740799999999998, "grad_norm": 0.5489000678062439, "learning_rate": 0.00025351998923139044, "loss": 4.2023, "step": 1543 }, { "epoch": 1.97536, "grad_norm": 0.5540454983711243, "learning_rate": 0.00025347960694575313, "loss": 4.2174, "step": 1544 }, { "epoch": 1.97664, "grad_norm": 0.5399606823921204, "learning_rate": 0.00025343922466011576, "loss": 4.335, "step": 1545 }, { "epoch": 1.9779200000000001, "grad_norm": 0.5686156749725342, "learning_rate": 0.0002533988423744784, "loss": 4.2687, "step": 1546 }, { "epoch": 1.9792, "grad_norm": 0.528323769569397, "learning_rate": 0.000253358460088841, "loss": 4.3328, "step": 1547 }, { "epoch": 1.98048, "grad_norm": 0.5400658845901489, "learning_rate": 0.00025331807780320365, "loss": 4.2065, "step": 1548 }, { "epoch": 1.98176, "grad_norm": 0.5493596792221069, "learning_rate": 0.0002532776955175663, "loss": 4.2173, "step": 1549 }, { "epoch": 1.98304, "grad_norm": 0.5500946044921875, "learning_rate": 0.0002532373132319289, "loss": 4.2876, "step": 1550 }, { "epoch": 1.9843199999999999, "grad_norm": 0.5746954679489136, "learning_rate": 0.00025319693094629154, "loss": 4.3206, "step": 1551 }, { "epoch": 1.9856, "grad_norm": 0.5554184913635254, "learning_rate": 0.00025315654866065417, "loss": 4.2171, "step": 1552 }, { "epoch": 1.98688, "grad_norm": 0.5702386498451233, "learning_rate": 0.0002531161663750168, "loss": 4.2557, "step": 1553 }, { "epoch": 1.9881600000000001, "grad_norm": 0.5968025922775269, "learning_rate": 0.00025307578408937943, "loss": 4.2913, "step": 1554 }, { "epoch": 1.98944, "grad_norm": 0.5453073382377625, "learning_rate": 0.00025303540180374206, "loss": 4.2114, "step": 1555 }, { "epoch": 1.99072, "grad_norm": 0.5790675282478333, "learning_rate": 0.0002529950195181047, "loss": 4.2186, "step": 1556 }, { "epoch": 1.992, "grad_norm": 0.5652514100074768, "learning_rate": 0.00025295463723246737, "loss": 4.2595, "step": 1557 }, { "epoch": 1.99328, "grad_norm": 0.5572407245635986, "learning_rate": 0.00025291425494682995, "loss": 4.2544, "step": 1558 }, { "epoch": 1.9945599999999999, "grad_norm": 0.5312286019325256, "learning_rate": 0.0002528738726611926, "loss": 4.2369, "step": 1559 }, { "epoch": 1.9958399999999998, "grad_norm": 0.5568103194236755, "learning_rate": 0.0002528334903755552, "loss": 4.3173, "step": 1560 }, { "epoch": 1.99712, "grad_norm": 0.5829612612724304, "learning_rate": 0.00025279310808991784, "loss": 4.2247, "step": 1561 }, { "epoch": 1.9984, "grad_norm": 0.5639691352844238, "learning_rate": 0.0002527527258042805, "loss": 4.2355, "step": 1562 }, { "epoch": 1.9996800000000001, "grad_norm": 0.5624785423278809, "learning_rate": 0.00025271234351864315, "loss": 4.2893, "step": 1563 }, { "epoch": 2.0, "grad_norm": 0.9181849360466003, "learning_rate": 0.0002526719612330058, "loss": 4.2339, "step": 1564 }, { "epoch": 2.00128, "grad_norm": 0.6469334959983826, "learning_rate": 0.00025263157894736836, "loss": 4.1552, "step": 1565 }, { "epoch": 2.00256, "grad_norm": 0.607097327709198, "learning_rate": 0.00025259119666173104, "loss": 4.1764, "step": 1566 }, { "epoch": 2.00384, "grad_norm": 0.5565678477287292, "learning_rate": 0.0002525508143760937, "loss": 4.1168, "step": 1567 }, { "epoch": 2.00512, "grad_norm": 0.5955238342285156, "learning_rate": 0.0002525104320904563, "loss": 4.1852, "step": 1568 }, { "epoch": 2.0064, "grad_norm": 0.5889109969139099, "learning_rate": 0.00025247004980481893, "loss": 4.1784, "step": 1569 }, { "epoch": 2.00768, "grad_norm": 0.6503939032554626, "learning_rate": 0.00025242966751918156, "loss": 4.1564, "step": 1570 }, { "epoch": 2.00896, "grad_norm": 0.5937455296516418, "learning_rate": 0.0002523892852335442, "loss": 4.2143, "step": 1571 }, { "epoch": 2.01024, "grad_norm": 0.6147340536117554, "learning_rate": 0.0002523489029479068, "loss": 4.1123, "step": 1572 }, { "epoch": 2.01152, "grad_norm": 0.5893435478210449, "learning_rate": 0.00025230852066226945, "loss": 4.0739, "step": 1573 }, { "epoch": 2.0128, "grad_norm": 0.584883451461792, "learning_rate": 0.00025226813837663214, "loss": 4.2348, "step": 1574 }, { "epoch": 2.01408, "grad_norm": 0.5527332425117493, "learning_rate": 0.0002522277560909947, "loss": 4.205, "step": 1575 }, { "epoch": 2.01536, "grad_norm": 0.5769275426864624, "learning_rate": 0.00025218737380535734, "loss": 4.1406, "step": 1576 }, { "epoch": 2.01664, "grad_norm": 0.5587900280952454, "learning_rate": 0.00025214699151972, "loss": 4.1601, "step": 1577 }, { "epoch": 2.01792, "grad_norm": 0.5883750319480896, "learning_rate": 0.0002521066092340826, "loss": 4.1505, "step": 1578 }, { "epoch": 2.0192, "grad_norm": 0.5383011102676392, "learning_rate": 0.0002520662269484453, "loss": 4.1801, "step": 1579 }, { "epoch": 2.02048, "grad_norm": 0.5970503687858582, "learning_rate": 0.0002520258446628079, "loss": 4.1597, "step": 1580 }, { "epoch": 2.02176, "grad_norm": 0.5630191564559937, "learning_rate": 0.00025198546237717055, "loss": 4.1925, "step": 1581 }, { "epoch": 2.02304, "grad_norm": 0.5271741151809692, "learning_rate": 0.0002519450800915331, "loss": 4.2025, "step": 1582 }, { "epoch": 2.02432, "grad_norm": 0.5406798124313354, "learning_rate": 0.0002519046978058958, "loss": 4.1666, "step": 1583 }, { "epoch": 2.0256, "grad_norm": 0.5316964387893677, "learning_rate": 0.00025186431552025844, "loss": 4.1439, "step": 1584 }, { "epoch": 2.02688, "grad_norm": 0.5242227911949158, "learning_rate": 0.00025182393323462107, "loss": 4.1849, "step": 1585 }, { "epoch": 2.02816, "grad_norm": 0.516384482383728, "learning_rate": 0.0002517835509489837, "loss": 4.1493, "step": 1586 }, { "epoch": 2.02944, "grad_norm": 0.5419257879257202, "learning_rate": 0.00025174316866334633, "loss": 4.1466, "step": 1587 }, { "epoch": 2.03072, "grad_norm": 0.5190739631652832, "learning_rate": 0.00025170278637770896, "loss": 4.2076, "step": 1588 }, { "epoch": 2.032, "grad_norm": 0.5474783778190613, "learning_rate": 0.0002516624040920716, "loss": 4.1463, "step": 1589 }, { "epoch": 2.03328, "grad_norm": 0.5086979269981384, "learning_rate": 0.0002516220218064342, "loss": 4.1831, "step": 1590 }, { "epoch": 2.03456, "grad_norm": 0.5824074149131775, "learning_rate": 0.00025158163952079685, "loss": 4.1523, "step": 1591 }, { "epoch": 2.03584, "grad_norm": 0.523669958114624, "learning_rate": 0.0002515412572351595, "loss": 4.1735, "step": 1592 }, { "epoch": 2.03712, "grad_norm": 0.5100351572036743, "learning_rate": 0.0002515008749495221, "loss": 4.1265, "step": 1593 }, { "epoch": 2.0384, "grad_norm": 0.5400765538215637, "learning_rate": 0.00025146049266388474, "loss": 4.1038, "step": 1594 }, { "epoch": 2.03968, "grad_norm": 0.5364828705787659, "learning_rate": 0.00025142011037824737, "loss": 4.1539, "step": 1595 }, { "epoch": 2.04096, "grad_norm": 0.5193732976913452, "learning_rate": 0.00025137972809261005, "loss": 4.1766, "step": 1596 }, { "epoch": 2.04224, "grad_norm": 0.5233401656150818, "learning_rate": 0.0002513393458069727, "loss": 4.1535, "step": 1597 }, { "epoch": 2.04352, "grad_norm": 0.5625421404838562, "learning_rate": 0.00025129896352133526, "loss": 4.1345, "step": 1598 }, { "epoch": 2.0448, "grad_norm": 0.5566902756690979, "learning_rate": 0.0002512585812356979, "loss": 4.1438, "step": 1599 }, { "epoch": 2.04608, "grad_norm": 0.5623239874839783, "learning_rate": 0.0002512181989500606, "loss": 4.1251, "step": 1600 }, { "epoch": 2.04736, "grad_norm": 0.578046441078186, "learning_rate": 0.0002511778166644232, "loss": 4.0756, "step": 1601 }, { "epoch": 2.04864, "grad_norm": 0.5461819171905518, "learning_rate": 0.00025113743437878583, "loss": 4.1906, "step": 1602 }, { "epoch": 2.04992, "grad_norm": 0.5973131656646729, "learning_rate": 0.00025109705209314846, "loss": 4.1781, "step": 1603 }, { "epoch": 2.0512, "grad_norm": 0.5380710959434509, "learning_rate": 0.0002510566698075111, "loss": 4.1321, "step": 1604 }, { "epoch": 2.05248, "grad_norm": 0.5838267207145691, "learning_rate": 0.0002510162875218737, "loss": 4.1575, "step": 1605 }, { "epoch": 2.05376, "grad_norm": 0.5240848660469055, "learning_rate": 0.00025097590523623635, "loss": 4.0436, "step": 1606 }, { "epoch": 2.05504, "grad_norm": 0.5829635858535767, "learning_rate": 0.000250935522950599, "loss": 4.1312, "step": 1607 }, { "epoch": 2.05632, "grad_norm": 0.5419280529022217, "learning_rate": 0.0002508951406649616, "loss": 4.1518, "step": 1608 }, { "epoch": 2.0576, "grad_norm": 0.5682242512702942, "learning_rate": 0.00025085475837932424, "loss": 4.2161, "step": 1609 }, { "epoch": 2.05888, "grad_norm": 0.5263741612434387, "learning_rate": 0.0002508143760936869, "loss": 4.1819, "step": 1610 }, { "epoch": 2.06016, "grad_norm": 0.5568137168884277, "learning_rate": 0.0002507739938080495, "loss": 4.0907, "step": 1611 }, { "epoch": 2.06144, "grad_norm": 0.5448589324951172, "learning_rate": 0.00025073361152241214, "loss": 4.171, "step": 1612 }, { "epoch": 2.06272, "grad_norm": 0.5299394726753235, "learning_rate": 0.0002506932292367748, "loss": 4.1997, "step": 1613 }, { "epoch": 2.064, "grad_norm": 0.5272650122642517, "learning_rate": 0.0002506528469511374, "loss": 4.0966, "step": 1614 }, { "epoch": 2.06528, "grad_norm": 0.5520989894866943, "learning_rate": 0.0002506124646655, "loss": 4.1154, "step": 1615 }, { "epoch": 2.06656, "grad_norm": 0.5448061227798462, "learning_rate": 0.00025057208237986266, "loss": 4.096, "step": 1616 }, { "epoch": 2.06784, "grad_norm": 0.5410912036895752, "learning_rate": 0.0002505317000942253, "loss": 4.1071, "step": 1617 }, { "epoch": 2.06912, "grad_norm": 0.5440801978111267, "learning_rate": 0.00025049131780858797, "loss": 4.2481, "step": 1618 }, { "epoch": 2.0704, "grad_norm": 0.5587443709373474, "learning_rate": 0.0002504509355229506, "loss": 4.1345, "step": 1619 }, { "epoch": 2.07168, "grad_norm": 0.5543466210365295, "learning_rate": 0.00025041055323731323, "loss": 4.2023, "step": 1620 }, { "epoch": 2.07296, "grad_norm": 0.5711597800254822, "learning_rate": 0.0002503701709516758, "loss": 4.1222, "step": 1621 }, { "epoch": 2.07424, "grad_norm": 0.5779085159301758, "learning_rate": 0.0002503297886660385, "loss": 4.1628, "step": 1622 }, { "epoch": 2.07552, "grad_norm": 0.5895453691482544, "learning_rate": 0.0002502894063804011, "loss": 4.1175, "step": 1623 }, { "epoch": 2.0768, "grad_norm": 0.5698716640472412, "learning_rate": 0.00025024902409476375, "loss": 4.0683, "step": 1624 }, { "epoch": 2.07808, "grad_norm": 0.6177151799201965, "learning_rate": 0.0002502086418091264, "loss": 4.2269, "step": 1625 }, { "epoch": 2.07936, "grad_norm": 0.5702541470527649, "learning_rate": 0.000250168259523489, "loss": 4.1249, "step": 1626 }, { "epoch": 2.08064, "grad_norm": 0.5631515383720398, "learning_rate": 0.00025012787723785164, "loss": 4.0755, "step": 1627 }, { "epoch": 2.08192, "grad_norm": 0.5638665556907654, "learning_rate": 0.00025008749495221427, "loss": 4.1912, "step": 1628 }, { "epoch": 2.0832, "grad_norm": 0.5572354793548584, "learning_rate": 0.0002500471126665769, "loss": 4.1421, "step": 1629 }, { "epoch": 2.08448, "grad_norm": 0.5218389630317688, "learning_rate": 0.00025000673038093953, "loss": 4.1606, "step": 1630 }, { "epoch": 2.08576, "grad_norm": 0.543541669845581, "learning_rate": 0.00024996634809530216, "loss": 4.2001, "step": 1631 }, { "epoch": 2.08704, "grad_norm": 0.5522968173027039, "learning_rate": 0.0002499259658096648, "loss": 4.1279, "step": 1632 }, { "epoch": 2.08832, "grad_norm": 0.5614569783210754, "learning_rate": 0.0002498855835240274, "loss": 4.057, "step": 1633 }, { "epoch": 2.0896, "grad_norm": 0.5415524244308472, "learning_rate": 0.00024984520123839005, "loss": 4.2062, "step": 1634 }, { "epoch": 2.09088, "grad_norm": 0.5276180505752563, "learning_rate": 0.00024980481895275274, "loss": 4.1219, "step": 1635 }, { "epoch": 2.09216, "grad_norm": 0.5661503076553345, "learning_rate": 0.00024976443666711537, "loss": 4.0845, "step": 1636 }, { "epoch": 2.09344, "grad_norm": 0.5094005465507507, "learning_rate": 0.00024972405438147794, "loss": 4.1161, "step": 1637 }, { "epoch": 2.09472, "grad_norm": 0.5565866231918335, "learning_rate": 0.00024968367209584057, "loss": 4.1302, "step": 1638 }, { "epoch": 2.096, "grad_norm": 0.5100904107093811, "learning_rate": 0.00024964328981020326, "loss": 4.189, "step": 1639 }, { "epoch": 2.09728, "grad_norm": 0.5307742953300476, "learning_rate": 0.0002496029075245659, "loss": 4.0909, "step": 1640 }, { "epoch": 2.09856, "grad_norm": 0.5522825717926025, "learning_rate": 0.0002495625252389285, "loss": 4.086, "step": 1641 }, { "epoch": 2.09984, "grad_norm": 0.5281661152839661, "learning_rate": 0.00024952214295329115, "loss": 4.0938, "step": 1642 }, { "epoch": 2.10112, "grad_norm": 0.5300430655479431, "learning_rate": 0.0002494817606676538, "loss": 4.2113, "step": 1643 }, { "epoch": 2.1024, "grad_norm": 0.5836507081985474, "learning_rate": 0.0002494413783820164, "loss": 4.1544, "step": 1644 }, { "epoch": 2.1036799999999998, "grad_norm": 0.5055437088012695, "learning_rate": 0.00024940099609637904, "loss": 4.1888, "step": 1645 }, { "epoch": 2.10496, "grad_norm": 0.5788672566413879, "learning_rate": 0.00024936061381074167, "loss": 4.1254, "step": 1646 }, { "epoch": 2.10624, "grad_norm": 0.5161530375480652, "learning_rate": 0.0002493202315251043, "loss": 4.1339, "step": 1647 }, { "epoch": 2.10752, "grad_norm": 0.526464581489563, "learning_rate": 0.0002492798492394669, "loss": 4.0877, "step": 1648 }, { "epoch": 2.1088, "grad_norm": 0.5108904242515564, "learning_rate": 0.00024923946695382956, "loss": 4.1818, "step": 1649 }, { "epoch": 2.11008, "grad_norm": 0.5350533723831177, "learning_rate": 0.0002491990846681922, "loss": 4.1912, "step": 1650 }, { "epoch": 2.11136, "grad_norm": 0.552305281162262, "learning_rate": 0.0002491587023825548, "loss": 4.0919, "step": 1651 }, { "epoch": 2.11264, "grad_norm": 0.5402622818946838, "learning_rate": 0.0002491183200969175, "loss": 4.1347, "step": 1652 }, { "epoch": 2.11392, "grad_norm": 0.5491945147514343, "learning_rate": 0.00024907793781128013, "loss": 4.1458, "step": 1653 }, { "epoch": 2.1152, "grad_norm": 0.5369943380355835, "learning_rate": 0.0002490375555256427, "loss": 4.1845, "step": 1654 }, { "epoch": 2.11648, "grad_norm": 0.5386082530021667, "learning_rate": 0.00024899717324000534, "loss": 4.1702, "step": 1655 }, { "epoch": 2.11776, "grad_norm": 0.5493932366371155, "learning_rate": 0.00024895679095436797, "loss": 4.1743, "step": 1656 }, { "epoch": 2.11904, "grad_norm": 0.5019392967224121, "learning_rate": 0.00024891640866873065, "loss": 4.0747, "step": 1657 }, { "epoch": 2.12032, "grad_norm": 0.556978166103363, "learning_rate": 0.0002488760263830933, "loss": 4.1214, "step": 1658 }, { "epoch": 2.1216, "grad_norm": 0.5479657053947449, "learning_rate": 0.0002488356440974559, "loss": 4.1133, "step": 1659 }, { "epoch": 2.12288, "grad_norm": 0.5306862592697144, "learning_rate": 0.0002487952618118185, "loss": 4.1571, "step": 1660 }, { "epoch": 2.12416, "grad_norm": 0.5123226046562195, "learning_rate": 0.00024875487952618117, "loss": 4.1351, "step": 1661 }, { "epoch": 2.12544, "grad_norm": 0.5251739621162415, "learning_rate": 0.0002487144972405438, "loss": 4.1093, "step": 1662 }, { "epoch": 2.12672, "grad_norm": 0.5215139389038086, "learning_rate": 0.00024867411495490643, "loss": 4.1659, "step": 1663 }, { "epoch": 2.128, "grad_norm": 0.579883873462677, "learning_rate": 0.00024863373266926906, "loss": 4.153, "step": 1664 }, { "epoch": 2.12928, "grad_norm": 0.5173061490058899, "learning_rate": 0.0002485933503836317, "loss": 4.1132, "step": 1665 }, { "epoch": 2.13056, "grad_norm": 0.5922846794128418, "learning_rate": 0.0002485529680979943, "loss": 4.149, "step": 1666 }, { "epoch": 2.13184, "grad_norm": 0.5453259348869324, "learning_rate": 0.00024851258581235695, "loss": 4.1019, "step": 1667 }, { "epoch": 2.13312, "grad_norm": 0.5758420825004578, "learning_rate": 0.0002484722035267196, "loss": 4.1267, "step": 1668 }, { "epoch": 2.1344, "grad_norm": 0.5560485124588013, "learning_rate": 0.0002484318212410822, "loss": 4.1299, "step": 1669 }, { "epoch": 2.13568, "grad_norm": 0.5476799011230469, "learning_rate": 0.00024839143895544484, "loss": 4.0798, "step": 1670 }, { "epoch": 2.13696, "grad_norm": 0.5895335078239441, "learning_rate": 0.00024835105666980747, "loss": 4.1217, "step": 1671 }, { "epoch": 2.13824, "grad_norm": 0.5773458480834961, "learning_rate": 0.0002483106743841701, "loss": 4.1645, "step": 1672 }, { "epoch": 2.13952, "grad_norm": 0.5620774626731873, "learning_rate": 0.00024827029209853273, "loss": 4.1691, "step": 1673 }, { "epoch": 2.1408, "grad_norm": 0.5326398015022278, "learning_rate": 0.0002482299098128954, "loss": 4.1493, "step": 1674 }, { "epoch": 2.14208, "grad_norm": 0.5294104218482971, "learning_rate": 0.00024818952752725805, "loss": 4.0135, "step": 1675 }, { "epoch": 2.14336, "grad_norm": 0.6189349293708801, "learning_rate": 0.0002481491452416207, "loss": 4.1086, "step": 1676 }, { "epoch": 2.14464, "grad_norm": 0.5574718713760376, "learning_rate": 0.00024810876295598325, "loss": 4.0684, "step": 1677 }, { "epoch": 2.14592, "grad_norm": 0.5645029544830322, "learning_rate": 0.00024806838067034594, "loss": 4.0929, "step": 1678 }, { "epoch": 2.1471999999999998, "grad_norm": 0.5400739312171936, "learning_rate": 0.00024802799838470857, "loss": 4.0444, "step": 1679 }, { "epoch": 2.14848, "grad_norm": 0.5390148162841797, "learning_rate": 0.0002479876160990712, "loss": 4.1553, "step": 1680 }, { "epoch": 2.14976, "grad_norm": 0.520582377910614, "learning_rate": 0.00024794723381343383, "loss": 4.0592, "step": 1681 }, { "epoch": 2.15104, "grad_norm": 0.5372869968414307, "learning_rate": 0.00024790685152779646, "loss": 4.1065, "step": 1682 }, { "epoch": 2.15232, "grad_norm": 0.510082483291626, "learning_rate": 0.0002478664692421591, "loss": 4.1625, "step": 1683 }, { "epoch": 2.1536, "grad_norm": 0.5741236805915833, "learning_rate": 0.0002478260869565217, "loss": 4.0973, "step": 1684 }, { "epoch": 2.15488, "grad_norm": 0.5604896545410156, "learning_rate": 0.00024778570467088435, "loss": 4.0886, "step": 1685 }, { "epoch": 2.15616, "grad_norm": 0.5670208930969238, "learning_rate": 0.000247745322385247, "loss": 4.1643, "step": 1686 }, { "epoch": 2.15744, "grad_norm": 0.5674409866333008, "learning_rate": 0.0002477049400996096, "loss": 4.1747, "step": 1687 }, { "epoch": 2.15872, "grad_norm": 0.5394083857536316, "learning_rate": 0.00024766455781397224, "loss": 4.1235, "step": 1688 }, { "epoch": 2.16, "grad_norm": 0.5426709651947021, "learning_rate": 0.00024762417552833487, "loss": 4.1005, "step": 1689 }, { "epoch": 2.16128, "grad_norm": 0.5474469661712646, "learning_rate": 0.0002475837932426975, "loss": 4.1152, "step": 1690 }, { "epoch": 2.16256, "grad_norm": 0.6519491076469421, "learning_rate": 0.0002475434109570602, "loss": 4.0955, "step": 1691 }, { "epoch": 2.16384, "grad_norm": 0.5159724354743958, "learning_rate": 0.0002475030286714228, "loss": 4.1734, "step": 1692 }, { "epoch": 2.16512, "grad_norm": 0.5642454028129578, "learning_rate": 0.0002474626463857854, "loss": 4.0931, "step": 1693 }, { "epoch": 2.1664, "grad_norm": 0.5765841603279114, "learning_rate": 0.000247422264100148, "loss": 4.1322, "step": 1694 }, { "epoch": 2.16768, "grad_norm": 0.5622481107711792, "learning_rate": 0.0002473818818145107, "loss": 4.1613, "step": 1695 }, { "epoch": 2.16896, "grad_norm": 0.5255911350250244, "learning_rate": 0.00024734149952887333, "loss": 4.1129, "step": 1696 }, { "epoch": 2.17024, "grad_norm": 0.5299469232559204, "learning_rate": 0.00024730111724323596, "loss": 4.0843, "step": 1697 }, { "epoch": 2.17152, "grad_norm": 0.5808050632476807, "learning_rate": 0.0002472607349575986, "loss": 4.1286, "step": 1698 }, { "epoch": 2.1728, "grad_norm": 0.563703179359436, "learning_rate": 0.0002472203526719612, "loss": 4.1928, "step": 1699 }, { "epoch": 2.17408, "grad_norm": 0.5682193636894226, "learning_rate": 0.00024717997038632385, "loss": 4.1669, "step": 1700 }, { "epoch": 2.17536, "grad_norm": 0.5745776295661926, "learning_rate": 0.0002471395881006865, "loss": 4.1129, "step": 1701 }, { "epoch": 2.17664, "grad_norm": 0.5572461485862732, "learning_rate": 0.0002470992058150491, "loss": 4.1546, "step": 1702 }, { "epoch": 2.17792, "grad_norm": 0.5437954664230347, "learning_rate": 0.00024705882352941174, "loss": 4.159, "step": 1703 }, { "epoch": 2.1792, "grad_norm": 0.536284327507019, "learning_rate": 0.0002470184412437744, "loss": 4.1498, "step": 1704 }, { "epoch": 2.18048, "grad_norm": 0.5438897609710693, "learning_rate": 0.000246978058958137, "loss": 4.0687, "step": 1705 }, { "epoch": 2.18176, "grad_norm": 0.5237944722175598, "learning_rate": 0.00024693767667249963, "loss": 4.1393, "step": 1706 }, { "epoch": 2.18304, "grad_norm": 0.5314120054244995, "learning_rate": 0.00024689729438686226, "loss": 4.1041, "step": 1707 }, { "epoch": 2.18432, "grad_norm": 0.5380268096923828, "learning_rate": 0.00024685691210122495, "loss": 4.1046, "step": 1708 }, { "epoch": 2.1856, "grad_norm": 0.5044991970062256, "learning_rate": 0.0002468165298155875, "loss": 4.0293, "step": 1709 }, { "epoch": 2.18688, "grad_norm": 0.5603588223457336, "learning_rate": 0.00024677614752995015, "loss": 4.1426, "step": 1710 }, { "epoch": 2.18816, "grad_norm": 0.5293896794319153, "learning_rate": 0.0002467357652443128, "loss": 4.0825, "step": 1711 }, { "epoch": 2.18944, "grad_norm": 0.5398738980293274, "learning_rate": 0.0002466953829586754, "loss": 4.0774, "step": 1712 }, { "epoch": 2.19072, "grad_norm": 0.5111704468727112, "learning_rate": 0.0002466550006730381, "loss": 4.0992, "step": 1713 }, { "epoch": 2.192, "grad_norm": 0.5483059287071228, "learning_rate": 0.00024661461838740073, "loss": 4.0721, "step": 1714 }, { "epoch": 2.19328, "grad_norm": 0.5132558345794678, "learning_rate": 0.00024657423610176336, "loss": 4.1077, "step": 1715 }, { "epoch": 2.19456, "grad_norm": 0.5515888929367065, "learning_rate": 0.00024653385381612593, "loss": 4.0439, "step": 1716 }, { "epoch": 2.19584, "grad_norm": 0.5087373852729797, "learning_rate": 0.0002464934715304886, "loss": 4.1587, "step": 1717 }, { "epoch": 2.19712, "grad_norm": 0.5192369222640991, "learning_rate": 0.00024645308924485125, "loss": 4.0419, "step": 1718 }, { "epoch": 2.1984, "grad_norm": 0.5411146283149719, "learning_rate": 0.0002464127069592139, "loss": 4.1674, "step": 1719 }, { "epoch": 2.19968, "grad_norm": 0.5327039957046509, "learning_rate": 0.0002463723246735765, "loss": 4.1486, "step": 1720 }, { "epoch": 2.20096, "grad_norm": 0.5581017732620239, "learning_rate": 0.00024633194238793914, "loss": 4.1585, "step": 1721 }, { "epoch": 2.20224, "grad_norm": 0.5399036407470703, "learning_rate": 0.00024629156010230177, "loss": 4.1631, "step": 1722 }, { "epoch": 2.20352, "grad_norm": 0.5582420229911804, "learning_rate": 0.0002462511778166644, "loss": 4.0919, "step": 1723 }, { "epoch": 2.2048, "grad_norm": 0.5208436846733093, "learning_rate": 0.00024621079553102703, "loss": 4.1291, "step": 1724 }, { "epoch": 2.20608, "grad_norm": 0.5868030190467834, "learning_rate": 0.00024617041324538966, "loss": 4.0925, "step": 1725 }, { "epoch": 2.20736, "grad_norm": 0.5274373292922974, "learning_rate": 0.0002461300309597523, "loss": 4.1289, "step": 1726 }, { "epoch": 2.20864, "grad_norm": 0.5727094411849976, "learning_rate": 0.0002460896486741149, "loss": 4.1407, "step": 1727 }, { "epoch": 2.20992, "grad_norm": 0.5434361100196838, "learning_rate": 0.00024604926638847755, "loss": 4.1471, "step": 1728 }, { "epoch": 2.2112, "grad_norm": 0.5813825726509094, "learning_rate": 0.0002460088841028402, "loss": 4.1329, "step": 1729 }, { "epoch": 2.2124800000000002, "grad_norm": 0.5424474477767944, "learning_rate": 0.00024596850181720286, "loss": 4.0815, "step": 1730 }, { "epoch": 2.21376, "grad_norm": 0.546615481376648, "learning_rate": 0.0002459281195315655, "loss": 4.1526, "step": 1731 }, { "epoch": 2.21504, "grad_norm": 0.5413877964019775, "learning_rate": 0.00024588773724592807, "loss": 4.104, "step": 1732 }, { "epoch": 2.21632, "grad_norm": 0.5641849040985107, "learning_rate": 0.0002458473549602907, "loss": 4.0903, "step": 1733 }, { "epoch": 2.2176, "grad_norm": 0.6042072176933289, "learning_rate": 0.0002458069726746534, "loss": 4.1078, "step": 1734 }, { "epoch": 2.21888, "grad_norm": 0.5650026202201843, "learning_rate": 0.000245766590389016, "loss": 4.1688, "step": 1735 }, { "epoch": 2.22016, "grad_norm": 0.5781028866767883, "learning_rate": 0.00024572620810337864, "loss": 4.1266, "step": 1736 }, { "epoch": 2.22144, "grad_norm": 0.5524300932884216, "learning_rate": 0.0002456858258177413, "loss": 4.1045, "step": 1737 }, { "epoch": 2.22272, "grad_norm": 0.5217704772949219, "learning_rate": 0.0002456454435321039, "loss": 4.1377, "step": 1738 }, { "epoch": 2.224, "grad_norm": 0.5524937510490417, "learning_rate": 0.00024560506124646653, "loss": 4.1003, "step": 1739 }, { "epoch": 2.22528, "grad_norm": 0.5031275153160095, "learning_rate": 0.00024556467896082916, "loss": 4.1322, "step": 1740 }, { "epoch": 2.22656, "grad_norm": 0.5842592716217041, "learning_rate": 0.0002455242966751918, "loss": 4.1308, "step": 1741 }, { "epoch": 2.22784, "grad_norm": 0.5045456290245056, "learning_rate": 0.0002454839143895544, "loss": 4.1172, "step": 1742 }, { "epoch": 2.22912, "grad_norm": 0.5572609901428223, "learning_rate": 0.00024544353210391705, "loss": 4.0625, "step": 1743 }, { "epoch": 2.2304, "grad_norm": 0.5589746236801147, "learning_rate": 0.0002454031498182797, "loss": 4.0665, "step": 1744 }, { "epoch": 2.23168, "grad_norm": 0.5695862174034119, "learning_rate": 0.0002453627675326423, "loss": 4.0537, "step": 1745 }, { "epoch": 2.23296, "grad_norm": 0.5234766006469727, "learning_rate": 0.00024532238524700494, "loss": 4.1759, "step": 1746 }, { "epoch": 2.23424, "grad_norm": 0.5754182934761047, "learning_rate": 0.00024528200296136763, "loss": 4.1404, "step": 1747 }, { "epoch": 2.23552, "grad_norm": 0.5176681280136108, "learning_rate": 0.00024524162067573026, "loss": 4.1085, "step": 1748 }, { "epoch": 2.2368, "grad_norm": 0.5456019043922424, "learning_rate": 0.00024520123839009283, "loss": 4.1433, "step": 1749 }, { "epoch": 2.23808, "grad_norm": 0.5013742446899414, "learning_rate": 0.00024516085610445547, "loss": 4.1624, "step": 1750 }, { "epoch": 2.23936, "grad_norm": 0.5228570103645325, "learning_rate": 0.0002451204738188181, "loss": 4.0994, "step": 1751 }, { "epoch": 2.24064, "grad_norm": 0.5208329558372498, "learning_rate": 0.0002450800915331808, "loss": 4.1814, "step": 1752 }, { "epoch": 2.24192, "grad_norm": 0.5297362804412842, "learning_rate": 0.0002450397092475434, "loss": 4.147, "step": 1753 }, { "epoch": 2.2432, "grad_norm": 0.5192136764526367, "learning_rate": 0.00024499932696190604, "loss": 4.175, "step": 1754 }, { "epoch": 2.24448, "grad_norm": 0.5594687461853027, "learning_rate": 0.00024495894467626867, "loss": 4.1391, "step": 1755 }, { "epoch": 2.24576, "grad_norm": 0.5374172329902649, "learning_rate": 0.0002449185623906313, "loss": 4.1404, "step": 1756 }, { "epoch": 2.24704, "grad_norm": 0.5575613975524902, "learning_rate": 0.00024487818010499393, "loss": 4.0406, "step": 1757 }, { "epoch": 2.24832, "grad_norm": 0.507201075553894, "learning_rate": 0.00024483779781935656, "loss": 4.1444, "step": 1758 }, { "epoch": 2.2496, "grad_norm": 0.5673658847808838, "learning_rate": 0.0002447974155337192, "loss": 4.0135, "step": 1759 }, { "epoch": 2.25088, "grad_norm": 0.5208572149276733, "learning_rate": 0.0002447570332480818, "loss": 4.1486, "step": 1760 }, { "epoch": 2.25216, "grad_norm": 0.5534422397613525, "learning_rate": 0.00024471665096244445, "loss": 4.0913, "step": 1761 }, { "epoch": 2.25344, "grad_norm": 0.5177516937255859, "learning_rate": 0.0002446762686768071, "loss": 4.131, "step": 1762 }, { "epoch": 2.25472, "grad_norm": 0.5717141032218933, "learning_rate": 0.0002446358863911697, "loss": 4.0954, "step": 1763 }, { "epoch": 2.2560000000000002, "grad_norm": 0.5581514835357666, "learning_rate": 0.00024459550410553234, "loss": 4.1365, "step": 1764 }, { "epoch": 2.25728, "grad_norm": 0.5282269716262817, "learning_rate": 0.00024455512181989497, "loss": 4.0941, "step": 1765 }, { "epoch": 2.25856, "grad_norm": 0.5452956557273865, "learning_rate": 0.0002445147395342576, "loss": 4.1019, "step": 1766 }, { "epoch": 2.25984, "grad_norm": 0.5826310515403748, "learning_rate": 0.00024447435724862023, "loss": 4.0641, "step": 1767 }, { "epoch": 2.26112, "grad_norm": 0.5480028986930847, "learning_rate": 0.00024443397496298286, "loss": 4.096, "step": 1768 }, { "epoch": 2.2624, "grad_norm": 0.5349941849708557, "learning_rate": 0.00024439359267734554, "loss": 4.088, "step": 1769 }, { "epoch": 2.26368, "grad_norm": 0.5315197706222534, "learning_rate": 0.0002443532103917082, "loss": 4.1428, "step": 1770 }, { "epoch": 2.26496, "grad_norm": 0.5485361218452454, "learning_rate": 0.0002443128281060708, "loss": 4.0701, "step": 1771 }, { "epoch": 2.26624, "grad_norm": 0.49806830286979675, "learning_rate": 0.0002442724458204334, "loss": 4.0907, "step": 1772 }, { "epoch": 2.26752, "grad_norm": 0.5614639520645142, "learning_rate": 0.00024423206353479607, "loss": 4.1116, "step": 1773 }, { "epoch": 2.2688, "grad_norm": 0.5616679191589355, "learning_rate": 0.0002441916812491587, "loss": 4.1148, "step": 1774 }, { "epoch": 2.27008, "grad_norm": 0.5605160593986511, "learning_rate": 0.0002441512989635213, "loss": 4.0161, "step": 1775 }, { "epoch": 2.27136, "grad_norm": 0.5229089856147766, "learning_rate": 0.00024411091667788393, "loss": 4.1171, "step": 1776 }, { "epoch": 2.27264, "grad_norm": 0.5960175395011902, "learning_rate": 0.00024407053439224656, "loss": 4.1207, "step": 1777 }, { "epoch": 2.27392, "grad_norm": 0.5144199132919312, "learning_rate": 0.00024403015210660922, "loss": 4.162, "step": 1778 }, { "epoch": 2.2752, "grad_norm": 0.5757052302360535, "learning_rate": 0.00024398976982097185, "loss": 4.129, "step": 1779 }, { "epoch": 2.27648, "grad_norm": 0.5588791966438293, "learning_rate": 0.00024394938753533448, "loss": 4.0564, "step": 1780 }, { "epoch": 2.27776, "grad_norm": 0.5604344606399536, "learning_rate": 0.0002439090052496971, "loss": 4.0195, "step": 1781 }, { "epoch": 2.27904, "grad_norm": 0.5186293721199036, "learning_rate": 0.00024386862296405976, "loss": 4.0465, "step": 1782 }, { "epoch": 2.28032, "grad_norm": 0.5566515922546387, "learning_rate": 0.0002438282406784224, "loss": 4.1026, "step": 1783 }, { "epoch": 2.2816, "grad_norm": 0.5155622363090515, "learning_rate": 0.000243787858392785, "loss": 4.096, "step": 1784 }, { "epoch": 2.28288, "grad_norm": 0.5197412967681885, "learning_rate": 0.00024374747610714763, "loss": 4.0739, "step": 1785 }, { "epoch": 2.28416, "grad_norm": 0.5129163861274719, "learning_rate": 0.00024370709382151028, "loss": 4.054, "step": 1786 }, { "epoch": 2.28544, "grad_norm": 0.4941207468509674, "learning_rate": 0.0002436667115358729, "loss": 4.0845, "step": 1787 }, { "epoch": 2.28672, "grad_norm": 0.5023811459541321, "learning_rate": 0.00024362632925023554, "loss": 4.1381, "step": 1788 }, { "epoch": 2.288, "grad_norm": 0.507765531539917, "learning_rate": 0.00024358594696459817, "loss": 4.0451, "step": 1789 }, { "epoch": 2.2892799999999998, "grad_norm": 0.4993181526660919, "learning_rate": 0.0002435455646789608, "loss": 4.0173, "step": 1790 }, { "epoch": 2.29056, "grad_norm": 0.5167925953865051, "learning_rate": 0.00024350518239332346, "loss": 4.1821, "step": 1791 }, { "epoch": 2.29184, "grad_norm": 0.4934568703174591, "learning_rate": 0.00024346480010768606, "loss": 4.1292, "step": 1792 }, { "epoch": 2.29312, "grad_norm": 0.5314911007881165, "learning_rate": 0.0002434244178220487, "loss": 4.1175, "step": 1793 }, { "epoch": 2.2944, "grad_norm": 0.5263243913650513, "learning_rate": 0.00024338403553641132, "loss": 4.0788, "step": 1794 }, { "epoch": 2.29568, "grad_norm": 0.5136982202529907, "learning_rate": 0.00024334365325077398, "loss": 4.0407, "step": 1795 }, { "epoch": 2.29696, "grad_norm": 0.53715580701828, "learning_rate": 0.0002433032709651366, "loss": 4.0982, "step": 1796 }, { "epoch": 2.29824, "grad_norm": 0.537679135799408, "learning_rate": 0.00024326288867949924, "loss": 4.0186, "step": 1797 }, { "epoch": 2.2995200000000002, "grad_norm": 0.5566760301589966, "learning_rate": 0.00024322250639386187, "loss": 4.0081, "step": 1798 }, { "epoch": 2.3008, "grad_norm": 0.5105803608894348, "learning_rate": 0.00024318212410822453, "loss": 4.1103, "step": 1799 }, { "epoch": 2.30208, "grad_norm": 0.5302501916885376, "learning_rate": 0.00024314174182258713, "loss": 4.0884, "step": 1800 }, { "epoch": 2.30336, "grad_norm": 0.5275909900665283, "learning_rate": 0.00024310135953694976, "loss": 4.1443, "step": 1801 }, { "epoch": 2.30464, "grad_norm": 0.529685378074646, "learning_rate": 0.0002430609772513124, "loss": 4.0931, "step": 1802 }, { "epoch": 2.30592, "grad_norm": 0.5178261995315552, "learning_rate": 0.00024302059496567502, "loss": 4.0161, "step": 1803 }, { "epoch": 2.3072, "grad_norm": 0.5614741444587708, "learning_rate": 0.00024298021268003768, "loss": 4.0476, "step": 1804 }, { "epoch": 2.30848, "grad_norm": 0.5194922685623169, "learning_rate": 0.0002429398303944003, "loss": 4.0889, "step": 1805 }, { "epoch": 2.30976, "grad_norm": 0.5637295842170715, "learning_rate": 0.00024289944810876294, "loss": 4.1037, "step": 1806 }, { "epoch": 2.31104, "grad_norm": 0.5420023202896118, "learning_rate": 0.00024285906582312554, "loss": 4.1075, "step": 1807 }, { "epoch": 2.31232, "grad_norm": 0.5864022374153137, "learning_rate": 0.0002428186835374882, "loss": 4.0976, "step": 1808 }, { "epoch": 2.3136, "grad_norm": 0.5702569484710693, "learning_rate": 0.00024277830125185083, "loss": 4.0542, "step": 1809 }, { "epoch": 2.31488, "grad_norm": 0.5524796843528748, "learning_rate": 0.00024273791896621346, "loss": 4.0888, "step": 1810 }, { "epoch": 2.31616, "grad_norm": 0.5307409167289734, "learning_rate": 0.0002426975366805761, "loss": 4.1252, "step": 1811 }, { "epoch": 2.31744, "grad_norm": 0.543082594871521, "learning_rate": 0.00024265715439493875, "loss": 4.1532, "step": 1812 }, { "epoch": 2.31872, "grad_norm": 0.5330712199211121, "learning_rate": 0.00024261677210930138, "loss": 4.1552, "step": 1813 }, { "epoch": 2.32, "grad_norm": 0.5282273292541504, "learning_rate": 0.000242576389823664, "loss": 4.1571, "step": 1814 }, { "epoch": 2.32128, "grad_norm": 0.5447540283203125, "learning_rate": 0.0002425360075380266, "loss": 4.1582, "step": 1815 }, { "epoch": 2.32256, "grad_norm": 0.5365498065948486, "learning_rate": 0.0002424956252523893, "loss": 4.1033, "step": 1816 }, { "epoch": 2.32384, "grad_norm": 0.5478320121765137, "learning_rate": 0.0002424552429667519, "loss": 4.1227, "step": 1817 }, { "epoch": 2.32512, "grad_norm": 0.5289247035980225, "learning_rate": 0.00024241486068111453, "loss": 4.1199, "step": 1818 }, { "epoch": 2.3264, "grad_norm": 0.5543386340141296, "learning_rate": 0.00024237447839547716, "loss": 4.0492, "step": 1819 }, { "epoch": 2.32768, "grad_norm": 0.5105769634246826, "learning_rate": 0.0002423340961098398, "loss": 4.0887, "step": 1820 }, { "epoch": 2.32896, "grad_norm": 0.5353228449821472, "learning_rate": 0.00024229371382420244, "loss": 3.9982, "step": 1821 }, { "epoch": 2.33024, "grad_norm": 0.4944192171096802, "learning_rate": 0.00024225333153856507, "loss": 4.0972, "step": 1822 }, { "epoch": 2.33152, "grad_norm": 0.5060207843780518, "learning_rate": 0.00024221294925292768, "loss": 4.0681, "step": 1823 }, { "epoch": 2.3327999999999998, "grad_norm": 0.503966212272644, "learning_rate": 0.0002421725669672903, "loss": 4.095, "step": 1824 }, { "epoch": 2.33408, "grad_norm": 0.5177589654922485, "learning_rate": 0.00024213218468165296, "loss": 4.048, "step": 1825 }, { "epoch": 2.33536, "grad_norm": 0.513486921787262, "learning_rate": 0.0002420918023960156, "loss": 4.1617, "step": 1826 }, { "epoch": 2.33664, "grad_norm": 0.5109826326370239, "learning_rate": 0.00024205142011037822, "loss": 4.1553, "step": 1827 }, { "epoch": 2.33792, "grad_norm": 0.5415542125701904, "learning_rate": 0.00024201103782474085, "loss": 4.0252, "step": 1828 }, { "epoch": 2.3392, "grad_norm": 0.5250722169876099, "learning_rate": 0.0002419706555391035, "loss": 4.1262, "step": 1829 }, { "epoch": 2.34048, "grad_norm": 0.490961492061615, "learning_rate": 0.00024193027325346614, "loss": 4.1085, "step": 1830 }, { "epoch": 2.34176, "grad_norm": 0.5075300931930542, "learning_rate": 0.00024188989096782877, "loss": 4.0679, "step": 1831 }, { "epoch": 2.3430400000000002, "grad_norm": 0.5052638053894043, "learning_rate": 0.00024184950868219138, "loss": 4.0838, "step": 1832 }, { "epoch": 2.34432, "grad_norm": 0.5473994016647339, "learning_rate": 0.000241809126396554, "loss": 4.0862, "step": 1833 }, { "epoch": 2.3456, "grad_norm": 0.502493143081665, "learning_rate": 0.00024176874411091666, "loss": 4.0654, "step": 1834 }, { "epoch": 2.34688, "grad_norm": 0.5568253397941589, "learning_rate": 0.0002417283618252793, "loss": 4.072, "step": 1835 }, { "epoch": 2.34816, "grad_norm": 0.535667359828949, "learning_rate": 0.00024168797953964192, "loss": 4.1611, "step": 1836 }, { "epoch": 2.34944, "grad_norm": 0.5129615068435669, "learning_rate": 0.00024164759725400455, "loss": 4.0566, "step": 1837 }, { "epoch": 2.35072, "grad_norm": 0.5144100785255432, "learning_rate": 0.0002416072149683672, "loss": 4.2091, "step": 1838 }, { "epoch": 2.352, "grad_norm": 0.500097393989563, "learning_rate": 0.00024156683268272984, "loss": 4.1066, "step": 1839 }, { "epoch": 2.35328, "grad_norm": 0.520469069480896, "learning_rate": 0.00024152645039709244, "loss": 4.0778, "step": 1840 }, { "epoch": 2.35456, "grad_norm": 0.5127030611038208, "learning_rate": 0.00024148606811145507, "loss": 4.1135, "step": 1841 }, { "epoch": 2.35584, "grad_norm": 0.540470540523529, "learning_rate": 0.00024144568582581773, "loss": 4.0279, "step": 1842 }, { "epoch": 2.35712, "grad_norm": 0.5427557826042175, "learning_rate": 0.00024140530354018036, "loss": 4.0734, "step": 1843 }, { "epoch": 2.3584, "grad_norm": 0.5099870562553406, "learning_rate": 0.000241364921254543, "loss": 4.0705, "step": 1844 }, { "epoch": 2.35968, "grad_norm": 0.5234814882278442, "learning_rate": 0.00024132453896890562, "loss": 4.0138, "step": 1845 }, { "epoch": 2.36096, "grad_norm": 0.5056787729263306, "learning_rate": 0.00024128415668326825, "loss": 4.1004, "step": 1846 }, { "epoch": 2.36224, "grad_norm": 0.5264895558357239, "learning_rate": 0.0002412437743976309, "loss": 4.0975, "step": 1847 }, { "epoch": 2.36352, "grad_norm": 0.5280450582504272, "learning_rate": 0.0002412033921119935, "loss": 4.0805, "step": 1848 }, { "epoch": 2.3648, "grad_norm": 0.509524941444397, "learning_rate": 0.00024116300982635614, "loss": 4.0108, "step": 1849 }, { "epoch": 2.36608, "grad_norm": 0.5179848074913025, "learning_rate": 0.00024112262754071877, "loss": 4.0785, "step": 1850 }, { "epoch": 2.36736, "grad_norm": 0.5557752847671509, "learning_rate": 0.00024108224525508143, "loss": 4.1493, "step": 1851 }, { "epoch": 2.36864, "grad_norm": 0.5200245976448059, "learning_rate": 0.00024104186296944406, "loss": 4.0801, "step": 1852 }, { "epoch": 2.36992, "grad_norm": 0.5246480703353882, "learning_rate": 0.0002410014806838067, "loss": 4.0722, "step": 1853 }, { "epoch": 2.3712, "grad_norm": 0.5433456301689148, "learning_rate": 0.00024096109839816932, "loss": 4.1251, "step": 1854 }, { "epoch": 2.37248, "grad_norm": 0.5170203447341919, "learning_rate": 0.00024092071611253198, "loss": 4.0856, "step": 1855 }, { "epoch": 2.37376, "grad_norm": 0.5600740313529968, "learning_rate": 0.00024088033382689458, "loss": 4.0519, "step": 1856 }, { "epoch": 2.37504, "grad_norm": 0.549588680267334, "learning_rate": 0.0002408399515412572, "loss": 4.0641, "step": 1857 }, { "epoch": 2.3763199999999998, "grad_norm": 0.5169657468795776, "learning_rate": 0.00024079956925561984, "loss": 4.0794, "step": 1858 }, { "epoch": 2.3776, "grad_norm": 0.5614109039306641, "learning_rate": 0.00024075918696998247, "loss": 4.1284, "step": 1859 }, { "epoch": 2.37888, "grad_norm": 0.5183959603309631, "learning_rate": 0.00024071880468434513, "loss": 4.133, "step": 1860 }, { "epoch": 2.38016, "grad_norm": 0.584951639175415, "learning_rate": 0.00024067842239870776, "loss": 4.0667, "step": 1861 }, { "epoch": 2.38144, "grad_norm": 0.5184288620948792, "learning_rate": 0.00024063804011307039, "loss": 4.0535, "step": 1862 }, { "epoch": 2.38272, "grad_norm": 0.558650016784668, "learning_rate": 0.000240597657827433, "loss": 4.0802, "step": 1863 }, { "epoch": 2.384, "grad_norm": 0.5584487318992615, "learning_rate": 0.00024055727554179565, "loss": 4.1414, "step": 1864 }, { "epoch": 2.38528, "grad_norm": 0.5305871963500977, "learning_rate": 0.00024051689325615828, "loss": 4.0945, "step": 1865 }, { "epoch": 2.3865600000000002, "grad_norm": 0.5361427664756775, "learning_rate": 0.0002404765109705209, "loss": 4.0935, "step": 1866 }, { "epoch": 2.38784, "grad_norm": 0.49007391929626465, "learning_rate": 0.00024043612868488354, "loss": 4.1032, "step": 1867 }, { "epoch": 2.38912, "grad_norm": 0.5409270524978638, "learning_rate": 0.0002403957463992462, "loss": 4.0758, "step": 1868 }, { "epoch": 2.3904, "grad_norm": 0.5215279459953308, "learning_rate": 0.00024035536411360882, "loss": 4.0599, "step": 1869 }, { "epoch": 2.39168, "grad_norm": 0.5199880599975586, "learning_rate": 0.00024031498182797145, "loss": 4.0554, "step": 1870 }, { "epoch": 2.39296, "grad_norm": 0.5253363251686096, "learning_rate": 0.00024027459954233406, "loss": 4.1072, "step": 1871 }, { "epoch": 2.39424, "grad_norm": 0.5469797253608704, "learning_rate": 0.0002402342172566967, "loss": 4.0496, "step": 1872 }, { "epoch": 2.39552, "grad_norm": 0.5029854774475098, "learning_rate": 0.00024019383497105934, "loss": 4.0267, "step": 1873 }, { "epoch": 2.3968, "grad_norm": 0.5808607339859009, "learning_rate": 0.00024015345268542197, "loss": 4.0903, "step": 1874 }, { "epoch": 2.39808, "grad_norm": 0.4929969906806946, "learning_rate": 0.0002401130703997846, "loss": 4.0763, "step": 1875 }, { "epoch": 2.39936, "grad_norm": 0.5704619288444519, "learning_rate": 0.00024007268811414723, "loss": 4.0923, "step": 1876 }, { "epoch": 2.40064, "grad_norm": 0.5273481607437134, "learning_rate": 0.0002400323058285099, "loss": 4.0345, "step": 1877 }, { "epoch": 2.40192, "grad_norm": 0.5336358547210693, "learning_rate": 0.00023999192354287252, "loss": 4.1127, "step": 1878 }, { "epoch": 2.4032, "grad_norm": 0.5384494662284851, "learning_rate": 0.00023995154125723512, "loss": 4.131, "step": 1879 }, { "epoch": 2.40448, "grad_norm": 0.5182468891143799, "learning_rate": 0.00023991115897159775, "loss": 4.088, "step": 1880 }, { "epoch": 2.40576, "grad_norm": 0.54859459400177, "learning_rate": 0.0002398707766859604, "loss": 4.0991, "step": 1881 }, { "epoch": 2.40704, "grad_norm": 0.5565007925033569, "learning_rate": 0.00023983039440032304, "loss": 4.0784, "step": 1882 }, { "epoch": 2.40832, "grad_norm": 0.5592086315155029, "learning_rate": 0.00023979001211468567, "loss": 4.126, "step": 1883 }, { "epoch": 2.4096, "grad_norm": 0.5493931174278259, "learning_rate": 0.0002397496298290483, "loss": 4.1278, "step": 1884 }, { "epoch": 2.41088, "grad_norm": 0.5680795311927795, "learning_rate": 0.00023970924754341093, "loss": 4.1366, "step": 1885 }, { "epoch": 2.41216, "grad_norm": 0.5174584984779358, "learning_rate": 0.0002396688652577736, "loss": 4.0928, "step": 1886 }, { "epoch": 2.41344, "grad_norm": 0.5714235305786133, "learning_rate": 0.0002396284829721362, "loss": 4.1146, "step": 1887 }, { "epoch": 2.41472, "grad_norm": 0.5287262201309204, "learning_rate": 0.00023958810068649882, "loss": 4.1175, "step": 1888 }, { "epoch": 2.416, "grad_norm": 0.5177103877067566, "learning_rate": 0.00023954771840086145, "loss": 4.0145, "step": 1889 }, { "epoch": 2.41728, "grad_norm": 0.5252452492713928, "learning_rate": 0.0002395073361152241, "loss": 4.0529, "step": 1890 }, { "epoch": 2.41856, "grad_norm": 0.5262645483016968, "learning_rate": 0.00023946695382958674, "loss": 4.0403, "step": 1891 }, { "epoch": 2.4198399999999998, "grad_norm": 0.5197089910507202, "learning_rate": 0.00023942657154394937, "loss": 4.0744, "step": 1892 }, { "epoch": 2.42112, "grad_norm": 0.512195885181427, "learning_rate": 0.000239386189258312, "loss": 4.0726, "step": 1893 }, { "epoch": 2.4224, "grad_norm": 0.5221033692359924, "learning_rate": 0.00023934580697267466, "loss": 4.1091, "step": 1894 }, { "epoch": 2.42368, "grad_norm": 0.5204744338989258, "learning_rate": 0.00023930542468703726, "loss": 4.0276, "step": 1895 }, { "epoch": 2.42496, "grad_norm": 0.5527137517929077, "learning_rate": 0.0002392650424013999, "loss": 4.0472, "step": 1896 }, { "epoch": 2.42624, "grad_norm": 0.5223075151443481, "learning_rate": 0.00023922466011576252, "loss": 4.0678, "step": 1897 }, { "epoch": 2.42752, "grad_norm": 0.5136579871177673, "learning_rate": 0.00023918427783012515, "loss": 3.968, "step": 1898 }, { "epoch": 2.4288, "grad_norm": 0.5277352929115295, "learning_rate": 0.0002391438955444878, "loss": 4.0798, "step": 1899 }, { "epoch": 2.4300800000000002, "grad_norm": 0.524479329586029, "learning_rate": 0.00023910351325885044, "loss": 4.0096, "step": 1900 }, { "epoch": 2.43136, "grad_norm": 0.5202826261520386, "learning_rate": 0.00023906313097321307, "loss": 4.0303, "step": 1901 }, { "epoch": 2.43264, "grad_norm": 0.5311558842658997, "learning_rate": 0.00023902274868757567, "loss": 4.1015, "step": 1902 }, { "epoch": 2.43392, "grad_norm": 0.5515623092651367, "learning_rate": 0.00023898236640193835, "loss": 4.0423, "step": 1903 }, { "epoch": 2.4352, "grad_norm": 0.55174320936203, "learning_rate": 0.00023894198411630096, "loss": 4.0474, "step": 1904 }, { "epoch": 2.43648, "grad_norm": 0.5991604924201965, "learning_rate": 0.0002389016018306636, "loss": 4.1282, "step": 1905 }, { "epoch": 2.43776, "grad_norm": 0.5626751780509949, "learning_rate": 0.00023886121954502622, "loss": 4.0855, "step": 1906 }, { "epoch": 2.43904, "grad_norm": 0.5392490029335022, "learning_rate": 0.00023882083725938887, "loss": 4.1173, "step": 1907 }, { "epoch": 2.44032, "grad_norm": 0.5552417635917664, "learning_rate": 0.0002387804549737515, "loss": 4.123, "step": 1908 }, { "epoch": 2.4416, "grad_norm": 0.5797650218009949, "learning_rate": 0.00023874007268811413, "loss": 4.0547, "step": 1909 }, { "epoch": 2.44288, "grad_norm": 0.5046030282974243, "learning_rate": 0.00023869969040247674, "loss": 4.1031, "step": 1910 }, { "epoch": 2.44416, "grad_norm": 0.6118117570877075, "learning_rate": 0.00023865930811683937, "loss": 4.0797, "step": 1911 }, { "epoch": 2.44544, "grad_norm": 0.5165206789970398, "learning_rate": 0.00023861892583120203, "loss": 4.0243, "step": 1912 }, { "epoch": 2.44672, "grad_norm": 0.5483618378639221, "learning_rate": 0.00023857854354556466, "loss": 4.0234, "step": 1913 }, { "epoch": 2.448, "grad_norm": 0.5128400325775146, "learning_rate": 0.00023853816125992729, "loss": 3.9816, "step": 1914 }, { "epoch": 2.44928, "grad_norm": 0.5472554564476013, "learning_rate": 0.00023849777897428992, "loss": 4.0127, "step": 1915 }, { "epoch": 2.45056, "grad_norm": 0.5363740921020508, "learning_rate": 0.00023845739668865257, "loss": 4.1075, "step": 1916 }, { "epoch": 2.45184, "grad_norm": 0.5734500288963318, "learning_rate": 0.0002384170144030152, "loss": 4.0291, "step": 1917 }, { "epoch": 2.45312, "grad_norm": 0.5291422009468079, "learning_rate": 0.00023837663211737783, "loss": 4.119, "step": 1918 }, { "epoch": 2.4544, "grad_norm": 0.541768491268158, "learning_rate": 0.00023833624983174044, "loss": 4.0881, "step": 1919 }, { "epoch": 2.45568, "grad_norm": 0.5272345542907715, "learning_rate": 0.0002382958675461031, "loss": 4.0666, "step": 1920 }, { "epoch": 2.45696, "grad_norm": 0.5372911095619202, "learning_rate": 0.00023825548526046572, "loss": 4.0557, "step": 1921 }, { "epoch": 2.45824, "grad_norm": 0.5213048458099365, "learning_rate": 0.00023821510297482835, "loss": 4.0735, "step": 1922 }, { "epoch": 2.45952, "grad_norm": 0.5202589631080627, "learning_rate": 0.00023817472068919098, "loss": 4.0209, "step": 1923 }, { "epoch": 2.4608, "grad_norm": 0.4860571026802063, "learning_rate": 0.0002381343384035536, "loss": 4.1663, "step": 1924 }, { "epoch": 2.46208, "grad_norm": 0.5281109809875488, "learning_rate": 0.00023809395611791627, "loss": 4.0593, "step": 1925 }, { "epoch": 2.4633599999999998, "grad_norm": 0.508752167224884, "learning_rate": 0.0002380535738322789, "loss": 4.0568, "step": 1926 }, { "epoch": 2.46464, "grad_norm": 0.5302470326423645, "learning_rate": 0.0002380131915466415, "loss": 4.0717, "step": 1927 }, { "epoch": 2.46592, "grad_norm": 0.5096002817153931, "learning_rate": 0.00023797280926100413, "loss": 4.0652, "step": 1928 }, { "epoch": 2.4672, "grad_norm": 0.5053885579109192, "learning_rate": 0.0002379324269753668, "loss": 4.1062, "step": 1929 }, { "epoch": 2.46848, "grad_norm": 0.5277340412139893, "learning_rate": 0.00023789204468972942, "loss": 4.1013, "step": 1930 }, { "epoch": 2.46976, "grad_norm": 0.5312148928642273, "learning_rate": 0.00023785166240409205, "loss": 4.0802, "step": 1931 }, { "epoch": 2.47104, "grad_norm": 0.5116835832595825, "learning_rate": 0.00023781128011845468, "loss": 3.9968, "step": 1932 }, { "epoch": 2.47232, "grad_norm": 0.5169728398323059, "learning_rate": 0.00023777089783281734, "loss": 4.0566, "step": 1933 }, { "epoch": 2.4736000000000002, "grad_norm": 0.5306958556175232, "learning_rate": 0.00023773051554717997, "loss": 4.0178, "step": 1934 }, { "epoch": 2.47488, "grad_norm": 0.5176283717155457, "learning_rate": 0.00023769013326154257, "loss": 4.0897, "step": 1935 }, { "epoch": 2.47616, "grad_norm": 0.5138524174690247, "learning_rate": 0.0002376497509759052, "loss": 4.0544, "step": 1936 }, { "epoch": 2.47744, "grad_norm": 0.5273920297622681, "learning_rate": 0.00023760936869026783, "loss": 4.0539, "step": 1937 }, { "epoch": 2.47872, "grad_norm": 0.5098421573638916, "learning_rate": 0.0002375689864046305, "loss": 4.0386, "step": 1938 }, { "epoch": 2.48, "grad_norm": 0.5550551414489746, "learning_rate": 0.00023752860411899312, "loss": 4.0598, "step": 1939 }, { "epoch": 2.48128, "grad_norm": 0.5031646490097046, "learning_rate": 0.00023748822183335575, "loss": 4.1084, "step": 1940 }, { "epoch": 2.48256, "grad_norm": 0.5071708559989929, "learning_rate": 0.00023744783954771838, "loss": 4.07, "step": 1941 }, { "epoch": 2.48384, "grad_norm": 0.5383012890815735, "learning_rate": 0.00023740745726208104, "loss": 4.0044, "step": 1942 }, { "epoch": 2.48512, "grad_norm": 0.5164763927459717, "learning_rate": 0.00023736707497644364, "loss": 4.1217, "step": 1943 }, { "epoch": 2.4864, "grad_norm": 0.531086266040802, "learning_rate": 0.00023732669269080627, "loss": 4.055, "step": 1944 }, { "epoch": 2.48768, "grad_norm": 0.5626212358474731, "learning_rate": 0.0002372863104051689, "loss": 3.9839, "step": 1945 }, { "epoch": 2.48896, "grad_norm": 0.48967134952545166, "learning_rate": 0.00023724592811953156, "loss": 4.0099, "step": 1946 }, { "epoch": 2.49024, "grad_norm": 0.4974314272403717, "learning_rate": 0.00023720554583389419, "loss": 4.0544, "step": 1947 }, { "epoch": 2.49152, "grad_norm": 0.5294041037559509, "learning_rate": 0.00023716516354825682, "loss": 4.0783, "step": 1948 }, { "epoch": 2.4928, "grad_norm": 0.5068480372428894, "learning_rate": 0.00023712478126261945, "loss": 4.0757, "step": 1949 }, { "epoch": 2.49408, "grad_norm": 0.4977387487888336, "learning_rate": 0.0002370843989769821, "loss": 4.1196, "step": 1950 }, { "epoch": 2.49536, "grad_norm": 0.4864615797996521, "learning_rate": 0.0002370440166913447, "loss": 4.0737, "step": 1951 }, { "epoch": 2.49664, "grad_norm": 0.5214549899101257, "learning_rate": 0.00023700363440570734, "loss": 4.0846, "step": 1952 }, { "epoch": 2.49792, "grad_norm": 0.538516640663147, "learning_rate": 0.00023696325212006997, "loss": 4.0236, "step": 1953 }, { "epoch": 2.4992, "grad_norm": 0.5184758901596069, "learning_rate": 0.0002369228698344326, "loss": 4.0424, "step": 1954 }, { "epoch": 2.50048, "grad_norm": 0.512209415435791, "learning_rate": 0.00023688248754879525, "loss": 4.0921, "step": 1955 }, { "epoch": 2.50176, "grad_norm": 0.5255154371261597, "learning_rate": 0.00023684210526315788, "loss": 4.1313, "step": 1956 }, { "epoch": 2.50304, "grad_norm": 0.5283589959144592, "learning_rate": 0.00023680172297752051, "loss": 4.0266, "step": 1957 }, { "epoch": 2.50432, "grad_norm": 0.5340486764907837, "learning_rate": 0.00023676134069188312, "loss": 4.1258, "step": 1958 }, { "epoch": 2.5056000000000003, "grad_norm": 0.5446499586105347, "learning_rate": 0.00023672095840624577, "loss": 4.0665, "step": 1959 }, { "epoch": 2.5068799999999998, "grad_norm": 0.4942466914653778, "learning_rate": 0.0002366805761206084, "loss": 4.0474, "step": 1960 }, { "epoch": 2.50816, "grad_norm": 0.5173544883728027, "learning_rate": 0.00023664019383497103, "loss": 3.9846, "step": 1961 }, { "epoch": 2.50944, "grad_norm": 0.5310766100883484, "learning_rate": 0.00023659981154933366, "loss": 4.0044, "step": 1962 }, { "epoch": 2.51072, "grad_norm": 0.5134363770484924, "learning_rate": 0.00023655942926369632, "loss": 4.0734, "step": 1963 }, { "epoch": 2.512, "grad_norm": 0.5086606740951538, "learning_rate": 0.00023651904697805895, "loss": 4.0758, "step": 1964 }, { "epoch": 2.51328, "grad_norm": 0.5497402548789978, "learning_rate": 0.00023647866469242158, "loss": 4.075, "step": 1965 }, { "epoch": 2.51456, "grad_norm": 0.5068183541297913, "learning_rate": 0.00023643828240678418, "loss": 4.0666, "step": 1966 }, { "epoch": 2.51584, "grad_norm": 0.5550326704978943, "learning_rate": 0.00023639790012114681, "loss": 4.0539, "step": 1967 }, { "epoch": 2.5171200000000002, "grad_norm": 0.49992436170578003, "learning_rate": 0.00023635751783550947, "loss": 4.0962, "step": 1968 }, { "epoch": 2.5183999999999997, "grad_norm": 0.5428693294525146, "learning_rate": 0.0002363171355498721, "loss": 3.9748, "step": 1969 }, { "epoch": 2.51968, "grad_norm": 0.5194831490516663, "learning_rate": 0.00023627675326423473, "loss": 4.1121, "step": 1970 }, { "epoch": 2.52096, "grad_norm": 0.5650715827941895, "learning_rate": 0.00023623637097859736, "loss": 4.0938, "step": 1971 }, { "epoch": 2.52224, "grad_norm": 0.5177878141403198, "learning_rate": 0.00023619598869296002, "loss": 4.0946, "step": 1972 }, { "epoch": 2.52352, "grad_norm": 0.5797203183174133, "learning_rate": 0.00023615560640732265, "loss": 4.0559, "step": 1973 }, { "epoch": 2.5248, "grad_norm": 0.5099804997444153, "learning_rate": 0.00023611522412168525, "loss": 4.0196, "step": 1974 }, { "epoch": 2.52608, "grad_norm": 0.5663504004478455, "learning_rate": 0.00023607484183604788, "loss": 4.0268, "step": 1975 }, { "epoch": 2.52736, "grad_norm": 0.4963954985141754, "learning_rate": 0.00023603445955041054, "loss": 4.0723, "step": 1976 }, { "epoch": 2.52864, "grad_norm": 0.5366328954696655, "learning_rate": 0.00023599407726477317, "loss": 4.0549, "step": 1977 }, { "epoch": 2.5299199999999997, "grad_norm": 0.515116274356842, "learning_rate": 0.0002359536949791358, "loss": 4.0863, "step": 1978 }, { "epoch": 2.5312, "grad_norm": 0.5633923411369324, "learning_rate": 0.00023591331269349843, "loss": 4.0656, "step": 1979 }, { "epoch": 2.53248, "grad_norm": 0.515352725982666, "learning_rate": 0.00023587293040786106, "loss": 4.0995, "step": 1980 }, { "epoch": 2.53376, "grad_norm": 0.5454360842704773, "learning_rate": 0.00023583254812222372, "loss": 4.0187, "step": 1981 }, { "epoch": 2.53504, "grad_norm": 0.5400582551956177, "learning_rate": 0.00023579216583658632, "loss": 4.0149, "step": 1982 }, { "epoch": 2.53632, "grad_norm": 0.5306532382965088, "learning_rate": 0.00023575178355094895, "loss": 4.0082, "step": 1983 }, { "epoch": 2.5376, "grad_norm": 0.535999059677124, "learning_rate": 0.00023571140126531158, "loss": 4.0181, "step": 1984 }, { "epoch": 2.53888, "grad_norm": 0.5338032245635986, "learning_rate": 0.00023567101897967424, "loss": 3.9664, "step": 1985 }, { "epoch": 2.54016, "grad_norm": 0.5602062344551086, "learning_rate": 0.00023563063669403687, "loss": 4.0308, "step": 1986 }, { "epoch": 2.54144, "grad_norm": 0.5280570387840271, "learning_rate": 0.0002355902544083995, "loss": 4.0465, "step": 1987 }, { "epoch": 2.54272, "grad_norm": 0.5154047608375549, "learning_rate": 0.00023554987212276213, "loss": 4.0587, "step": 1988 }, { "epoch": 2.544, "grad_norm": 0.551994800567627, "learning_rate": 0.00023550948983712478, "loss": 4.0683, "step": 1989 }, { "epoch": 2.54528, "grad_norm": 0.5081275105476379, "learning_rate": 0.00023546910755148741, "loss": 4.0972, "step": 1990 }, { "epoch": 2.54656, "grad_norm": 0.5598514676094055, "learning_rate": 0.00023542872526585002, "loss": 4.0323, "step": 1991 }, { "epoch": 2.54784, "grad_norm": 0.48178520798683167, "learning_rate": 0.00023538834298021265, "loss": 3.9906, "step": 1992 }, { "epoch": 2.5491200000000003, "grad_norm": 0.558735191822052, "learning_rate": 0.00023534796069457528, "loss": 4.0245, "step": 1993 }, { "epoch": 2.5504, "grad_norm": 0.540560781955719, "learning_rate": 0.00023530757840893794, "loss": 4.1077, "step": 1994 }, { "epoch": 2.55168, "grad_norm": 0.5557510852813721, "learning_rate": 0.00023526719612330057, "loss": 4.0786, "step": 1995 }, { "epoch": 2.55296, "grad_norm": 0.5289320945739746, "learning_rate": 0.0002352268138376632, "loss": 4.0445, "step": 1996 }, { "epoch": 2.55424, "grad_norm": 0.5117323398590088, "learning_rate": 0.0002351864315520258, "loss": 4.0146, "step": 1997 }, { "epoch": 2.55552, "grad_norm": 0.525393009185791, "learning_rate": 0.00023514604926638848, "loss": 4.1132, "step": 1998 }, { "epoch": 2.5568, "grad_norm": 0.5251143574714661, "learning_rate": 0.00023510566698075109, "loss": 4.0507, "step": 1999 }, { "epoch": 2.55808, "grad_norm": 0.5183683633804321, "learning_rate": 0.00023506528469511372, "loss": 4.0713, "step": 2000 }, { "epoch": 2.55936, "grad_norm": 0.5445199012756348, "learning_rate": 0.00023502490240947635, "loss": 4.0467, "step": 2001 }, { "epoch": 2.5606400000000002, "grad_norm": 0.5163613557815552, "learning_rate": 0.000234984520123839, "loss": 4.0313, "step": 2002 }, { "epoch": 2.5619199999999998, "grad_norm": 0.5514837503433228, "learning_rate": 0.00023494413783820163, "loss": 4.03, "step": 2003 }, { "epoch": 2.5632, "grad_norm": 0.527066707611084, "learning_rate": 0.00023490375555256426, "loss": 4.097, "step": 2004 }, { "epoch": 2.56448, "grad_norm": 0.5349636673927307, "learning_rate": 0.0002348633732669269, "loss": 4.019, "step": 2005 }, { "epoch": 2.56576, "grad_norm": 0.5117310285568237, "learning_rate": 0.0002348229909812895, "loss": 3.9889, "step": 2006 }, { "epoch": 2.56704, "grad_norm": 0.5110161900520325, "learning_rate": 0.00023478260869565215, "loss": 4.0419, "step": 2007 }, { "epoch": 2.56832, "grad_norm": 0.5449533462524414, "learning_rate": 0.00023474222641001478, "loss": 4.1393, "step": 2008 }, { "epoch": 2.5696, "grad_norm": 0.5280770063400269, "learning_rate": 0.0002347018441243774, "loss": 4.0109, "step": 2009 }, { "epoch": 2.57088, "grad_norm": 0.5067214369773865, "learning_rate": 0.00023466146183874004, "loss": 4.0415, "step": 2010 }, { "epoch": 2.5721600000000002, "grad_norm": 0.5153365135192871, "learning_rate": 0.0002346210795531027, "loss": 4.0349, "step": 2011 }, { "epoch": 2.5734399999999997, "grad_norm": 0.509453296661377, "learning_rate": 0.00023458069726746533, "loss": 4.0768, "step": 2012 }, { "epoch": 2.57472, "grad_norm": 0.5531152486801147, "learning_rate": 0.00023454031498182796, "loss": 4.0643, "step": 2013 }, { "epoch": 2.576, "grad_norm": 0.5219248533248901, "learning_rate": 0.00023449993269619056, "loss": 4.0365, "step": 2014 }, { "epoch": 2.57728, "grad_norm": 0.5232629776000977, "learning_rate": 0.00023445955041055322, "loss": 4.0989, "step": 2015 }, { "epoch": 2.57856, "grad_norm": 0.5008588433265686, "learning_rate": 0.00023441916812491585, "loss": 3.9987, "step": 2016 }, { "epoch": 2.57984, "grad_norm": 0.5122629404067993, "learning_rate": 0.00023437878583927848, "loss": 4.0479, "step": 2017 }, { "epoch": 2.58112, "grad_norm": 0.48956623673439026, "learning_rate": 0.0002343384035536411, "loss": 4.1008, "step": 2018 }, { "epoch": 2.5824, "grad_norm": 0.5428857207298279, "learning_rate": 0.00023429802126800374, "loss": 4.0692, "step": 2019 }, { "epoch": 2.58368, "grad_norm": 0.5066841840744019, "learning_rate": 0.0002342576389823664, "loss": 4.0896, "step": 2020 }, { "epoch": 2.58496, "grad_norm": 0.5314661860466003, "learning_rate": 0.00023421725669672903, "loss": 3.9449, "step": 2021 }, { "epoch": 2.58624, "grad_norm": 0.5035257935523987, "learning_rate": 0.00023417687441109163, "loss": 4.0175, "step": 2022 }, { "epoch": 2.58752, "grad_norm": 0.4997243583202362, "learning_rate": 0.00023413649212545426, "loss": 4.1032, "step": 2023 }, { "epoch": 2.5888, "grad_norm": 0.4923264980316162, "learning_rate": 0.00023409610983981692, "loss": 4.0363, "step": 2024 }, { "epoch": 2.59008, "grad_norm": 0.5122047662734985, "learning_rate": 0.00023405572755417955, "loss": 4.0297, "step": 2025 }, { "epoch": 2.59136, "grad_norm": 0.5043531060218811, "learning_rate": 0.00023401534526854218, "loss": 4.0732, "step": 2026 }, { "epoch": 2.59264, "grad_norm": 0.5018818974494934, "learning_rate": 0.0002339749629829048, "loss": 4.0188, "step": 2027 }, { "epoch": 2.59392, "grad_norm": 0.4968951940536499, "learning_rate": 0.00023393458069726747, "loss": 4.0574, "step": 2028 }, { "epoch": 2.5952, "grad_norm": 0.5588295459747314, "learning_rate": 0.0002338941984116301, "loss": 4.0049, "step": 2029 }, { "epoch": 2.59648, "grad_norm": 0.5243644714355469, "learning_rate": 0.0002338538161259927, "loss": 3.8995, "step": 2030 }, { "epoch": 2.59776, "grad_norm": 0.5623689293861389, "learning_rate": 0.00023381343384035533, "loss": 4.0505, "step": 2031 }, { "epoch": 2.59904, "grad_norm": 0.5318677425384521, "learning_rate": 0.00023377305155471796, "loss": 3.9892, "step": 2032 }, { "epoch": 2.60032, "grad_norm": 0.5533915758132935, "learning_rate": 0.00023373266926908062, "loss": 4.0111, "step": 2033 }, { "epoch": 2.6016, "grad_norm": 0.5214724540710449, "learning_rate": 0.00023369228698344325, "loss": 4.0176, "step": 2034 }, { "epoch": 2.60288, "grad_norm": 0.550571620464325, "learning_rate": 0.00023365190469780588, "loss": 4.0881, "step": 2035 }, { "epoch": 2.6041600000000003, "grad_norm": 0.5152320265769958, "learning_rate": 0.0002336115224121685, "loss": 4.043, "step": 2036 }, { "epoch": 2.6054399999999998, "grad_norm": 0.5005114078521729, "learning_rate": 0.00023357114012653116, "loss": 3.9851, "step": 2037 }, { "epoch": 2.60672, "grad_norm": 0.5171644687652588, "learning_rate": 0.00023353075784089377, "loss": 3.9649, "step": 2038 }, { "epoch": 2.608, "grad_norm": 0.5362170338630676, "learning_rate": 0.0002334903755552564, "loss": 4.0048, "step": 2039 }, { "epoch": 2.60928, "grad_norm": 0.5056260824203491, "learning_rate": 0.00023344999326961903, "loss": 4.056, "step": 2040 }, { "epoch": 2.61056, "grad_norm": 0.5133628249168396, "learning_rate": 0.00023340961098398168, "loss": 4.0981, "step": 2041 }, { "epoch": 2.61184, "grad_norm": 0.5374043583869934, "learning_rate": 0.00023336922869834431, "loss": 3.9999, "step": 2042 }, { "epoch": 2.61312, "grad_norm": 0.4886053800582886, "learning_rate": 0.00023332884641270694, "loss": 4.1064, "step": 2043 }, { "epoch": 2.6144, "grad_norm": 0.5053584575653076, "learning_rate": 0.00023328846412706957, "loss": 3.9454, "step": 2044 }, { "epoch": 2.6156800000000002, "grad_norm": 0.4969666004180908, "learning_rate": 0.00023324808184143218, "loss": 4.146, "step": 2045 }, { "epoch": 2.6169599999999997, "grad_norm": 0.5076320767402649, "learning_rate": 0.00023320769955579483, "loss": 4.0381, "step": 2046 }, { "epoch": 2.61824, "grad_norm": 0.4875280261039734, "learning_rate": 0.00023316731727015746, "loss": 4.0423, "step": 2047 }, { "epoch": 2.61952, "grad_norm": 0.5112111568450928, "learning_rate": 0.0002331269349845201, "loss": 4.0637, "step": 2048 }, { "epoch": 2.6208, "grad_norm": 0.5074511170387268, "learning_rate": 0.00023308655269888272, "loss": 3.9859, "step": 2049 }, { "epoch": 2.62208, "grad_norm": 0.5409026145935059, "learning_rate": 0.00023304617041324538, "loss": 4.1341, "step": 2050 }, { "epoch": 2.62336, "grad_norm": 0.5368846654891968, "learning_rate": 0.000233005788127608, "loss": 3.9908, "step": 2051 }, { "epoch": 2.62464, "grad_norm": 0.5240885615348816, "learning_rate": 0.00023296540584197064, "loss": 4.0777, "step": 2052 }, { "epoch": 2.62592, "grad_norm": 0.5103102326393127, "learning_rate": 0.00023292502355633325, "loss": 4.0188, "step": 2053 }, { "epoch": 2.6272, "grad_norm": 0.5250452756881714, "learning_rate": 0.0002328846412706959, "loss": 4.0489, "step": 2054 }, { "epoch": 2.62848, "grad_norm": 0.5160833597183228, "learning_rate": 0.00023284425898505853, "loss": 4.0773, "step": 2055 }, { "epoch": 2.62976, "grad_norm": 0.5143999457359314, "learning_rate": 0.00023280387669942116, "loss": 4.0191, "step": 2056 }, { "epoch": 2.63104, "grad_norm": 0.5295541286468506, "learning_rate": 0.0002327634944137838, "loss": 4.0769, "step": 2057 }, { "epoch": 2.63232, "grad_norm": 0.5218490958213806, "learning_rate": 0.00023272311212814642, "loss": 3.9913, "step": 2058 }, { "epoch": 2.6336, "grad_norm": 0.488608717918396, "learning_rate": 0.00023268272984250908, "loss": 4.0126, "step": 2059 }, { "epoch": 2.63488, "grad_norm": 0.5450034141540527, "learning_rate": 0.0002326423475568717, "loss": 4.0593, "step": 2060 }, { "epoch": 2.63616, "grad_norm": 0.4820656478404999, "learning_rate": 0.0002326019652712343, "loss": 4.0532, "step": 2061 }, { "epoch": 2.63744, "grad_norm": 0.5224117636680603, "learning_rate": 0.00023256158298559694, "loss": 4.0053, "step": 2062 }, { "epoch": 2.63872, "grad_norm": 0.4709533154964447, "learning_rate": 0.0002325212006999596, "loss": 3.9811, "step": 2063 }, { "epoch": 2.64, "grad_norm": 0.5281003713607788, "learning_rate": 0.00023248081841432223, "loss": 4.0737, "step": 2064 }, { "epoch": 2.64128, "grad_norm": 0.49592503905296326, "learning_rate": 0.00023244043612868486, "loss": 4.0537, "step": 2065 }, { "epoch": 2.64256, "grad_norm": 0.5004292726516724, "learning_rate": 0.0002324000538430475, "loss": 4.0797, "step": 2066 }, { "epoch": 2.64384, "grad_norm": 0.5057252049446106, "learning_rate": 0.00023235967155741015, "loss": 4.0881, "step": 2067 }, { "epoch": 2.64512, "grad_norm": 0.511310875415802, "learning_rate": 0.00023231928927177278, "loss": 4.0383, "step": 2068 }, { "epoch": 2.6464, "grad_norm": 0.5321905016899109, "learning_rate": 0.00023227890698613538, "loss": 4.0814, "step": 2069 }, { "epoch": 2.6476800000000003, "grad_norm": 0.5654548406600952, "learning_rate": 0.000232238524700498, "loss": 4.1224, "step": 2070 }, { "epoch": 2.6489599999999998, "grad_norm": 0.527220606803894, "learning_rate": 0.00023219814241486067, "loss": 4.0622, "step": 2071 }, { "epoch": 2.65024, "grad_norm": 0.5580291152000427, "learning_rate": 0.0002321577601292233, "loss": 4.0797, "step": 2072 }, { "epoch": 2.65152, "grad_norm": 0.545524001121521, "learning_rate": 0.00023211737784358593, "loss": 4.0666, "step": 2073 }, { "epoch": 2.6528, "grad_norm": 0.5084550380706787, "learning_rate": 0.00023207699555794856, "loss": 4.0051, "step": 2074 }, { "epoch": 2.65408, "grad_norm": 0.5336723923683167, "learning_rate": 0.0002320366132723112, "loss": 4.0471, "step": 2075 }, { "epoch": 2.65536, "grad_norm": 0.5522205829620361, "learning_rate": 0.00023199623098667385, "loss": 4.0846, "step": 2076 }, { "epoch": 2.65664, "grad_norm": 0.5465154051780701, "learning_rate": 0.00023195584870103645, "loss": 3.9559, "step": 2077 }, { "epoch": 2.65792, "grad_norm": 0.5187036395072937, "learning_rate": 0.00023191546641539908, "loss": 3.9902, "step": 2078 }, { "epoch": 2.6592000000000002, "grad_norm": 0.5408727526664734, "learning_rate": 0.0002318750841297617, "loss": 4.0133, "step": 2079 }, { "epoch": 2.6604799999999997, "grad_norm": 0.5757235288619995, "learning_rate": 0.00023183470184412437, "loss": 4.0698, "step": 2080 }, { "epoch": 2.66176, "grad_norm": 0.5221036076545715, "learning_rate": 0.000231794319558487, "loss": 4.1101, "step": 2081 }, { "epoch": 2.66304, "grad_norm": 0.5259095430374146, "learning_rate": 0.00023175393727284963, "loss": 3.9804, "step": 2082 }, { "epoch": 2.66432, "grad_norm": 0.520976185798645, "learning_rate": 0.00023171355498721226, "loss": 4.02, "step": 2083 }, { "epoch": 2.6656, "grad_norm": 0.5386399626731873, "learning_rate": 0.0002316731727015749, "loss": 4.1377, "step": 2084 }, { "epoch": 2.66688, "grad_norm": 0.5121170282363892, "learning_rate": 0.00023163279041593754, "loss": 4.0627, "step": 2085 }, { "epoch": 2.66816, "grad_norm": 0.5606420636177063, "learning_rate": 0.00023159240813030015, "loss": 4.0281, "step": 2086 }, { "epoch": 2.66944, "grad_norm": 0.5451529622077942, "learning_rate": 0.00023155202584466278, "loss": 3.9724, "step": 2087 }, { "epoch": 2.67072, "grad_norm": 0.5385946035385132, "learning_rate": 0.0002315116435590254, "loss": 4.0462, "step": 2088 }, { "epoch": 2.672, "grad_norm": 0.5405113101005554, "learning_rate": 0.00023147126127338806, "loss": 3.9622, "step": 2089 }, { "epoch": 2.67328, "grad_norm": 0.5446247458457947, "learning_rate": 0.0002314308789877507, "loss": 4.0747, "step": 2090 }, { "epoch": 2.67456, "grad_norm": 0.5345394611358643, "learning_rate": 0.00023139049670211332, "loss": 3.9045, "step": 2091 }, { "epoch": 2.67584, "grad_norm": 0.5287266969680786, "learning_rate": 0.00023135011441647595, "loss": 4.0296, "step": 2092 }, { "epoch": 2.67712, "grad_norm": 0.4985159933567047, "learning_rate": 0.0002313097321308386, "loss": 4.0478, "step": 2093 }, { "epoch": 2.6784, "grad_norm": 0.5355061292648315, "learning_rate": 0.00023126934984520121, "loss": 4.0843, "step": 2094 }, { "epoch": 2.67968, "grad_norm": 0.4830259382724762, "learning_rate": 0.00023122896755956384, "loss": 4.0583, "step": 2095 }, { "epoch": 2.68096, "grad_norm": 0.5162714719772339, "learning_rate": 0.00023118858527392647, "loss": 3.967, "step": 2096 }, { "epoch": 2.68224, "grad_norm": 0.5130695700645447, "learning_rate": 0.00023114820298828913, "loss": 4.0423, "step": 2097 }, { "epoch": 2.68352, "grad_norm": 0.507447361946106, "learning_rate": 0.00023110782070265176, "loss": 4.0212, "step": 2098 }, { "epoch": 2.6848, "grad_norm": 0.6723561882972717, "learning_rate": 0.0002310674384170144, "loss": 4.0209, "step": 2099 }, { "epoch": 2.68608, "grad_norm": 0.49744725227355957, "learning_rate": 0.00023102705613137702, "loss": 3.9865, "step": 2100 }, { "epoch": 2.68736, "grad_norm": 0.5632631778717041, "learning_rate": 0.00023098667384573962, "loss": 4.0426, "step": 2101 }, { "epoch": 2.68864, "grad_norm": 0.5552526712417603, "learning_rate": 0.00023094629156010228, "loss": 4.0527, "step": 2102 }, { "epoch": 2.68992, "grad_norm": 0.5476700663566589, "learning_rate": 0.0002309059092744649, "loss": 4.0647, "step": 2103 }, { "epoch": 2.6912000000000003, "grad_norm": 0.545758068561554, "learning_rate": 0.00023086552698882754, "loss": 4.0247, "step": 2104 }, { "epoch": 2.6924799999999998, "grad_norm": 0.5172504186630249, "learning_rate": 0.00023082514470319017, "loss": 4.0629, "step": 2105 }, { "epoch": 2.69376, "grad_norm": 0.5142618417739868, "learning_rate": 0.00023078476241755283, "loss": 3.9725, "step": 2106 }, { "epoch": 2.69504, "grad_norm": 0.49800601601600647, "learning_rate": 0.00023074438013191546, "loss": 3.9891, "step": 2107 }, { "epoch": 2.69632, "grad_norm": 0.5199533700942993, "learning_rate": 0.0002307039978462781, "loss": 4.0479, "step": 2108 }, { "epoch": 2.6976, "grad_norm": 0.4997861981391907, "learning_rate": 0.0002306636155606407, "loss": 3.9831, "step": 2109 }, { "epoch": 2.69888, "grad_norm": 0.5149181485176086, "learning_rate": 0.00023062323327500335, "loss": 4.0014, "step": 2110 }, { "epoch": 2.70016, "grad_norm": 0.5101584792137146, "learning_rate": 0.00023058285098936598, "loss": 4.0213, "step": 2111 }, { "epoch": 2.70144, "grad_norm": 0.4832100570201874, "learning_rate": 0.0002305424687037286, "loss": 4.0012, "step": 2112 }, { "epoch": 2.7027200000000002, "grad_norm": 0.527621328830719, "learning_rate": 0.00023050208641809124, "loss": 4.0418, "step": 2113 }, { "epoch": 2.7039999999999997, "grad_norm": 0.4823199510574341, "learning_rate": 0.00023046170413245387, "loss": 4.0175, "step": 2114 }, { "epoch": 2.70528, "grad_norm": 0.5674161911010742, "learning_rate": 0.00023042132184681653, "loss": 4.0703, "step": 2115 }, { "epoch": 2.70656, "grad_norm": 0.5498639345169067, "learning_rate": 0.00023038093956117916, "loss": 4.0389, "step": 2116 }, { "epoch": 2.70784, "grad_norm": 0.5504740476608276, "learning_rate": 0.00023034055727554176, "loss": 4.0138, "step": 2117 }, { "epoch": 2.70912, "grad_norm": 0.5479692816734314, "learning_rate": 0.0002303001749899044, "loss": 4.0722, "step": 2118 }, { "epoch": 2.7104, "grad_norm": 0.4745061993598938, "learning_rate": 0.00023025979270426705, "loss": 4.0672, "step": 2119 }, { "epoch": 2.71168, "grad_norm": 0.5192096829414368, "learning_rate": 0.00023021941041862968, "loss": 3.9691, "step": 2120 }, { "epoch": 2.71296, "grad_norm": 0.5102022290229797, "learning_rate": 0.0002301790281329923, "loss": 4.0599, "step": 2121 }, { "epoch": 2.71424, "grad_norm": 0.49399974942207336, "learning_rate": 0.00023013864584735494, "loss": 3.9948, "step": 2122 }, { "epoch": 2.71552, "grad_norm": 0.5214704871177673, "learning_rate": 0.0002300982635617176, "loss": 4.0376, "step": 2123 }, { "epoch": 2.7168, "grad_norm": 0.5198361873626709, "learning_rate": 0.00023005788127608022, "loss": 4.0227, "step": 2124 }, { "epoch": 2.71808, "grad_norm": 0.5009457468986511, "learning_rate": 0.00023001749899044283, "loss": 4.0152, "step": 2125 }, { "epoch": 2.71936, "grad_norm": 0.5245611667633057, "learning_rate": 0.00022997711670480546, "loss": 4.056, "step": 2126 }, { "epoch": 2.72064, "grad_norm": 0.4951135814189911, "learning_rate": 0.0002299367344191681, "loss": 4.0599, "step": 2127 }, { "epoch": 2.72192, "grad_norm": 0.5234236717224121, "learning_rate": 0.00022989635213353074, "loss": 4.0412, "step": 2128 }, { "epoch": 2.7232, "grad_norm": 0.5335028171539307, "learning_rate": 0.00022985596984789337, "loss": 3.9789, "step": 2129 }, { "epoch": 2.72448, "grad_norm": 0.527600109577179, "learning_rate": 0.000229815587562256, "loss": 4.0591, "step": 2130 }, { "epoch": 2.72576, "grad_norm": 0.5804891586303711, "learning_rate": 0.00022977520527661864, "loss": 4.029, "step": 2131 }, { "epoch": 2.72704, "grad_norm": 0.5268876552581787, "learning_rate": 0.0002297348229909813, "loss": 3.9655, "step": 2132 }, { "epoch": 2.72832, "grad_norm": 0.5200521349906921, "learning_rate": 0.0002296944407053439, "loss": 3.9752, "step": 2133 }, { "epoch": 2.7296, "grad_norm": 0.5403656363487244, "learning_rate": 0.00022965405841970653, "loss": 4.0228, "step": 2134 }, { "epoch": 2.73088, "grad_norm": 0.528051495552063, "learning_rate": 0.00022961367613406916, "loss": 3.9931, "step": 2135 }, { "epoch": 2.73216, "grad_norm": 0.5134397745132446, "learning_rate": 0.0002295732938484318, "loss": 4.0045, "step": 2136 }, { "epoch": 2.73344, "grad_norm": 0.5310769081115723, "learning_rate": 0.00022953291156279444, "loss": 4.0, "step": 2137 }, { "epoch": 2.7347200000000003, "grad_norm": 0.5168265700340271, "learning_rate": 0.00022949252927715707, "loss": 4.0438, "step": 2138 }, { "epoch": 2.7359999999999998, "grad_norm": 0.5118173360824585, "learning_rate": 0.0002294521469915197, "loss": 4.0294, "step": 2139 }, { "epoch": 2.73728, "grad_norm": 0.5167641043663025, "learning_rate": 0.0002294117647058823, "loss": 3.9984, "step": 2140 }, { "epoch": 2.73856, "grad_norm": 0.48330435156822205, "learning_rate": 0.00022937138242024496, "loss": 4.076, "step": 2141 }, { "epoch": 2.73984, "grad_norm": 0.5215317606925964, "learning_rate": 0.0002293310001346076, "loss": 4.0358, "step": 2142 }, { "epoch": 2.74112, "grad_norm": 0.5229592323303223, "learning_rate": 0.00022929061784897022, "loss": 3.9641, "step": 2143 }, { "epoch": 2.7424, "grad_norm": 0.49107488989830017, "learning_rate": 0.00022925023556333285, "loss": 3.9869, "step": 2144 }, { "epoch": 2.74368, "grad_norm": 0.5166721343994141, "learning_rate": 0.0002292098532776955, "loss": 3.9876, "step": 2145 }, { "epoch": 2.74496, "grad_norm": 0.5030099749565125, "learning_rate": 0.00022916947099205814, "loss": 3.9987, "step": 2146 }, { "epoch": 2.7462400000000002, "grad_norm": 0.5043811202049255, "learning_rate": 0.00022912908870642077, "loss": 4.0329, "step": 2147 }, { "epoch": 2.7475199999999997, "grad_norm": 0.5001031756401062, "learning_rate": 0.00022908870642078337, "loss": 4.0079, "step": 2148 }, { "epoch": 2.7488, "grad_norm": 0.5411626100540161, "learning_rate": 0.00022904832413514603, "loss": 4.0499, "step": 2149 }, { "epoch": 2.75008, "grad_norm": 0.529230535030365, "learning_rate": 0.00022900794184950866, "loss": 4.0719, "step": 2150 }, { "epoch": 2.75136, "grad_norm": 0.5309909582138062, "learning_rate": 0.0002289675595638713, "loss": 3.9798, "step": 2151 }, { "epoch": 2.75264, "grad_norm": 0.5642079710960388, "learning_rate": 0.00022892717727823392, "loss": 4.0106, "step": 2152 }, { "epoch": 2.75392, "grad_norm": 0.5178594589233398, "learning_rate": 0.00022888679499259655, "loss": 4.057, "step": 2153 }, { "epoch": 2.7552, "grad_norm": 0.5369943380355835, "learning_rate": 0.0002288464127069592, "loss": 3.996, "step": 2154 }, { "epoch": 2.75648, "grad_norm": 0.5157840251922607, "learning_rate": 0.00022880603042132184, "loss": 4.038, "step": 2155 }, { "epoch": 2.75776, "grad_norm": 0.5400508642196655, "learning_rate": 0.00022876564813568444, "loss": 3.9986, "step": 2156 }, { "epoch": 2.75904, "grad_norm": 0.49471211433410645, "learning_rate": 0.00022872526585004707, "loss": 4.0016, "step": 2157 }, { "epoch": 2.76032, "grad_norm": 0.5369953513145447, "learning_rate": 0.00022868488356440973, "loss": 4.0411, "step": 2158 }, { "epoch": 2.7616, "grad_norm": 0.5281562209129333, "learning_rate": 0.00022864450127877236, "loss": 4.0303, "step": 2159 }, { "epoch": 2.76288, "grad_norm": 0.5228774547576904, "learning_rate": 0.000228604118993135, "loss": 4.0643, "step": 2160 }, { "epoch": 2.76416, "grad_norm": 0.5000044107437134, "learning_rate": 0.00022856373670749762, "loss": 4.0538, "step": 2161 }, { "epoch": 2.76544, "grad_norm": 0.505082905292511, "learning_rate": 0.00022852335442186028, "loss": 4.0106, "step": 2162 }, { "epoch": 2.76672, "grad_norm": 0.5144949555397034, "learning_rate": 0.0002284829721362229, "loss": 3.9746, "step": 2163 }, { "epoch": 2.768, "grad_norm": 0.5088104605674744, "learning_rate": 0.0002284425898505855, "loss": 3.9718, "step": 2164 }, { "epoch": 2.76928, "grad_norm": 0.5328196287155151, "learning_rate": 0.00022840220756494814, "loss": 4.0054, "step": 2165 }, { "epoch": 2.77056, "grad_norm": 0.5345327854156494, "learning_rate": 0.00022836182527931077, "loss": 4.0231, "step": 2166 }, { "epoch": 2.77184, "grad_norm": 0.49523457884788513, "learning_rate": 0.00022832144299367343, "loss": 3.9831, "step": 2167 }, { "epoch": 2.77312, "grad_norm": 0.5508283376693726, "learning_rate": 0.00022828106070803606, "loss": 3.9685, "step": 2168 }, { "epoch": 2.7744, "grad_norm": 0.5366639494895935, "learning_rate": 0.00022824067842239869, "loss": 4.0006, "step": 2169 }, { "epoch": 2.77568, "grad_norm": 0.5522692799568176, "learning_rate": 0.00022820029613676132, "loss": 3.9667, "step": 2170 }, { "epoch": 2.77696, "grad_norm": 0.4968757927417755, "learning_rate": 0.00022815991385112397, "loss": 4.0107, "step": 2171 }, { "epoch": 2.7782400000000003, "grad_norm": 0.5238058567047119, "learning_rate": 0.0002281195315654866, "loss": 3.9716, "step": 2172 }, { "epoch": 2.7795199999999998, "grad_norm": 0.5009492039680481, "learning_rate": 0.0002280791492798492, "loss": 4.0209, "step": 2173 }, { "epoch": 2.7808, "grad_norm": 0.5323073863983154, "learning_rate": 0.00022803876699421184, "loss": 4.0487, "step": 2174 }, { "epoch": 2.78208, "grad_norm": 0.5168647170066833, "learning_rate": 0.0002279983847085745, "loss": 4.0385, "step": 2175 }, { "epoch": 2.78336, "grad_norm": 0.5620954632759094, "learning_rate": 0.00022795800242293712, "loss": 4.0084, "step": 2176 }, { "epoch": 2.78464, "grad_norm": 0.5150801539421082, "learning_rate": 0.00022791762013729975, "loss": 3.9903, "step": 2177 }, { "epoch": 2.78592, "grad_norm": 0.5225955843925476, "learning_rate": 0.00022787723785166238, "loss": 3.9493, "step": 2178 }, { "epoch": 2.7872, "grad_norm": 0.5136468410491943, "learning_rate": 0.000227836855566025, "loss": 4.0297, "step": 2179 }, { "epoch": 2.78848, "grad_norm": 0.5070092082023621, "learning_rate": 0.00022779647328038767, "loss": 3.9916, "step": 2180 }, { "epoch": 2.7897600000000002, "grad_norm": 0.48506057262420654, "learning_rate": 0.00022775609099475027, "loss": 4.049, "step": 2181 }, { "epoch": 2.7910399999999997, "grad_norm": 0.5056405067443848, "learning_rate": 0.0002277157087091129, "loss": 4.0344, "step": 2182 }, { "epoch": 2.79232, "grad_norm": 0.5065452456474304, "learning_rate": 0.00022767532642347553, "loss": 4.0575, "step": 2183 }, { "epoch": 2.7936, "grad_norm": 0.4931611716747284, "learning_rate": 0.0002276349441378382, "loss": 3.9662, "step": 2184 }, { "epoch": 2.79488, "grad_norm": 0.5401006937026978, "learning_rate": 0.00022759456185220082, "loss": 3.9968, "step": 2185 }, { "epoch": 2.79616, "grad_norm": 0.48308640718460083, "learning_rate": 0.00022755417956656345, "loss": 3.9867, "step": 2186 }, { "epoch": 2.79744, "grad_norm": 0.5414093732833862, "learning_rate": 0.00022751379728092608, "loss": 3.938, "step": 2187 }, { "epoch": 2.79872, "grad_norm": 0.47990182042121887, "learning_rate": 0.00022747341499528874, "loss": 4.1135, "step": 2188 }, { "epoch": 2.8, "grad_norm": 0.5447584986686707, "learning_rate": 0.00022743303270965134, "loss": 4.1674, "step": 2189 }, { "epoch": 2.80128, "grad_norm": 0.4829598665237427, "learning_rate": 0.00022739265042401397, "loss": 4.0011, "step": 2190 }, { "epoch": 2.80256, "grad_norm": 0.5174542665481567, "learning_rate": 0.0002273522681383766, "loss": 4.0109, "step": 2191 }, { "epoch": 2.80384, "grad_norm": 0.5266432166099548, "learning_rate": 0.00022731188585273926, "loss": 3.9463, "step": 2192 }, { "epoch": 2.80512, "grad_norm": 0.5114743709564209, "learning_rate": 0.0002272715035671019, "loss": 4.0119, "step": 2193 }, { "epoch": 2.8064, "grad_norm": 0.5330787301063538, "learning_rate": 0.00022723112128146452, "loss": 4.0184, "step": 2194 }, { "epoch": 2.80768, "grad_norm": 0.49550044536590576, "learning_rate": 0.00022719073899582715, "loss": 4.0103, "step": 2195 }, { "epoch": 2.80896, "grad_norm": 0.5248696804046631, "learning_rate": 0.00022715035671018975, "loss": 4.0211, "step": 2196 }, { "epoch": 2.81024, "grad_norm": 0.528124213218689, "learning_rate": 0.0002271099744245524, "loss": 3.9816, "step": 2197 }, { "epoch": 2.81152, "grad_norm": 0.498807817697525, "learning_rate": 0.00022706959213891504, "loss": 3.9876, "step": 2198 }, { "epoch": 2.8128, "grad_norm": 0.5002254247665405, "learning_rate": 0.00022702920985327767, "loss": 3.9442, "step": 2199 }, { "epoch": 2.81408, "grad_norm": 0.5136492848396301, "learning_rate": 0.0002269888275676403, "loss": 4.0369, "step": 2200 }, { "epoch": 2.81536, "grad_norm": 0.5032553672790527, "learning_rate": 0.00022694844528200296, "loss": 4.0838, "step": 2201 }, { "epoch": 2.81664, "grad_norm": 0.4921877086162567, "learning_rate": 0.0002269080629963656, "loss": 4.0399, "step": 2202 }, { "epoch": 2.81792, "grad_norm": 0.48802658915519714, "learning_rate": 0.00022686768071072822, "loss": 3.9631, "step": 2203 }, { "epoch": 2.8192, "grad_norm": 0.5272952914237976, "learning_rate": 0.00022682729842509082, "loss": 4.0163, "step": 2204 }, { "epoch": 2.82048, "grad_norm": 0.5049785375595093, "learning_rate": 0.00022678691613945348, "loss": 3.9296, "step": 2205 }, { "epoch": 2.8217600000000003, "grad_norm": 0.512064516544342, "learning_rate": 0.0002267465338538161, "loss": 3.9826, "step": 2206 }, { "epoch": 2.8230399999999998, "grad_norm": 0.49538886547088623, "learning_rate": 0.00022670615156817874, "loss": 3.9296, "step": 2207 }, { "epoch": 2.82432, "grad_norm": 0.5314378142356873, "learning_rate": 0.00022666576928254137, "loss": 4.0114, "step": 2208 }, { "epoch": 2.8256, "grad_norm": 0.5182934403419495, "learning_rate": 0.000226625386996904, "loss": 4.0499, "step": 2209 }, { "epoch": 2.82688, "grad_norm": 0.5282546877861023, "learning_rate": 0.00022658500471126665, "loss": 4.0127, "step": 2210 }, { "epoch": 2.82816, "grad_norm": 0.49509358406066895, "learning_rate": 0.00022654462242562929, "loss": 3.9841, "step": 2211 }, { "epoch": 2.82944, "grad_norm": 0.5452840328216553, "learning_rate": 0.0002265042401399919, "loss": 4.0112, "step": 2212 }, { "epoch": 2.83072, "grad_norm": 0.5101568102836609, "learning_rate": 0.00022646385785435452, "loss": 3.9766, "step": 2213 }, { "epoch": 2.832, "grad_norm": 0.5204622149467468, "learning_rate": 0.00022642347556871718, "loss": 4.0391, "step": 2214 }, { "epoch": 2.8332800000000002, "grad_norm": 0.49549055099487305, "learning_rate": 0.0002263830932830798, "loss": 3.9876, "step": 2215 }, { "epoch": 2.8345599999999997, "grad_norm": 0.5057578682899475, "learning_rate": 0.00022634271099744244, "loss": 4.0252, "step": 2216 }, { "epoch": 2.83584, "grad_norm": 0.5117456912994385, "learning_rate": 0.00022630232871180507, "loss": 4.0438, "step": 2217 }, { "epoch": 2.83712, "grad_norm": 0.49402594566345215, "learning_rate": 0.00022626194642616772, "loss": 4.0064, "step": 2218 }, { "epoch": 2.8384, "grad_norm": 0.49484384059906006, "learning_rate": 0.00022622156414053035, "loss": 3.9281, "step": 2219 }, { "epoch": 2.83968, "grad_norm": 0.49207979440689087, "learning_rate": 0.00022618118185489296, "loss": 4.0203, "step": 2220 }, { "epoch": 2.84096, "grad_norm": 0.5029460191726685, "learning_rate": 0.00022614079956925559, "loss": 4.0214, "step": 2221 }, { "epoch": 2.84224, "grad_norm": 0.49335840344429016, "learning_rate": 0.00022610041728361822, "loss": 4.0127, "step": 2222 }, { "epoch": 2.84352, "grad_norm": 0.5236724019050598, "learning_rate": 0.00022606003499798087, "loss": 4.0602, "step": 2223 }, { "epoch": 2.8448, "grad_norm": 0.5051988363265991, "learning_rate": 0.0002260196527123435, "loss": 3.9739, "step": 2224 }, { "epoch": 2.84608, "grad_norm": 0.48310649394989014, "learning_rate": 0.00022597927042670613, "loss": 4.0336, "step": 2225 }, { "epoch": 2.84736, "grad_norm": 0.49196314811706543, "learning_rate": 0.00022593888814106876, "loss": 4.068, "step": 2226 }, { "epoch": 2.84864, "grad_norm": 0.48503825068473816, "learning_rate": 0.00022589850585543142, "loss": 4.0305, "step": 2227 }, { "epoch": 2.84992, "grad_norm": 0.5028344988822937, "learning_rate": 0.00022585812356979402, "loss": 3.9993, "step": 2228 }, { "epoch": 2.8512, "grad_norm": 0.4741714298725128, "learning_rate": 0.00022581774128415665, "loss": 4.0975, "step": 2229 }, { "epoch": 2.85248, "grad_norm": 0.5247095227241516, "learning_rate": 0.00022577735899851928, "loss": 4.0058, "step": 2230 }, { "epoch": 2.85376, "grad_norm": 0.5030754804611206, "learning_rate": 0.00022573697671288194, "loss": 3.968, "step": 2231 }, { "epoch": 2.85504, "grad_norm": 0.5085668563842773, "learning_rate": 0.00022569659442724457, "loss": 4.0359, "step": 2232 }, { "epoch": 2.85632, "grad_norm": 0.5207344889640808, "learning_rate": 0.0002256562121416072, "loss": 3.9844, "step": 2233 }, { "epoch": 2.8576, "grad_norm": 0.5112427473068237, "learning_rate": 0.00022561582985596983, "loss": 4.0718, "step": 2234 }, { "epoch": 2.85888, "grad_norm": 0.49677979946136475, "learning_rate": 0.00022557544757033243, "loss": 3.9815, "step": 2235 }, { "epoch": 2.86016, "grad_norm": 0.5005208253860474, "learning_rate": 0.0002255350652846951, "loss": 4.0531, "step": 2236 }, { "epoch": 2.86144, "grad_norm": 0.4932452142238617, "learning_rate": 0.00022549468299905772, "loss": 4.0379, "step": 2237 }, { "epoch": 2.86272, "grad_norm": 0.510642945766449, "learning_rate": 0.00022545430071342035, "loss": 4.0408, "step": 2238 }, { "epoch": 2.864, "grad_norm": 0.49646908044815063, "learning_rate": 0.00022541391842778298, "loss": 4.0044, "step": 2239 }, { "epoch": 2.8652800000000003, "grad_norm": 0.5001757144927979, "learning_rate": 0.00022537353614214564, "loss": 4.0446, "step": 2240 }, { "epoch": 2.8665599999999998, "grad_norm": 0.5238614678382874, "learning_rate": 0.00022533315385650827, "loss": 3.9808, "step": 2241 }, { "epoch": 2.86784, "grad_norm": 0.5164071917533875, "learning_rate": 0.0002252927715708709, "loss": 4.0207, "step": 2242 }, { "epoch": 2.86912, "grad_norm": 0.5213832855224609, "learning_rate": 0.0002252523892852335, "loss": 4.0449, "step": 2243 }, { "epoch": 2.8704, "grad_norm": 0.4973851144313812, "learning_rate": 0.00022521200699959619, "loss": 3.9746, "step": 2244 }, { "epoch": 2.87168, "grad_norm": 0.479270875453949, "learning_rate": 0.0002251716247139588, "loss": 3.9801, "step": 2245 }, { "epoch": 2.87296, "grad_norm": 0.48888400197029114, "learning_rate": 0.00022513124242832142, "loss": 4.0178, "step": 2246 }, { "epoch": 2.87424, "grad_norm": 0.48972752690315247, "learning_rate": 0.00022509086014268405, "loss": 4.0051, "step": 2247 }, { "epoch": 2.87552, "grad_norm": 0.4787149429321289, "learning_rate": 0.00022505047785704668, "loss": 3.8699, "step": 2248 }, { "epoch": 2.8768000000000002, "grad_norm": 0.49900394678115845, "learning_rate": 0.00022501009557140934, "loss": 3.9537, "step": 2249 }, { "epoch": 2.8780799999999997, "grad_norm": 0.48665642738342285, "learning_rate": 0.00022496971328577197, "loss": 3.9841, "step": 2250 }, { "epoch": 2.87936, "grad_norm": 0.48270756006240845, "learning_rate": 0.00022492933100013457, "loss": 4.0203, "step": 2251 }, { "epoch": 2.88064, "grad_norm": 0.48800358176231384, "learning_rate": 0.0002248889487144972, "loss": 4.0187, "step": 2252 }, { "epoch": 2.88192, "grad_norm": 0.5024043321609497, "learning_rate": 0.00022484856642885986, "loss": 3.968, "step": 2253 }, { "epoch": 2.8832, "grad_norm": 0.5062508583068848, "learning_rate": 0.0002248081841432225, "loss": 4.0056, "step": 2254 }, { "epoch": 2.88448, "grad_norm": 0.4831932485103607, "learning_rate": 0.00022476780185758512, "loss": 4.0342, "step": 2255 }, { "epoch": 2.88576, "grad_norm": 0.4784030020236969, "learning_rate": 0.00022472741957194775, "loss": 4.0064, "step": 2256 }, { "epoch": 2.88704, "grad_norm": 0.49033915996551514, "learning_rate": 0.0002246870372863104, "loss": 4.0344, "step": 2257 }, { "epoch": 2.88832, "grad_norm": 0.505535364151001, "learning_rate": 0.00022464665500067303, "loss": 3.9952, "step": 2258 }, { "epoch": 2.8895999999999997, "grad_norm": 0.5267712473869324, "learning_rate": 0.00022460627271503566, "loss": 3.9862, "step": 2259 }, { "epoch": 2.89088, "grad_norm": 0.5031368136405945, "learning_rate": 0.00022456589042939827, "loss": 4.0297, "step": 2260 }, { "epoch": 2.89216, "grad_norm": 0.5366697907447815, "learning_rate": 0.0002245255081437609, "loss": 4.037, "step": 2261 }, { "epoch": 2.89344, "grad_norm": 0.5078505873680115, "learning_rate": 0.00022448512585812355, "loss": 3.9513, "step": 2262 }, { "epoch": 2.89472, "grad_norm": 0.5012052655220032, "learning_rate": 0.00022444474357248618, "loss": 3.9266, "step": 2263 }, { "epoch": 2.896, "grad_norm": 0.5153883695602417, "learning_rate": 0.00022440436128684881, "loss": 3.9948, "step": 2264 }, { "epoch": 2.89728, "grad_norm": 0.5152749419212341, "learning_rate": 0.00022436397900121144, "loss": 3.9879, "step": 2265 }, { "epoch": 2.89856, "grad_norm": 0.5094638466835022, "learning_rate": 0.0002243235967155741, "loss": 3.9949, "step": 2266 }, { "epoch": 2.89984, "grad_norm": 0.489305317401886, "learning_rate": 0.00022428321442993673, "loss": 3.9818, "step": 2267 }, { "epoch": 2.90112, "grad_norm": 0.4954513907432556, "learning_rate": 0.00022424283214429933, "loss": 3.9221, "step": 2268 }, { "epoch": 2.9024, "grad_norm": 0.5154023170471191, "learning_rate": 0.00022420244985866196, "loss": 4.0049, "step": 2269 }, { "epoch": 2.90368, "grad_norm": 0.5139758586883545, "learning_rate": 0.00022416206757302462, "loss": 3.9232, "step": 2270 }, { "epoch": 2.90496, "grad_norm": 0.5243357419967651, "learning_rate": 0.00022412168528738725, "loss": 4.0215, "step": 2271 }, { "epoch": 2.90624, "grad_norm": 0.5350946187973022, "learning_rate": 0.00022408130300174988, "loss": 3.9502, "step": 2272 }, { "epoch": 2.90752, "grad_norm": 0.5469184517860413, "learning_rate": 0.0002240409207161125, "loss": 3.9617, "step": 2273 }, { "epoch": 2.9088000000000003, "grad_norm": 0.5040557980537415, "learning_rate": 0.00022400053843047514, "loss": 3.9336, "step": 2274 }, { "epoch": 2.91008, "grad_norm": 0.5210381150245667, "learning_rate": 0.0002239601561448378, "loss": 3.9707, "step": 2275 }, { "epoch": 2.91136, "grad_norm": 0.5240786075592041, "learning_rate": 0.0002239197738592004, "loss": 3.9758, "step": 2276 }, { "epoch": 2.91264, "grad_norm": 0.5405933260917664, "learning_rate": 0.00022387939157356303, "loss": 4.036, "step": 2277 }, { "epoch": 2.91392, "grad_norm": 0.5046157240867615, "learning_rate": 0.00022383900928792566, "loss": 3.9727, "step": 2278 }, { "epoch": 2.9152, "grad_norm": 0.49960970878601074, "learning_rate": 0.00022379862700228832, "loss": 4.0635, "step": 2279 }, { "epoch": 2.91648, "grad_norm": 0.514678418636322, "learning_rate": 0.00022375824471665095, "loss": 4.01, "step": 2280 }, { "epoch": 2.91776, "grad_norm": 0.5296341180801392, "learning_rate": 0.00022371786243101358, "loss": 3.9588, "step": 2281 }, { "epoch": 2.91904, "grad_norm": 0.49709784984588623, "learning_rate": 0.0002236774801453762, "loss": 3.9068, "step": 2282 }, { "epoch": 2.9203200000000002, "grad_norm": 0.5464451909065247, "learning_rate": 0.00022363709785973887, "loss": 4.0025, "step": 2283 }, { "epoch": 2.9215999999999998, "grad_norm": 0.5068522095680237, "learning_rate": 0.00022359671557410147, "loss": 3.9793, "step": 2284 }, { "epoch": 2.92288, "grad_norm": 0.5234406590461731, "learning_rate": 0.0002235563332884641, "loss": 3.9384, "step": 2285 }, { "epoch": 2.92416, "grad_norm": 0.5124624967575073, "learning_rate": 0.00022351595100282673, "loss": 3.9735, "step": 2286 }, { "epoch": 2.92544, "grad_norm": 0.5246328115463257, "learning_rate": 0.00022347556871718936, "loss": 4.0739, "step": 2287 }, { "epoch": 2.92672, "grad_norm": 0.5275157690048218, "learning_rate": 0.00022343518643155202, "loss": 3.9339, "step": 2288 }, { "epoch": 2.928, "grad_norm": 0.5206854939460754, "learning_rate": 0.00022339480414591465, "loss": 3.9908, "step": 2289 }, { "epoch": 2.92928, "grad_norm": 0.50496506690979, "learning_rate": 0.00022335442186027728, "loss": 4.0184, "step": 2290 }, { "epoch": 2.93056, "grad_norm": 0.5281472206115723, "learning_rate": 0.00022331403957463988, "loss": 4.0205, "step": 2291 }, { "epoch": 2.9318400000000002, "grad_norm": 0.5035421252250671, "learning_rate": 0.00022327365728900254, "loss": 4.016, "step": 2292 }, { "epoch": 2.9331199999999997, "grad_norm": 0.5101969242095947, "learning_rate": 0.00022323327500336517, "loss": 3.9902, "step": 2293 }, { "epoch": 2.9344, "grad_norm": 0.49704429507255554, "learning_rate": 0.0002231928927177278, "loss": 3.9911, "step": 2294 }, { "epoch": 2.93568, "grad_norm": 0.5134301781654358, "learning_rate": 0.00022315251043209043, "loss": 3.9861, "step": 2295 }, { "epoch": 2.93696, "grad_norm": 0.5616884231567383, "learning_rate": 0.00022311212814645309, "loss": 4.0493, "step": 2296 }, { "epoch": 2.93824, "grad_norm": 0.4961213767528534, "learning_rate": 0.00022307174586081572, "loss": 3.9555, "step": 2297 }, { "epoch": 2.93952, "grad_norm": 0.5437341928482056, "learning_rate": 0.00022303136357517835, "loss": 3.9674, "step": 2298 }, { "epoch": 2.9408, "grad_norm": 0.5309537053108215, "learning_rate": 0.00022299098128954095, "loss": 4.0694, "step": 2299 }, { "epoch": 2.94208, "grad_norm": 0.5304505825042725, "learning_rate": 0.00022295059900390358, "loss": 3.9685, "step": 2300 }, { "epoch": 2.94336, "grad_norm": 0.5457247495651245, "learning_rate": 0.00022291021671826624, "loss": 3.9616, "step": 2301 }, { "epoch": 2.94464, "grad_norm": 0.5207186341285706, "learning_rate": 0.00022286983443262887, "loss": 3.9636, "step": 2302 }, { "epoch": 2.94592, "grad_norm": 0.511628270149231, "learning_rate": 0.0002228294521469915, "loss": 4.0361, "step": 2303 }, { "epoch": 2.9472, "grad_norm": 0.5089471340179443, "learning_rate": 0.00022278906986135413, "loss": 3.9737, "step": 2304 }, { "epoch": 2.94848, "grad_norm": 0.5251920819282532, "learning_rate": 0.00022274868757571678, "loss": 4.0431, "step": 2305 }, { "epoch": 2.94976, "grad_norm": 0.528783917427063, "learning_rate": 0.0002227083052900794, "loss": 3.9471, "step": 2306 }, { "epoch": 2.95104, "grad_norm": 0.507645845413208, "learning_rate": 0.00022266792300444202, "loss": 4.0272, "step": 2307 }, { "epoch": 2.9523200000000003, "grad_norm": 0.5167163014411926, "learning_rate": 0.00022262754071880465, "loss": 3.9864, "step": 2308 }, { "epoch": 2.9536, "grad_norm": 0.48903942108154297, "learning_rate": 0.0002225871584331673, "loss": 4.0455, "step": 2309 }, { "epoch": 2.95488, "grad_norm": 0.5110606551170349, "learning_rate": 0.00022254677614752993, "loss": 3.9581, "step": 2310 }, { "epoch": 2.95616, "grad_norm": 0.5128504633903503, "learning_rate": 0.00022250639386189256, "loss": 3.9707, "step": 2311 }, { "epoch": 2.95744, "grad_norm": 0.5019478797912598, "learning_rate": 0.0002224660115762552, "loss": 4.0202, "step": 2312 }, { "epoch": 2.95872, "grad_norm": 0.4747721254825592, "learning_rate": 0.00022242562929061785, "loss": 4.0257, "step": 2313 }, { "epoch": 2.96, "grad_norm": 0.4858660399913788, "learning_rate": 0.00022238524700498048, "loss": 3.9312, "step": 2314 }, { "epoch": 2.96128, "grad_norm": 0.49668505787849426, "learning_rate": 0.00022234486471934308, "loss": 4.0515, "step": 2315 }, { "epoch": 2.96256, "grad_norm": 0.5372991561889648, "learning_rate": 0.00022230448243370571, "loss": 3.8684, "step": 2316 }, { "epoch": 2.9638400000000003, "grad_norm": 0.49556052684783936, "learning_rate": 0.00022226410014806834, "loss": 4.0073, "step": 2317 }, { "epoch": 2.9651199999999998, "grad_norm": 0.5234814882278442, "learning_rate": 0.000222223717862431, "loss": 3.9878, "step": 2318 }, { "epoch": 2.9664, "grad_norm": 0.49563363194465637, "learning_rate": 0.00022218333557679363, "loss": 4.0238, "step": 2319 }, { "epoch": 2.96768, "grad_norm": 0.5069983005523682, "learning_rate": 0.00022214295329115626, "loss": 3.9518, "step": 2320 }, { "epoch": 2.96896, "grad_norm": 0.5253263711929321, "learning_rate": 0.0002221025710055189, "loss": 4.012, "step": 2321 }, { "epoch": 2.97024, "grad_norm": 0.5037320256233215, "learning_rate": 0.00022206218871988155, "loss": 3.965, "step": 2322 }, { "epoch": 2.97152, "grad_norm": 0.5014638900756836, "learning_rate": 0.00022202180643424415, "loss": 3.9803, "step": 2323 }, { "epoch": 2.9728, "grad_norm": 0.5158342719078064, "learning_rate": 0.00022198142414860678, "loss": 3.9949, "step": 2324 }, { "epoch": 2.97408, "grad_norm": 0.5213910937309265, "learning_rate": 0.0002219410418629694, "loss": 3.9926, "step": 2325 }, { "epoch": 2.9753600000000002, "grad_norm": 0.5123266577720642, "learning_rate": 0.00022190065957733207, "loss": 3.957, "step": 2326 }, { "epoch": 2.9766399999999997, "grad_norm": 0.5103859305381775, "learning_rate": 0.0002218602772916947, "loss": 4.0097, "step": 2327 }, { "epoch": 2.97792, "grad_norm": 0.5117716789245605, "learning_rate": 0.00022181989500605733, "loss": 3.9832, "step": 2328 }, { "epoch": 2.9792, "grad_norm": 0.48839670419692993, "learning_rate": 0.00022177951272041996, "loss": 4.0224, "step": 2329 }, { "epoch": 2.98048, "grad_norm": 0.5360726714134216, "learning_rate": 0.00022173913043478256, "loss": 3.9817, "step": 2330 }, { "epoch": 2.98176, "grad_norm": 0.4825299382209778, "learning_rate": 0.00022169874814914525, "loss": 3.9989, "step": 2331 }, { "epoch": 2.98304, "grad_norm": 0.5100297331809998, "learning_rate": 0.00022165836586350785, "loss": 3.9841, "step": 2332 }, { "epoch": 2.98432, "grad_norm": 0.4855094850063324, "learning_rate": 0.00022161798357787048, "loss": 3.963, "step": 2333 }, { "epoch": 2.9856, "grad_norm": 0.48381808400154114, "learning_rate": 0.0002215776012922331, "loss": 3.9805, "step": 2334 }, { "epoch": 2.98688, "grad_norm": 0.4909382462501526, "learning_rate": 0.00022153721900659577, "loss": 3.9692, "step": 2335 }, { "epoch": 2.98816, "grad_norm": 0.4872201085090637, "learning_rate": 0.0002214968367209584, "loss": 3.9689, "step": 2336 }, { "epoch": 2.98944, "grad_norm": 0.470969557762146, "learning_rate": 0.00022145645443532103, "loss": 3.9348, "step": 2337 }, { "epoch": 2.99072, "grad_norm": 0.4978778064250946, "learning_rate": 0.00022141607214968363, "loss": 3.9148, "step": 2338 }, { "epoch": 2.992, "grad_norm": 0.48822498321533203, "learning_rate": 0.00022137568986404631, "loss": 3.858, "step": 2339 }, { "epoch": 2.99328, "grad_norm": 0.4724681079387665, "learning_rate": 0.00022133530757840892, "loss": 3.9882, "step": 2340 }, { "epoch": 2.99456, "grad_norm": 0.5068411231040955, "learning_rate": 0.00022129492529277155, "loss": 3.9354, "step": 2341 }, { "epoch": 2.99584, "grad_norm": 0.49231788516044617, "learning_rate": 0.00022125454300713418, "loss": 3.9973, "step": 2342 }, { "epoch": 2.99712, "grad_norm": 0.5040780901908875, "learning_rate": 0.0002212141607214968, "loss": 3.9624, "step": 2343 }, { "epoch": 2.9984, "grad_norm": 0.5048054456710815, "learning_rate": 0.00022117377843585946, "loss": 4.0559, "step": 2344 }, { "epoch": 2.99968, "grad_norm": 0.5000688433647156, "learning_rate": 0.0002211333961502221, "loss": 4.0228, "step": 2345 }, { "epoch": 3.0, "grad_norm": 0.9295437932014465, "learning_rate": 0.00022109301386458472, "loss": 3.9067, "step": 2346 }, { "epoch": 3.00128, "grad_norm": 0.5954660177230835, "learning_rate": 0.00022105263157894733, "loss": 3.7776, "step": 2347 }, { "epoch": 3.00256, "grad_norm": 0.5306954979896545, "learning_rate": 0.00022101224929330998, "loss": 3.8486, "step": 2348 }, { "epoch": 3.00384, "grad_norm": 0.5421395897865295, "learning_rate": 0.00022097186700767261, "loss": 3.8924, "step": 2349 }, { "epoch": 3.00512, "grad_norm": 0.5273167490959167, "learning_rate": 0.00022093148472203524, "loss": 3.8585, "step": 2350 }, { "epoch": 3.0064, "grad_norm": 0.5256027579307556, "learning_rate": 0.00022089110243639788, "loss": 3.8755, "step": 2351 }, { "epoch": 3.00768, "grad_norm": 0.5198791027069092, "learning_rate": 0.00022085072015076053, "loss": 3.8962, "step": 2352 }, { "epoch": 3.00896, "grad_norm": 0.5103099346160889, "learning_rate": 0.00022081033786512316, "loss": 3.8984, "step": 2353 }, { "epoch": 3.01024, "grad_norm": 0.48842713236808777, "learning_rate": 0.0002207699555794858, "loss": 3.9612, "step": 2354 }, { "epoch": 3.01152, "grad_norm": 0.5094363689422607, "learning_rate": 0.0002207295732938484, "loss": 3.8044, "step": 2355 }, { "epoch": 3.0128, "grad_norm": 0.537132740020752, "learning_rate": 0.00022068919100821103, "loss": 3.9496, "step": 2356 }, { "epoch": 3.01408, "grad_norm": 0.5116694569587708, "learning_rate": 0.00022064880872257368, "loss": 3.8491, "step": 2357 }, { "epoch": 3.01536, "grad_norm": 0.5091819763183594, "learning_rate": 0.0002206084264369363, "loss": 3.8916, "step": 2358 }, { "epoch": 3.01664, "grad_norm": 0.5170096158981323, "learning_rate": 0.00022056804415129894, "loss": 3.9251, "step": 2359 }, { "epoch": 3.01792, "grad_norm": 0.4943462014198303, "learning_rate": 0.00022052766186566157, "loss": 3.9045, "step": 2360 }, { "epoch": 3.0192, "grad_norm": 0.4762275815010071, "learning_rate": 0.00022048727958002423, "loss": 3.8835, "step": 2361 }, { "epoch": 3.02048, "grad_norm": 0.5517547130584717, "learning_rate": 0.00022044689729438686, "loss": 3.8825, "step": 2362 }, { "epoch": 3.02176, "grad_norm": 0.5428676605224609, "learning_rate": 0.00022040651500874946, "loss": 3.8153, "step": 2363 }, { "epoch": 3.02304, "grad_norm": 0.5272209644317627, "learning_rate": 0.0002203661327231121, "loss": 3.8748, "step": 2364 }, { "epoch": 3.02432, "grad_norm": 0.5124238729476929, "learning_rate": 0.00022032575043747475, "loss": 3.9815, "step": 2365 }, { "epoch": 3.0256, "grad_norm": 0.5289603471755981, "learning_rate": 0.00022028536815183738, "loss": 3.8756, "step": 2366 }, { "epoch": 3.02688, "grad_norm": 0.5256252884864807, "learning_rate": 0.0002202449858662, "loss": 3.8492, "step": 2367 }, { "epoch": 3.02816, "grad_norm": 0.5126022696495056, "learning_rate": 0.00022020460358056264, "loss": 3.814, "step": 2368 }, { "epoch": 3.02944, "grad_norm": 0.5239128470420837, "learning_rate": 0.00022016422129492527, "loss": 3.8932, "step": 2369 }, { "epoch": 3.03072, "grad_norm": 0.49754399061203003, "learning_rate": 0.00022012383900928793, "loss": 3.8759, "step": 2370 }, { "epoch": 3.032, "grad_norm": 0.49979376792907715, "learning_rate": 0.00022008345672365053, "loss": 3.9152, "step": 2371 }, { "epoch": 3.03328, "grad_norm": 0.48074454069137573, "learning_rate": 0.00022004307443801316, "loss": 3.8479, "step": 2372 }, { "epoch": 3.03456, "grad_norm": 0.5178848505020142, "learning_rate": 0.0002200026921523758, "loss": 3.8689, "step": 2373 }, { "epoch": 3.03584, "grad_norm": 0.5065923929214478, "learning_rate": 0.00021996230986673845, "loss": 3.868, "step": 2374 }, { "epoch": 3.03712, "grad_norm": 0.4748089015483856, "learning_rate": 0.00021992192758110108, "loss": 3.8268, "step": 2375 }, { "epoch": 3.0384, "grad_norm": 0.4894319474697113, "learning_rate": 0.0002198815452954637, "loss": 3.7976, "step": 2376 }, { "epoch": 3.03968, "grad_norm": 0.5088182687759399, "learning_rate": 0.00021984116300982634, "loss": 3.8785, "step": 2377 }, { "epoch": 3.04096, "grad_norm": 0.48660656809806824, "learning_rate": 0.000219800780724189, "loss": 3.8136, "step": 2378 }, { "epoch": 3.04224, "grad_norm": 0.5052852034568787, "learning_rate": 0.0002197603984385516, "loss": 3.8922, "step": 2379 }, { "epoch": 3.04352, "grad_norm": 0.49681487679481506, "learning_rate": 0.00021972001615291423, "loss": 3.8612, "step": 2380 }, { "epoch": 3.0448, "grad_norm": 0.4781433045864105, "learning_rate": 0.00021967963386727686, "loss": 3.844, "step": 2381 }, { "epoch": 3.04608, "grad_norm": 0.5147704482078552, "learning_rate": 0.0002196392515816395, "loss": 3.8779, "step": 2382 }, { "epoch": 3.04736, "grad_norm": 0.49647682905197144, "learning_rate": 0.00021959886929600215, "loss": 3.8326, "step": 2383 }, { "epoch": 3.04864, "grad_norm": 0.4942707121372223, "learning_rate": 0.00021955848701036478, "loss": 3.8618, "step": 2384 }, { "epoch": 3.04992, "grad_norm": 0.5220282077789307, "learning_rate": 0.0002195181047247274, "loss": 3.9195, "step": 2385 }, { "epoch": 3.0512, "grad_norm": 0.4788862466812134, "learning_rate": 0.00021947772243909, "loss": 3.8758, "step": 2386 }, { "epoch": 3.05248, "grad_norm": 0.4899081289768219, "learning_rate": 0.00021943734015345267, "loss": 3.8233, "step": 2387 }, { "epoch": 3.05376, "grad_norm": 0.4979023337364197, "learning_rate": 0.0002193969578678153, "loss": 3.8196, "step": 2388 }, { "epoch": 3.05504, "grad_norm": 0.48769399523735046, "learning_rate": 0.00021935657558217793, "loss": 3.85, "step": 2389 }, { "epoch": 3.05632, "grad_norm": 0.5051479935646057, "learning_rate": 0.00021931619329654056, "loss": 3.8659, "step": 2390 }, { "epoch": 3.0576, "grad_norm": 0.508083701133728, "learning_rate": 0.00021927581101090321, "loss": 3.8542, "step": 2391 }, { "epoch": 3.05888, "grad_norm": 0.5128031373023987, "learning_rate": 0.00021923542872526584, "loss": 3.9097, "step": 2392 }, { "epoch": 3.06016, "grad_norm": 0.49602705240249634, "learning_rate": 0.00021919504643962847, "loss": 3.8487, "step": 2393 }, { "epoch": 3.06144, "grad_norm": 0.5159900188446045, "learning_rate": 0.00021915466415399108, "loss": 3.7835, "step": 2394 }, { "epoch": 3.06272, "grad_norm": 0.4902365803718567, "learning_rate": 0.0002191142818683537, "loss": 3.8462, "step": 2395 }, { "epoch": 3.064, "grad_norm": 0.5062222480773926, "learning_rate": 0.00021907389958271636, "loss": 3.8156, "step": 2396 }, { "epoch": 3.06528, "grad_norm": 0.5004411339759827, "learning_rate": 0.000219033517297079, "loss": 3.7797, "step": 2397 }, { "epoch": 3.06656, "grad_norm": 0.5420650839805603, "learning_rate": 0.00021899313501144162, "loss": 3.9092, "step": 2398 }, { "epoch": 3.06784, "grad_norm": 0.5353574752807617, "learning_rate": 0.00021895275272580425, "loss": 3.8596, "step": 2399 }, { "epoch": 3.06912, "grad_norm": 0.4925222396850586, "learning_rate": 0.0002189123704401669, "loss": 3.8678, "step": 2400 }, { "epoch": 3.0704, "grad_norm": 0.54100501537323, "learning_rate": 0.00021887198815452954, "loss": 3.8494, "step": 2401 }, { "epoch": 3.07168, "grad_norm": 0.5081486105918884, "learning_rate": 0.00021883160586889214, "loss": 3.8965, "step": 2402 }, { "epoch": 3.07296, "grad_norm": 0.5015511512756348, "learning_rate": 0.00021879122358325477, "loss": 3.9272, "step": 2403 }, { "epoch": 3.07424, "grad_norm": 0.5019526481628418, "learning_rate": 0.00021875084129761743, "loss": 3.8459, "step": 2404 }, { "epoch": 3.07552, "grad_norm": 0.491059273481369, "learning_rate": 0.00021871045901198006, "loss": 3.8815, "step": 2405 }, { "epoch": 3.0768, "grad_norm": 0.5351020097732544, "learning_rate": 0.0002186700767263427, "loss": 3.8489, "step": 2406 }, { "epoch": 3.07808, "grad_norm": 0.5229183435440063, "learning_rate": 0.00021862969444070532, "loss": 3.8421, "step": 2407 }, { "epoch": 3.07936, "grad_norm": 0.507849395275116, "learning_rate": 0.00021858931215506795, "loss": 3.8573, "step": 2408 }, { "epoch": 3.08064, "grad_norm": 0.5348861813545227, "learning_rate": 0.0002185489298694306, "loss": 3.8958, "step": 2409 }, { "epoch": 3.08192, "grad_norm": 0.5243686437606812, "learning_rate": 0.0002185085475837932, "loss": 3.8338, "step": 2410 }, { "epoch": 3.0832, "grad_norm": 0.5294802784919739, "learning_rate": 0.00021846816529815584, "loss": 3.9149, "step": 2411 }, { "epoch": 3.08448, "grad_norm": 0.5236945748329163, "learning_rate": 0.00021842778301251847, "loss": 3.848, "step": 2412 }, { "epoch": 3.08576, "grad_norm": 0.5086913704872131, "learning_rate": 0.00021838740072688113, "loss": 3.794, "step": 2413 }, { "epoch": 3.08704, "grad_norm": 0.5362405776977539, "learning_rate": 0.00021834701844124376, "loss": 3.9174, "step": 2414 }, { "epoch": 3.08832, "grad_norm": 0.5077964067459106, "learning_rate": 0.0002183066361556064, "loss": 3.8253, "step": 2415 }, { "epoch": 3.0896, "grad_norm": 0.5096865296363831, "learning_rate": 0.00021826625386996902, "loss": 3.8887, "step": 2416 }, { "epoch": 3.09088, "grad_norm": 0.5185666084289551, "learning_rate": 0.00021822587158433168, "loss": 3.8292, "step": 2417 }, { "epoch": 3.09216, "grad_norm": 0.4858289659023285, "learning_rate": 0.0002181854892986943, "loss": 3.8525, "step": 2418 }, { "epoch": 3.09344, "grad_norm": 0.5283902287483215, "learning_rate": 0.0002181451070130569, "loss": 3.8476, "step": 2419 }, { "epoch": 3.09472, "grad_norm": 0.5762797594070435, "learning_rate": 0.00021810472472741954, "loss": 3.8591, "step": 2420 }, { "epoch": 3.096, "grad_norm": 0.5110985636711121, "learning_rate": 0.00021806434244178217, "loss": 3.7624, "step": 2421 }, { "epoch": 3.09728, "grad_norm": 0.5029181241989136, "learning_rate": 0.00021802396015614483, "loss": 3.8236, "step": 2422 }, { "epoch": 3.09856, "grad_norm": 0.5185674428939819, "learning_rate": 0.00021798357787050746, "loss": 3.8156, "step": 2423 }, { "epoch": 3.09984, "grad_norm": 0.5150001645088196, "learning_rate": 0.0002179431955848701, "loss": 3.8182, "step": 2424 }, { "epoch": 3.10112, "grad_norm": 0.4945181608200073, "learning_rate": 0.0002179028132992327, "loss": 3.8906, "step": 2425 }, { "epoch": 3.1024, "grad_norm": 0.5029820799827576, "learning_rate": 0.00021786243101359537, "loss": 3.8907, "step": 2426 }, { "epoch": 3.1036799999999998, "grad_norm": 0.5240580439567566, "learning_rate": 0.00021782204872795798, "loss": 3.911, "step": 2427 }, { "epoch": 3.10496, "grad_norm": 0.526394248008728, "learning_rate": 0.0002177816664423206, "loss": 3.928, "step": 2428 }, { "epoch": 3.10624, "grad_norm": 0.517666757106781, "learning_rate": 0.00021774128415668324, "loss": 3.8165, "step": 2429 }, { "epoch": 3.10752, "grad_norm": 0.5141002535820007, "learning_rate": 0.0002177009018710459, "loss": 3.7981, "step": 2430 }, { "epoch": 3.1088, "grad_norm": 0.4856560230255127, "learning_rate": 0.00021766051958540853, "loss": 3.8666, "step": 2431 }, { "epoch": 3.11008, "grad_norm": 0.5310648083686829, "learning_rate": 0.00021762013729977116, "loss": 3.8851, "step": 2432 }, { "epoch": 3.11136, "grad_norm": 0.5003724098205566, "learning_rate": 0.00021757975501413379, "loss": 3.8379, "step": 2433 }, { "epoch": 3.11264, "grad_norm": 0.5054803490638733, "learning_rate": 0.0002175393727284964, "loss": 3.7736, "step": 2434 }, { "epoch": 3.11392, "grad_norm": 0.4943746328353882, "learning_rate": 0.00021749899044285905, "loss": 3.8826, "step": 2435 }, { "epoch": 3.1152, "grad_norm": 0.49656054377555847, "learning_rate": 0.00021745860815722168, "loss": 3.838, "step": 2436 }, { "epoch": 3.11648, "grad_norm": 0.5456673502922058, "learning_rate": 0.0002174182258715843, "loss": 3.9265, "step": 2437 }, { "epoch": 3.11776, "grad_norm": 0.4923163056373596, "learning_rate": 0.00021737784358594694, "loss": 3.9262, "step": 2438 }, { "epoch": 3.11904, "grad_norm": 0.5190243124961853, "learning_rate": 0.0002173374613003096, "loss": 3.8364, "step": 2439 }, { "epoch": 3.12032, "grad_norm": 0.47707489132881165, "learning_rate": 0.00021729707901467222, "loss": 3.8502, "step": 2440 }, { "epoch": 3.1216, "grad_norm": 0.5326244235038757, "learning_rate": 0.00021725669672903485, "loss": 3.8078, "step": 2441 }, { "epoch": 3.12288, "grad_norm": 0.5211552381515503, "learning_rate": 0.00021721631444339746, "loss": 3.8304, "step": 2442 }, { "epoch": 3.12416, "grad_norm": 0.524936854839325, "learning_rate": 0.0002171759321577601, "loss": 3.7785, "step": 2443 }, { "epoch": 3.12544, "grad_norm": 0.5139725804328918, "learning_rate": 0.00021713554987212274, "loss": 3.8666, "step": 2444 }, { "epoch": 3.12672, "grad_norm": 0.5270793437957764, "learning_rate": 0.00021709516758648537, "loss": 3.8508, "step": 2445 }, { "epoch": 3.128, "grad_norm": 0.5018444657325745, "learning_rate": 0.000217054785300848, "loss": 3.8951, "step": 2446 }, { "epoch": 3.12928, "grad_norm": 0.5170176029205322, "learning_rate": 0.00021701440301521066, "loss": 3.8977, "step": 2447 }, { "epoch": 3.13056, "grad_norm": 0.49703750014305115, "learning_rate": 0.0002169740207295733, "loss": 3.7843, "step": 2448 }, { "epoch": 3.13184, "grad_norm": 0.5318528413772583, "learning_rate": 0.00021693363844393592, "loss": 3.9147, "step": 2449 }, { "epoch": 3.13312, "grad_norm": 0.5124686360359192, "learning_rate": 0.00021689325615829852, "loss": 3.8174, "step": 2450 }, { "epoch": 3.1344, "grad_norm": 0.5229965448379517, "learning_rate": 0.00021685287387266115, "loss": 3.9171, "step": 2451 }, { "epoch": 3.13568, "grad_norm": 0.5190241932868958, "learning_rate": 0.0002168124915870238, "loss": 3.8315, "step": 2452 }, { "epoch": 3.13696, "grad_norm": 0.5506916642189026, "learning_rate": 0.00021677210930138644, "loss": 3.8351, "step": 2453 }, { "epoch": 3.13824, "grad_norm": 0.5469547510147095, "learning_rate": 0.00021673172701574907, "loss": 3.9098, "step": 2454 }, { "epoch": 3.13952, "grad_norm": 0.5336132049560547, "learning_rate": 0.0002166913447301117, "loss": 3.8963, "step": 2455 }, { "epoch": 3.1408, "grad_norm": 0.549925684928894, "learning_rate": 0.00021665096244447436, "loss": 3.8541, "step": 2456 }, { "epoch": 3.14208, "grad_norm": 0.5148488283157349, "learning_rate": 0.000216610580158837, "loss": 3.8654, "step": 2457 }, { "epoch": 3.14336, "grad_norm": 0.5574371218681335, "learning_rate": 0.0002165701978731996, "loss": 3.9102, "step": 2458 }, { "epoch": 3.14464, "grad_norm": 0.5164331793785095, "learning_rate": 0.00021652981558756222, "loss": 3.7154, "step": 2459 }, { "epoch": 3.14592, "grad_norm": 0.5179708003997803, "learning_rate": 0.00021648943330192488, "loss": 3.9097, "step": 2460 }, { "epoch": 3.1471999999999998, "grad_norm": 0.5360696315765381, "learning_rate": 0.0002164490510162875, "loss": 3.8359, "step": 2461 }, { "epoch": 3.14848, "grad_norm": 0.5113580822944641, "learning_rate": 0.00021640866873065014, "loss": 3.911, "step": 2462 }, { "epoch": 3.14976, "grad_norm": 0.5048407316207886, "learning_rate": 0.00021636828644501277, "loss": 3.8197, "step": 2463 }, { "epoch": 3.15104, "grad_norm": 0.5186136960983276, "learning_rate": 0.0002163279041593754, "loss": 3.8374, "step": 2464 }, { "epoch": 3.15232, "grad_norm": 0.4925592243671417, "learning_rate": 0.00021628752187373806, "loss": 3.8343, "step": 2465 }, { "epoch": 3.1536, "grad_norm": 0.49732711911201477, "learning_rate": 0.00021624713958810066, "loss": 3.8915, "step": 2466 }, { "epoch": 3.15488, "grad_norm": 0.5022438764572144, "learning_rate": 0.0002162067573024633, "loss": 3.8251, "step": 2467 }, { "epoch": 3.15616, "grad_norm": 0.5367256999015808, "learning_rate": 0.00021616637501682592, "loss": 3.8802, "step": 2468 }, { "epoch": 3.15744, "grad_norm": 0.4930475950241089, "learning_rate": 0.00021612599273118858, "loss": 3.8364, "step": 2469 }, { "epoch": 3.15872, "grad_norm": 0.5071558356285095, "learning_rate": 0.0002160856104455512, "loss": 3.828, "step": 2470 }, { "epoch": 3.16, "grad_norm": 0.5163344740867615, "learning_rate": 0.00021604522815991384, "loss": 3.7945, "step": 2471 }, { "epoch": 3.16128, "grad_norm": 0.5042238831520081, "learning_rate": 0.00021600484587427647, "loss": 3.8724, "step": 2472 }, { "epoch": 3.16256, "grad_norm": 0.5206556916236877, "learning_rate": 0.00021596446358863912, "loss": 3.8445, "step": 2473 }, { "epoch": 3.16384, "grad_norm": 0.4869347810745239, "learning_rate": 0.00021592408130300173, "loss": 3.8982, "step": 2474 }, { "epoch": 3.16512, "grad_norm": 0.4939110279083252, "learning_rate": 0.00021588369901736436, "loss": 3.8215, "step": 2475 }, { "epoch": 3.1664, "grad_norm": 0.4792523980140686, "learning_rate": 0.000215843316731727, "loss": 3.8195, "step": 2476 }, { "epoch": 3.16768, "grad_norm": 0.4833280146121979, "learning_rate": 0.00021580293444608962, "loss": 3.878, "step": 2477 }, { "epoch": 3.16896, "grad_norm": 0.48897233605384827, "learning_rate": 0.00021576255216045227, "loss": 3.8418, "step": 2478 }, { "epoch": 3.17024, "grad_norm": 0.5081768035888672, "learning_rate": 0.0002157221698748149, "loss": 3.8296, "step": 2479 }, { "epoch": 3.17152, "grad_norm": 0.499881386756897, "learning_rate": 0.00021568178758917753, "loss": 3.9065, "step": 2480 }, { "epoch": 3.1728, "grad_norm": 0.5089867115020752, "learning_rate": 0.00021564140530354014, "loss": 3.9007, "step": 2481 }, { "epoch": 3.17408, "grad_norm": 0.490253746509552, "learning_rate": 0.0002156010230179028, "loss": 3.9089, "step": 2482 }, { "epoch": 3.17536, "grad_norm": 0.49601680040359497, "learning_rate": 0.00021556064073226542, "loss": 3.8876, "step": 2483 }, { "epoch": 3.17664, "grad_norm": 0.5113343000411987, "learning_rate": 0.00021552025844662805, "loss": 3.8393, "step": 2484 }, { "epoch": 3.17792, "grad_norm": 0.47494059801101685, "learning_rate": 0.00021547987616099068, "loss": 3.8461, "step": 2485 }, { "epoch": 3.1792, "grad_norm": 0.5105437636375427, "learning_rate": 0.00021543949387535334, "loss": 3.8837, "step": 2486 }, { "epoch": 3.18048, "grad_norm": 0.5032381415367126, "learning_rate": 0.00021539911158971597, "loss": 3.8457, "step": 2487 }, { "epoch": 3.18176, "grad_norm": 0.49342939257621765, "learning_rate": 0.0002153587293040786, "loss": 3.819, "step": 2488 }, { "epoch": 3.18304, "grad_norm": 0.49922072887420654, "learning_rate": 0.0002153183470184412, "loss": 3.792, "step": 2489 }, { "epoch": 3.18432, "grad_norm": 0.49167224764823914, "learning_rate": 0.00021527796473280384, "loss": 3.8155, "step": 2490 }, { "epoch": 3.1856, "grad_norm": 0.4955747425556183, "learning_rate": 0.0002152375824471665, "loss": 3.9052, "step": 2491 }, { "epoch": 3.18688, "grad_norm": 0.513163149356842, "learning_rate": 0.00021519720016152912, "loss": 3.946, "step": 2492 }, { "epoch": 3.18816, "grad_norm": 0.4796956181526184, "learning_rate": 0.00021515681787589175, "loss": 3.8418, "step": 2493 }, { "epoch": 3.18944, "grad_norm": 0.5129488110542297, "learning_rate": 0.00021511643559025438, "loss": 3.9219, "step": 2494 }, { "epoch": 3.19072, "grad_norm": 0.49274012446403503, "learning_rate": 0.00021507605330461704, "loss": 3.9179, "step": 2495 }, { "epoch": 3.192, "grad_norm": 0.5269827246665955, "learning_rate": 0.00021503567101897967, "loss": 3.8845, "step": 2496 }, { "epoch": 3.19328, "grad_norm": 0.5039204955101013, "learning_rate": 0.00021499528873334227, "loss": 3.8432, "step": 2497 }, { "epoch": 3.19456, "grad_norm": 0.5350068807601929, "learning_rate": 0.0002149549064477049, "loss": 3.7202, "step": 2498 }, { "epoch": 3.19584, "grad_norm": 0.49248334765434265, "learning_rate": 0.00021491452416206756, "loss": 3.848, "step": 2499 }, { "epoch": 3.19712, "grad_norm": 0.5159714221954346, "learning_rate": 0.0002148741418764302, "loss": 3.8145, "step": 2500 }, { "epoch": 3.1984, "grad_norm": 0.4729326367378235, "learning_rate": 0.00021483375959079282, "loss": 3.9304, "step": 2501 }, { "epoch": 3.19968, "grad_norm": 0.5187593698501587, "learning_rate": 0.00021479337730515545, "loss": 3.8462, "step": 2502 }, { "epoch": 3.20096, "grad_norm": 0.521251380443573, "learning_rate": 0.00021475299501951808, "loss": 3.8268, "step": 2503 }, { "epoch": 3.20224, "grad_norm": 0.4812679886817932, "learning_rate": 0.00021471261273388074, "loss": 3.8409, "step": 2504 }, { "epoch": 3.20352, "grad_norm": 0.4940718114376068, "learning_rate": 0.00021467223044824337, "loss": 3.8029, "step": 2505 }, { "epoch": 3.2048, "grad_norm": 0.5103253126144409, "learning_rate": 0.00021463184816260597, "loss": 3.8354, "step": 2506 }, { "epoch": 3.20608, "grad_norm": 0.5237296223640442, "learning_rate": 0.0002145914658769686, "loss": 3.8567, "step": 2507 }, { "epoch": 3.20736, "grad_norm": 0.5030867457389832, "learning_rate": 0.00021455108359133126, "loss": 3.8715, "step": 2508 }, { "epoch": 3.20864, "grad_norm": 0.49784404039382935, "learning_rate": 0.0002145107013056939, "loss": 3.7889, "step": 2509 }, { "epoch": 3.20992, "grad_norm": 0.5086187124252319, "learning_rate": 0.00021447031902005652, "loss": 3.9156, "step": 2510 }, { "epoch": 3.2112, "grad_norm": 0.5107704401016235, "learning_rate": 0.00021442993673441915, "loss": 3.7905, "step": 2511 }, { "epoch": 3.2124800000000002, "grad_norm": 0.4935329258441925, "learning_rate": 0.0002143895544487818, "loss": 3.8257, "step": 2512 }, { "epoch": 3.21376, "grad_norm": 0.5265173316001892, "learning_rate": 0.00021434917216314444, "loss": 3.8725, "step": 2513 }, { "epoch": 3.21504, "grad_norm": 0.5278881192207336, "learning_rate": 0.00021430878987750704, "loss": 3.8997, "step": 2514 }, { "epoch": 3.21632, "grad_norm": 0.5073565244674683, "learning_rate": 0.00021426840759186967, "loss": 3.8788, "step": 2515 }, { "epoch": 3.2176, "grad_norm": 0.5007202625274658, "learning_rate": 0.0002142280253062323, "loss": 3.8289, "step": 2516 }, { "epoch": 3.21888, "grad_norm": 0.5268493294715881, "learning_rate": 0.00021418764302059496, "loss": 3.9057, "step": 2517 }, { "epoch": 3.22016, "grad_norm": 0.5133515000343323, "learning_rate": 0.00021414726073495759, "loss": 3.9227, "step": 2518 }, { "epoch": 3.22144, "grad_norm": 0.5341963171958923, "learning_rate": 0.00021410687844932022, "loss": 3.8918, "step": 2519 }, { "epoch": 3.22272, "grad_norm": 0.488156795501709, "learning_rate": 0.00021406649616368285, "loss": 3.9192, "step": 2520 }, { "epoch": 3.224, "grad_norm": 0.5290320515632629, "learning_rate": 0.0002140261138780455, "loss": 3.7908, "step": 2521 }, { "epoch": 3.22528, "grad_norm": 0.510662853717804, "learning_rate": 0.0002139857315924081, "loss": 3.837, "step": 2522 }, { "epoch": 3.22656, "grad_norm": 0.5320321321487427, "learning_rate": 0.00021394534930677074, "loss": 3.8889, "step": 2523 }, { "epoch": 3.22784, "grad_norm": 0.5298739075660706, "learning_rate": 0.00021390496702113337, "loss": 3.9002, "step": 2524 }, { "epoch": 3.22912, "grad_norm": 0.5062413215637207, "learning_rate": 0.00021386458473549602, "loss": 3.8557, "step": 2525 }, { "epoch": 3.2304, "grad_norm": 0.5112326145172119, "learning_rate": 0.00021382420244985865, "loss": 3.8719, "step": 2526 }, { "epoch": 3.23168, "grad_norm": 0.48387032747268677, "learning_rate": 0.00021378382016422128, "loss": 3.8151, "step": 2527 }, { "epoch": 3.23296, "grad_norm": 0.49385952949523926, "learning_rate": 0.0002137434378785839, "loss": 3.8642, "step": 2528 }, { "epoch": 3.23424, "grad_norm": 0.5138410925865173, "learning_rate": 0.00021370305559294652, "loss": 3.9133, "step": 2529 }, { "epoch": 3.23552, "grad_norm": 0.5019318461418152, "learning_rate": 0.00021366267330730917, "loss": 3.8908, "step": 2530 }, { "epoch": 3.2368, "grad_norm": 0.4930397868156433, "learning_rate": 0.0002136222910216718, "loss": 3.8062, "step": 2531 }, { "epoch": 3.23808, "grad_norm": 0.5121169090270996, "learning_rate": 0.00021358190873603443, "loss": 3.7471, "step": 2532 }, { "epoch": 3.23936, "grad_norm": 0.4978577494621277, "learning_rate": 0.00021354152645039706, "loss": 3.9114, "step": 2533 }, { "epoch": 3.24064, "grad_norm": 0.5160923004150391, "learning_rate": 0.00021350114416475972, "loss": 3.832, "step": 2534 }, { "epoch": 3.24192, "grad_norm": 0.49270156025886536, "learning_rate": 0.00021346076187912235, "loss": 3.8092, "step": 2535 }, { "epoch": 3.2432, "grad_norm": 0.5121193528175354, "learning_rate": 0.00021342037959348498, "loss": 3.7557, "step": 2536 }, { "epoch": 3.24448, "grad_norm": 0.518182098865509, "learning_rate": 0.00021337999730784758, "loss": 3.8582, "step": 2537 }, { "epoch": 3.24576, "grad_norm": 0.5109360814094543, "learning_rate": 0.00021333961502221024, "loss": 3.8122, "step": 2538 }, { "epoch": 3.24704, "grad_norm": 0.5071660280227661, "learning_rate": 0.00021329923273657287, "loss": 3.8727, "step": 2539 }, { "epoch": 3.24832, "grad_norm": 0.5436646342277527, "learning_rate": 0.0002132588504509355, "loss": 3.903, "step": 2540 }, { "epoch": 3.2496, "grad_norm": 0.5585988163948059, "learning_rate": 0.00021321846816529813, "loss": 3.8499, "step": 2541 }, { "epoch": 3.25088, "grad_norm": 0.5021224617958069, "learning_rate": 0.00021317808587966076, "loss": 3.8658, "step": 2542 }, { "epoch": 3.25216, "grad_norm": 0.532137393951416, "learning_rate": 0.00021313770359402342, "loss": 3.8951, "step": 2543 }, { "epoch": 3.25344, "grad_norm": 0.5324034690856934, "learning_rate": 0.00021309732130838605, "loss": 3.8472, "step": 2544 }, { "epoch": 3.25472, "grad_norm": 0.5090122222900391, "learning_rate": 0.00021305693902274865, "loss": 3.8011, "step": 2545 }, { "epoch": 3.2560000000000002, "grad_norm": 0.5446542501449585, "learning_rate": 0.00021301655673711128, "loss": 3.8029, "step": 2546 }, { "epoch": 3.25728, "grad_norm": 0.5329357385635376, "learning_rate": 0.00021297617445147394, "loss": 3.7843, "step": 2547 }, { "epoch": 3.25856, "grad_norm": 0.552749752998352, "learning_rate": 0.00021293579216583657, "loss": 3.8847, "step": 2548 }, { "epoch": 3.25984, "grad_norm": 0.529541552066803, "learning_rate": 0.0002128954098801992, "loss": 3.9156, "step": 2549 }, { "epoch": 3.26112, "grad_norm": 0.5321997404098511, "learning_rate": 0.00021285502759456183, "loss": 3.8645, "step": 2550 }, { "epoch": 3.2624, "grad_norm": 0.5043097138404846, "learning_rate": 0.0002128146453089245, "loss": 3.8073, "step": 2551 }, { "epoch": 3.26368, "grad_norm": 0.5317241549491882, "learning_rate": 0.00021277426302328712, "loss": 3.8946, "step": 2552 }, { "epoch": 3.26496, "grad_norm": 0.5115827918052673, "learning_rate": 0.00021273388073764972, "loss": 3.8361, "step": 2553 }, { "epoch": 3.26624, "grad_norm": 0.5349282622337341, "learning_rate": 0.00021269349845201235, "loss": 3.8482, "step": 2554 }, { "epoch": 3.26752, "grad_norm": 0.5121451616287231, "learning_rate": 0.00021265311616637498, "loss": 3.8416, "step": 2555 }, { "epoch": 3.2688, "grad_norm": 0.4995376765727997, "learning_rate": 0.00021261273388073764, "loss": 3.848, "step": 2556 }, { "epoch": 3.27008, "grad_norm": 0.5239928960800171, "learning_rate": 0.00021257235159510027, "loss": 3.8595, "step": 2557 }, { "epoch": 3.27136, "grad_norm": 0.5018921494483948, "learning_rate": 0.0002125319693094629, "loss": 3.8748, "step": 2558 }, { "epoch": 3.27264, "grad_norm": 0.5001853108406067, "learning_rate": 0.00021249158702382553, "loss": 3.7935, "step": 2559 }, { "epoch": 3.27392, "grad_norm": 0.5090051889419556, "learning_rate": 0.00021245120473818818, "loss": 3.8594, "step": 2560 }, { "epoch": 3.2752, "grad_norm": 0.491327702999115, "learning_rate": 0.0002124108224525508, "loss": 3.8575, "step": 2561 }, { "epoch": 3.27648, "grad_norm": 0.516203761100769, "learning_rate": 0.00021237044016691342, "loss": 3.8331, "step": 2562 }, { "epoch": 3.27776, "grad_norm": 0.5098548531532288, "learning_rate": 0.00021233005788127605, "loss": 3.8881, "step": 2563 }, { "epoch": 3.27904, "grad_norm": 0.5232280492782593, "learning_rate": 0.0002122896755956387, "loss": 3.8788, "step": 2564 }, { "epoch": 3.28032, "grad_norm": 0.5226551294326782, "learning_rate": 0.00021224929331000133, "loss": 3.7725, "step": 2565 }, { "epoch": 3.2816, "grad_norm": 0.5192921161651611, "learning_rate": 0.00021220891102436396, "loss": 3.9056, "step": 2566 }, { "epoch": 3.28288, "grad_norm": 0.5096680521965027, "learning_rate": 0.0002121685287387266, "loss": 3.8403, "step": 2567 }, { "epoch": 3.28416, "grad_norm": 0.4832972586154938, "learning_rate": 0.00021212814645308925, "loss": 3.8747, "step": 2568 }, { "epoch": 3.28544, "grad_norm": 0.5360098481178284, "learning_rate": 0.00021208776416745185, "loss": 3.8489, "step": 2569 }, { "epoch": 3.28672, "grad_norm": 0.5098423361778259, "learning_rate": 0.00021204738188181448, "loss": 3.8063, "step": 2570 }, { "epoch": 3.288, "grad_norm": 0.5014640092849731, "learning_rate": 0.00021200699959617712, "loss": 3.8698, "step": 2571 }, { "epoch": 3.2892799999999998, "grad_norm": 0.531622052192688, "learning_rate": 0.00021196661731053975, "loss": 4.001, "step": 2572 }, { "epoch": 3.29056, "grad_norm": 0.5045267939567566, "learning_rate": 0.0002119262350249024, "loss": 3.8148, "step": 2573 }, { "epoch": 3.29184, "grad_norm": 0.5068945288658142, "learning_rate": 0.00021188585273926503, "loss": 3.8387, "step": 2574 }, { "epoch": 3.29312, "grad_norm": 0.5441416501998901, "learning_rate": 0.00021184547045362766, "loss": 3.8937, "step": 2575 }, { "epoch": 3.2944, "grad_norm": 0.5135897397994995, "learning_rate": 0.00021180508816799027, "loss": 3.8456, "step": 2576 }, { "epoch": 3.29568, "grad_norm": 0.5423394441604614, "learning_rate": 0.00021176470588235295, "loss": 3.9034, "step": 2577 }, { "epoch": 3.29696, "grad_norm": 0.5269873738288879, "learning_rate": 0.00021172432359671555, "loss": 3.8488, "step": 2578 }, { "epoch": 3.29824, "grad_norm": 0.57085782289505, "learning_rate": 0.00021168394131107818, "loss": 3.8606, "step": 2579 }, { "epoch": 3.2995200000000002, "grad_norm": 0.5277544260025024, "learning_rate": 0.0002116435590254408, "loss": 3.8475, "step": 2580 }, { "epoch": 3.3008, "grad_norm": 0.5475723743438721, "learning_rate": 0.00021160317673980347, "loss": 3.8538, "step": 2581 }, { "epoch": 3.30208, "grad_norm": 0.5339505076408386, "learning_rate": 0.0002115627944541661, "loss": 3.8168, "step": 2582 }, { "epoch": 3.30336, "grad_norm": 0.561708927154541, "learning_rate": 0.00021152241216852873, "loss": 3.8303, "step": 2583 }, { "epoch": 3.30464, "grad_norm": 0.507378339767456, "learning_rate": 0.00021148202988289133, "loss": 3.8375, "step": 2584 }, { "epoch": 3.30592, "grad_norm": 0.5297276377677917, "learning_rate": 0.00021144164759725396, "loss": 3.8747, "step": 2585 }, { "epoch": 3.3072, "grad_norm": 0.5358178615570068, "learning_rate": 0.00021140126531161662, "loss": 3.8599, "step": 2586 }, { "epoch": 3.30848, "grad_norm": 0.5373551845550537, "learning_rate": 0.00021136088302597925, "loss": 3.8097, "step": 2587 }, { "epoch": 3.30976, "grad_norm": 0.5402244925498962, "learning_rate": 0.00021132050074034188, "loss": 3.8853, "step": 2588 }, { "epoch": 3.31104, "grad_norm": 0.5271474123001099, "learning_rate": 0.0002112801184547045, "loss": 3.7922, "step": 2589 }, { "epoch": 3.31232, "grad_norm": 0.5495550632476807, "learning_rate": 0.00021123973616906717, "loss": 3.8268, "step": 2590 }, { "epoch": 3.3136, "grad_norm": 0.5179243683815002, "learning_rate": 0.0002111993538834298, "loss": 3.9258, "step": 2591 }, { "epoch": 3.31488, "grad_norm": 0.5357790589332581, "learning_rate": 0.00021115897159779243, "loss": 3.8901, "step": 2592 }, { "epoch": 3.31616, "grad_norm": 0.4818624258041382, "learning_rate": 0.00021111858931215503, "loss": 3.8431, "step": 2593 }, { "epoch": 3.31744, "grad_norm": 0.5085586905479431, "learning_rate": 0.0002110782070265177, "loss": 3.8345, "step": 2594 }, { "epoch": 3.31872, "grad_norm": 0.518665611743927, "learning_rate": 0.00021103782474088032, "loss": 3.7475, "step": 2595 }, { "epoch": 3.32, "grad_norm": 0.5138918161392212, "learning_rate": 0.00021099744245524295, "loss": 3.8736, "step": 2596 }, { "epoch": 3.32128, "grad_norm": 0.5055665373802185, "learning_rate": 0.00021095706016960558, "loss": 3.8456, "step": 2597 }, { "epoch": 3.32256, "grad_norm": 0.4695318937301636, "learning_rate": 0.0002109166778839682, "loss": 3.8635, "step": 2598 }, { "epoch": 3.32384, "grad_norm": 0.47867250442504883, "learning_rate": 0.00021087629559833087, "loss": 3.8418, "step": 2599 }, { "epoch": 3.32512, "grad_norm": 0.5042849779129028, "learning_rate": 0.0002108359133126935, "loss": 3.8351, "step": 2600 }, { "epoch": 3.3264, "grad_norm": 0.5028448700904846, "learning_rate": 0.0002107955310270561, "loss": 3.8349, "step": 2601 }, { "epoch": 3.32768, "grad_norm": 0.5034447312355042, "learning_rate": 0.00021075514874141873, "loss": 3.8095, "step": 2602 }, { "epoch": 3.32896, "grad_norm": 0.4787198007106781, "learning_rate": 0.00021071476645578139, "loss": 3.7805, "step": 2603 }, { "epoch": 3.33024, "grad_norm": 0.535429835319519, "learning_rate": 0.00021067438417014402, "loss": 3.8945, "step": 2604 }, { "epoch": 3.33152, "grad_norm": 0.5033429265022278, "learning_rate": 0.00021063400188450665, "loss": 3.7983, "step": 2605 }, { "epoch": 3.3327999999999998, "grad_norm": 0.4938407242298126, "learning_rate": 0.00021059361959886928, "loss": 3.8798, "step": 2606 }, { "epoch": 3.33408, "grad_norm": 0.5288311839103699, "learning_rate": 0.00021055323731323193, "loss": 3.833, "step": 2607 }, { "epoch": 3.33536, "grad_norm": 0.5072165131568909, "learning_rate": 0.00021051285502759456, "loss": 3.8302, "step": 2608 }, { "epoch": 3.33664, "grad_norm": 0.5025217533111572, "learning_rate": 0.00021047247274195717, "loss": 3.8571, "step": 2609 }, { "epoch": 3.33792, "grad_norm": 0.5042093396186829, "learning_rate": 0.0002104320904563198, "loss": 3.8965, "step": 2610 }, { "epoch": 3.3392, "grad_norm": 0.5024438500404358, "learning_rate": 0.00021039170817068243, "loss": 3.8112, "step": 2611 }, { "epoch": 3.34048, "grad_norm": 0.4909932613372803, "learning_rate": 0.00021035132588504508, "loss": 3.8429, "step": 2612 }, { "epoch": 3.34176, "grad_norm": 0.505427896976471, "learning_rate": 0.00021031094359940771, "loss": 3.9004, "step": 2613 }, { "epoch": 3.3430400000000002, "grad_norm": 0.5072449445724487, "learning_rate": 0.00021027056131377034, "loss": 3.824, "step": 2614 }, { "epoch": 3.34432, "grad_norm": 0.5105025172233582, "learning_rate": 0.00021023017902813297, "loss": 3.8643, "step": 2615 }, { "epoch": 3.3456, "grad_norm": 0.5096602439880371, "learning_rate": 0.00021018979674249563, "loss": 3.8363, "step": 2616 }, { "epoch": 3.34688, "grad_norm": 0.4964136779308319, "learning_rate": 0.00021014941445685823, "loss": 3.8228, "step": 2617 }, { "epoch": 3.34816, "grad_norm": 0.510119616985321, "learning_rate": 0.00021010903217122086, "loss": 3.8846, "step": 2618 }, { "epoch": 3.34944, "grad_norm": 0.504747211933136, "learning_rate": 0.0002100686498855835, "loss": 3.8775, "step": 2619 }, { "epoch": 3.35072, "grad_norm": 0.496619313955307, "learning_rate": 0.00021002826759994615, "loss": 3.8405, "step": 2620 }, { "epoch": 3.352, "grad_norm": 0.5143598914146423, "learning_rate": 0.00020998788531430878, "loss": 3.8538, "step": 2621 }, { "epoch": 3.35328, "grad_norm": 0.49931085109710693, "learning_rate": 0.0002099475030286714, "loss": 3.8484, "step": 2622 }, { "epoch": 3.35456, "grad_norm": 0.49670740962028503, "learning_rate": 0.00020990712074303404, "loss": 3.8549, "step": 2623 }, { "epoch": 3.35584, "grad_norm": 0.5194510221481323, "learning_rate": 0.00020986673845739664, "loss": 3.9321, "step": 2624 }, { "epoch": 3.35712, "grad_norm": 0.4933635890483856, "learning_rate": 0.0002098263561717593, "loss": 3.9203, "step": 2625 }, { "epoch": 3.3584, "grad_norm": 0.5265275835990906, "learning_rate": 0.00020978597388612193, "loss": 3.7854, "step": 2626 }, { "epoch": 3.35968, "grad_norm": 0.5067412853240967, "learning_rate": 0.00020974559160048456, "loss": 3.8297, "step": 2627 }, { "epoch": 3.36096, "grad_norm": 0.5101473927497864, "learning_rate": 0.0002097052093148472, "loss": 3.8358, "step": 2628 }, { "epoch": 3.36224, "grad_norm": 0.5028688311576843, "learning_rate": 0.00020966482702920985, "loss": 3.8475, "step": 2629 }, { "epoch": 3.36352, "grad_norm": 0.5051252245903015, "learning_rate": 0.00020962444474357248, "loss": 3.8307, "step": 2630 }, { "epoch": 3.3648, "grad_norm": 0.490156888961792, "learning_rate": 0.0002095840624579351, "loss": 3.843, "step": 2631 }, { "epoch": 3.36608, "grad_norm": 0.49856409430503845, "learning_rate": 0.0002095436801722977, "loss": 3.8606, "step": 2632 }, { "epoch": 3.36736, "grad_norm": 0.47945141792297363, "learning_rate": 0.00020950329788666037, "loss": 3.8623, "step": 2633 }, { "epoch": 3.36864, "grad_norm": 0.5224535465240479, "learning_rate": 0.000209462915601023, "loss": 3.8351, "step": 2634 }, { "epoch": 3.36992, "grad_norm": 0.49152296781539917, "learning_rate": 0.00020942253331538563, "loss": 3.812, "step": 2635 }, { "epoch": 3.3712, "grad_norm": 0.4920552968978882, "learning_rate": 0.00020938215102974826, "loss": 3.8568, "step": 2636 }, { "epoch": 3.37248, "grad_norm": 0.4960016906261444, "learning_rate": 0.0002093417687441109, "loss": 3.8487, "step": 2637 }, { "epoch": 3.37376, "grad_norm": 0.5091875195503235, "learning_rate": 0.00020930138645847355, "loss": 3.8627, "step": 2638 }, { "epoch": 3.37504, "grad_norm": 0.4994186758995056, "learning_rate": 0.00020926100417283618, "loss": 3.8548, "step": 2639 }, { "epoch": 3.3763199999999998, "grad_norm": 0.49067172408103943, "learning_rate": 0.00020922062188719878, "loss": 3.8619, "step": 2640 }, { "epoch": 3.3776, "grad_norm": 0.4815825819969177, "learning_rate": 0.0002091802396015614, "loss": 3.8202, "step": 2641 }, { "epoch": 3.37888, "grad_norm": 0.501998245716095, "learning_rate": 0.00020913985731592407, "loss": 3.8243, "step": 2642 }, { "epoch": 3.38016, "grad_norm": 0.49832189083099365, "learning_rate": 0.0002090994750302867, "loss": 3.8018, "step": 2643 }, { "epoch": 3.38144, "grad_norm": 0.5382563471794128, "learning_rate": 0.00020905909274464933, "loss": 3.8121, "step": 2644 }, { "epoch": 3.38272, "grad_norm": 0.5151340961456299, "learning_rate": 0.00020901871045901196, "loss": 3.8461, "step": 2645 }, { "epoch": 3.384, "grad_norm": 0.5148212909698486, "learning_rate": 0.00020897832817337461, "loss": 3.9021, "step": 2646 }, { "epoch": 3.38528, "grad_norm": 0.49888908863067627, "learning_rate": 0.00020893794588773724, "loss": 3.9323, "step": 2647 }, { "epoch": 3.3865600000000002, "grad_norm": 0.5344513058662415, "learning_rate": 0.00020889756360209985, "loss": 3.8421, "step": 2648 }, { "epoch": 3.38784, "grad_norm": 0.5026503205299377, "learning_rate": 0.00020885718131646248, "loss": 3.7643, "step": 2649 }, { "epoch": 3.38912, "grad_norm": 0.4816747009754181, "learning_rate": 0.0002088167990308251, "loss": 3.7415, "step": 2650 }, { "epoch": 3.3904, "grad_norm": 0.5043998956680298, "learning_rate": 0.00020877641674518777, "loss": 3.8523, "step": 2651 }, { "epoch": 3.39168, "grad_norm": 0.5302547812461853, "learning_rate": 0.0002087360344595504, "loss": 3.8868, "step": 2652 }, { "epoch": 3.39296, "grad_norm": 0.47889089584350586, "learning_rate": 0.00020869565217391303, "loss": 3.7999, "step": 2653 }, { "epoch": 3.39424, "grad_norm": 0.5710118412971497, "learning_rate": 0.00020865526988827566, "loss": 3.8922, "step": 2654 }, { "epoch": 3.39552, "grad_norm": 0.504432201385498, "learning_rate": 0.0002086148876026383, "loss": 3.8663, "step": 2655 }, { "epoch": 3.3968, "grad_norm": 0.5317485928535461, "learning_rate": 0.00020857450531700092, "loss": 3.8964, "step": 2656 }, { "epoch": 3.39808, "grad_norm": 0.5189566612243652, "learning_rate": 0.00020853412303136355, "loss": 3.8165, "step": 2657 }, { "epoch": 3.39936, "grad_norm": 0.5216357111930847, "learning_rate": 0.00020849374074572618, "loss": 3.8114, "step": 2658 }, { "epoch": 3.40064, "grad_norm": 0.517992377281189, "learning_rate": 0.00020845335846008883, "loss": 3.8755, "step": 2659 }, { "epoch": 3.40192, "grad_norm": 0.5476480722427368, "learning_rate": 0.00020841297617445146, "loss": 3.8006, "step": 2660 }, { "epoch": 3.4032, "grad_norm": 0.5155659914016724, "learning_rate": 0.0002083725938888141, "loss": 3.7955, "step": 2661 }, { "epoch": 3.40448, "grad_norm": 0.5250527262687683, "learning_rate": 0.00020833221160317672, "loss": 3.9009, "step": 2662 }, { "epoch": 3.40576, "grad_norm": 0.5005857944488525, "learning_rate": 0.00020829182931753933, "loss": 3.8746, "step": 2663 }, { "epoch": 3.40704, "grad_norm": 0.5127919316291809, "learning_rate": 0.000208251447031902, "loss": 3.8685, "step": 2664 }, { "epoch": 3.40832, "grad_norm": 0.49947747588157654, "learning_rate": 0.0002082110647462646, "loss": 3.8362, "step": 2665 }, { "epoch": 3.4096, "grad_norm": 0.48780733346939087, "learning_rate": 0.00020817068246062724, "loss": 3.821, "step": 2666 }, { "epoch": 3.41088, "grad_norm": 0.5010076761245728, "learning_rate": 0.00020813030017498987, "loss": 3.7895, "step": 2667 }, { "epoch": 3.41216, "grad_norm": 0.5000413060188293, "learning_rate": 0.00020808991788935253, "loss": 3.8729, "step": 2668 }, { "epoch": 3.41344, "grad_norm": 0.5171912312507629, "learning_rate": 0.00020804953560371516, "loss": 3.8008, "step": 2669 }, { "epoch": 3.41472, "grad_norm": 0.49489569664001465, "learning_rate": 0.0002080091533180778, "loss": 3.8762, "step": 2670 }, { "epoch": 3.416, "grad_norm": 0.49014776945114136, "learning_rate": 0.0002079687710324404, "loss": 3.7536, "step": 2671 }, { "epoch": 3.41728, "grad_norm": 0.5062114596366882, "learning_rate": 0.00020792838874680308, "loss": 3.8479, "step": 2672 }, { "epoch": 3.41856, "grad_norm": 0.49555784463882446, "learning_rate": 0.00020788800646116568, "loss": 3.851, "step": 2673 }, { "epoch": 3.4198399999999998, "grad_norm": 0.48227742314338684, "learning_rate": 0.0002078476241755283, "loss": 3.8026, "step": 2674 }, { "epoch": 3.42112, "grad_norm": 0.49328896403312683, "learning_rate": 0.00020780724188989094, "loss": 3.8565, "step": 2675 }, { "epoch": 3.4224, "grad_norm": 0.4922971725463867, "learning_rate": 0.00020776685960425357, "loss": 3.8081, "step": 2676 }, { "epoch": 3.42368, "grad_norm": 0.5213683843612671, "learning_rate": 0.00020772647731861623, "loss": 3.7913, "step": 2677 }, { "epoch": 3.42496, "grad_norm": 0.4958494007587433, "learning_rate": 0.00020768609503297886, "loss": 3.8678, "step": 2678 }, { "epoch": 3.42624, "grad_norm": 0.4969683885574341, "learning_rate": 0.0002076457127473415, "loss": 3.7767, "step": 2679 }, { "epoch": 3.42752, "grad_norm": 0.5043670535087585, "learning_rate": 0.0002076053304617041, "loss": 3.8217, "step": 2680 }, { "epoch": 3.4288, "grad_norm": 0.4834843575954437, "learning_rate": 0.00020756494817606675, "loss": 3.7935, "step": 2681 }, { "epoch": 3.4300800000000002, "grad_norm": 0.5033546686172485, "learning_rate": 0.00020752456589042938, "loss": 3.8507, "step": 2682 }, { "epoch": 3.43136, "grad_norm": 0.4869885742664337, "learning_rate": 0.000207484183604792, "loss": 3.7742, "step": 2683 }, { "epoch": 3.43264, "grad_norm": 0.5016533732414246, "learning_rate": 0.00020744380131915464, "loss": 3.8161, "step": 2684 }, { "epoch": 3.43392, "grad_norm": 0.4721163809299469, "learning_rate": 0.0002074034190335173, "loss": 3.8743, "step": 2685 }, { "epoch": 3.4352, "grad_norm": 0.5129477381706238, "learning_rate": 0.00020736303674787993, "loss": 3.9012, "step": 2686 }, { "epoch": 3.43648, "grad_norm": 0.5066404938697815, "learning_rate": 0.00020732265446224256, "loss": 3.8459, "step": 2687 }, { "epoch": 3.43776, "grad_norm": 0.5318638682365417, "learning_rate": 0.00020728227217660516, "loss": 3.744, "step": 2688 }, { "epoch": 3.43904, "grad_norm": 0.497341513633728, "learning_rate": 0.00020724188989096782, "loss": 3.8856, "step": 2689 }, { "epoch": 3.44032, "grad_norm": 0.5572753548622131, "learning_rate": 0.00020720150760533045, "loss": 3.7982, "step": 2690 }, { "epoch": 3.4416, "grad_norm": 0.4999849498271942, "learning_rate": 0.00020716112531969308, "loss": 3.7728, "step": 2691 }, { "epoch": 3.44288, "grad_norm": 0.5322354435920715, "learning_rate": 0.0002071207430340557, "loss": 3.9, "step": 2692 }, { "epoch": 3.44416, "grad_norm": 0.5226873755455017, "learning_rate": 0.00020708036074841834, "loss": 3.8428, "step": 2693 }, { "epoch": 3.44544, "grad_norm": 0.5086526870727539, "learning_rate": 0.000207039978462781, "loss": 3.8667, "step": 2694 }, { "epoch": 3.44672, "grad_norm": 0.4990271329879761, "learning_rate": 0.00020699959617714362, "loss": 3.8685, "step": 2695 }, { "epoch": 3.448, "grad_norm": 0.5120396018028259, "learning_rate": 0.00020695921389150623, "loss": 3.8694, "step": 2696 }, { "epoch": 3.44928, "grad_norm": 0.5174608826637268, "learning_rate": 0.00020691883160586886, "loss": 3.8495, "step": 2697 }, { "epoch": 3.45056, "grad_norm": 0.5169567465782166, "learning_rate": 0.00020687844932023151, "loss": 3.8418, "step": 2698 }, { "epoch": 3.45184, "grad_norm": 0.5117883682250977, "learning_rate": 0.00020683806703459414, "loss": 3.8707, "step": 2699 }, { "epoch": 3.45312, "grad_norm": 0.5186121463775635, "learning_rate": 0.00020679768474895677, "loss": 3.8601, "step": 2700 }, { "epoch": 3.4544, "grad_norm": 0.5014731884002686, "learning_rate": 0.0002067573024633194, "loss": 3.8657, "step": 2701 }, { "epoch": 3.45568, "grad_norm": 0.4928348958492279, "learning_rate": 0.00020671692017768206, "loss": 3.7956, "step": 2702 }, { "epoch": 3.45696, "grad_norm": 0.4971044063568115, "learning_rate": 0.0002066765378920447, "loss": 3.9533, "step": 2703 }, { "epoch": 3.45824, "grad_norm": 0.5187544226646423, "learning_rate": 0.0002066361556064073, "loss": 3.847, "step": 2704 }, { "epoch": 3.45952, "grad_norm": 0.4827615022659302, "learning_rate": 0.00020659577332076992, "loss": 3.8399, "step": 2705 }, { "epoch": 3.4608, "grad_norm": 0.5062766671180725, "learning_rate": 0.00020655539103513255, "loss": 3.9054, "step": 2706 }, { "epoch": 3.46208, "grad_norm": 0.49423202872276306, "learning_rate": 0.0002065150087494952, "loss": 3.8656, "step": 2707 }, { "epoch": 3.4633599999999998, "grad_norm": 0.5079230666160583, "learning_rate": 0.00020647462646385784, "loss": 3.7847, "step": 2708 }, { "epoch": 3.46464, "grad_norm": 0.5124461650848389, "learning_rate": 0.00020643424417822047, "loss": 3.7327, "step": 2709 }, { "epoch": 3.46592, "grad_norm": 0.5108417868614197, "learning_rate": 0.0002063938618925831, "loss": 3.848, "step": 2710 }, { "epoch": 3.4672, "grad_norm": 0.5090741515159607, "learning_rate": 0.00020635347960694576, "loss": 3.7348, "step": 2711 }, { "epoch": 3.46848, "grad_norm": 0.4850289523601532, "learning_rate": 0.00020631309732130836, "loss": 3.8364, "step": 2712 }, { "epoch": 3.46976, "grad_norm": 0.5068538188934326, "learning_rate": 0.000206272715035671, "loss": 3.8469, "step": 2713 }, { "epoch": 3.47104, "grad_norm": 0.5119503140449524, "learning_rate": 0.00020623233275003362, "loss": 3.9369, "step": 2714 }, { "epoch": 3.47232, "grad_norm": 0.49882522225379944, "learning_rate": 0.00020619195046439628, "loss": 3.8124, "step": 2715 }, { "epoch": 3.4736000000000002, "grad_norm": 0.5284246802330017, "learning_rate": 0.0002061515681787589, "loss": 3.7402, "step": 2716 }, { "epoch": 3.47488, "grad_norm": 0.5059674382209778, "learning_rate": 0.00020611118589312154, "loss": 3.8813, "step": 2717 }, { "epoch": 3.47616, "grad_norm": 0.5152750015258789, "learning_rate": 0.00020607080360748417, "loss": 3.8808, "step": 2718 }, { "epoch": 3.47744, "grad_norm": 0.5104813575744629, "learning_rate": 0.00020603042132184677, "loss": 3.7924, "step": 2719 }, { "epoch": 3.47872, "grad_norm": 0.500645637512207, "learning_rate": 0.00020599003903620943, "loss": 3.8189, "step": 2720 }, { "epoch": 3.48, "grad_norm": 0.511396050453186, "learning_rate": 0.00020594965675057206, "loss": 3.8472, "step": 2721 }, { "epoch": 3.48128, "grad_norm": 0.5173667073249817, "learning_rate": 0.0002059092744649347, "loss": 3.8637, "step": 2722 }, { "epoch": 3.48256, "grad_norm": 0.49966612458229065, "learning_rate": 0.00020586889217929732, "loss": 3.8889, "step": 2723 }, { "epoch": 3.48384, "grad_norm": 0.4898408055305481, "learning_rate": 0.00020582850989365998, "loss": 3.8718, "step": 2724 }, { "epoch": 3.48512, "grad_norm": 0.5065445303916931, "learning_rate": 0.0002057881276080226, "loss": 3.8627, "step": 2725 }, { "epoch": 3.4864, "grad_norm": 0.485765278339386, "learning_rate": 0.00020574774532238524, "loss": 3.7733, "step": 2726 }, { "epoch": 3.48768, "grad_norm": 0.5096103549003601, "learning_rate": 0.00020570736303674784, "loss": 3.7589, "step": 2727 }, { "epoch": 3.48896, "grad_norm": 0.5073655843734741, "learning_rate": 0.0002056669807511105, "loss": 3.8593, "step": 2728 }, { "epoch": 3.49024, "grad_norm": 0.48420706391334534, "learning_rate": 0.00020562659846547313, "loss": 3.7242, "step": 2729 }, { "epoch": 3.49152, "grad_norm": 0.5061640739440918, "learning_rate": 0.00020558621617983576, "loss": 3.9017, "step": 2730 }, { "epoch": 3.4928, "grad_norm": 0.5040037035942078, "learning_rate": 0.0002055458338941984, "loss": 3.8566, "step": 2731 }, { "epoch": 3.49408, "grad_norm": 0.4816730320453644, "learning_rate": 0.00020550545160856102, "loss": 3.8164, "step": 2732 }, { "epoch": 3.49536, "grad_norm": 0.5050880908966064, "learning_rate": 0.00020546506932292368, "loss": 3.899, "step": 2733 }, { "epoch": 3.49664, "grad_norm": 0.47876450419425964, "learning_rate": 0.0002054246870372863, "loss": 3.8351, "step": 2734 }, { "epoch": 3.49792, "grad_norm": 0.5011029243469238, "learning_rate": 0.0002053843047516489, "loss": 3.7863, "step": 2735 }, { "epoch": 3.4992, "grad_norm": 0.509458065032959, "learning_rate": 0.00020534392246601154, "loss": 3.889, "step": 2736 }, { "epoch": 3.50048, "grad_norm": 0.4975719749927521, "learning_rate": 0.0002053035401803742, "loss": 3.9014, "step": 2737 }, { "epoch": 3.50176, "grad_norm": 0.49265387654304504, "learning_rate": 0.00020526315789473683, "loss": 3.839, "step": 2738 }, { "epoch": 3.50304, "grad_norm": 0.5019585490226746, "learning_rate": 0.00020522277560909946, "loss": 3.8466, "step": 2739 }, { "epoch": 3.50432, "grad_norm": 0.49570876359939575, "learning_rate": 0.00020518239332346209, "loss": 3.8707, "step": 2740 }, { "epoch": 3.5056000000000003, "grad_norm": 0.5000244975090027, "learning_rate": 0.00020514201103782474, "loss": 3.7987, "step": 2741 }, { "epoch": 3.5068799999999998, "grad_norm": 0.5056788325309753, "learning_rate": 0.00020510162875218737, "loss": 3.8735, "step": 2742 }, { "epoch": 3.50816, "grad_norm": 0.5059694647789001, "learning_rate": 0.00020506124646654998, "loss": 3.8705, "step": 2743 }, { "epoch": 3.50944, "grad_norm": 0.47248658537864685, "learning_rate": 0.0002050208641809126, "loss": 3.8288, "step": 2744 }, { "epoch": 3.51072, "grad_norm": 0.5001699328422546, "learning_rate": 0.00020498048189527524, "loss": 3.8767, "step": 2745 }, { "epoch": 3.512, "grad_norm": 0.5233591794967651, "learning_rate": 0.0002049400996096379, "loss": 3.8299, "step": 2746 }, { "epoch": 3.51328, "grad_norm": 0.5596291422843933, "learning_rate": 0.00020489971732400052, "loss": 3.9301, "step": 2747 }, { "epoch": 3.51456, "grad_norm": 0.5503137111663818, "learning_rate": 0.00020485933503836315, "loss": 3.8602, "step": 2748 }, { "epoch": 3.51584, "grad_norm": 0.48577359318733215, "learning_rate": 0.00020481895275272578, "loss": 3.7755, "step": 2749 }, { "epoch": 3.5171200000000002, "grad_norm": 0.5109073519706726, "learning_rate": 0.00020477857046708844, "loss": 3.8155, "step": 2750 }, { "epoch": 3.5183999999999997, "grad_norm": 0.5062686800956726, "learning_rate": 0.00020473818818145104, "loss": 3.819, "step": 2751 }, { "epoch": 3.51968, "grad_norm": 0.5061745047569275, "learning_rate": 0.00020469780589581367, "loss": 3.892, "step": 2752 }, { "epoch": 3.52096, "grad_norm": 0.49665218591690063, "learning_rate": 0.0002046574236101763, "loss": 3.8608, "step": 2753 }, { "epoch": 3.52224, "grad_norm": 0.511926531791687, "learning_rate": 0.00020461704132453896, "loss": 3.7848, "step": 2754 }, { "epoch": 3.52352, "grad_norm": 0.4888019859790802, "learning_rate": 0.0002045766590389016, "loss": 3.7409, "step": 2755 }, { "epoch": 3.5248, "grad_norm": 0.4757629334926605, "learning_rate": 0.00020453627675326422, "loss": 3.8083, "step": 2756 }, { "epoch": 3.52608, "grad_norm": 0.48579922318458557, "learning_rate": 0.00020449589446762685, "loss": 3.8436, "step": 2757 }, { "epoch": 3.52736, "grad_norm": 0.47741061449050903, "learning_rate": 0.00020445551218198945, "loss": 3.7938, "step": 2758 }, { "epoch": 3.52864, "grad_norm": 0.5141867995262146, "learning_rate": 0.00020441512989635214, "loss": 3.8169, "step": 2759 }, { "epoch": 3.5299199999999997, "grad_norm": 0.4805345833301544, "learning_rate": 0.00020437474761071474, "loss": 3.8487, "step": 2760 }, { "epoch": 3.5312, "grad_norm": 0.5061795711517334, "learning_rate": 0.00020433436532507737, "loss": 3.811, "step": 2761 }, { "epoch": 3.53248, "grad_norm": 0.48661288619041443, "learning_rate": 0.00020429398303944, "loss": 3.7974, "step": 2762 }, { "epoch": 3.53376, "grad_norm": 0.5097336173057556, "learning_rate": 0.00020425360075380266, "loss": 3.8154, "step": 2763 }, { "epoch": 3.53504, "grad_norm": 0.4881674349308014, "learning_rate": 0.0002042132184681653, "loss": 3.8714, "step": 2764 }, { "epoch": 3.53632, "grad_norm": 0.5308511853218079, "learning_rate": 0.00020417283618252792, "loss": 3.786, "step": 2765 }, { "epoch": 3.5376, "grad_norm": 0.48169809579849243, "learning_rate": 0.00020413245389689055, "loss": 3.8127, "step": 2766 }, { "epoch": 3.53888, "grad_norm": 0.5140031576156616, "learning_rate": 0.0002040920716112532, "loss": 3.8552, "step": 2767 }, { "epoch": 3.54016, "grad_norm": 0.4895153343677521, "learning_rate": 0.0002040516893256158, "loss": 3.8319, "step": 2768 }, { "epoch": 3.54144, "grad_norm": 0.5244706869125366, "learning_rate": 0.00020401130703997844, "loss": 3.7663, "step": 2769 }, { "epoch": 3.54272, "grad_norm": 0.5048360228538513, "learning_rate": 0.00020397092475434107, "loss": 3.8417, "step": 2770 }, { "epoch": 3.544, "grad_norm": 0.515129029750824, "learning_rate": 0.0002039305424687037, "loss": 3.9146, "step": 2771 }, { "epoch": 3.54528, "grad_norm": 0.5083556771278381, "learning_rate": 0.00020389016018306636, "loss": 3.8344, "step": 2772 }, { "epoch": 3.54656, "grad_norm": 0.4877746105194092, "learning_rate": 0.000203849777897429, "loss": 3.8346, "step": 2773 }, { "epoch": 3.54784, "grad_norm": 0.508374035358429, "learning_rate": 0.00020380939561179162, "loss": 3.7713, "step": 2774 }, { "epoch": 3.5491200000000003, "grad_norm": 0.496378093957901, "learning_rate": 0.00020376901332615422, "loss": 3.8809, "step": 2775 }, { "epoch": 3.5504, "grad_norm": 0.5039408206939697, "learning_rate": 0.00020372863104051688, "loss": 3.8309, "step": 2776 }, { "epoch": 3.55168, "grad_norm": 0.496198445558548, "learning_rate": 0.0002036882487548795, "loss": 3.8634, "step": 2777 }, { "epoch": 3.55296, "grad_norm": 0.48462727665901184, "learning_rate": 0.00020364786646924214, "loss": 3.8348, "step": 2778 }, { "epoch": 3.55424, "grad_norm": 0.48560038208961487, "learning_rate": 0.00020360748418360477, "loss": 3.7579, "step": 2779 }, { "epoch": 3.55552, "grad_norm": 0.502811074256897, "learning_rate": 0.00020356710189796742, "loss": 3.8567, "step": 2780 }, { "epoch": 3.5568, "grad_norm": 0.4968845546245575, "learning_rate": 0.00020352671961233005, "loss": 3.8431, "step": 2781 }, { "epoch": 3.55808, "grad_norm": 0.48642468452453613, "learning_rate": 0.00020348633732669268, "loss": 3.7647, "step": 2782 }, { "epoch": 3.55936, "grad_norm": 0.4881822168827057, "learning_rate": 0.0002034459550410553, "loss": 3.792, "step": 2783 }, { "epoch": 3.5606400000000002, "grad_norm": 0.49411314725875854, "learning_rate": 0.00020340557275541792, "loss": 3.794, "step": 2784 }, { "epoch": 3.5619199999999998, "grad_norm": 0.48321542143821716, "learning_rate": 0.00020336519046978057, "loss": 3.8337, "step": 2785 }, { "epoch": 3.5632, "grad_norm": 0.4860212504863739, "learning_rate": 0.0002033248081841432, "loss": 3.8145, "step": 2786 }, { "epoch": 3.56448, "grad_norm": 0.4979793429374695, "learning_rate": 0.00020328442589850583, "loss": 3.8073, "step": 2787 }, { "epoch": 3.56576, "grad_norm": 0.4866310954093933, "learning_rate": 0.00020324404361286846, "loss": 3.7945, "step": 2788 }, { "epoch": 3.56704, "grad_norm": 0.516071617603302, "learning_rate": 0.00020320366132723112, "loss": 3.8237, "step": 2789 }, { "epoch": 3.56832, "grad_norm": 0.5022051930427551, "learning_rate": 0.00020316327904159375, "loss": 3.8451, "step": 2790 }, { "epoch": 3.5696, "grad_norm": 0.5036980509757996, "learning_rate": 0.00020312289675595636, "loss": 3.7898, "step": 2791 }, { "epoch": 3.57088, "grad_norm": 0.5079161524772644, "learning_rate": 0.00020308251447031899, "loss": 3.7985, "step": 2792 }, { "epoch": 3.5721600000000002, "grad_norm": 0.48699140548706055, "learning_rate": 0.00020304213218468164, "loss": 3.8116, "step": 2793 }, { "epoch": 3.5734399999999997, "grad_norm": 0.5011927485466003, "learning_rate": 0.00020300174989904427, "loss": 3.9555, "step": 2794 }, { "epoch": 3.57472, "grad_norm": 0.5062665939331055, "learning_rate": 0.0002029613676134069, "loss": 3.8183, "step": 2795 }, { "epoch": 3.576, "grad_norm": 0.49101898074150085, "learning_rate": 0.00020292098532776953, "loss": 3.8428, "step": 2796 }, { "epoch": 3.57728, "grad_norm": 0.5135859251022339, "learning_rate": 0.00020288060304213216, "loss": 3.8469, "step": 2797 }, { "epoch": 3.57856, "grad_norm": 0.5152158141136169, "learning_rate": 0.00020284022075649482, "loss": 3.8385, "step": 2798 }, { "epoch": 3.57984, "grad_norm": 0.4942494034767151, "learning_rate": 0.00020279983847085742, "loss": 3.9062, "step": 2799 }, { "epoch": 3.58112, "grad_norm": 0.5052478313446045, "learning_rate": 0.00020275945618522005, "loss": 3.7949, "step": 2800 }, { "epoch": 3.5824, "grad_norm": 0.5218126177787781, "learning_rate": 0.00020271907389958268, "loss": 3.8128, "step": 2801 }, { "epoch": 3.58368, "grad_norm": 0.5123757719993591, "learning_rate": 0.00020267869161394534, "loss": 3.8098, "step": 2802 }, { "epoch": 3.58496, "grad_norm": 0.48139896988868713, "learning_rate": 0.00020263830932830797, "loss": 3.8333, "step": 2803 }, { "epoch": 3.58624, "grad_norm": 0.494536817073822, "learning_rate": 0.0002025979270426706, "loss": 3.7192, "step": 2804 }, { "epoch": 3.58752, "grad_norm": 0.4789034128189087, "learning_rate": 0.00020255754475703323, "loss": 3.7481, "step": 2805 }, { "epoch": 3.5888, "grad_norm": 0.4984082877635956, "learning_rate": 0.0002025171624713959, "loss": 3.8158, "step": 2806 }, { "epoch": 3.59008, "grad_norm": 0.48809587955474854, "learning_rate": 0.0002024767801857585, "loss": 3.8401, "step": 2807 }, { "epoch": 3.59136, "grad_norm": 0.5058606863021851, "learning_rate": 0.00020243639790012112, "loss": 3.8788, "step": 2808 }, { "epoch": 3.59264, "grad_norm": 0.5134665369987488, "learning_rate": 0.00020239601561448375, "loss": 3.8255, "step": 2809 }, { "epoch": 3.59392, "grad_norm": 0.4707070589065552, "learning_rate": 0.0002023556333288464, "loss": 3.7997, "step": 2810 }, { "epoch": 3.5952, "grad_norm": 0.4775642454624176, "learning_rate": 0.00020231525104320904, "loss": 3.8048, "step": 2811 }, { "epoch": 3.59648, "grad_norm": 0.5038156509399414, "learning_rate": 0.00020227486875757167, "loss": 3.8541, "step": 2812 }, { "epoch": 3.59776, "grad_norm": 0.5030291676521301, "learning_rate": 0.0002022344864719343, "loss": 3.8111, "step": 2813 }, { "epoch": 3.59904, "grad_norm": 0.5007965564727783, "learning_rate": 0.0002021941041862969, "loss": 3.7762, "step": 2814 }, { "epoch": 3.60032, "grad_norm": 0.49890565872192383, "learning_rate": 0.00020215372190065956, "loss": 3.7919, "step": 2815 }, { "epoch": 3.6016, "grad_norm": 0.48519954085350037, "learning_rate": 0.0002021133396150222, "loss": 3.8374, "step": 2816 }, { "epoch": 3.60288, "grad_norm": 0.5019754767417908, "learning_rate": 0.00020207295732938482, "loss": 3.7988, "step": 2817 }, { "epoch": 3.6041600000000003, "grad_norm": 0.4958573877811432, "learning_rate": 0.00020203257504374745, "loss": 3.8074, "step": 2818 }, { "epoch": 3.6054399999999998, "grad_norm": 0.5017514228820801, "learning_rate": 0.0002019921927581101, "loss": 3.819, "step": 2819 }, { "epoch": 3.60672, "grad_norm": 0.5000422596931458, "learning_rate": 0.00020195181047247274, "loss": 3.8453, "step": 2820 }, { "epoch": 3.608, "grad_norm": 0.4973451793193817, "learning_rate": 0.00020191142818683537, "loss": 3.7835, "step": 2821 }, { "epoch": 3.60928, "grad_norm": 0.4728100001811981, "learning_rate": 0.00020187104590119797, "loss": 3.8969, "step": 2822 }, { "epoch": 3.61056, "grad_norm": 0.5011451840400696, "learning_rate": 0.00020183066361556063, "loss": 3.8835, "step": 2823 }, { "epoch": 3.61184, "grad_norm": 0.5048329830169678, "learning_rate": 0.00020179028132992326, "loss": 3.8225, "step": 2824 }, { "epoch": 3.61312, "grad_norm": 0.5037258863449097, "learning_rate": 0.00020174989904428589, "loss": 3.7936, "step": 2825 }, { "epoch": 3.6144, "grad_norm": 0.4970305263996124, "learning_rate": 0.00020170951675864852, "loss": 3.8651, "step": 2826 }, { "epoch": 3.6156800000000002, "grad_norm": 0.5086222290992737, "learning_rate": 0.00020166913447301115, "loss": 3.7828, "step": 2827 }, { "epoch": 3.6169599999999997, "grad_norm": 0.4862697720527649, "learning_rate": 0.0002016287521873738, "loss": 3.7793, "step": 2828 }, { "epoch": 3.61824, "grad_norm": 0.5067240595817566, "learning_rate": 0.00020158836990173643, "loss": 3.8363, "step": 2829 }, { "epoch": 3.61952, "grad_norm": 0.5017095804214478, "learning_rate": 0.00020154798761609904, "loss": 3.8429, "step": 2830 }, { "epoch": 3.6208, "grad_norm": 0.4909166991710663, "learning_rate": 0.00020150760533046167, "loss": 3.8017, "step": 2831 }, { "epoch": 3.62208, "grad_norm": 0.5125616192817688, "learning_rate": 0.00020146722304482432, "loss": 3.7553, "step": 2832 }, { "epoch": 3.62336, "grad_norm": 0.48391321301460266, "learning_rate": 0.00020142684075918695, "loss": 3.8041, "step": 2833 }, { "epoch": 3.62464, "grad_norm": 0.4965222477912903, "learning_rate": 0.00020138645847354958, "loss": 3.8016, "step": 2834 }, { "epoch": 3.62592, "grad_norm": 0.4799792170524597, "learning_rate": 0.00020134607618791221, "loss": 3.8301, "step": 2835 }, { "epoch": 3.6272, "grad_norm": 0.5186593532562256, "learning_rate": 0.00020130569390227487, "loss": 3.7594, "step": 2836 }, { "epoch": 3.62848, "grad_norm": 0.4939570426940918, "learning_rate": 0.0002012653116166375, "loss": 3.8343, "step": 2837 }, { "epoch": 3.62976, "grad_norm": 0.4945909082889557, "learning_rate": 0.0002012249293310001, "loss": 3.8158, "step": 2838 }, { "epoch": 3.63104, "grad_norm": 0.5133264660835266, "learning_rate": 0.00020118454704536273, "loss": 3.8507, "step": 2839 }, { "epoch": 3.63232, "grad_norm": 0.4962891936302185, "learning_rate": 0.00020114416475972536, "loss": 3.8135, "step": 2840 }, { "epoch": 3.6336, "grad_norm": 0.5103831887245178, "learning_rate": 0.00020110378247408802, "loss": 3.7948, "step": 2841 }, { "epoch": 3.63488, "grad_norm": 0.5109889507293701, "learning_rate": 0.00020106340018845065, "loss": 3.8751, "step": 2842 }, { "epoch": 3.63616, "grad_norm": 0.4803555905818939, "learning_rate": 0.00020102301790281328, "loss": 3.7989, "step": 2843 }, { "epoch": 3.63744, "grad_norm": 0.494994193315506, "learning_rate": 0.0002009826356171759, "loss": 3.8933, "step": 2844 }, { "epoch": 3.63872, "grad_norm": 0.4815881848335266, "learning_rate": 0.00020094225333153857, "loss": 3.7667, "step": 2845 }, { "epoch": 3.64, "grad_norm": 0.5187839865684509, "learning_rate": 0.0002009018710459012, "loss": 3.8322, "step": 2846 }, { "epoch": 3.64128, "grad_norm": 0.5170251727104187, "learning_rate": 0.0002008614887602638, "loss": 3.8624, "step": 2847 }, { "epoch": 3.64256, "grad_norm": 0.4987940490245819, "learning_rate": 0.00020082110647462643, "loss": 3.8021, "step": 2848 }, { "epoch": 3.64384, "grad_norm": 0.5137864351272583, "learning_rate": 0.0002007807241889891, "loss": 3.8342, "step": 2849 }, { "epoch": 3.64512, "grad_norm": 0.4848984479904175, "learning_rate": 0.00020074034190335172, "loss": 3.7829, "step": 2850 }, { "epoch": 3.6464, "grad_norm": 0.5238636136054993, "learning_rate": 0.00020069995961771435, "loss": 3.8299, "step": 2851 }, { "epoch": 3.6476800000000003, "grad_norm": 0.49876147508621216, "learning_rate": 0.00020065957733207698, "loss": 3.7739, "step": 2852 }, { "epoch": 3.6489599999999998, "grad_norm": 0.5346967577934265, "learning_rate": 0.00020061919504643958, "loss": 3.8934, "step": 2853 }, { "epoch": 3.65024, "grad_norm": 0.4966239631175995, "learning_rate": 0.00020057881276080227, "loss": 3.9085, "step": 2854 }, { "epoch": 3.65152, "grad_norm": 0.5036845207214355, "learning_rate": 0.00020053843047516487, "loss": 3.7837, "step": 2855 }, { "epoch": 3.6528, "grad_norm": 0.49921709299087524, "learning_rate": 0.0002004980481895275, "loss": 3.8029, "step": 2856 }, { "epoch": 3.65408, "grad_norm": 0.5502278804779053, "learning_rate": 0.00020045766590389013, "loss": 3.8748, "step": 2857 }, { "epoch": 3.65536, "grad_norm": 0.4963222146034241, "learning_rate": 0.0002004172836182528, "loss": 3.7893, "step": 2858 }, { "epoch": 3.65664, "grad_norm": 0.5209027528762817, "learning_rate": 0.00020037690133261542, "loss": 3.7979, "step": 2859 }, { "epoch": 3.65792, "grad_norm": 0.4983651340007782, "learning_rate": 0.00020033651904697805, "loss": 3.8321, "step": 2860 }, { "epoch": 3.6592000000000002, "grad_norm": 0.6087284088134766, "learning_rate": 0.00020029613676134068, "loss": 3.8507, "step": 2861 }, { "epoch": 3.6604799999999997, "grad_norm": 0.5027658343315125, "learning_rate": 0.00020025575447570333, "loss": 3.8565, "step": 2862 }, { "epoch": 3.66176, "grad_norm": 0.5117068886756897, "learning_rate": 0.00020021537219006594, "loss": 3.825, "step": 2863 }, { "epoch": 3.66304, "grad_norm": 0.5000763535499573, "learning_rate": 0.00020017498990442857, "loss": 3.8242, "step": 2864 }, { "epoch": 3.66432, "grad_norm": 0.5197402834892273, "learning_rate": 0.0002001346076187912, "loss": 3.8651, "step": 2865 }, { "epoch": 3.6656, "grad_norm": 0.5055534839630127, "learning_rate": 0.00020009422533315383, "loss": 3.8355, "step": 2866 }, { "epoch": 3.66688, "grad_norm": 0.5054891705513, "learning_rate": 0.00020005384304751648, "loss": 3.8619, "step": 2867 }, { "epoch": 3.66816, "grad_norm": 0.47256743907928467, "learning_rate": 0.00020001346076187911, "loss": 3.7752, "step": 2868 }, { "epoch": 3.66944, "grad_norm": 0.48075199127197266, "learning_rate": 0.00019997307847624174, "loss": 3.8054, "step": 2869 }, { "epoch": 3.67072, "grad_norm": 0.4974772036075592, "learning_rate": 0.00019993269619060435, "loss": 3.7665, "step": 2870 }, { "epoch": 3.672, "grad_norm": 0.48151201009750366, "learning_rate": 0.000199892313904967, "loss": 3.8581, "step": 2871 }, { "epoch": 3.67328, "grad_norm": 0.5086482763290405, "learning_rate": 0.00019985193161932964, "loss": 3.8571, "step": 2872 }, { "epoch": 3.67456, "grad_norm": 0.4769311845302582, "learning_rate": 0.00019981154933369227, "loss": 3.8258, "step": 2873 }, { "epoch": 3.67584, "grad_norm": 0.5033696889877319, "learning_rate": 0.0001997711670480549, "loss": 3.8217, "step": 2874 }, { "epoch": 3.67712, "grad_norm": 0.4959774613380432, "learning_rate": 0.00019973078476241755, "loss": 3.8151, "step": 2875 }, { "epoch": 3.6784, "grad_norm": 0.4797199070453644, "learning_rate": 0.00019969040247678018, "loss": 3.8164, "step": 2876 }, { "epoch": 3.67968, "grad_norm": 0.5271828770637512, "learning_rate": 0.0001996500201911428, "loss": 3.7692, "step": 2877 }, { "epoch": 3.68096, "grad_norm": 0.489942729473114, "learning_rate": 0.00019960963790550542, "loss": 3.8024, "step": 2878 }, { "epoch": 3.68224, "grad_norm": 0.5125555992126465, "learning_rate": 0.00019956925561986805, "loss": 3.7912, "step": 2879 }, { "epoch": 3.68352, "grad_norm": 0.4915968179702759, "learning_rate": 0.0001995288733342307, "loss": 3.7815, "step": 2880 }, { "epoch": 3.6848, "grad_norm": 0.49331218004226685, "learning_rate": 0.00019948849104859333, "loss": 3.878, "step": 2881 }, { "epoch": 3.68608, "grad_norm": 0.5256069898605347, "learning_rate": 0.00019944810876295596, "loss": 3.8137, "step": 2882 }, { "epoch": 3.68736, "grad_norm": 0.4750528335571289, "learning_rate": 0.0001994077264773186, "loss": 3.7832, "step": 2883 }, { "epoch": 3.68864, "grad_norm": 0.5187920928001404, "learning_rate": 0.00019936734419168125, "loss": 3.8111, "step": 2884 }, { "epoch": 3.68992, "grad_norm": 0.4905349016189575, "learning_rate": 0.00019932696190604388, "loss": 3.8054, "step": 2885 }, { "epoch": 3.6912000000000003, "grad_norm": 0.5346660614013672, "learning_rate": 0.00019928657962040648, "loss": 3.8365, "step": 2886 }, { "epoch": 3.6924799999999998, "grad_norm": 0.5014554262161255, "learning_rate": 0.0001992461973347691, "loss": 3.8105, "step": 2887 }, { "epoch": 3.69376, "grad_norm": 0.4906976521015167, "learning_rate": 0.00019920581504913177, "loss": 3.7636, "step": 2888 }, { "epoch": 3.69504, "grad_norm": 0.5077499747276306, "learning_rate": 0.0001991654327634944, "loss": 3.7638, "step": 2889 }, { "epoch": 3.69632, "grad_norm": 0.48475882411003113, "learning_rate": 0.00019912505047785703, "loss": 3.8723, "step": 2890 }, { "epoch": 3.6976, "grad_norm": 0.509909451007843, "learning_rate": 0.00019908466819221966, "loss": 3.8396, "step": 2891 }, { "epoch": 3.69888, "grad_norm": 0.49699631333351135, "learning_rate": 0.0001990442859065823, "loss": 3.7865, "step": 2892 }, { "epoch": 3.70016, "grad_norm": 0.4960545301437378, "learning_rate": 0.00019900390362094495, "loss": 3.819, "step": 2893 }, { "epoch": 3.70144, "grad_norm": 0.5059762597084045, "learning_rate": 0.00019896352133530755, "loss": 3.8588, "step": 2894 }, { "epoch": 3.7027200000000002, "grad_norm": 0.503450870513916, "learning_rate": 0.00019892313904967018, "loss": 3.8109, "step": 2895 }, { "epoch": 3.7039999999999997, "grad_norm": 0.505183219909668, "learning_rate": 0.0001988827567640328, "loss": 3.7826, "step": 2896 }, { "epoch": 3.70528, "grad_norm": 0.5047440528869629, "learning_rate": 0.00019884237447839547, "loss": 3.8386, "step": 2897 }, { "epoch": 3.70656, "grad_norm": 0.5242089033126831, "learning_rate": 0.0001988019921927581, "loss": 3.7865, "step": 2898 }, { "epoch": 3.70784, "grad_norm": 0.48960578441619873, "learning_rate": 0.00019876160990712073, "loss": 3.8107, "step": 2899 }, { "epoch": 3.70912, "grad_norm": 0.5148612260818481, "learning_rate": 0.00019872122762148336, "loss": 3.7782, "step": 2900 }, { "epoch": 3.7104, "grad_norm": 0.4915827810764313, "learning_rate": 0.00019868084533584602, "loss": 3.8166, "step": 2901 }, { "epoch": 3.71168, "grad_norm": 0.4962410032749176, "learning_rate": 0.00019864046305020862, "loss": 3.7864, "step": 2902 }, { "epoch": 3.71296, "grad_norm": 0.50472092628479, "learning_rate": 0.00019860008076457125, "loss": 3.7607, "step": 2903 }, { "epoch": 3.71424, "grad_norm": 0.5142050981521606, "learning_rate": 0.00019855969847893388, "loss": 3.835, "step": 2904 }, { "epoch": 3.71552, "grad_norm": 0.5082858204841614, "learning_rate": 0.0001985193161932965, "loss": 3.903, "step": 2905 }, { "epoch": 3.7168, "grad_norm": 0.529228150844574, "learning_rate": 0.00019847893390765917, "loss": 3.7935, "step": 2906 }, { "epoch": 3.71808, "grad_norm": 0.515741765499115, "learning_rate": 0.0001984385516220218, "loss": 3.863, "step": 2907 }, { "epoch": 3.71936, "grad_norm": 0.5094571113586426, "learning_rate": 0.00019839816933638443, "loss": 3.8557, "step": 2908 }, { "epoch": 3.72064, "grad_norm": 0.5035596489906311, "learning_rate": 0.00019835778705074703, "loss": 3.8019, "step": 2909 }, { "epoch": 3.72192, "grad_norm": 0.5284061431884766, "learning_rate": 0.00019831740476510969, "loss": 3.8547, "step": 2910 }, { "epoch": 3.7232, "grad_norm": 0.5245686173439026, "learning_rate": 0.00019827702247947232, "loss": 3.7454, "step": 2911 }, { "epoch": 3.72448, "grad_norm": 0.514725923538208, "learning_rate": 0.00019823664019383495, "loss": 3.8237, "step": 2912 }, { "epoch": 3.72576, "grad_norm": 0.5170477628707886, "learning_rate": 0.00019819625790819758, "loss": 3.8264, "step": 2913 }, { "epoch": 3.72704, "grad_norm": 0.5244784951210022, "learning_rate": 0.00019815587562256023, "loss": 3.7872, "step": 2914 }, { "epoch": 3.72832, "grad_norm": 0.510220468044281, "learning_rate": 0.00019811549333692286, "loss": 3.8574, "step": 2915 }, { "epoch": 3.7296, "grad_norm": 0.5242193937301636, "learning_rate": 0.0001980751110512855, "loss": 3.8547, "step": 2916 }, { "epoch": 3.73088, "grad_norm": 0.48602530360221863, "learning_rate": 0.0001980347287656481, "loss": 3.7948, "step": 2917 }, { "epoch": 3.73216, "grad_norm": 0.4796367585659027, "learning_rate": 0.00019799434648001073, "loss": 3.793, "step": 2918 }, { "epoch": 3.73344, "grad_norm": 0.4868846535682678, "learning_rate": 0.00019795396419437338, "loss": 3.8138, "step": 2919 }, { "epoch": 3.7347200000000003, "grad_norm": 0.48275643587112427, "learning_rate": 0.00019791358190873601, "loss": 3.8185, "step": 2920 }, { "epoch": 3.7359999999999998, "grad_norm": 0.4956841468811035, "learning_rate": 0.00019787319962309864, "loss": 3.8374, "step": 2921 }, { "epoch": 3.73728, "grad_norm": 0.5024325251579285, "learning_rate": 0.00019783281733746127, "loss": 3.9032, "step": 2922 }, { "epoch": 3.73856, "grad_norm": 0.4799942970275879, "learning_rate": 0.00019779243505182393, "loss": 3.8317, "step": 2923 }, { "epoch": 3.73984, "grad_norm": 0.5100151300430298, "learning_rate": 0.00019775205276618656, "loss": 3.8394, "step": 2924 }, { "epoch": 3.74112, "grad_norm": 0.4840579926967621, "learning_rate": 0.00019771167048054916, "loss": 3.7803, "step": 2925 }, { "epoch": 3.7424, "grad_norm": 0.495714008808136, "learning_rate": 0.0001976712881949118, "loss": 3.7849, "step": 2926 }, { "epoch": 3.74368, "grad_norm": 0.49256429076194763, "learning_rate": 0.00019763090590927445, "loss": 3.7815, "step": 2927 }, { "epoch": 3.74496, "grad_norm": 0.4639676809310913, "learning_rate": 0.00019759052362363708, "loss": 3.8497, "step": 2928 }, { "epoch": 3.7462400000000002, "grad_norm": 0.486234575510025, "learning_rate": 0.0001975501413379997, "loss": 3.866, "step": 2929 }, { "epoch": 3.7475199999999997, "grad_norm": 0.4823131263256073, "learning_rate": 0.00019750975905236234, "loss": 3.7977, "step": 2930 }, { "epoch": 3.7488, "grad_norm": 0.4761921465396881, "learning_rate": 0.000197469376766725, "loss": 3.7816, "step": 2931 }, { "epoch": 3.75008, "grad_norm": 0.4939184784889221, "learning_rate": 0.00019742899448108763, "loss": 3.8268, "step": 2932 }, { "epoch": 3.75136, "grad_norm": 0.4557759761810303, "learning_rate": 0.00019738861219545026, "loss": 3.7831, "step": 2933 }, { "epoch": 3.75264, "grad_norm": 0.4687483012676239, "learning_rate": 0.00019734822990981286, "loss": 3.8796, "step": 2934 }, { "epoch": 3.75392, "grad_norm": 0.4676748514175415, "learning_rate": 0.0001973078476241755, "loss": 3.8334, "step": 2935 }, { "epoch": 3.7552, "grad_norm": 0.47445160150527954, "learning_rate": 0.00019726746533853815, "loss": 3.7532, "step": 2936 }, { "epoch": 3.75648, "grad_norm": 0.4851106107234955, "learning_rate": 0.00019722708305290078, "loss": 3.7783, "step": 2937 }, { "epoch": 3.75776, "grad_norm": 0.4669123888015747, "learning_rate": 0.0001971867007672634, "loss": 3.8144, "step": 2938 }, { "epoch": 3.75904, "grad_norm": 0.4675630033016205, "learning_rate": 0.00019714631848162604, "loss": 3.7909, "step": 2939 }, { "epoch": 3.76032, "grad_norm": 0.4890853762626648, "learning_rate": 0.0001971059361959887, "loss": 3.803, "step": 2940 }, { "epoch": 3.7616, "grad_norm": 0.4596586227416992, "learning_rate": 0.00019706555391035133, "loss": 3.7987, "step": 2941 }, { "epoch": 3.76288, "grad_norm": 0.5085187554359436, "learning_rate": 0.00019702517162471393, "loss": 3.7938, "step": 2942 }, { "epoch": 3.76416, "grad_norm": 0.4674232006072998, "learning_rate": 0.00019698478933907656, "loss": 3.9139, "step": 2943 }, { "epoch": 3.76544, "grad_norm": 0.4964526891708374, "learning_rate": 0.00019694440705343922, "loss": 3.805, "step": 2944 }, { "epoch": 3.76672, "grad_norm": 0.4870041310787201, "learning_rate": 0.00019690402476780185, "loss": 3.7573, "step": 2945 }, { "epoch": 3.768, "grad_norm": 0.4788266718387604, "learning_rate": 0.00019686364248216448, "loss": 3.8082, "step": 2946 }, { "epoch": 3.76928, "grad_norm": 0.5005785822868347, "learning_rate": 0.0001968232601965271, "loss": 3.7935, "step": 2947 }, { "epoch": 3.77056, "grad_norm": 0.48432236909866333, "learning_rate": 0.00019678287791088974, "loss": 3.8643, "step": 2948 }, { "epoch": 3.77184, "grad_norm": 0.4844403862953186, "learning_rate": 0.0001967424956252524, "loss": 3.8046, "step": 2949 }, { "epoch": 3.77312, "grad_norm": 0.49328166246414185, "learning_rate": 0.000196702113339615, "loss": 3.8041, "step": 2950 }, { "epoch": 3.7744, "grad_norm": 0.48709914088249207, "learning_rate": 0.00019666173105397763, "loss": 3.7822, "step": 2951 }, { "epoch": 3.77568, "grad_norm": 0.4914034903049469, "learning_rate": 0.00019662134876834026, "loss": 3.866, "step": 2952 }, { "epoch": 3.77696, "grad_norm": 0.5005134344100952, "learning_rate": 0.00019658096648270292, "loss": 3.7386, "step": 2953 }, { "epoch": 3.7782400000000003, "grad_norm": 0.4764963388442993, "learning_rate": 0.00019654058419706555, "loss": 3.8049, "step": 2954 }, { "epoch": 3.7795199999999998, "grad_norm": 0.4586517810821533, "learning_rate": 0.00019650020191142818, "loss": 3.8072, "step": 2955 }, { "epoch": 3.7808, "grad_norm": 0.4838949739933014, "learning_rate": 0.0001964598196257908, "loss": 3.8139, "step": 2956 }, { "epoch": 3.78208, "grad_norm": 0.48450782895088196, "learning_rate": 0.00019641943734015346, "loss": 3.8376, "step": 2957 }, { "epoch": 3.78336, "grad_norm": 0.5044925808906555, "learning_rate": 0.00019637905505451607, "loss": 3.7702, "step": 2958 }, { "epoch": 3.78464, "grad_norm": 0.4797843396663666, "learning_rate": 0.0001963386727688787, "loss": 3.7628, "step": 2959 }, { "epoch": 3.78592, "grad_norm": 0.4827995300292969, "learning_rate": 0.00019629829048324133, "loss": 3.7901, "step": 2960 }, { "epoch": 3.7872, "grad_norm": 0.4926610291004181, "learning_rate": 0.00019625790819760396, "loss": 3.8232, "step": 2961 }, { "epoch": 3.78848, "grad_norm": 0.46942880749702454, "learning_rate": 0.0001962175259119666, "loss": 3.7819, "step": 2962 }, { "epoch": 3.7897600000000002, "grad_norm": 0.5101065039634705, "learning_rate": 0.00019617714362632924, "loss": 3.8026, "step": 2963 }, { "epoch": 3.7910399999999997, "grad_norm": 0.4860995411872864, "learning_rate": 0.00019613676134069187, "loss": 3.8669, "step": 2964 }, { "epoch": 3.79232, "grad_norm": 0.49931204319000244, "learning_rate": 0.00019609637905505448, "loss": 3.8031, "step": 2965 }, { "epoch": 3.7936, "grad_norm": 0.4776234030723572, "learning_rate": 0.00019605599676941713, "loss": 3.8091, "step": 2966 }, { "epoch": 3.79488, "grad_norm": 0.5147337317466736, "learning_rate": 0.00019601561448377976, "loss": 3.8127, "step": 2967 }, { "epoch": 3.79616, "grad_norm": 0.49513697624206543, "learning_rate": 0.0001959752321981424, "loss": 3.9379, "step": 2968 }, { "epoch": 3.79744, "grad_norm": 0.5062114596366882, "learning_rate": 0.00019593484991250502, "loss": 3.833, "step": 2969 }, { "epoch": 3.79872, "grad_norm": 0.48605409264564514, "learning_rate": 0.00019589446762686768, "loss": 3.8156, "step": 2970 }, { "epoch": 3.8, "grad_norm": 0.48607662320137024, "learning_rate": 0.0001958540853412303, "loss": 3.8235, "step": 2971 }, { "epoch": 3.80128, "grad_norm": 0.48669230937957764, "learning_rate": 0.00019581370305559294, "loss": 3.7372, "step": 2972 }, { "epoch": 3.80256, "grad_norm": 0.48195433616638184, "learning_rate": 0.00019577332076995554, "loss": 3.798, "step": 2973 }, { "epoch": 3.80384, "grad_norm": 0.4917437434196472, "learning_rate": 0.00019573293848431817, "loss": 3.8529, "step": 2974 }, { "epoch": 3.80512, "grad_norm": 0.5031919479370117, "learning_rate": 0.00019569255619868083, "loss": 3.7061, "step": 2975 }, { "epoch": 3.8064, "grad_norm": 0.5153105854988098, "learning_rate": 0.00019565217391304346, "loss": 3.7935, "step": 2976 }, { "epoch": 3.80768, "grad_norm": 0.5166071057319641, "learning_rate": 0.0001956117916274061, "loss": 3.8835, "step": 2977 }, { "epoch": 3.80896, "grad_norm": 0.47903141379356384, "learning_rate": 0.00019557140934176872, "loss": 3.7476, "step": 2978 }, { "epoch": 3.81024, "grad_norm": 0.4941449761390686, "learning_rate": 0.00019553102705613138, "loss": 3.7796, "step": 2979 }, { "epoch": 3.81152, "grad_norm": 0.5048912763595581, "learning_rate": 0.000195490644770494, "loss": 3.7708, "step": 2980 }, { "epoch": 3.8128, "grad_norm": 0.4860134422779083, "learning_rate": 0.0001954502624848566, "loss": 3.8097, "step": 2981 }, { "epoch": 3.81408, "grad_norm": 0.48185980319976807, "learning_rate": 0.00019540988019921924, "loss": 3.8444, "step": 2982 }, { "epoch": 3.81536, "grad_norm": 0.4978376030921936, "learning_rate": 0.0001953694979135819, "loss": 3.7791, "step": 2983 }, { "epoch": 3.81664, "grad_norm": 0.5320491194725037, "learning_rate": 0.00019532911562794453, "loss": 3.7774, "step": 2984 }, { "epoch": 3.81792, "grad_norm": 0.4956910014152527, "learning_rate": 0.00019528873334230716, "loss": 3.8242, "step": 2985 }, { "epoch": 3.8192, "grad_norm": 0.4940122663974762, "learning_rate": 0.0001952483510566698, "loss": 3.7981, "step": 2986 }, { "epoch": 3.82048, "grad_norm": 0.49585092067718506, "learning_rate": 0.00019520796877103242, "loss": 3.7541, "step": 2987 }, { "epoch": 3.8217600000000003, "grad_norm": 0.49807533621788025, "learning_rate": 0.00019516758648539508, "loss": 3.7866, "step": 2988 }, { "epoch": 3.8230399999999998, "grad_norm": 0.5053405165672302, "learning_rate": 0.00019512720419975768, "loss": 3.8625, "step": 2989 }, { "epoch": 3.82432, "grad_norm": 0.5178548693656921, "learning_rate": 0.0001950868219141203, "loss": 3.7998, "step": 2990 }, { "epoch": 3.8256, "grad_norm": 0.5026320815086365, "learning_rate": 0.00019504643962848294, "loss": 3.815, "step": 2991 }, { "epoch": 3.82688, "grad_norm": 0.5041228532791138, "learning_rate": 0.0001950060573428456, "loss": 3.7673, "step": 2992 }, { "epoch": 3.82816, "grad_norm": 0.5110290050506592, "learning_rate": 0.00019496567505720823, "loss": 3.8342, "step": 2993 }, { "epoch": 3.82944, "grad_norm": 0.5149263739585876, "learning_rate": 0.00019492529277157086, "loss": 3.8922, "step": 2994 }, { "epoch": 3.83072, "grad_norm": 0.49787768721580505, "learning_rate": 0.0001948849104859335, "loss": 3.8788, "step": 2995 }, { "epoch": 3.832, "grad_norm": 0.48449409008026123, "learning_rate": 0.00019484452820029614, "loss": 3.7207, "step": 2996 }, { "epoch": 3.8332800000000002, "grad_norm": 0.49689674377441406, "learning_rate": 0.00019480414591465875, "loss": 3.7686, "step": 2997 }, { "epoch": 3.8345599999999997, "grad_norm": 0.533508837223053, "learning_rate": 0.00019476376362902138, "loss": 3.8948, "step": 2998 }, { "epoch": 3.83584, "grad_norm": 0.48604533076286316, "learning_rate": 0.000194723381343384, "loss": 3.7144, "step": 2999 }, { "epoch": 3.83712, "grad_norm": 0.5123369693756104, "learning_rate": 0.00019468299905774664, "loss": 3.7716, "step": 3000 }, { "epoch": 3.8384, "grad_norm": 0.4925461709499359, "learning_rate": 0.0001946426167721093, "loss": 3.862, "step": 3001 }, { "epoch": 3.83968, "grad_norm": 0.5041381120681763, "learning_rate": 0.00019460223448647192, "loss": 3.7986, "step": 3002 }, { "epoch": 3.84096, "grad_norm": 0.4902060031890869, "learning_rate": 0.00019456185220083455, "loss": 3.8608, "step": 3003 }, { "epoch": 3.84224, "grad_norm": 0.5032182931900024, "learning_rate": 0.00019452146991519716, "loss": 3.8192, "step": 3004 }, { "epoch": 3.84352, "grad_norm": 0.5030463337898254, "learning_rate": 0.00019448108762955984, "loss": 3.8128, "step": 3005 }, { "epoch": 3.8448, "grad_norm": 0.49824386835098267, "learning_rate": 0.00019444070534392244, "loss": 3.7264, "step": 3006 }, { "epoch": 3.84608, "grad_norm": 0.5038865208625793, "learning_rate": 0.00019440032305828507, "loss": 3.8521, "step": 3007 }, { "epoch": 3.84736, "grad_norm": 0.5093699097633362, "learning_rate": 0.0001943599407726477, "loss": 3.7095, "step": 3008 }, { "epoch": 3.84864, "grad_norm": 0.49875956773757935, "learning_rate": 0.00019431955848701036, "loss": 3.868, "step": 3009 }, { "epoch": 3.84992, "grad_norm": 0.5082076787948608, "learning_rate": 0.000194279176201373, "loss": 3.8264, "step": 3010 }, { "epoch": 3.8512, "grad_norm": 0.5088693499565125, "learning_rate": 0.00019423879391573562, "loss": 3.8094, "step": 3011 }, { "epoch": 3.85248, "grad_norm": 0.49538499116897583, "learning_rate": 0.00019419841163009823, "loss": 3.8007, "step": 3012 }, { "epoch": 3.85376, "grad_norm": 0.5211043953895569, "learning_rate": 0.00019415802934446086, "loss": 3.7566, "step": 3013 }, { "epoch": 3.85504, "grad_norm": 0.4987725019454956, "learning_rate": 0.0001941176470588235, "loss": 3.8489, "step": 3014 }, { "epoch": 3.85632, "grad_norm": 0.4974265992641449, "learning_rate": 0.00019407726477318614, "loss": 3.8121, "step": 3015 }, { "epoch": 3.8576, "grad_norm": 0.5079092383384705, "learning_rate": 0.00019403688248754877, "loss": 3.8206, "step": 3016 }, { "epoch": 3.85888, "grad_norm": 0.4906683564186096, "learning_rate": 0.0001939965002019114, "loss": 3.7863, "step": 3017 }, { "epoch": 3.86016, "grad_norm": 0.5003571510314941, "learning_rate": 0.00019395611791627406, "loss": 3.7905, "step": 3018 }, { "epoch": 3.86144, "grad_norm": 0.5029097199440002, "learning_rate": 0.0001939157356306367, "loss": 3.8112, "step": 3019 }, { "epoch": 3.86272, "grad_norm": 0.494642972946167, "learning_rate": 0.00019387535334499932, "loss": 3.7935, "step": 3020 }, { "epoch": 3.864, "grad_norm": 0.48063820600509644, "learning_rate": 0.00019383497105936192, "loss": 3.8126, "step": 3021 }, { "epoch": 3.8652800000000003, "grad_norm": 0.4792153835296631, "learning_rate": 0.00019379458877372458, "loss": 3.7361, "step": 3022 }, { "epoch": 3.8665599999999998, "grad_norm": 0.46592941880226135, "learning_rate": 0.0001937542064880872, "loss": 3.8774, "step": 3023 }, { "epoch": 3.86784, "grad_norm": 0.46901074051856995, "learning_rate": 0.00019371382420244984, "loss": 3.8471, "step": 3024 }, { "epoch": 3.86912, "grad_norm": 0.4709526300430298, "learning_rate": 0.00019367344191681247, "loss": 3.8575, "step": 3025 }, { "epoch": 3.8704, "grad_norm": 0.4765014946460724, "learning_rate": 0.0001936330596311751, "loss": 3.7684, "step": 3026 }, { "epoch": 3.87168, "grad_norm": 0.48558369278907776, "learning_rate": 0.00019359267734553776, "loss": 3.8208, "step": 3027 }, { "epoch": 3.87296, "grad_norm": 0.47557520866394043, "learning_rate": 0.0001935522950599004, "loss": 3.755, "step": 3028 }, { "epoch": 3.87424, "grad_norm": 0.4816844165325165, "learning_rate": 0.000193511912774263, "loss": 3.8242, "step": 3029 }, { "epoch": 3.87552, "grad_norm": 0.464916467666626, "learning_rate": 0.00019347153048862562, "loss": 3.8323, "step": 3030 }, { "epoch": 3.8768000000000002, "grad_norm": 0.4912608563899994, "learning_rate": 0.00019343114820298828, "loss": 3.833, "step": 3031 }, { "epoch": 3.8780799999999997, "grad_norm": 0.4688814580440521, "learning_rate": 0.0001933907659173509, "loss": 3.8551, "step": 3032 }, { "epoch": 3.87936, "grad_norm": 0.4833512306213379, "learning_rate": 0.00019335038363171354, "loss": 3.8419, "step": 3033 }, { "epoch": 3.88064, "grad_norm": 0.49540048837661743, "learning_rate": 0.00019331000134607617, "loss": 3.8368, "step": 3034 }, { "epoch": 3.88192, "grad_norm": 0.4662631154060364, "learning_rate": 0.00019326961906043883, "loss": 3.9145, "step": 3035 }, { "epoch": 3.8832, "grad_norm": 0.4789687693119049, "learning_rate": 0.00019322923677480146, "loss": 3.809, "step": 3036 }, { "epoch": 3.88448, "grad_norm": 0.47663864493370056, "learning_rate": 0.00019318885448916406, "loss": 3.7629, "step": 3037 }, { "epoch": 3.88576, "grad_norm": 0.46691396832466125, "learning_rate": 0.0001931484722035267, "loss": 3.7381, "step": 3038 }, { "epoch": 3.88704, "grad_norm": 0.5005871057510376, "learning_rate": 0.00019310808991788932, "loss": 3.8393, "step": 3039 }, { "epoch": 3.88832, "grad_norm": 0.4863739311695099, "learning_rate": 0.00019306770763225198, "loss": 3.7096, "step": 3040 }, { "epoch": 3.8895999999999997, "grad_norm": 0.49686795473098755, "learning_rate": 0.0001930273253466146, "loss": 3.74, "step": 3041 }, { "epoch": 3.89088, "grad_norm": 0.4751260578632355, "learning_rate": 0.00019298694306097724, "loss": 3.8243, "step": 3042 }, { "epoch": 3.89216, "grad_norm": 0.4915493428707123, "learning_rate": 0.00019294656077533987, "loss": 3.8116, "step": 3043 }, { "epoch": 3.89344, "grad_norm": 0.4819605350494385, "learning_rate": 0.00019290617848970252, "loss": 3.843, "step": 3044 }, { "epoch": 3.89472, "grad_norm": 0.4787873923778534, "learning_rate": 0.00019286579620406513, "loss": 3.7245, "step": 3045 }, { "epoch": 3.896, "grad_norm": 0.4805007874965668, "learning_rate": 0.00019282541391842776, "loss": 3.7573, "step": 3046 }, { "epoch": 3.89728, "grad_norm": 0.4945317804813385, "learning_rate": 0.00019278503163279039, "loss": 3.9226, "step": 3047 }, { "epoch": 3.89856, "grad_norm": 0.49421226978302, "learning_rate": 0.00019274464934715304, "loss": 3.7792, "step": 3048 }, { "epoch": 3.89984, "grad_norm": 0.47549110651016235, "learning_rate": 0.00019270426706151567, "loss": 3.8229, "step": 3049 }, { "epoch": 3.90112, "grad_norm": 0.5038889050483704, "learning_rate": 0.0001926638847758783, "loss": 3.8079, "step": 3050 }, { "epoch": 3.9024, "grad_norm": 0.4997271001338959, "learning_rate": 0.00019262350249024093, "loss": 3.8135, "step": 3051 }, { "epoch": 3.90368, "grad_norm": 0.4927690625190735, "learning_rate": 0.00019258312020460354, "loss": 3.8534, "step": 3052 }, { "epoch": 3.90496, "grad_norm": 0.4786875545978546, "learning_rate": 0.0001925427379189662, "loss": 3.7229, "step": 3053 }, { "epoch": 3.90624, "grad_norm": 0.5047836899757385, "learning_rate": 0.00019250235563332882, "loss": 3.8432, "step": 3054 }, { "epoch": 3.90752, "grad_norm": 0.5120929479598999, "learning_rate": 0.00019246197334769145, "loss": 3.777, "step": 3055 }, { "epoch": 3.9088000000000003, "grad_norm": 0.49564608931541443, "learning_rate": 0.00019242159106205408, "loss": 3.8209, "step": 3056 }, { "epoch": 3.91008, "grad_norm": 0.4866943657398224, "learning_rate": 0.00019238120877641674, "loss": 3.7899, "step": 3057 }, { "epoch": 3.91136, "grad_norm": 0.487051784992218, "learning_rate": 0.00019234082649077937, "loss": 3.799, "step": 3058 }, { "epoch": 3.91264, "grad_norm": 0.4829874634742737, "learning_rate": 0.000192300444205142, "loss": 3.7346, "step": 3059 }, { "epoch": 3.91392, "grad_norm": 0.4827929735183716, "learning_rate": 0.0001922600619195046, "loss": 3.7442, "step": 3060 }, { "epoch": 3.9152, "grad_norm": 0.4668867588043213, "learning_rate": 0.00019221967963386726, "loss": 3.7373, "step": 3061 }, { "epoch": 3.91648, "grad_norm": 0.4852744936943054, "learning_rate": 0.0001921792973482299, "loss": 3.7558, "step": 3062 }, { "epoch": 3.91776, "grad_norm": 0.4621356129646301, "learning_rate": 0.00019213891506259252, "loss": 3.7306, "step": 3063 }, { "epoch": 3.91904, "grad_norm": 0.47092723846435547, "learning_rate": 0.00019209853277695515, "loss": 3.7813, "step": 3064 }, { "epoch": 3.9203200000000002, "grad_norm": 0.48093530535697937, "learning_rate": 0.0001920581504913178, "loss": 3.7827, "step": 3065 }, { "epoch": 3.9215999999999998, "grad_norm": 0.4793148338794708, "learning_rate": 0.00019201776820568044, "loss": 3.7267, "step": 3066 }, { "epoch": 3.92288, "grad_norm": 0.4646039605140686, "learning_rate": 0.00019197738592004307, "loss": 3.7712, "step": 3067 }, { "epoch": 3.92416, "grad_norm": 0.47865545749664307, "learning_rate": 0.00019193700363440567, "loss": 3.7797, "step": 3068 }, { "epoch": 3.92544, "grad_norm": 0.4747565984725952, "learning_rate": 0.0001918966213487683, "loss": 3.7688, "step": 3069 }, { "epoch": 3.92672, "grad_norm": 0.4655320346355438, "learning_rate": 0.00019185623906313096, "loss": 3.8159, "step": 3070 }, { "epoch": 3.928, "grad_norm": 0.4775558114051819, "learning_rate": 0.0001918158567774936, "loss": 3.7575, "step": 3071 }, { "epoch": 3.92928, "grad_norm": 0.4783431589603424, "learning_rate": 0.00019177547449185622, "loss": 3.6739, "step": 3072 }, { "epoch": 3.93056, "grad_norm": 0.48486626148223877, "learning_rate": 0.00019173509220621885, "loss": 3.7762, "step": 3073 }, { "epoch": 3.9318400000000002, "grad_norm": 0.4990038275718689, "learning_rate": 0.0001916947099205815, "loss": 3.821, "step": 3074 }, { "epoch": 3.9331199999999997, "grad_norm": 0.4997273087501526, "learning_rate": 0.00019165432763494414, "loss": 3.8262, "step": 3075 }, { "epoch": 3.9344, "grad_norm": 0.48248130083084106, "learning_rate": 0.00019161394534930674, "loss": 3.8229, "step": 3076 }, { "epoch": 3.93568, "grad_norm": 0.4947473406791687, "learning_rate": 0.00019157356306366937, "loss": 3.8035, "step": 3077 }, { "epoch": 3.93696, "grad_norm": 0.49929845333099365, "learning_rate": 0.00019153318077803203, "loss": 3.7907, "step": 3078 }, { "epoch": 3.93824, "grad_norm": 0.5050824880599976, "learning_rate": 0.00019149279849239466, "loss": 3.7588, "step": 3079 }, { "epoch": 3.93952, "grad_norm": 0.5087640881538391, "learning_rate": 0.0001914524162067573, "loss": 3.8331, "step": 3080 }, { "epoch": 3.9408, "grad_norm": 0.4996209442615509, "learning_rate": 0.00019141203392111992, "loss": 3.7626, "step": 3081 }, { "epoch": 3.94208, "grad_norm": 0.49905964732170105, "learning_rate": 0.00019137165163548255, "loss": 3.7356, "step": 3082 }, { "epoch": 3.94336, "grad_norm": 0.5181260704994202, "learning_rate": 0.0001913312693498452, "loss": 3.8077, "step": 3083 }, { "epoch": 3.94464, "grad_norm": 0.5053640604019165, "learning_rate": 0.0001912908870642078, "loss": 3.8205, "step": 3084 }, { "epoch": 3.94592, "grad_norm": 0.5048377513885498, "learning_rate": 0.00019125050477857044, "loss": 3.7736, "step": 3085 }, { "epoch": 3.9472, "grad_norm": 0.4769393503665924, "learning_rate": 0.00019121012249293307, "loss": 3.7224, "step": 3086 }, { "epoch": 3.94848, "grad_norm": 0.5042276382446289, "learning_rate": 0.00019116974020729572, "loss": 3.7301, "step": 3087 }, { "epoch": 3.94976, "grad_norm": 0.4883654713630676, "learning_rate": 0.00019112935792165835, "loss": 3.7873, "step": 3088 }, { "epoch": 3.95104, "grad_norm": 0.46280887722969055, "learning_rate": 0.00019108897563602098, "loss": 3.8232, "step": 3089 }, { "epoch": 3.9523200000000003, "grad_norm": 0.47715169191360474, "learning_rate": 0.00019104859335038361, "loss": 3.8285, "step": 3090 }, { "epoch": 3.9536, "grad_norm": 0.4663173258304596, "learning_rate": 0.00019100821106474627, "loss": 3.7407, "step": 3091 }, { "epoch": 3.95488, "grad_norm": 0.4840726852416992, "learning_rate": 0.0001909678287791089, "loss": 3.7961, "step": 3092 }, { "epoch": 3.95616, "grad_norm": 0.4775456190109253, "learning_rate": 0.0001909274464934715, "loss": 3.7243, "step": 3093 }, { "epoch": 3.95744, "grad_norm": 0.506077229976654, "learning_rate": 0.00019088706420783414, "loss": 3.832, "step": 3094 }, { "epoch": 3.95872, "grad_norm": 0.4873894155025482, "learning_rate": 0.00019084668192219677, "loss": 3.8327, "step": 3095 }, { "epoch": 3.96, "grad_norm": 0.4892643690109253, "learning_rate": 0.00019080629963655942, "loss": 3.8131, "step": 3096 }, { "epoch": 3.96128, "grad_norm": 0.503563642501831, "learning_rate": 0.00019076591735092205, "loss": 3.8291, "step": 3097 }, { "epoch": 3.96256, "grad_norm": 0.4653085768222809, "learning_rate": 0.00019072553506528468, "loss": 3.7782, "step": 3098 }, { "epoch": 3.9638400000000003, "grad_norm": 0.46706753969192505, "learning_rate": 0.00019068515277964729, "loss": 3.7926, "step": 3099 }, { "epoch": 3.9651199999999998, "grad_norm": 0.4738360345363617, "learning_rate": 0.00019064477049400997, "loss": 3.7901, "step": 3100 }, { "epoch": 3.9664, "grad_norm": 0.48858094215393066, "learning_rate": 0.00019060438820837257, "loss": 3.7761, "step": 3101 }, { "epoch": 3.96768, "grad_norm": 0.4789169430732727, "learning_rate": 0.0001905640059227352, "loss": 3.8362, "step": 3102 }, { "epoch": 3.96896, "grad_norm": 0.4818405508995056, "learning_rate": 0.00019052362363709783, "loss": 3.7566, "step": 3103 }, { "epoch": 3.97024, "grad_norm": 0.4910163879394531, "learning_rate": 0.0001904832413514605, "loss": 3.8183, "step": 3104 }, { "epoch": 3.97152, "grad_norm": 0.47668853402137756, "learning_rate": 0.00019044285906582312, "loss": 3.7694, "step": 3105 }, { "epoch": 3.9728, "grad_norm": 0.4871366024017334, "learning_rate": 0.00019040247678018575, "loss": 3.8093, "step": 3106 }, { "epoch": 3.97408, "grad_norm": 0.4852496087551117, "learning_rate": 0.00019036209449454838, "loss": 3.8424, "step": 3107 }, { "epoch": 3.9753600000000002, "grad_norm": 0.49890244007110596, "learning_rate": 0.00019032171220891098, "loss": 3.7696, "step": 3108 }, { "epoch": 3.9766399999999997, "grad_norm": 0.49646714329719543, "learning_rate": 0.00019028132992327364, "loss": 3.8346, "step": 3109 }, { "epoch": 3.97792, "grad_norm": 0.48989424109458923, "learning_rate": 0.00019024094763763627, "loss": 3.7555, "step": 3110 }, { "epoch": 3.9792, "grad_norm": 0.4914839267730713, "learning_rate": 0.0001902005653519989, "loss": 3.8055, "step": 3111 }, { "epoch": 3.98048, "grad_norm": 0.48345229029655457, "learning_rate": 0.00019016018306636153, "loss": 3.8296, "step": 3112 }, { "epoch": 3.98176, "grad_norm": 0.4965314269065857, "learning_rate": 0.0001901198007807242, "loss": 3.8099, "step": 3113 }, { "epoch": 3.98304, "grad_norm": 0.47324666380882263, "learning_rate": 0.00019007941849508682, "loss": 3.8194, "step": 3114 }, { "epoch": 3.98432, "grad_norm": 0.48489847779273987, "learning_rate": 0.00019003903620944945, "loss": 3.792, "step": 3115 }, { "epoch": 3.9856, "grad_norm": 0.4774338901042938, "learning_rate": 0.00018999865392381205, "loss": 3.8118, "step": 3116 }, { "epoch": 3.98688, "grad_norm": 0.4876146912574768, "learning_rate": 0.0001899582716381747, "loss": 3.8224, "step": 3117 }, { "epoch": 3.98816, "grad_norm": 0.4737136662006378, "learning_rate": 0.00018991788935253734, "loss": 3.8342, "step": 3118 }, { "epoch": 3.98944, "grad_norm": 0.46729955077171326, "learning_rate": 0.00018987750706689997, "loss": 3.7307, "step": 3119 }, { "epoch": 3.99072, "grad_norm": 0.46377983689308167, "learning_rate": 0.0001898371247812626, "loss": 3.8138, "step": 3120 }, { "epoch": 3.992, "grad_norm": 0.4843078851699829, "learning_rate": 0.00018979674249562523, "loss": 3.8357, "step": 3121 }, { "epoch": 3.99328, "grad_norm": 0.47113025188446045, "learning_rate": 0.00018975636020998789, "loss": 3.7578, "step": 3122 }, { "epoch": 3.99456, "grad_norm": 0.5183811187744141, "learning_rate": 0.00018971597792435052, "loss": 3.7808, "step": 3123 }, { "epoch": 3.99584, "grad_norm": 0.4774457812309265, "learning_rate": 0.00018967559563871312, "loss": 3.7492, "step": 3124 }, { "epoch": 3.99712, "grad_norm": 0.5077742338180542, "learning_rate": 0.00018963521335307575, "loss": 3.6986, "step": 3125 }, { "epoch": 3.9984, "grad_norm": 0.489953875541687, "learning_rate": 0.0001895948310674384, "loss": 3.7662, "step": 3126 }, { "epoch": 3.99968, "grad_norm": 0.5075393319129944, "learning_rate": 0.00018955444878180104, "loss": 3.7844, "step": 3127 }, { "epoch": 4.0, "grad_norm": 0.8455784916877747, "learning_rate": 0.00018951406649616367, "loss": 3.6669, "step": 3128 }, { "epoch": 4.00128, "grad_norm": 0.5457375645637512, "learning_rate": 0.0001894736842105263, "loss": 3.696, "step": 3129 }, { "epoch": 4.00256, "grad_norm": 0.5012986063957214, "learning_rate": 0.00018943330192488895, "loss": 3.7483, "step": 3130 }, { "epoch": 4.00384, "grad_norm": 0.5168102979660034, "learning_rate": 0.00018939291963925158, "loss": 3.6833, "step": 3131 }, { "epoch": 4.00512, "grad_norm": 0.5085991024971008, "learning_rate": 0.0001893525373536142, "loss": 3.7756, "step": 3132 }, { "epoch": 4.0064, "grad_norm": 0.5034124255180359, "learning_rate": 0.00018931215506797682, "loss": 3.6705, "step": 3133 }, { "epoch": 4.00768, "grad_norm": 0.5143512487411499, "learning_rate": 0.00018927177278233945, "loss": 3.7017, "step": 3134 }, { "epoch": 4.00896, "grad_norm": 0.49328911304473877, "learning_rate": 0.0001892313904967021, "loss": 3.618, "step": 3135 }, { "epoch": 4.01024, "grad_norm": 0.49207302927970886, "learning_rate": 0.00018919100821106473, "loss": 3.6588, "step": 3136 }, { "epoch": 4.01152, "grad_norm": 0.5474900603294373, "learning_rate": 0.00018915062592542736, "loss": 3.7265, "step": 3137 }, { "epoch": 4.0128, "grad_norm": 0.48699188232421875, "learning_rate": 0.00018911024363979, "loss": 3.7186, "step": 3138 }, { "epoch": 4.01408, "grad_norm": 0.5149625539779663, "learning_rate": 0.00018906986135415265, "loss": 3.6609, "step": 3139 }, { "epoch": 4.01536, "grad_norm": 0.5144675374031067, "learning_rate": 0.00018902947906851525, "loss": 3.6982, "step": 3140 }, { "epoch": 4.01664, "grad_norm": 0.4917832612991333, "learning_rate": 0.00018898909678287788, "loss": 3.6243, "step": 3141 }, { "epoch": 4.01792, "grad_norm": 0.5480942726135254, "learning_rate": 0.00018894871449724051, "loss": 3.7372, "step": 3142 }, { "epoch": 4.0192, "grad_norm": 0.49381646513938904, "learning_rate": 0.00018890833221160317, "loss": 3.6939, "step": 3143 }, { "epoch": 4.02048, "grad_norm": 0.5019636750221252, "learning_rate": 0.0001888679499259658, "loss": 3.7502, "step": 3144 }, { "epoch": 4.0217600000000004, "grad_norm": 0.49695998430252075, "learning_rate": 0.00018882756764032843, "loss": 3.7423, "step": 3145 }, { "epoch": 4.02304, "grad_norm": 0.5167843103408813, "learning_rate": 0.00018878718535469106, "loss": 3.7029, "step": 3146 }, { "epoch": 4.02432, "grad_norm": 0.49102476239204407, "learning_rate": 0.00018874680306905366, "loss": 3.6505, "step": 3147 }, { "epoch": 4.0256, "grad_norm": 0.5086076855659485, "learning_rate": 0.00018870642078341632, "loss": 3.654, "step": 3148 }, { "epoch": 4.02688, "grad_norm": 0.5053331851959229, "learning_rate": 0.00018866603849777895, "loss": 3.6908, "step": 3149 }, { "epoch": 4.02816, "grad_norm": 0.5181117653846741, "learning_rate": 0.00018862565621214158, "loss": 3.6742, "step": 3150 }, { "epoch": 4.02944, "grad_norm": 0.4862511157989502, "learning_rate": 0.0001885852739265042, "loss": 3.6927, "step": 3151 }, { "epoch": 4.03072, "grad_norm": 0.5201453566551208, "learning_rate": 0.00018854489164086687, "loss": 3.7295, "step": 3152 }, { "epoch": 4.032, "grad_norm": 0.4780125319957733, "learning_rate": 0.0001885045093552295, "loss": 3.6679, "step": 3153 }, { "epoch": 4.03328, "grad_norm": 0.5376641750335693, "learning_rate": 0.00018846412706959213, "loss": 3.7114, "step": 3154 }, { "epoch": 4.03456, "grad_norm": 0.4939403235912323, "learning_rate": 0.00018842374478395473, "loss": 3.6443, "step": 3155 }, { "epoch": 4.03584, "grad_norm": 0.502256453037262, "learning_rate": 0.0001883833624983174, "loss": 3.7188, "step": 3156 }, { "epoch": 4.03712, "grad_norm": 0.49722784757614136, "learning_rate": 0.00018834298021268002, "loss": 3.6703, "step": 3157 }, { "epoch": 4.0384, "grad_norm": 0.4928574562072754, "learning_rate": 0.00018830259792704265, "loss": 3.6398, "step": 3158 }, { "epoch": 4.03968, "grad_norm": 0.5021003484725952, "learning_rate": 0.00018826221564140528, "loss": 3.6147, "step": 3159 }, { "epoch": 4.04096, "grad_norm": 0.49602240324020386, "learning_rate": 0.0001882218333557679, "loss": 3.6448, "step": 3160 }, { "epoch": 4.04224, "grad_norm": 0.5034336447715759, "learning_rate": 0.00018818145107013057, "loss": 3.7424, "step": 3161 }, { "epoch": 4.04352, "grad_norm": 0.4804299473762512, "learning_rate": 0.0001881410687844932, "loss": 3.6461, "step": 3162 }, { "epoch": 4.0448, "grad_norm": 0.509303092956543, "learning_rate": 0.0001881006864988558, "loss": 3.6735, "step": 3163 }, { "epoch": 4.04608, "grad_norm": 0.5042724609375, "learning_rate": 0.00018806030421321843, "loss": 3.6822, "step": 3164 }, { "epoch": 4.04736, "grad_norm": 0.49135807156562805, "learning_rate": 0.0001880199219275811, "loss": 3.6458, "step": 3165 }, { "epoch": 4.04864, "grad_norm": 0.486905038356781, "learning_rate": 0.00018797953964194372, "loss": 3.6209, "step": 3166 }, { "epoch": 4.04992, "grad_norm": 0.4827145040035248, "learning_rate": 0.00018793915735630635, "loss": 3.6738, "step": 3167 }, { "epoch": 4.0512, "grad_norm": 0.4779025912284851, "learning_rate": 0.00018789877507066898, "loss": 3.6753, "step": 3168 }, { "epoch": 4.05248, "grad_norm": 0.49523454904556274, "learning_rate": 0.00018785839278503163, "loss": 3.6973, "step": 3169 }, { "epoch": 4.05376, "grad_norm": 0.49853113293647766, "learning_rate": 0.00018781801049939426, "loss": 3.7392, "step": 3170 }, { "epoch": 4.05504, "grad_norm": 0.500675618648529, "learning_rate": 0.00018777762821375687, "loss": 3.6922, "step": 3171 }, { "epoch": 4.05632, "grad_norm": 0.49067091941833496, "learning_rate": 0.0001877372459281195, "loss": 3.6531, "step": 3172 }, { "epoch": 4.0576, "grad_norm": 0.4818088710308075, "learning_rate": 0.00018769686364248213, "loss": 3.7128, "step": 3173 }, { "epoch": 4.05888, "grad_norm": 0.5128709673881531, "learning_rate": 0.00018765648135684479, "loss": 3.7044, "step": 3174 }, { "epoch": 4.06016, "grad_norm": 0.5045835375785828, "learning_rate": 0.00018761609907120742, "loss": 3.6572, "step": 3175 }, { "epoch": 4.06144, "grad_norm": 0.4782851040363312, "learning_rate": 0.00018757571678557005, "loss": 3.7127, "step": 3176 }, { "epoch": 4.06272, "grad_norm": 0.4840090274810791, "learning_rate": 0.00018753533449993268, "loss": 3.6702, "step": 3177 }, { "epoch": 4.064, "grad_norm": 0.49672845005989075, "learning_rate": 0.00018749495221429533, "loss": 3.6537, "step": 3178 }, { "epoch": 4.06528, "grad_norm": 0.4983557462692261, "learning_rate": 0.00018745456992865796, "loss": 3.6794, "step": 3179 }, { "epoch": 4.06656, "grad_norm": 0.5004916191101074, "learning_rate": 0.00018741418764302057, "loss": 3.6771, "step": 3180 }, { "epoch": 4.06784, "grad_norm": 0.5043333768844604, "learning_rate": 0.0001873738053573832, "loss": 3.7056, "step": 3181 }, { "epoch": 4.06912, "grad_norm": 0.4966271221637726, "learning_rate": 0.00018733342307174585, "loss": 3.6367, "step": 3182 }, { "epoch": 4.0704, "grad_norm": 0.5312650203704834, "learning_rate": 0.00018729304078610848, "loss": 3.6806, "step": 3183 }, { "epoch": 4.07168, "grad_norm": 0.48910966515541077, "learning_rate": 0.0001872526585004711, "loss": 3.726, "step": 3184 }, { "epoch": 4.07296, "grad_norm": 0.5237113237380981, "learning_rate": 0.00018721227621483374, "loss": 3.6479, "step": 3185 }, { "epoch": 4.07424, "grad_norm": 0.48384127020835876, "learning_rate": 0.0001871718939291964, "loss": 3.7127, "step": 3186 }, { "epoch": 4.07552, "grad_norm": 0.5084925889968872, "learning_rate": 0.00018713151164355903, "loss": 3.6814, "step": 3187 }, { "epoch": 4.0768, "grad_norm": 0.5011091828346252, "learning_rate": 0.00018709112935792163, "loss": 3.6603, "step": 3188 }, { "epoch": 4.07808, "grad_norm": 0.4861987829208374, "learning_rate": 0.00018705074707228426, "loss": 3.7693, "step": 3189 }, { "epoch": 4.07936, "grad_norm": 0.5180191993713379, "learning_rate": 0.0001870103647866469, "loss": 3.6892, "step": 3190 }, { "epoch": 4.08064, "grad_norm": 0.48718804121017456, "learning_rate": 0.00018696998250100955, "loss": 3.6065, "step": 3191 }, { "epoch": 4.08192, "grad_norm": 0.520859956741333, "learning_rate": 0.00018692960021537218, "loss": 3.656, "step": 3192 }, { "epoch": 4.0832, "grad_norm": 0.49004432559013367, "learning_rate": 0.0001868892179297348, "loss": 3.6136, "step": 3193 }, { "epoch": 4.08448, "grad_norm": 0.5408604741096497, "learning_rate": 0.00018684883564409744, "loss": 3.6482, "step": 3194 }, { "epoch": 4.08576, "grad_norm": 0.4856225252151489, "learning_rate": 0.0001868084533584601, "loss": 3.6174, "step": 3195 }, { "epoch": 4.08704, "grad_norm": 0.5170305371284485, "learning_rate": 0.0001867680710728227, "loss": 3.7183, "step": 3196 }, { "epoch": 4.08832, "grad_norm": 0.5067542791366577, "learning_rate": 0.00018672768878718533, "loss": 3.6856, "step": 3197 }, { "epoch": 4.0896, "grad_norm": 0.4944978952407837, "learning_rate": 0.00018668730650154796, "loss": 3.6034, "step": 3198 }, { "epoch": 4.09088, "grad_norm": 0.5182904601097107, "learning_rate": 0.00018664692421591062, "loss": 3.7634, "step": 3199 }, { "epoch": 4.09216, "grad_norm": 0.5076016187667847, "learning_rate": 0.00018660654193027325, "loss": 3.6408, "step": 3200 }, { "epoch": 4.09344, "grad_norm": 0.4935440123081207, "learning_rate": 0.00018656615964463588, "loss": 3.6573, "step": 3201 }, { "epoch": 4.09472, "grad_norm": 0.5223529934883118, "learning_rate": 0.0001865257773589985, "loss": 3.6398, "step": 3202 }, { "epoch": 4.096, "grad_norm": 0.4967701733112335, "learning_rate": 0.0001864853950733611, "loss": 3.6774, "step": 3203 }, { "epoch": 4.09728, "grad_norm": 0.5154727697372437, "learning_rate": 0.00018644501278772377, "loss": 3.7467, "step": 3204 }, { "epoch": 4.09856, "grad_norm": 0.5095906257629395, "learning_rate": 0.0001864046305020864, "loss": 3.7121, "step": 3205 }, { "epoch": 4.09984, "grad_norm": 0.5027210116386414, "learning_rate": 0.00018636424821644903, "loss": 3.671, "step": 3206 }, { "epoch": 4.10112, "grad_norm": 0.537636399269104, "learning_rate": 0.00018632386593081166, "loss": 3.7246, "step": 3207 }, { "epoch": 4.1024, "grad_norm": 0.4799419641494751, "learning_rate": 0.00018628348364517432, "loss": 3.7003, "step": 3208 }, { "epoch": 4.10368, "grad_norm": 0.5242260098457336, "learning_rate": 0.00018624310135953695, "loss": 3.6924, "step": 3209 }, { "epoch": 4.10496, "grad_norm": 0.5360248684883118, "learning_rate": 0.00018620271907389958, "loss": 3.6041, "step": 3210 }, { "epoch": 4.10624, "grad_norm": 0.5119033455848694, "learning_rate": 0.00018616233678826218, "loss": 3.6326, "step": 3211 }, { "epoch": 4.10752, "grad_norm": 0.5700018405914307, "learning_rate": 0.00018612195450262484, "loss": 3.7075, "step": 3212 }, { "epoch": 4.1088, "grad_norm": 0.5085955262184143, "learning_rate": 0.00018608157221698747, "loss": 3.6672, "step": 3213 }, { "epoch": 4.11008, "grad_norm": 0.5086491703987122, "learning_rate": 0.0001860411899313501, "loss": 3.6875, "step": 3214 }, { "epoch": 4.11136, "grad_norm": 0.5247465968132019, "learning_rate": 0.00018600080764571273, "loss": 3.6448, "step": 3215 }, { "epoch": 4.11264, "grad_norm": 0.5041964054107666, "learning_rate": 0.00018596042536007536, "loss": 3.6232, "step": 3216 }, { "epoch": 4.11392, "grad_norm": 0.5098519325256348, "learning_rate": 0.00018592004307443801, "loss": 3.6918, "step": 3217 }, { "epoch": 4.1152, "grad_norm": 0.5236632227897644, "learning_rate": 0.00018587966078880064, "loss": 3.6658, "step": 3218 }, { "epoch": 4.11648, "grad_norm": 0.4816301465034485, "learning_rate": 0.00018583927850316325, "loss": 3.7479, "step": 3219 }, { "epoch": 4.11776, "grad_norm": 0.5393995642662048, "learning_rate": 0.00018579889621752588, "loss": 3.6477, "step": 3220 }, { "epoch": 4.11904, "grad_norm": 0.4957987666130066, "learning_rate": 0.00018575851393188853, "loss": 3.7106, "step": 3221 }, { "epoch": 4.12032, "grad_norm": 0.5068425536155701, "learning_rate": 0.00018571813164625116, "loss": 3.6857, "step": 3222 }, { "epoch": 4.1216, "grad_norm": 0.5265698432922363, "learning_rate": 0.0001856777493606138, "loss": 3.7724, "step": 3223 }, { "epoch": 4.12288, "grad_norm": 0.5055711269378662, "learning_rate": 0.00018563736707497642, "loss": 3.7152, "step": 3224 }, { "epoch": 4.12416, "grad_norm": 0.5185781717300415, "learning_rate": 0.00018559698478933908, "loss": 3.729, "step": 3225 }, { "epoch": 4.12544, "grad_norm": 0.5067470669746399, "learning_rate": 0.0001855566025037017, "loss": 3.7011, "step": 3226 }, { "epoch": 4.12672, "grad_norm": 0.5095682144165039, "learning_rate": 0.00018551622021806431, "loss": 3.6616, "step": 3227 }, { "epoch": 4.128, "grad_norm": 0.5128963589668274, "learning_rate": 0.00018547583793242694, "loss": 3.6418, "step": 3228 }, { "epoch": 4.12928, "grad_norm": 0.5066363215446472, "learning_rate": 0.00018543545564678957, "loss": 3.6522, "step": 3229 }, { "epoch": 4.13056, "grad_norm": 0.5100614428520203, "learning_rate": 0.00018539507336115223, "loss": 3.6805, "step": 3230 }, { "epoch": 4.13184, "grad_norm": 0.5063032507896423, "learning_rate": 0.00018535469107551486, "loss": 3.654, "step": 3231 }, { "epoch": 4.13312, "grad_norm": 0.5069268345832825, "learning_rate": 0.0001853143087898775, "loss": 3.7731, "step": 3232 }, { "epoch": 4.1344, "grad_norm": 0.5150942206382751, "learning_rate": 0.00018527392650424012, "loss": 3.7019, "step": 3233 }, { "epoch": 4.13568, "grad_norm": 0.4924696087837219, "learning_rate": 0.00018523354421860278, "loss": 3.7097, "step": 3234 }, { "epoch": 4.13696, "grad_norm": 0.5000399351119995, "learning_rate": 0.00018519316193296538, "loss": 3.6958, "step": 3235 }, { "epoch": 4.13824, "grad_norm": 0.468902051448822, "learning_rate": 0.000185152779647328, "loss": 3.6186, "step": 3236 }, { "epoch": 4.13952, "grad_norm": 0.49955734610557556, "learning_rate": 0.00018511239736169064, "loss": 3.7507, "step": 3237 }, { "epoch": 4.1408, "grad_norm": 0.5048097372055054, "learning_rate": 0.0001850720150760533, "loss": 3.6257, "step": 3238 }, { "epoch": 4.14208, "grad_norm": 0.4985654056072235, "learning_rate": 0.00018503163279041593, "loss": 3.6897, "step": 3239 }, { "epoch": 4.14336, "grad_norm": 0.4974263310432434, "learning_rate": 0.00018499125050477856, "loss": 3.7192, "step": 3240 }, { "epoch": 4.14464, "grad_norm": 0.5002862215042114, "learning_rate": 0.0001849508682191412, "loss": 3.694, "step": 3241 }, { "epoch": 4.14592, "grad_norm": 0.49395427107810974, "learning_rate": 0.0001849104859335038, "loss": 3.7136, "step": 3242 }, { "epoch": 4.1472, "grad_norm": 0.5043503046035767, "learning_rate": 0.00018487010364786645, "loss": 3.6868, "step": 3243 }, { "epoch": 4.14848, "grad_norm": 0.4990045726299286, "learning_rate": 0.00018482972136222908, "loss": 3.6802, "step": 3244 }, { "epoch": 4.14976, "grad_norm": 0.5021066069602966, "learning_rate": 0.0001847893390765917, "loss": 3.6486, "step": 3245 }, { "epoch": 4.15104, "grad_norm": 0.48476442694664, "learning_rate": 0.00018474895679095434, "loss": 3.6201, "step": 3246 }, { "epoch": 4.15232, "grad_norm": 0.5046278238296509, "learning_rate": 0.000184708574505317, "loss": 3.7327, "step": 3247 }, { "epoch": 4.1536, "grad_norm": 0.49205055832862854, "learning_rate": 0.00018466819221967963, "loss": 3.7463, "step": 3248 }, { "epoch": 4.15488, "grad_norm": 0.4804619252681732, "learning_rate": 0.00018462780993404226, "loss": 3.5683, "step": 3249 }, { "epoch": 4.15616, "grad_norm": 0.4981594979763031, "learning_rate": 0.00018458742764840486, "loss": 3.6402, "step": 3250 }, { "epoch": 4.15744, "grad_norm": 0.4988977313041687, "learning_rate": 0.00018454704536276755, "loss": 3.6497, "step": 3251 }, { "epoch": 4.15872, "grad_norm": 0.5172099471092224, "learning_rate": 0.00018450666307713015, "loss": 3.6574, "step": 3252 }, { "epoch": 4.16, "grad_norm": 0.5064343810081482, "learning_rate": 0.00018446628079149278, "loss": 3.6283, "step": 3253 }, { "epoch": 4.16128, "grad_norm": 0.47614553570747375, "learning_rate": 0.0001844258985058554, "loss": 3.6493, "step": 3254 }, { "epoch": 4.16256, "grad_norm": 0.48512086272239685, "learning_rate": 0.00018438551622021804, "loss": 3.6168, "step": 3255 }, { "epoch": 4.16384, "grad_norm": 0.5282117128372192, "learning_rate": 0.0001843451339345807, "loss": 3.7312, "step": 3256 }, { "epoch": 4.16512, "grad_norm": 0.4799935817718506, "learning_rate": 0.00018430475164894333, "loss": 3.6321, "step": 3257 }, { "epoch": 4.1664, "grad_norm": 0.5284485816955566, "learning_rate": 0.00018426436936330593, "loss": 3.6389, "step": 3258 }, { "epoch": 4.16768, "grad_norm": 0.49284806847572327, "learning_rate": 0.00018422398707766856, "loss": 3.7423, "step": 3259 }, { "epoch": 4.16896, "grad_norm": 0.5154812335968018, "learning_rate": 0.00018418360479203122, "loss": 3.6436, "step": 3260 }, { "epoch": 4.17024, "grad_norm": 0.49612143635749817, "learning_rate": 0.00018414322250639385, "loss": 3.6282, "step": 3261 }, { "epoch": 4.17152, "grad_norm": 0.5098751783370972, "learning_rate": 0.00018410284022075648, "loss": 3.7001, "step": 3262 }, { "epoch": 4.1728, "grad_norm": 0.5058940649032593, "learning_rate": 0.0001840624579351191, "loss": 3.7325, "step": 3263 }, { "epoch": 4.17408, "grad_norm": 0.4903821051120758, "learning_rate": 0.00018402207564948176, "loss": 3.7294, "step": 3264 }, { "epoch": 4.17536, "grad_norm": 0.5205444693565369, "learning_rate": 0.0001839816933638444, "loss": 3.748, "step": 3265 }, { "epoch": 4.17664, "grad_norm": 0.5151527523994446, "learning_rate": 0.00018394131107820702, "loss": 3.7253, "step": 3266 }, { "epoch": 4.17792, "grad_norm": 0.5021133422851562, "learning_rate": 0.00018390092879256963, "loss": 3.7138, "step": 3267 }, { "epoch": 4.1792, "grad_norm": 0.5128991007804871, "learning_rate": 0.00018386054650693226, "loss": 3.695, "step": 3268 }, { "epoch": 4.18048, "grad_norm": 0.4980866014957428, "learning_rate": 0.0001838201642212949, "loss": 3.6914, "step": 3269 }, { "epoch": 4.18176, "grad_norm": 0.4788151681423187, "learning_rate": 0.00018377978193565754, "loss": 3.7036, "step": 3270 }, { "epoch": 4.18304, "grad_norm": 0.5027557611465454, "learning_rate": 0.00018373939965002017, "loss": 3.6309, "step": 3271 }, { "epoch": 4.18432, "grad_norm": 0.4961322247982025, "learning_rate": 0.0001836990173643828, "loss": 3.7176, "step": 3272 }, { "epoch": 4.1856, "grad_norm": 0.48880982398986816, "learning_rate": 0.00018365863507874546, "loss": 3.6192, "step": 3273 }, { "epoch": 4.18688, "grad_norm": 0.5047913193702698, "learning_rate": 0.0001836182527931081, "loss": 3.6584, "step": 3274 }, { "epoch": 4.18816, "grad_norm": 0.5131552815437317, "learning_rate": 0.0001835778705074707, "loss": 3.6554, "step": 3275 }, { "epoch": 4.18944, "grad_norm": 0.4939292073249817, "learning_rate": 0.00018353748822183332, "loss": 3.6149, "step": 3276 }, { "epoch": 4.19072, "grad_norm": 0.5033706426620483, "learning_rate": 0.00018349710593619598, "loss": 3.704, "step": 3277 }, { "epoch": 4.192, "grad_norm": 0.4993079602718353, "learning_rate": 0.0001834567236505586, "loss": 3.7489, "step": 3278 }, { "epoch": 4.19328, "grad_norm": 0.49270492792129517, "learning_rate": 0.00018341634136492124, "loss": 3.6576, "step": 3279 }, { "epoch": 4.19456, "grad_norm": 0.513554036617279, "learning_rate": 0.00018337595907928387, "loss": 3.6928, "step": 3280 }, { "epoch": 4.19584, "grad_norm": 0.4733978509902954, "learning_rate": 0.0001833355767936465, "loss": 3.6538, "step": 3281 }, { "epoch": 4.19712, "grad_norm": 0.5104435086250305, "learning_rate": 0.00018329519450800916, "loss": 3.637, "step": 3282 }, { "epoch": 4.1984, "grad_norm": 0.49731189012527466, "learning_rate": 0.00018325481222237176, "loss": 3.7184, "step": 3283 }, { "epoch": 4.19968, "grad_norm": 0.5013545155525208, "learning_rate": 0.0001832144299367344, "loss": 3.6749, "step": 3284 }, { "epoch": 4.20096, "grad_norm": 0.5214062929153442, "learning_rate": 0.00018317404765109702, "loss": 3.6357, "step": 3285 }, { "epoch": 4.20224, "grad_norm": 0.4991886615753174, "learning_rate": 0.00018313366536545968, "loss": 3.6431, "step": 3286 }, { "epoch": 4.20352, "grad_norm": 0.5190424919128418, "learning_rate": 0.0001830932830798223, "loss": 3.6487, "step": 3287 }, { "epoch": 4.2048, "grad_norm": 0.4960169792175293, "learning_rate": 0.00018305290079418494, "loss": 3.7144, "step": 3288 }, { "epoch": 4.20608, "grad_norm": 0.49390795826911926, "learning_rate": 0.00018301251850854757, "loss": 3.6899, "step": 3289 }, { "epoch": 4.2073599999999995, "grad_norm": 0.5029337406158447, "learning_rate": 0.00018297213622291023, "loss": 3.7521, "step": 3290 }, { "epoch": 4.20864, "grad_norm": 0.48957720398902893, "learning_rate": 0.00018293175393727283, "loss": 3.7307, "step": 3291 }, { "epoch": 4.20992, "grad_norm": 0.48470643162727356, "learning_rate": 0.00018289137165163546, "loss": 3.6277, "step": 3292 }, { "epoch": 4.2112, "grad_norm": 0.4815894663333893, "learning_rate": 0.0001828509893659981, "loss": 3.7091, "step": 3293 }, { "epoch": 4.21248, "grad_norm": 0.49646440148353577, "learning_rate": 0.00018281060708036072, "loss": 3.7575, "step": 3294 }, { "epoch": 4.21376, "grad_norm": 0.4831089675426483, "learning_rate": 0.00018277022479472338, "loss": 3.745, "step": 3295 }, { "epoch": 4.21504, "grad_norm": 0.5089125633239746, "learning_rate": 0.000182729842509086, "loss": 3.7263, "step": 3296 }, { "epoch": 4.21632, "grad_norm": 0.4908083975315094, "learning_rate": 0.00018268946022344864, "loss": 3.6735, "step": 3297 }, { "epoch": 4.2176, "grad_norm": 0.46788400411605835, "learning_rate": 0.00018264907793781124, "loss": 3.6856, "step": 3298 }, { "epoch": 4.21888, "grad_norm": 0.4864565432071686, "learning_rate": 0.0001826086956521739, "loss": 3.6435, "step": 3299 }, { "epoch": 4.22016, "grad_norm": 0.5052434802055359, "learning_rate": 0.00018256831336653653, "loss": 3.6445, "step": 3300 }, { "epoch": 4.22144, "grad_norm": 0.47929632663726807, "learning_rate": 0.00018252793108089916, "loss": 3.6773, "step": 3301 }, { "epoch": 4.22272, "grad_norm": 0.522189736366272, "learning_rate": 0.0001824875487952618, "loss": 3.6605, "step": 3302 }, { "epoch": 4.224, "grad_norm": 0.5014151334762573, "learning_rate": 0.00018244716650962444, "loss": 3.6866, "step": 3303 }, { "epoch": 4.22528, "grad_norm": 0.49791914224624634, "learning_rate": 0.00018240678422398707, "loss": 3.7027, "step": 3304 }, { "epoch": 4.22656, "grad_norm": 0.5259125828742981, "learning_rate": 0.0001823664019383497, "loss": 3.7465, "step": 3305 }, { "epoch": 4.22784, "grad_norm": 0.49660128355026245, "learning_rate": 0.0001823260196527123, "loss": 3.7207, "step": 3306 }, { "epoch": 4.22912, "grad_norm": 0.5191150307655334, "learning_rate": 0.00018228563736707496, "loss": 3.8187, "step": 3307 }, { "epoch": 4.2304, "grad_norm": 0.5008911490440369, "learning_rate": 0.0001822452550814376, "loss": 3.6294, "step": 3308 }, { "epoch": 4.23168, "grad_norm": 0.4917088449001312, "learning_rate": 0.00018220487279580022, "loss": 3.7032, "step": 3309 }, { "epoch": 4.23296, "grad_norm": 0.5173110961914062, "learning_rate": 0.00018216449051016285, "loss": 3.7054, "step": 3310 }, { "epoch": 4.23424, "grad_norm": 0.5040363073348999, "learning_rate": 0.00018212410822452549, "loss": 3.7053, "step": 3311 }, { "epoch": 4.23552, "grad_norm": 0.48391327261924744, "learning_rate": 0.00018208372593888814, "loss": 3.6792, "step": 3312 }, { "epoch": 4.2368, "grad_norm": 0.5244686603546143, "learning_rate": 0.00018204334365325077, "loss": 3.6416, "step": 3313 }, { "epoch": 4.23808, "grad_norm": 0.5149886608123779, "learning_rate": 0.00018200296136761338, "loss": 3.7686, "step": 3314 }, { "epoch": 4.23936, "grad_norm": 0.5035302042961121, "learning_rate": 0.000181962579081976, "loss": 3.7187, "step": 3315 }, { "epoch": 4.24064, "grad_norm": 0.5213099122047424, "learning_rate": 0.00018192219679633866, "loss": 3.7372, "step": 3316 }, { "epoch": 4.24192, "grad_norm": 0.49687349796295166, "learning_rate": 0.0001818818145107013, "loss": 3.705, "step": 3317 }, { "epoch": 4.2432, "grad_norm": 0.49955183267593384, "learning_rate": 0.00018184143222506392, "loss": 3.6738, "step": 3318 }, { "epoch": 4.24448, "grad_norm": 0.49271631240844727, "learning_rate": 0.00018180104993942655, "loss": 3.671, "step": 3319 }, { "epoch": 4.24576, "grad_norm": 0.4815945625305176, "learning_rate": 0.0001817606676537892, "loss": 3.7131, "step": 3320 }, { "epoch": 4.24704, "grad_norm": 0.5155565142631531, "learning_rate": 0.00018172028536815184, "loss": 3.6281, "step": 3321 }, { "epoch": 4.24832, "grad_norm": 0.48677995800971985, "learning_rate": 0.00018167990308251444, "loss": 3.6267, "step": 3322 }, { "epoch": 4.2496, "grad_norm": 0.5277391076087952, "learning_rate": 0.00018163952079687707, "loss": 3.6703, "step": 3323 }, { "epoch": 4.25088, "grad_norm": 0.4805748164653778, "learning_rate": 0.0001815991385112397, "loss": 3.6601, "step": 3324 }, { "epoch": 4.25216, "grad_norm": 0.527442216873169, "learning_rate": 0.00018155875622560236, "loss": 3.7159, "step": 3325 }, { "epoch": 4.25344, "grad_norm": 0.48253729939460754, "learning_rate": 0.000181518373939965, "loss": 3.6895, "step": 3326 }, { "epoch": 4.25472, "grad_norm": 0.49663540720939636, "learning_rate": 0.00018147799165432762, "loss": 3.7617, "step": 3327 }, { "epoch": 4.256, "grad_norm": 0.48462435603141785, "learning_rate": 0.00018143760936869025, "loss": 3.6659, "step": 3328 }, { "epoch": 4.25728, "grad_norm": 0.4957021474838257, "learning_rate": 0.0001813972270830529, "loss": 3.6396, "step": 3329 }, { "epoch": 4.25856, "grad_norm": 0.4980667233467102, "learning_rate": 0.0001813568447974155, "loss": 3.6964, "step": 3330 }, { "epoch": 4.25984, "grad_norm": 0.47956860065460205, "learning_rate": 0.00018131646251177814, "loss": 3.6877, "step": 3331 }, { "epoch": 4.26112, "grad_norm": 0.4983195960521698, "learning_rate": 0.00018127608022614077, "loss": 3.6595, "step": 3332 }, { "epoch": 4.2624, "grad_norm": 0.5003688335418701, "learning_rate": 0.00018123569794050343, "loss": 3.7105, "step": 3333 }, { "epoch": 4.26368, "grad_norm": 0.4797384738922119, "learning_rate": 0.00018119531565486606, "loss": 3.7471, "step": 3334 }, { "epoch": 4.26496, "grad_norm": 0.5109444856643677, "learning_rate": 0.0001811549333692287, "loss": 3.6958, "step": 3335 }, { "epoch": 4.26624, "grad_norm": 0.4882170557975769, "learning_rate": 0.00018111455108359132, "loss": 3.6789, "step": 3336 }, { "epoch": 4.26752, "grad_norm": 0.5124632120132446, "learning_rate": 0.00018107416879795392, "loss": 3.702, "step": 3337 }, { "epoch": 4.2688, "grad_norm": 0.500720739364624, "learning_rate": 0.0001810337865123166, "loss": 3.6423, "step": 3338 }, { "epoch": 4.27008, "grad_norm": 0.49544718861579895, "learning_rate": 0.0001809934042266792, "loss": 3.7047, "step": 3339 }, { "epoch": 4.27136, "grad_norm": 0.5047063231468201, "learning_rate": 0.00018095302194104184, "loss": 3.7542, "step": 3340 }, { "epoch": 4.27264, "grad_norm": 0.4655781388282776, "learning_rate": 0.00018091263965540447, "loss": 3.7705, "step": 3341 }, { "epoch": 4.27392, "grad_norm": 0.4966285228729248, "learning_rate": 0.00018087225736976713, "loss": 3.6768, "step": 3342 }, { "epoch": 4.2752, "grad_norm": 0.4942556321620941, "learning_rate": 0.00018083187508412976, "loss": 3.593, "step": 3343 }, { "epoch": 4.27648, "grad_norm": 0.517810046672821, "learning_rate": 0.00018079149279849239, "loss": 3.6881, "step": 3344 }, { "epoch": 4.27776, "grad_norm": 0.4855895936489105, "learning_rate": 0.000180751110512855, "loss": 3.6054, "step": 3345 }, { "epoch": 4.27904, "grad_norm": 0.5209439396858215, "learning_rate": 0.00018071072822721767, "loss": 3.6611, "step": 3346 }, { "epoch": 4.28032, "grad_norm": 0.5114684700965881, "learning_rate": 0.00018067034594158028, "loss": 3.6896, "step": 3347 }, { "epoch": 4.2816, "grad_norm": 0.49628081917762756, "learning_rate": 0.0001806299636559429, "loss": 3.7266, "step": 3348 }, { "epoch": 4.2828800000000005, "grad_norm": 0.4987524747848511, "learning_rate": 0.00018058958137030554, "loss": 3.6737, "step": 3349 }, { "epoch": 4.28416, "grad_norm": 0.4812726676464081, "learning_rate": 0.00018054919908466817, "loss": 3.6078, "step": 3350 }, { "epoch": 4.28544, "grad_norm": 0.5009292960166931, "learning_rate": 0.00018050881679903082, "loss": 3.7269, "step": 3351 }, { "epoch": 4.28672, "grad_norm": 0.5155140161514282, "learning_rate": 0.00018046843451339345, "loss": 3.6942, "step": 3352 }, { "epoch": 4.288, "grad_norm": 0.5101702809333801, "learning_rate": 0.00018042805222775608, "loss": 3.7346, "step": 3353 }, { "epoch": 4.28928, "grad_norm": 0.49652808904647827, "learning_rate": 0.0001803876699421187, "loss": 3.6923, "step": 3354 }, { "epoch": 4.29056, "grad_norm": 0.5246519446372986, "learning_rate": 0.00018034728765648134, "loss": 3.6841, "step": 3355 }, { "epoch": 4.29184, "grad_norm": 0.49491071701049805, "learning_rate": 0.00018030690537084397, "loss": 3.649, "step": 3356 }, { "epoch": 4.29312, "grad_norm": 0.5071698427200317, "learning_rate": 0.0001802665230852066, "loss": 3.7119, "step": 3357 }, { "epoch": 4.2943999999999996, "grad_norm": 0.48967668414115906, "learning_rate": 0.00018022614079956923, "loss": 3.6469, "step": 3358 }, { "epoch": 4.29568, "grad_norm": 0.502606987953186, "learning_rate": 0.0001801857585139319, "loss": 3.7297, "step": 3359 }, { "epoch": 4.29696, "grad_norm": 0.5163201689720154, "learning_rate": 0.00018014537622829452, "loss": 3.7605, "step": 3360 }, { "epoch": 4.29824, "grad_norm": 0.500606894493103, "learning_rate": 0.00018010499394265715, "loss": 3.7258, "step": 3361 }, { "epoch": 4.29952, "grad_norm": 0.4911852180957794, "learning_rate": 0.00018006461165701975, "loss": 3.7299, "step": 3362 }, { "epoch": 4.3008, "grad_norm": 0.494475781917572, "learning_rate": 0.00018002422937138238, "loss": 3.6581, "step": 3363 }, { "epoch": 4.30208, "grad_norm": 0.49037453532218933, "learning_rate": 0.00017998384708574504, "loss": 3.7097, "step": 3364 }, { "epoch": 4.30336, "grad_norm": 0.4940754175186157, "learning_rate": 0.00017994346480010767, "loss": 3.7747, "step": 3365 }, { "epoch": 4.30464, "grad_norm": 0.4911755323410034, "learning_rate": 0.0001799030825144703, "loss": 3.7565, "step": 3366 }, { "epoch": 4.30592, "grad_norm": 0.473459392786026, "learning_rate": 0.00017986270022883293, "loss": 3.7478, "step": 3367 }, { "epoch": 4.3072, "grad_norm": 0.4972091317176819, "learning_rate": 0.0001798223179431956, "loss": 3.5687, "step": 3368 }, { "epoch": 4.30848, "grad_norm": 0.475479394197464, "learning_rate": 0.00017978193565755822, "loss": 3.6893, "step": 3369 }, { "epoch": 4.30976, "grad_norm": 0.4981231987476349, "learning_rate": 0.00017974155337192082, "loss": 3.6605, "step": 3370 }, { "epoch": 4.31104, "grad_norm": 0.4935183823108673, "learning_rate": 0.00017970117108628345, "loss": 3.6837, "step": 3371 }, { "epoch": 4.31232, "grad_norm": 0.491040974855423, "learning_rate": 0.0001796607888006461, "loss": 3.6831, "step": 3372 }, { "epoch": 4.3136, "grad_norm": 0.48840728402137756, "learning_rate": 0.00017962040651500874, "loss": 3.7186, "step": 3373 }, { "epoch": 4.31488, "grad_norm": 0.49155256152153015, "learning_rate": 0.00017958002422937137, "loss": 3.7328, "step": 3374 }, { "epoch": 4.31616, "grad_norm": 0.4962817132472992, "learning_rate": 0.000179539641943734, "loss": 3.7796, "step": 3375 }, { "epoch": 4.31744, "grad_norm": 0.47991764545440674, "learning_rate": 0.00017949925965809663, "loss": 3.6687, "step": 3376 }, { "epoch": 4.31872, "grad_norm": 0.4865191876888275, "learning_rate": 0.0001794588773724593, "loss": 3.6614, "step": 3377 }, { "epoch": 4.32, "grad_norm": 0.4849129617214203, "learning_rate": 0.0001794184950868219, "loss": 3.6637, "step": 3378 }, { "epoch": 4.32128, "grad_norm": 0.49385684728622437, "learning_rate": 0.00017937811280118452, "loss": 3.6891, "step": 3379 }, { "epoch": 4.32256, "grad_norm": 0.5029520988464355, "learning_rate": 0.00017933773051554715, "loss": 3.6901, "step": 3380 }, { "epoch": 4.32384, "grad_norm": 0.4933001399040222, "learning_rate": 0.0001792973482299098, "loss": 3.6471, "step": 3381 }, { "epoch": 4.32512, "grad_norm": 0.5136417746543884, "learning_rate": 0.00017925696594427244, "loss": 3.6926, "step": 3382 }, { "epoch": 4.3264, "grad_norm": 0.5117876529693604, "learning_rate": 0.00017921658365863507, "loss": 3.7587, "step": 3383 }, { "epoch": 4.32768, "grad_norm": 0.5118081569671631, "learning_rate": 0.0001791762013729977, "loss": 3.7112, "step": 3384 }, { "epoch": 4.32896, "grad_norm": 0.5017783641815186, "learning_rate": 0.00017913581908736035, "loss": 3.6705, "step": 3385 }, { "epoch": 4.33024, "grad_norm": 0.5022070407867432, "learning_rate": 0.00017909543680172296, "loss": 3.7245, "step": 3386 }, { "epoch": 4.33152, "grad_norm": 0.4982999265193939, "learning_rate": 0.0001790550545160856, "loss": 3.6153, "step": 3387 }, { "epoch": 4.3328, "grad_norm": 0.49681100249290466, "learning_rate": 0.00017901467223044822, "loss": 3.7025, "step": 3388 }, { "epoch": 4.33408, "grad_norm": 0.5171028971672058, "learning_rate": 0.00017897428994481085, "loss": 3.7245, "step": 3389 }, { "epoch": 4.33536, "grad_norm": 0.5349324345588684, "learning_rate": 0.0001789339076591735, "loss": 3.6531, "step": 3390 }, { "epoch": 4.33664, "grad_norm": 0.512718677520752, "learning_rate": 0.00017889352537353614, "loss": 3.709, "step": 3391 }, { "epoch": 4.33792, "grad_norm": 0.4975050687789917, "learning_rate": 0.00017885314308789877, "loss": 3.7275, "step": 3392 }, { "epoch": 4.3392, "grad_norm": 0.49178048968315125, "learning_rate": 0.00017881276080226137, "loss": 3.721, "step": 3393 }, { "epoch": 4.34048, "grad_norm": 0.5306468605995178, "learning_rate": 0.00017877237851662403, "loss": 3.7299, "step": 3394 }, { "epoch": 4.34176, "grad_norm": 0.5229880213737488, "learning_rate": 0.00017873199623098666, "loss": 3.694, "step": 3395 }, { "epoch": 4.34304, "grad_norm": 0.4882638156414032, "learning_rate": 0.00017869161394534929, "loss": 3.6635, "step": 3396 }, { "epoch": 4.34432, "grad_norm": 0.5192221403121948, "learning_rate": 0.00017865123165971192, "loss": 3.6414, "step": 3397 }, { "epoch": 4.3456, "grad_norm": 0.5011906027793884, "learning_rate": 0.00017861084937407457, "loss": 3.7079, "step": 3398 }, { "epoch": 4.34688, "grad_norm": 0.49621590971946716, "learning_rate": 0.0001785704670884372, "loss": 3.7085, "step": 3399 }, { "epoch": 4.34816, "grad_norm": 0.5203524827957153, "learning_rate": 0.00017853008480279983, "loss": 3.6853, "step": 3400 }, { "epoch": 4.3494399999999995, "grad_norm": 0.5072999596595764, "learning_rate": 0.00017848970251716244, "loss": 3.6291, "step": 3401 }, { "epoch": 4.35072, "grad_norm": 0.4849739670753479, "learning_rate": 0.00017844932023152507, "loss": 3.6109, "step": 3402 }, { "epoch": 4.352, "grad_norm": 0.510128915309906, "learning_rate": 0.00017840893794588772, "loss": 3.7104, "step": 3403 }, { "epoch": 4.35328, "grad_norm": 0.4891580045223236, "learning_rate": 0.00017836855566025035, "loss": 3.6278, "step": 3404 }, { "epoch": 4.35456, "grad_norm": 0.5085670948028564, "learning_rate": 0.00017832817337461298, "loss": 3.7082, "step": 3405 }, { "epoch": 4.35584, "grad_norm": 0.49909740686416626, "learning_rate": 0.0001782877910889756, "loss": 3.6073, "step": 3406 }, { "epoch": 4.35712, "grad_norm": 0.49702468514442444, "learning_rate": 0.00017824740880333827, "loss": 3.6564, "step": 3407 }, { "epoch": 4.3584, "grad_norm": 0.5047982335090637, "learning_rate": 0.0001782070265177009, "loss": 3.7321, "step": 3408 }, { "epoch": 4.35968, "grad_norm": 0.4968804717063904, "learning_rate": 0.0001781666442320635, "loss": 3.6783, "step": 3409 }, { "epoch": 4.36096, "grad_norm": 0.5167127251625061, "learning_rate": 0.00017812626194642613, "loss": 3.6759, "step": 3410 }, { "epoch": 4.36224, "grad_norm": 0.4954107403755188, "learning_rate": 0.0001780858796607888, "loss": 3.7047, "step": 3411 }, { "epoch": 4.36352, "grad_norm": 0.519900381565094, "learning_rate": 0.00017804549737515142, "loss": 3.629, "step": 3412 }, { "epoch": 4.3648, "grad_norm": 0.5000714659690857, "learning_rate": 0.00017800511508951405, "loss": 3.6723, "step": 3413 }, { "epoch": 4.36608, "grad_norm": 0.49087822437286377, "learning_rate": 0.00017796473280387668, "loss": 3.7102, "step": 3414 }, { "epoch": 4.36736, "grad_norm": 0.5202825665473938, "learning_rate": 0.0001779243505182393, "loss": 3.6964, "step": 3415 }, { "epoch": 4.36864, "grad_norm": 0.5085675716400146, "learning_rate": 0.00017788396823260197, "loss": 3.7475, "step": 3416 }, { "epoch": 4.3699200000000005, "grad_norm": 0.5006064176559448, "learning_rate": 0.00017784358594696457, "loss": 3.7466, "step": 3417 }, { "epoch": 4.3712, "grad_norm": 0.4939427375793457, "learning_rate": 0.0001778032036613272, "loss": 3.7078, "step": 3418 }, { "epoch": 4.37248, "grad_norm": 0.4983762204647064, "learning_rate": 0.00017776282137568983, "loss": 3.728, "step": 3419 }, { "epoch": 4.37376, "grad_norm": 0.48928341269493103, "learning_rate": 0.0001777224390900525, "loss": 3.6844, "step": 3420 }, { "epoch": 4.37504, "grad_norm": 0.47871965169906616, "learning_rate": 0.00017768205680441512, "loss": 3.7157, "step": 3421 }, { "epoch": 4.37632, "grad_norm": 0.5062506794929504, "learning_rate": 0.00017764167451877775, "loss": 3.6605, "step": 3422 }, { "epoch": 4.3776, "grad_norm": 0.47811704874038696, "learning_rate": 0.00017760129223314038, "loss": 3.6821, "step": 3423 }, { "epoch": 4.37888, "grad_norm": 0.510657787322998, "learning_rate": 0.00017756090994750304, "loss": 3.6808, "step": 3424 }, { "epoch": 4.38016, "grad_norm": 0.4923449754714966, "learning_rate": 0.00017752052766186564, "loss": 3.7051, "step": 3425 }, { "epoch": 4.38144, "grad_norm": 0.5173118710517883, "learning_rate": 0.00017748014537622827, "loss": 3.7312, "step": 3426 }, { "epoch": 4.38272, "grad_norm": 0.4835803508758545, "learning_rate": 0.0001774397630905909, "loss": 3.5541, "step": 3427 }, { "epoch": 4.384, "grad_norm": 0.5001106262207031, "learning_rate": 0.00017739938080495356, "loss": 3.6304, "step": 3428 }, { "epoch": 4.38528, "grad_norm": 0.4953722059726715, "learning_rate": 0.00017735899851931619, "loss": 3.7689, "step": 3429 }, { "epoch": 4.38656, "grad_norm": 0.4921940863132477, "learning_rate": 0.00017731861623367882, "loss": 3.7265, "step": 3430 }, { "epoch": 4.38784, "grad_norm": 0.5121296048164368, "learning_rate": 0.00017727823394804145, "loss": 3.7272, "step": 3431 }, { "epoch": 4.38912, "grad_norm": 0.48463988304138184, "learning_rate": 0.00017723785166240405, "loss": 3.6316, "step": 3432 }, { "epoch": 4.3904, "grad_norm": 0.48769861459732056, "learning_rate": 0.00017719746937676673, "loss": 3.6833, "step": 3433 }, { "epoch": 4.39168, "grad_norm": 0.5136172771453857, "learning_rate": 0.00017715708709112934, "loss": 3.6876, "step": 3434 }, { "epoch": 4.39296, "grad_norm": 0.4963926672935486, "learning_rate": 0.00017711670480549197, "loss": 3.7335, "step": 3435 }, { "epoch": 4.39424, "grad_norm": 0.49306657910346985, "learning_rate": 0.0001770763225198546, "loss": 3.7481, "step": 3436 }, { "epoch": 4.39552, "grad_norm": 0.5056197643280029, "learning_rate": 0.00017703594023421725, "loss": 3.6287, "step": 3437 }, { "epoch": 4.3968, "grad_norm": 0.47748926281929016, "learning_rate": 0.00017699555794857988, "loss": 3.6423, "step": 3438 }, { "epoch": 4.39808, "grad_norm": 0.5189681053161621, "learning_rate": 0.00017695517566294251, "loss": 3.6958, "step": 3439 }, { "epoch": 4.39936, "grad_norm": 0.48089975118637085, "learning_rate": 0.00017691479337730514, "loss": 3.6822, "step": 3440 }, { "epoch": 4.40064, "grad_norm": 0.4775058925151825, "learning_rate": 0.0001768744110916678, "loss": 3.7351, "step": 3441 }, { "epoch": 4.40192, "grad_norm": 0.5030752420425415, "learning_rate": 0.0001768340288060304, "loss": 3.6806, "step": 3442 }, { "epoch": 4.4032, "grad_norm": 0.4860950708389282, "learning_rate": 0.00017679364652039303, "loss": 3.6407, "step": 3443 }, { "epoch": 4.40448, "grad_norm": 0.4938212037086487, "learning_rate": 0.00017675326423475566, "loss": 3.6792, "step": 3444 }, { "epoch": 4.40576, "grad_norm": 0.5040499567985535, "learning_rate": 0.0001767128819491183, "loss": 3.6686, "step": 3445 }, { "epoch": 4.40704, "grad_norm": 0.4846273362636566, "learning_rate": 0.00017667249966348095, "loss": 3.6957, "step": 3446 }, { "epoch": 4.40832, "grad_norm": 0.49627983570098877, "learning_rate": 0.00017663211737784358, "loss": 3.7428, "step": 3447 }, { "epoch": 4.4096, "grad_norm": 0.4737420380115509, "learning_rate": 0.0001765917350922062, "loss": 3.6932, "step": 3448 }, { "epoch": 4.41088, "grad_norm": 0.5013241171836853, "learning_rate": 0.00017655135280656881, "loss": 3.7328, "step": 3449 }, { "epoch": 4.41216, "grad_norm": 0.46999281644821167, "learning_rate": 0.00017651097052093147, "loss": 3.7462, "step": 3450 }, { "epoch": 4.41344, "grad_norm": 0.5018439888954163, "learning_rate": 0.0001764705882352941, "loss": 3.684, "step": 3451 }, { "epoch": 4.41472, "grad_norm": 0.49331605434417725, "learning_rate": 0.00017643020594965673, "loss": 3.7176, "step": 3452 }, { "epoch": 4.416, "grad_norm": 0.48275622725486755, "learning_rate": 0.00017638982366401936, "loss": 3.6986, "step": 3453 }, { "epoch": 4.41728, "grad_norm": 0.48251670598983765, "learning_rate": 0.00017634944137838202, "loss": 3.6331, "step": 3454 }, { "epoch": 4.41856, "grad_norm": 0.487105131149292, "learning_rate": 0.00017630905909274465, "loss": 3.5988, "step": 3455 }, { "epoch": 4.41984, "grad_norm": 0.4950862526893616, "learning_rate": 0.00017626867680710728, "loss": 3.6718, "step": 3456 }, { "epoch": 4.42112, "grad_norm": 0.4823586344718933, "learning_rate": 0.00017622829452146988, "loss": 3.6558, "step": 3457 }, { "epoch": 4.4224, "grad_norm": 0.49396684765815735, "learning_rate": 0.0001761879122358325, "loss": 3.6592, "step": 3458 }, { "epoch": 4.42368, "grad_norm": 0.5019631385803223, "learning_rate": 0.00017614752995019517, "loss": 3.683, "step": 3459 }, { "epoch": 4.4249600000000004, "grad_norm": 0.4832041263580322, "learning_rate": 0.0001761071476645578, "loss": 3.6948, "step": 3460 }, { "epoch": 4.42624, "grad_norm": 0.5026285648345947, "learning_rate": 0.00017606676537892043, "loss": 3.7162, "step": 3461 }, { "epoch": 4.42752, "grad_norm": 0.478025883436203, "learning_rate": 0.00017602638309328306, "loss": 3.6677, "step": 3462 }, { "epoch": 4.4288, "grad_norm": 0.4910399913787842, "learning_rate": 0.00017598600080764572, "loss": 3.7119, "step": 3463 }, { "epoch": 4.43008, "grad_norm": 0.4960950016975403, "learning_rate": 0.00017594561852200835, "loss": 3.7209, "step": 3464 }, { "epoch": 4.43136, "grad_norm": 0.4814871549606323, "learning_rate": 0.00017590523623637095, "loss": 3.6908, "step": 3465 }, { "epoch": 4.43264, "grad_norm": 0.48105382919311523, "learning_rate": 0.00017586485395073358, "loss": 3.742, "step": 3466 }, { "epoch": 4.43392, "grad_norm": 0.4949689209461212, "learning_rate": 0.00017582447166509624, "loss": 3.6408, "step": 3467 }, { "epoch": 4.4352, "grad_norm": 0.4880666732788086, "learning_rate": 0.00017578408937945887, "loss": 3.6666, "step": 3468 }, { "epoch": 4.4364799999999995, "grad_norm": 0.4796549081802368, "learning_rate": 0.0001757437070938215, "loss": 3.6802, "step": 3469 }, { "epoch": 4.43776, "grad_norm": 0.5090510845184326, "learning_rate": 0.00017570332480818413, "loss": 3.727, "step": 3470 }, { "epoch": 4.43904, "grad_norm": 0.48752468824386597, "learning_rate": 0.00017566294252254676, "loss": 3.694, "step": 3471 }, { "epoch": 4.44032, "grad_norm": 0.49315810203552246, "learning_rate": 0.00017562256023690942, "loss": 3.6637, "step": 3472 }, { "epoch": 4.4416, "grad_norm": 0.4910982549190521, "learning_rate": 0.00017558217795127202, "loss": 3.697, "step": 3473 }, { "epoch": 4.44288, "grad_norm": 0.48540931940078735, "learning_rate": 0.00017554179566563465, "loss": 3.6457, "step": 3474 }, { "epoch": 4.44416, "grad_norm": 0.5137031674385071, "learning_rate": 0.00017550141337999728, "loss": 3.6628, "step": 3475 }, { "epoch": 4.44544, "grad_norm": 0.4871312379837036, "learning_rate": 0.00017546103109435994, "loss": 3.685, "step": 3476 }, { "epoch": 4.44672, "grad_norm": 0.48285388946533203, "learning_rate": 0.00017542064880872257, "loss": 3.655, "step": 3477 }, { "epoch": 4.448, "grad_norm": 0.49935588240623474, "learning_rate": 0.0001753802665230852, "loss": 3.6451, "step": 3478 }, { "epoch": 4.44928, "grad_norm": 0.49114716053009033, "learning_rate": 0.00017533988423744783, "loss": 3.6701, "step": 3479 }, { "epoch": 4.45056, "grad_norm": 0.4787088632583618, "learning_rate": 0.00017529950195181048, "loss": 3.6956, "step": 3480 }, { "epoch": 4.45184, "grad_norm": 0.48162198066711426, "learning_rate": 0.00017525911966617309, "loss": 3.6084, "step": 3481 }, { "epoch": 4.45312, "grad_norm": 0.49390703439712524, "learning_rate": 0.00017521873738053572, "loss": 3.6884, "step": 3482 }, { "epoch": 4.4544, "grad_norm": 0.4952353835105896, "learning_rate": 0.00017517835509489835, "loss": 3.6476, "step": 3483 }, { "epoch": 4.45568, "grad_norm": 0.48062050342559814, "learning_rate": 0.00017513797280926098, "loss": 3.6257, "step": 3484 }, { "epoch": 4.45696, "grad_norm": 0.5192123055458069, "learning_rate": 0.00017509759052362363, "loss": 3.6835, "step": 3485 }, { "epoch": 4.45824, "grad_norm": 0.4845411777496338, "learning_rate": 0.00017505720823798626, "loss": 3.6836, "step": 3486 }, { "epoch": 4.45952, "grad_norm": 0.47390422224998474, "learning_rate": 0.0001750168259523489, "loss": 3.6174, "step": 3487 }, { "epoch": 4.4608, "grad_norm": 0.5086140036582947, "learning_rate": 0.0001749764436667115, "loss": 3.7345, "step": 3488 }, { "epoch": 4.46208, "grad_norm": 0.49887514114379883, "learning_rate": 0.00017493606138107415, "loss": 3.7634, "step": 3489 }, { "epoch": 4.46336, "grad_norm": 0.5016082525253296, "learning_rate": 0.00017489567909543678, "loss": 3.753, "step": 3490 }, { "epoch": 4.46464, "grad_norm": 0.5016016960144043, "learning_rate": 0.00017485529680979941, "loss": 3.649, "step": 3491 }, { "epoch": 4.46592, "grad_norm": 0.5083345174789429, "learning_rate": 0.00017481491452416204, "loss": 3.7257, "step": 3492 }, { "epoch": 4.4672, "grad_norm": 0.4881742298603058, "learning_rate": 0.0001747745322385247, "loss": 3.7086, "step": 3493 }, { "epoch": 4.46848, "grad_norm": 0.5043129920959473, "learning_rate": 0.00017473414995288733, "loss": 3.7117, "step": 3494 }, { "epoch": 4.46976, "grad_norm": 0.5088573098182678, "learning_rate": 0.00017469376766724996, "loss": 3.6652, "step": 3495 }, { "epoch": 4.47104, "grad_norm": 0.5027455687522888, "learning_rate": 0.00017465338538161256, "loss": 3.7306, "step": 3496 }, { "epoch": 4.47232, "grad_norm": 0.49364176392555237, "learning_rate": 0.0001746130030959752, "loss": 3.7809, "step": 3497 }, { "epoch": 4.4736, "grad_norm": 0.4937020242214203, "learning_rate": 0.00017457262081033785, "loss": 3.7053, "step": 3498 }, { "epoch": 4.47488, "grad_norm": 0.47408318519592285, "learning_rate": 0.00017453223852470048, "loss": 3.6676, "step": 3499 }, { "epoch": 4.47616, "grad_norm": 0.4941287338733673, "learning_rate": 0.0001744918562390631, "loss": 3.6673, "step": 3500 }, { "epoch": 4.47744, "grad_norm": 0.4902247190475464, "learning_rate": 0.00017445147395342574, "loss": 3.6797, "step": 3501 }, { "epoch": 4.47872, "grad_norm": 0.4781825840473175, "learning_rate": 0.0001744110916677884, "loss": 3.767, "step": 3502 }, { "epoch": 4.48, "grad_norm": 0.5159146189689636, "learning_rate": 0.00017437070938215103, "loss": 3.6403, "step": 3503 }, { "epoch": 4.48128, "grad_norm": 0.49446627497673035, "learning_rate": 0.00017433032709651363, "loss": 3.6994, "step": 3504 }, { "epoch": 4.48256, "grad_norm": 0.49100860953330994, "learning_rate": 0.00017428994481087626, "loss": 3.6639, "step": 3505 }, { "epoch": 4.48384, "grad_norm": 0.495257705450058, "learning_rate": 0.00017424956252523892, "loss": 3.6325, "step": 3506 }, { "epoch": 4.48512, "grad_norm": 0.4940941333770752, "learning_rate": 0.00017420918023960155, "loss": 3.7, "step": 3507 }, { "epoch": 4.4864, "grad_norm": 0.4889221489429474, "learning_rate": 0.00017416879795396418, "loss": 3.6322, "step": 3508 }, { "epoch": 4.48768, "grad_norm": 0.49470090866088867, "learning_rate": 0.0001741284156683268, "loss": 3.6749, "step": 3509 }, { "epoch": 4.48896, "grad_norm": 0.48783206939697266, "learning_rate": 0.00017408803338268944, "loss": 3.6823, "step": 3510 }, { "epoch": 4.49024, "grad_norm": 0.47950899600982666, "learning_rate": 0.0001740476510970521, "loss": 3.7092, "step": 3511 }, { "epoch": 4.49152, "grad_norm": 0.49544283747673035, "learning_rate": 0.0001740072688114147, "loss": 3.6669, "step": 3512 }, { "epoch": 4.4928, "grad_norm": 0.5024276375770569, "learning_rate": 0.00017396688652577733, "loss": 3.7114, "step": 3513 }, { "epoch": 4.49408, "grad_norm": 0.49346235394477844, "learning_rate": 0.00017392650424013996, "loss": 3.684, "step": 3514 }, { "epoch": 4.49536, "grad_norm": 0.4775454103946686, "learning_rate": 0.00017388612195450262, "loss": 3.6177, "step": 3515 }, { "epoch": 4.49664, "grad_norm": 0.5153406858444214, "learning_rate": 0.00017384573966886525, "loss": 3.6474, "step": 3516 }, { "epoch": 4.49792, "grad_norm": 0.5022422075271606, "learning_rate": 0.00017380535738322788, "loss": 3.6711, "step": 3517 }, { "epoch": 4.4992, "grad_norm": 0.49757120013237, "learning_rate": 0.0001737649750975905, "loss": 3.6771, "step": 3518 }, { "epoch": 4.50048, "grad_norm": 0.49992772936820984, "learning_rate": 0.00017372459281195316, "loss": 3.5962, "step": 3519 }, { "epoch": 4.50176, "grad_norm": 0.5268425345420837, "learning_rate": 0.0001736842105263158, "loss": 3.7249, "step": 3520 }, { "epoch": 4.50304, "grad_norm": 0.4990698993206024, "learning_rate": 0.0001736438282406784, "loss": 3.6477, "step": 3521 }, { "epoch": 4.50432, "grad_norm": 0.49010327458381653, "learning_rate": 0.00017360344595504103, "loss": 3.6864, "step": 3522 }, { "epoch": 4.5056, "grad_norm": 0.5165103673934937, "learning_rate": 0.00017356306366940366, "loss": 3.8092, "step": 3523 }, { "epoch": 4.50688, "grad_norm": 0.48552533984184265, "learning_rate": 0.00017352268138376631, "loss": 3.6701, "step": 3524 }, { "epoch": 4.50816, "grad_norm": 0.501692533493042, "learning_rate": 0.00017348229909812894, "loss": 3.6105, "step": 3525 }, { "epoch": 4.50944, "grad_norm": 0.5025333762168884, "learning_rate": 0.00017344191681249157, "loss": 3.6874, "step": 3526 }, { "epoch": 4.51072, "grad_norm": 0.4903654456138611, "learning_rate": 0.00017340153452685418, "loss": 3.664, "step": 3527 }, { "epoch": 4.5120000000000005, "grad_norm": 0.4915325343608856, "learning_rate": 0.00017336115224121686, "loss": 3.7199, "step": 3528 }, { "epoch": 4.51328, "grad_norm": 0.5007879734039307, "learning_rate": 0.00017332076995557946, "loss": 3.6746, "step": 3529 }, { "epoch": 4.51456, "grad_norm": 0.48502570390701294, "learning_rate": 0.0001732803876699421, "loss": 3.6661, "step": 3530 }, { "epoch": 4.51584, "grad_norm": 0.5233677625656128, "learning_rate": 0.00017324000538430473, "loss": 3.6716, "step": 3531 }, { "epoch": 4.51712, "grad_norm": 0.5040356516838074, "learning_rate": 0.00017319962309866738, "loss": 3.656, "step": 3532 }, { "epoch": 4.5184, "grad_norm": 0.5123605132102966, "learning_rate": 0.00017315924081303, "loss": 3.6736, "step": 3533 }, { "epoch": 4.51968, "grad_norm": 0.48310691118240356, "learning_rate": 0.00017311885852739264, "loss": 3.667, "step": 3534 }, { "epoch": 4.52096, "grad_norm": 0.48786744475364685, "learning_rate": 0.00017307847624175527, "loss": 3.7278, "step": 3535 }, { "epoch": 4.52224, "grad_norm": 0.4786374270915985, "learning_rate": 0.00017303809395611788, "loss": 3.6351, "step": 3536 }, { "epoch": 4.5235199999999995, "grad_norm": 0.49251246452331543, "learning_rate": 0.00017299771167048053, "loss": 3.6494, "step": 3537 }, { "epoch": 4.5248, "grad_norm": 0.4909958243370056, "learning_rate": 0.00017295732938484316, "loss": 3.6081, "step": 3538 }, { "epoch": 4.52608, "grad_norm": 0.49989423155784607, "learning_rate": 0.0001729169470992058, "loss": 3.6365, "step": 3539 }, { "epoch": 4.52736, "grad_norm": 0.5115346312522888, "learning_rate": 0.00017287656481356842, "loss": 3.6791, "step": 3540 }, { "epoch": 4.52864, "grad_norm": 0.49784228205680847, "learning_rate": 0.00017283618252793108, "loss": 3.6783, "step": 3541 }, { "epoch": 4.52992, "grad_norm": 0.5055098533630371, "learning_rate": 0.0001727958002422937, "loss": 3.7022, "step": 3542 }, { "epoch": 4.5312, "grad_norm": 0.5355752110481262, "learning_rate": 0.00017275541795665634, "loss": 3.7565, "step": 3543 }, { "epoch": 4.53248, "grad_norm": 0.5136594176292419, "learning_rate": 0.00017271503567101894, "loss": 3.6853, "step": 3544 }, { "epoch": 4.53376, "grad_norm": 0.502863347530365, "learning_rate": 0.0001726746533853816, "loss": 3.7711, "step": 3545 }, { "epoch": 4.53504, "grad_norm": 0.5095115303993225, "learning_rate": 0.00017263427109974423, "loss": 3.6232, "step": 3546 }, { "epoch": 4.53632, "grad_norm": 0.5266593098640442, "learning_rate": 0.00017259388881410686, "loss": 3.7058, "step": 3547 }, { "epoch": 4.5376, "grad_norm": 0.5122666358947754, "learning_rate": 0.0001725535065284695, "loss": 3.6821, "step": 3548 }, { "epoch": 4.53888, "grad_norm": 0.5304042100906372, "learning_rate": 0.00017251312424283212, "loss": 3.7056, "step": 3549 }, { "epoch": 4.54016, "grad_norm": 0.495039165019989, "learning_rate": 0.00017247274195719478, "loss": 3.64, "step": 3550 }, { "epoch": 4.54144, "grad_norm": 0.5002861618995667, "learning_rate": 0.0001724323596715574, "loss": 3.628, "step": 3551 }, { "epoch": 4.54272, "grad_norm": 0.47562626004219055, "learning_rate": 0.00017239197738592, "loss": 3.6679, "step": 3552 }, { "epoch": 4.5440000000000005, "grad_norm": 0.4953966438770294, "learning_rate": 0.00017235159510028264, "loss": 3.6984, "step": 3553 }, { "epoch": 4.54528, "grad_norm": 0.4846183955669403, "learning_rate": 0.0001723112128146453, "loss": 3.7035, "step": 3554 }, { "epoch": 4.54656, "grad_norm": 0.4903351068496704, "learning_rate": 0.00017227083052900793, "loss": 3.6948, "step": 3555 }, { "epoch": 4.54784, "grad_norm": 0.5202940702438354, "learning_rate": 0.00017223044824337056, "loss": 3.7083, "step": 3556 }, { "epoch": 4.54912, "grad_norm": 0.47886836528778076, "learning_rate": 0.0001721900659577332, "loss": 3.7083, "step": 3557 }, { "epoch": 4.5504, "grad_norm": 0.47739362716674805, "learning_rate": 0.00017214968367209585, "loss": 3.6878, "step": 3558 }, { "epoch": 4.55168, "grad_norm": 0.4860458970069885, "learning_rate": 0.00017210930138645848, "loss": 3.6605, "step": 3559 }, { "epoch": 4.55296, "grad_norm": 0.48289328813552856, "learning_rate": 0.00017206891910082108, "loss": 3.681, "step": 3560 }, { "epoch": 4.55424, "grad_norm": 0.48003146052360535, "learning_rate": 0.0001720285368151837, "loss": 3.5914, "step": 3561 }, { "epoch": 4.55552, "grad_norm": 0.4859391152858734, "learning_rate": 0.00017198815452954637, "loss": 3.6338, "step": 3562 }, { "epoch": 4.5568, "grad_norm": 0.5082067251205444, "learning_rate": 0.000171947772243909, "loss": 3.6709, "step": 3563 }, { "epoch": 4.55808, "grad_norm": 0.5058284401893616, "learning_rate": 0.00017190738995827163, "loss": 3.7295, "step": 3564 }, { "epoch": 4.55936, "grad_norm": 0.4948490560054779, "learning_rate": 0.00017186700767263426, "loss": 3.7362, "step": 3565 }, { "epoch": 4.56064, "grad_norm": 0.48526522517204285, "learning_rate": 0.00017182662538699689, "loss": 3.6609, "step": 3566 }, { "epoch": 4.56192, "grad_norm": 0.49817943572998047, "learning_rate": 0.00017178624310135954, "loss": 3.7036, "step": 3567 }, { "epoch": 4.5632, "grad_norm": 0.4790593981742859, "learning_rate": 0.00017174586081572215, "loss": 3.6634, "step": 3568 }, { "epoch": 4.56448, "grad_norm": 0.47792062163352966, "learning_rate": 0.00017170547853008478, "loss": 3.688, "step": 3569 }, { "epoch": 4.56576, "grad_norm": 0.5085933208465576, "learning_rate": 0.0001716650962444474, "loss": 3.6739, "step": 3570 }, { "epoch": 4.56704, "grad_norm": 0.4828382730484009, "learning_rate": 0.00017162471395881006, "loss": 3.6292, "step": 3571 }, { "epoch": 4.56832, "grad_norm": 0.47902053594589233, "learning_rate": 0.0001715843316731727, "loss": 3.642, "step": 3572 }, { "epoch": 4.5696, "grad_norm": 0.48181986808776855, "learning_rate": 0.00017154394938753532, "loss": 3.7166, "step": 3573 }, { "epoch": 4.57088, "grad_norm": 0.49542367458343506, "learning_rate": 0.00017150356710189795, "loss": 3.6691, "step": 3574 }, { "epoch": 4.57216, "grad_norm": 0.48702624440193176, "learning_rate": 0.0001714631848162606, "loss": 3.6509, "step": 3575 }, { "epoch": 4.57344, "grad_norm": 0.4805475175380707, "learning_rate": 0.00017142280253062321, "loss": 3.7377, "step": 3576 }, { "epoch": 4.57472, "grad_norm": 0.4737749695777893, "learning_rate": 0.00017138242024498584, "loss": 3.7663, "step": 3577 }, { "epoch": 4.576, "grad_norm": 0.46877261996269226, "learning_rate": 0.00017134203795934847, "loss": 3.7753, "step": 3578 }, { "epoch": 4.57728, "grad_norm": 0.4928486943244934, "learning_rate": 0.0001713016556737111, "loss": 3.7, "step": 3579 }, { "epoch": 4.5785599999999995, "grad_norm": 0.4829467833042145, "learning_rate": 0.00017126127338807376, "loss": 3.6577, "step": 3580 }, { "epoch": 4.57984, "grad_norm": 0.4743864834308624, "learning_rate": 0.0001712208911024364, "loss": 3.7088, "step": 3581 }, { "epoch": 4.58112, "grad_norm": 0.4854520261287689, "learning_rate": 0.00017118050881679902, "loss": 3.6397, "step": 3582 }, { "epoch": 4.5824, "grad_norm": 0.48954036831855774, "learning_rate": 0.00017114012653116162, "loss": 3.7009, "step": 3583 }, { "epoch": 4.58368, "grad_norm": 0.4740862250328064, "learning_rate": 0.00017109974424552428, "loss": 3.6539, "step": 3584 }, { "epoch": 4.58496, "grad_norm": 0.5085862278938293, "learning_rate": 0.0001710593619598869, "loss": 3.7184, "step": 3585 }, { "epoch": 4.58624, "grad_norm": 0.48374059796333313, "learning_rate": 0.00017101897967424954, "loss": 3.6622, "step": 3586 }, { "epoch": 4.58752, "grad_norm": 0.49264445900917053, "learning_rate": 0.00017097859738861217, "loss": 3.7557, "step": 3587 }, { "epoch": 4.5888, "grad_norm": 0.5074787735939026, "learning_rate": 0.00017093821510297483, "loss": 3.7262, "step": 3588 }, { "epoch": 4.59008, "grad_norm": 0.4799599051475525, "learning_rate": 0.00017089783281733746, "loss": 3.642, "step": 3589 }, { "epoch": 4.59136, "grad_norm": 0.49617815017700195, "learning_rate": 0.0001708574505317001, "loss": 3.7215, "step": 3590 }, { "epoch": 4.59264, "grad_norm": 0.48311010003089905, "learning_rate": 0.0001708170682460627, "loss": 3.683, "step": 3591 }, { "epoch": 4.59392, "grad_norm": 0.48525270819664, "learning_rate": 0.00017077668596042532, "loss": 3.6323, "step": 3592 }, { "epoch": 4.5952, "grad_norm": 0.48173174262046814, "learning_rate": 0.00017073630367478798, "loss": 3.7176, "step": 3593 }, { "epoch": 4.59648, "grad_norm": 0.4832940101623535, "learning_rate": 0.0001706959213891506, "loss": 3.6404, "step": 3594 }, { "epoch": 4.59776, "grad_norm": 0.48117658495903015, "learning_rate": 0.00017065553910351324, "loss": 3.6617, "step": 3595 }, { "epoch": 4.5990400000000005, "grad_norm": 0.4833637475967407, "learning_rate": 0.00017061515681787587, "loss": 3.7209, "step": 3596 }, { "epoch": 4.60032, "grad_norm": 0.48473599553108215, "learning_rate": 0.00017057477453223853, "loss": 3.6349, "step": 3597 }, { "epoch": 4.6016, "grad_norm": 0.475893497467041, "learning_rate": 0.00017053439224660116, "loss": 3.6932, "step": 3598 }, { "epoch": 4.60288, "grad_norm": 0.49287232756614685, "learning_rate": 0.00017049400996096376, "loss": 3.6374, "step": 3599 }, { "epoch": 4.60416, "grad_norm": 0.47559598088264465, "learning_rate": 0.0001704536276753264, "loss": 3.6659, "step": 3600 }, { "epoch": 4.60544, "grad_norm": 0.4943821430206299, "learning_rate": 0.00017041324538968905, "loss": 3.6761, "step": 3601 }, { "epoch": 4.60672, "grad_norm": 0.5159769058227539, "learning_rate": 0.00017037286310405168, "loss": 3.6661, "step": 3602 }, { "epoch": 4.608, "grad_norm": 0.5051320791244507, "learning_rate": 0.0001703324808184143, "loss": 3.7369, "step": 3603 }, { "epoch": 4.60928, "grad_norm": 0.48964956402778625, "learning_rate": 0.00017029209853277694, "loss": 3.6809, "step": 3604 }, { "epoch": 4.6105599999999995, "grad_norm": 0.4939142167568207, "learning_rate": 0.00017025171624713957, "loss": 3.694, "step": 3605 }, { "epoch": 4.61184, "grad_norm": 0.5127921104431152, "learning_rate": 0.00017021133396150222, "loss": 3.7422, "step": 3606 }, { "epoch": 4.61312, "grad_norm": 0.5049188733100891, "learning_rate": 0.00017017095167586485, "loss": 3.6797, "step": 3607 }, { "epoch": 4.6144, "grad_norm": 0.5061559081077576, "learning_rate": 0.00017013056939022746, "loss": 3.6901, "step": 3608 }, { "epoch": 4.61568, "grad_norm": 0.48725587129592896, "learning_rate": 0.0001700901871045901, "loss": 3.7239, "step": 3609 }, { "epoch": 4.61696, "grad_norm": 0.48895296454429626, "learning_rate": 0.00017004980481895274, "loss": 3.73, "step": 3610 }, { "epoch": 4.61824, "grad_norm": 0.5009008646011353, "learning_rate": 0.00017000942253331538, "loss": 3.636, "step": 3611 }, { "epoch": 4.61952, "grad_norm": 0.4935823678970337, "learning_rate": 0.000169969040247678, "loss": 3.623, "step": 3612 }, { "epoch": 4.6208, "grad_norm": 0.4923403263092041, "learning_rate": 0.00016992865796204064, "loss": 3.7194, "step": 3613 }, { "epoch": 4.62208, "grad_norm": 0.4789236783981323, "learning_rate": 0.0001698882756764033, "loss": 3.7151, "step": 3614 }, { "epoch": 4.62336, "grad_norm": 0.5072107911109924, "learning_rate": 0.00016984789339076592, "loss": 3.7177, "step": 3615 }, { "epoch": 4.62464, "grad_norm": 0.4821150302886963, "learning_rate": 0.00016980751110512853, "loss": 3.6819, "step": 3616 }, { "epoch": 4.62592, "grad_norm": 0.48691630363464355, "learning_rate": 0.00016976712881949116, "loss": 3.6939, "step": 3617 }, { "epoch": 4.6272, "grad_norm": 0.4875396490097046, "learning_rate": 0.00016972674653385379, "loss": 3.7381, "step": 3618 }, { "epoch": 4.62848, "grad_norm": 0.4814916253089905, "learning_rate": 0.00016968636424821644, "loss": 3.6668, "step": 3619 }, { "epoch": 4.62976, "grad_norm": 0.5000630021095276, "learning_rate": 0.00016964598196257907, "loss": 3.7047, "step": 3620 }, { "epoch": 4.6310400000000005, "grad_norm": 0.4792519211769104, "learning_rate": 0.0001696055996769417, "loss": 3.6889, "step": 3621 }, { "epoch": 4.63232, "grad_norm": 0.4922233819961548, "learning_rate": 0.00016956521739130433, "loss": 3.6608, "step": 3622 }, { "epoch": 4.6336, "grad_norm": 0.49414339661598206, "learning_rate": 0.000169524835105667, "loss": 3.6623, "step": 3623 }, { "epoch": 4.63488, "grad_norm": 0.48598942160606384, "learning_rate": 0.0001694844528200296, "loss": 3.6867, "step": 3624 }, { "epoch": 4.63616, "grad_norm": 0.49352526664733887, "learning_rate": 0.00016944407053439222, "loss": 3.6493, "step": 3625 }, { "epoch": 4.63744, "grad_norm": 0.5058410167694092, "learning_rate": 0.00016940368824875485, "loss": 3.6692, "step": 3626 }, { "epoch": 4.63872, "grad_norm": 0.4912284016609192, "learning_rate": 0.0001693633059631175, "loss": 3.7055, "step": 3627 }, { "epoch": 4.64, "grad_norm": 0.4703737199306488, "learning_rate": 0.00016932292367748014, "loss": 3.6885, "step": 3628 }, { "epoch": 4.64128, "grad_norm": 0.4926270842552185, "learning_rate": 0.00016928254139184277, "loss": 3.6547, "step": 3629 }, { "epoch": 4.64256, "grad_norm": 0.48712384700775146, "learning_rate": 0.0001692421591062054, "loss": 3.7905, "step": 3630 }, { "epoch": 4.64384, "grad_norm": 0.5001680850982666, "learning_rate": 0.000169201776820568, "loss": 3.6893, "step": 3631 }, { "epoch": 4.64512, "grad_norm": 0.4809771180152893, "learning_rate": 0.00016916139453493066, "loss": 3.738, "step": 3632 }, { "epoch": 4.6464, "grad_norm": 0.4865325093269348, "learning_rate": 0.0001691210122492933, "loss": 3.6982, "step": 3633 }, { "epoch": 4.64768, "grad_norm": 0.48700863122940063, "learning_rate": 0.00016908062996365592, "loss": 3.681, "step": 3634 }, { "epoch": 4.64896, "grad_norm": 0.4818150997161865, "learning_rate": 0.00016904024767801855, "loss": 3.6595, "step": 3635 }, { "epoch": 4.65024, "grad_norm": 0.4853065609931946, "learning_rate": 0.0001689998653923812, "loss": 3.6553, "step": 3636 }, { "epoch": 4.65152, "grad_norm": 0.47990745306015015, "learning_rate": 0.00016895948310674384, "loss": 3.7326, "step": 3637 }, { "epoch": 4.6528, "grad_norm": 0.4638812839984894, "learning_rate": 0.00016891910082110647, "loss": 3.6394, "step": 3638 }, { "epoch": 4.65408, "grad_norm": 0.48503032326698303, "learning_rate": 0.00016887871853546907, "loss": 3.6976, "step": 3639 }, { "epoch": 4.65536, "grad_norm": 0.48162126541137695, "learning_rate": 0.00016883833624983173, "loss": 3.6568, "step": 3640 }, { "epoch": 4.65664, "grad_norm": 0.5152369737625122, "learning_rate": 0.00016879795396419436, "loss": 3.6476, "step": 3641 }, { "epoch": 4.65792, "grad_norm": 0.4773014783859253, "learning_rate": 0.000168757571678557, "loss": 3.6812, "step": 3642 }, { "epoch": 4.6592, "grad_norm": 0.5280107855796814, "learning_rate": 0.00016871718939291962, "loss": 3.6671, "step": 3643 }, { "epoch": 4.66048, "grad_norm": 0.49400994181632996, "learning_rate": 0.00016867680710728225, "loss": 3.6199, "step": 3644 }, { "epoch": 4.66176, "grad_norm": 0.5342155694961548, "learning_rate": 0.0001686364248216449, "loss": 3.6443, "step": 3645 }, { "epoch": 4.66304, "grad_norm": 0.500808835029602, "learning_rate": 0.00016859604253600754, "loss": 3.6912, "step": 3646 }, { "epoch": 4.66432, "grad_norm": 0.511306643486023, "learning_rate": 0.00016855566025037014, "loss": 3.6898, "step": 3647 }, { "epoch": 4.6655999999999995, "grad_norm": 0.48556506633758545, "learning_rate": 0.00016851527796473277, "loss": 3.6869, "step": 3648 }, { "epoch": 4.66688, "grad_norm": 0.48522791266441345, "learning_rate": 0.00016847489567909543, "loss": 3.7425, "step": 3649 }, { "epoch": 4.66816, "grad_norm": 0.4987458288669586, "learning_rate": 0.00016843451339345806, "loss": 3.6531, "step": 3650 }, { "epoch": 4.66944, "grad_norm": 0.5028944611549377, "learning_rate": 0.0001683941311078207, "loss": 3.6606, "step": 3651 }, { "epoch": 4.67072, "grad_norm": 0.4878036081790924, "learning_rate": 0.00016835374882218332, "loss": 3.6368, "step": 3652 }, { "epoch": 4.672, "grad_norm": 0.5253325700759888, "learning_rate": 0.00016831336653654597, "loss": 3.7309, "step": 3653 }, { "epoch": 4.67328, "grad_norm": 0.48149093985557556, "learning_rate": 0.0001682729842509086, "loss": 3.7106, "step": 3654 }, { "epoch": 4.67456, "grad_norm": 0.5140679478645325, "learning_rate": 0.0001682326019652712, "loss": 3.721, "step": 3655 }, { "epoch": 4.67584, "grad_norm": 0.49887320399284363, "learning_rate": 0.00016819221967963384, "loss": 3.6681, "step": 3656 }, { "epoch": 4.67712, "grad_norm": 0.47831347584724426, "learning_rate": 0.00016815183739399647, "loss": 3.6551, "step": 3657 }, { "epoch": 4.6784, "grad_norm": 0.5047725439071655, "learning_rate": 0.00016811145510835912, "loss": 3.6415, "step": 3658 }, { "epoch": 4.67968, "grad_norm": 0.4860306978225708, "learning_rate": 0.00016807107282272175, "loss": 3.6685, "step": 3659 }, { "epoch": 4.68096, "grad_norm": 0.4931819438934326, "learning_rate": 0.00016803069053708438, "loss": 3.6904, "step": 3660 }, { "epoch": 4.68224, "grad_norm": 0.5166349411010742, "learning_rate": 0.00016799030825144701, "loss": 3.727, "step": 3661 }, { "epoch": 4.68352, "grad_norm": 0.5045530796051025, "learning_rate": 0.00016794992596580967, "loss": 3.691, "step": 3662 }, { "epoch": 4.6848, "grad_norm": 0.49202415347099304, "learning_rate": 0.00016790954368017227, "loss": 3.6231, "step": 3663 }, { "epoch": 4.6860800000000005, "grad_norm": 0.4810471832752228, "learning_rate": 0.0001678691613945349, "loss": 3.6767, "step": 3664 }, { "epoch": 4.68736, "grad_norm": 0.514689028263092, "learning_rate": 0.00016782877910889753, "loss": 3.6977, "step": 3665 }, { "epoch": 4.68864, "grad_norm": 0.4772043526172638, "learning_rate": 0.0001677883968232602, "loss": 3.6731, "step": 3666 }, { "epoch": 4.68992, "grad_norm": 0.5057224035263062, "learning_rate": 0.00016774801453762282, "loss": 3.7433, "step": 3667 }, { "epoch": 4.6912, "grad_norm": 0.4916319251060486, "learning_rate": 0.00016770763225198545, "loss": 3.6379, "step": 3668 }, { "epoch": 4.69248, "grad_norm": 0.5023067593574524, "learning_rate": 0.00016766724996634808, "loss": 3.7392, "step": 3669 }, { "epoch": 4.69376, "grad_norm": 0.49110403656959534, "learning_rate": 0.00016762686768071069, "loss": 3.6819, "step": 3670 }, { "epoch": 4.69504, "grad_norm": 0.47794216871261597, "learning_rate": 0.00016758648539507334, "loss": 3.6164, "step": 3671 }, { "epoch": 4.69632, "grad_norm": 0.5044650435447693, "learning_rate": 0.00016754610310943597, "loss": 3.6418, "step": 3672 }, { "epoch": 4.6975999999999996, "grad_norm": 0.5018360614776611, "learning_rate": 0.0001675057208237986, "loss": 3.6579, "step": 3673 }, { "epoch": 4.69888, "grad_norm": 0.513712465763092, "learning_rate": 0.00016746533853816123, "loss": 3.7271, "step": 3674 }, { "epoch": 4.70016, "grad_norm": 0.5256522297859192, "learning_rate": 0.0001674249562525239, "loss": 3.6982, "step": 3675 }, { "epoch": 4.70144, "grad_norm": 0.5181000232696533, "learning_rate": 0.00016738457396688652, "loss": 3.6359, "step": 3676 }, { "epoch": 4.70272, "grad_norm": 0.5020421147346497, "learning_rate": 0.00016734419168124915, "loss": 3.7344, "step": 3677 }, { "epoch": 4.704, "grad_norm": 0.5058776140213013, "learning_rate": 0.00016730380939561175, "loss": 3.6141, "step": 3678 }, { "epoch": 4.70528, "grad_norm": 0.5099439024925232, "learning_rate": 0.00016726342710997444, "loss": 3.6955, "step": 3679 }, { "epoch": 4.70656, "grad_norm": 0.5178961753845215, "learning_rate": 0.00016722304482433704, "loss": 3.6717, "step": 3680 }, { "epoch": 4.70784, "grad_norm": 0.5064387917518616, "learning_rate": 0.00016718266253869967, "loss": 3.6438, "step": 3681 }, { "epoch": 4.70912, "grad_norm": 0.48872536420822144, "learning_rate": 0.0001671422802530623, "loss": 3.6893, "step": 3682 }, { "epoch": 4.7104, "grad_norm": 0.486896276473999, "learning_rate": 0.00016710189796742496, "loss": 3.6217, "step": 3683 }, { "epoch": 4.71168, "grad_norm": 0.49064919352531433, "learning_rate": 0.0001670615156817876, "loss": 3.693, "step": 3684 }, { "epoch": 4.71296, "grad_norm": 0.5264923572540283, "learning_rate": 0.00016702113339615022, "loss": 3.7152, "step": 3685 }, { "epoch": 4.71424, "grad_norm": 0.5014359354972839, "learning_rate": 0.00016698075111051282, "loss": 3.7828, "step": 3686 }, { "epoch": 4.71552, "grad_norm": 0.4952867329120636, "learning_rate": 0.00016694036882487545, "loss": 3.7426, "step": 3687 }, { "epoch": 4.7168, "grad_norm": 0.49848371744155884, "learning_rate": 0.0001668999865392381, "loss": 3.6818, "step": 3688 }, { "epoch": 4.7180800000000005, "grad_norm": 0.4835038185119629, "learning_rate": 0.00016685960425360074, "loss": 3.6319, "step": 3689 }, { "epoch": 4.71936, "grad_norm": 0.5079094171524048, "learning_rate": 0.00016681922196796337, "loss": 3.6938, "step": 3690 }, { "epoch": 4.7206399999999995, "grad_norm": 0.5054168701171875, "learning_rate": 0.000166778839682326, "loss": 3.7248, "step": 3691 }, { "epoch": 4.72192, "grad_norm": 0.4924614727497101, "learning_rate": 0.00016673845739668866, "loss": 3.7232, "step": 3692 }, { "epoch": 4.7232, "grad_norm": 0.5226231813430786, "learning_rate": 0.00016669807511105129, "loss": 3.6702, "step": 3693 }, { "epoch": 4.72448, "grad_norm": 0.5076504945755005, "learning_rate": 0.00016665769282541392, "loss": 3.7035, "step": 3694 }, { "epoch": 4.72576, "grad_norm": 0.5075111985206604, "learning_rate": 0.00016661731053977652, "loss": 3.6875, "step": 3695 }, { "epoch": 4.72704, "grad_norm": 0.5135008096694946, "learning_rate": 0.00016657692825413918, "loss": 3.6545, "step": 3696 }, { "epoch": 4.72832, "grad_norm": 0.5121733546257019, "learning_rate": 0.0001665365459685018, "loss": 3.6973, "step": 3697 }, { "epoch": 4.7296, "grad_norm": 0.4708007574081421, "learning_rate": 0.00016649616368286444, "loss": 3.6359, "step": 3698 }, { "epoch": 4.73088, "grad_norm": 0.5306479334831238, "learning_rate": 0.00016645578139722707, "loss": 3.7159, "step": 3699 }, { "epoch": 4.73216, "grad_norm": 0.4930853545665741, "learning_rate": 0.0001664153991115897, "loss": 3.6615, "step": 3700 }, { "epoch": 4.73344, "grad_norm": 0.5061611533164978, "learning_rate": 0.00016637501682595235, "loss": 3.7783, "step": 3701 }, { "epoch": 4.73472, "grad_norm": 0.4978872239589691, "learning_rate": 0.00016633463454031498, "loss": 3.6809, "step": 3702 }, { "epoch": 4.736, "grad_norm": 0.5093433856964111, "learning_rate": 0.00016629425225467759, "loss": 3.6233, "step": 3703 }, { "epoch": 4.73728, "grad_norm": 0.4769956171512604, "learning_rate": 0.00016625386996904022, "loss": 3.5826, "step": 3704 }, { "epoch": 4.73856, "grad_norm": 0.4818449020385742, "learning_rate": 0.00016621348768340287, "loss": 3.6416, "step": 3705 }, { "epoch": 4.73984, "grad_norm": 0.4861428439617157, "learning_rate": 0.0001661731053977655, "loss": 3.6334, "step": 3706 }, { "epoch": 4.7411200000000004, "grad_norm": 0.48706743121147156, "learning_rate": 0.00016613272311212813, "loss": 3.6829, "step": 3707 }, { "epoch": 4.7424, "grad_norm": 0.48839133977890015, "learning_rate": 0.00016609234082649076, "loss": 3.6995, "step": 3708 }, { "epoch": 4.74368, "grad_norm": 0.49105438590049744, "learning_rate": 0.00016605195854085342, "loss": 3.6665, "step": 3709 }, { "epoch": 4.74496, "grad_norm": 0.503555417060852, "learning_rate": 0.00016601157625521605, "loss": 3.7246, "step": 3710 }, { "epoch": 4.74624, "grad_norm": 0.4822165071964264, "learning_rate": 0.00016597119396957865, "loss": 3.6801, "step": 3711 }, { "epoch": 4.74752, "grad_norm": 0.4817500114440918, "learning_rate": 0.00016593081168394128, "loss": 3.575, "step": 3712 }, { "epoch": 4.7488, "grad_norm": 0.48095250129699707, "learning_rate": 0.00016589042939830391, "loss": 3.6716, "step": 3713 }, { "epoch": 4.75008, "grad_norm": 0.47289326786994934, "learning_rate": 0.00016585004711266657, "loss": 3.7431, "step": 3714 }, { "epoch": 4.75136, "grad_norm": 0.46789786219596863, "learning_rate": 0.0001658096648270292, "loss": 3.6603, "step": 3715 }, { "epoch": 4.7526399999999995, "grad_norm": 0.47495347261428833, "learning_rate": 0.00016576928254139183, "loss": 3.6394, "step": 3716 }, { "epoch": 4.75392, "grad_norm": 0.47441309690475464, "learning_rate": 0.00016572890025575446, "loss": 3.7065, "step": 3717 }, { "epoch": 4.7552, "grad_norm": 0.48996591567993164, "learning_rate": 0.00016568851797011712, "loss": 3.7313, "step": 3718 }, { "epoch": 4.75648, "grad_norm": 0.47536635398864746, "learning_rate": 0.00016564813568447972, "loss": 3.7061, "step": 3719 }, { "epoch": 4.75776, "grad_norm": 0.508357048034668, "learning_rate": 0.00016560775339884235, "loss": 3.7152, "step": 3720 }, { "epoch": 4.75904, "grad_norm": 0.4819643795490265, "learning_rate": 0.00016556737111320498, "loss": 3.6843, "step": 3721 }, { "epoch": 4.76032, "grad_norm": 0.48836493492126465, "learning_rate": 0.00016552698882756764, "loss": 3.6814, "step": 3722 }, { "epoch": 4.7616, "grad_norm": 0.4822399616241455, "learning_rate": 0.00016548660654193027, "loss": 3.6709, "step": 3723 }, { "epoch": 4.76288, "grad_norm": 0.4872921407222748, "learning_rate": 0.0001654462242562929, "loss": 3.6587, "step": 3724 }, { "epoch": 4.76416, "grad_norm": 0.486806184053421, "learning_rate": 0.00016540584197065553, "loss": 3.669, "step": 3725 }, { "epoch": 4.76544, "grad_norm": 0.49304312467575073, "learning_rate": 0.00016536545968501813, "loss": 3.7521, "step": 3726 }, { "epoch": 4.76672, "grad_norm": 0.4882862865924835, "learning_rate": 0.0001653250773993808, "loss": 3.7018, "step": 3727 }, { "epoch": 4.768, "grad_norm": 0.4865057170391083, "learning_rate": 0.00016528469511374342, "loss": 3.6966, "step": 3728 }, { "epoch": 4.76928, "grad_norm": 0.5051146745681763, "learning_rate": 0.00016524431282810605, "loss": 3.7802, "step": 3729 }, { "epoch": 4.77056, "grad_norm": 0.5136163830757141, "learning_rate": 0.00016520393054246868, "loss": 3.6855, "step": 3730 }, { "epoch": 4.77184, "grad_norm": 0.4789270758628845, "learning_rate": 0.00016516354825683134, "loss": 3.6787, "step": 3731 }, { "epoch": 4.7731200000000005, "grad_norm": 0.5074111223220825, "learning_rate": 0.00016512316597119397, "loss": 3.6951, "step": 3732 }, { "epoch": 4.7744, "grad_norm": 0.5275261998176575, "learning_rate": 0.0001650827836855566, "loss": 3.6443, "step": 3733 }, { "epoch": 4.77568, "grad_norm": 0.49672234058380127, "learning_rate": 0.0001650424013999192, "loss": 3.6661, "step": 3734 }, { "epoch": 4.77696, "grad_norm": 0.4923790693283081, "learning_rate": 0.00016500201911428186, "loss": 3.6796, "step": 3735 }, { "epoch": 4.77824, "grad_norm": 0.4953721761703491, "learning_rate": 0.0001649616368286445, "loss": 3.6743, "step": 3736 }, { "epoch": 4.77952, "grad_norm": 0.4764186441898346, "learning_rate": 0.00016492125454300712, "loss": 3.6672, "step": 3737 }, { "epoch": 4.7808, "grad_norm": 0.5113012790679932, "learning_rate": 0.00016488087225736975, "loss": 3.629, "step": 3738 }, { "epoch": 4.78208, "grad_norm": 0.48934149742126465, "learning_rate": 0.00016484048997173238, "loss": 3.7223, "step": 3739 }, { "epoch": 4.78336, "grad_norm": 0.5140074491500854, "learning_rate": 0.00016480010768609503, "loss": 3.7186, "step": 3740 }, { "epoch": 4.78464, "grad_norm": 0.4866582751274109, "learning_rate": 0.00016475972540045766, "loss": 3.62, "step": 3741 }, { "epoch": 4.78592, "grad_norm": 0.48126623034477234, "learning_rate": 0.00016471934311482027, "loss": 3.608, "step": 3742 }, { "epoch": 4.7872, "grad_norm": 0.48048514127731323, "learning_rate": 0.0001646789608291829, "loss": 3.6883, "step": 3743 }, { "epoch": 4.78848, "grad_norm": 0.4765174984931946, "learning_rate": 0.00016463857854354555, "loss": 3.7108, "step": 3744 }, { "epoch": 4.78976, "grad_norm": 0.4911894202232361, "learning_rate": 0.00016459819625790818, "loss": 3.6355, "step": 3745 }, { "epoch": 4.79104, "grad_norm": 0.5015087723731995, "learning_rate": 0.00016455781397227081, "loss": 3.6922, "step": 3746 }, { "epoch": 4.79232, "grad_norm": 0.4827320873737335, "learning_rate": 0.00016451743168663344, "loss": 3.6264, "step": 3747 }, { "epoch": 4.7936, "grad_norm": 0.4948306083679199, "learning_rate": 0.0001644770494009961, "loss": 3.7218, "step": 3748 }, { "epoch": 4.79488, "grad_norm": 0.48731568455696106, "learning_rate": 0.00016443666711535873, "loss": 3.6451, "step": 3749 }, { "epoch": 4.79616, "grad_norm": 0.4918150305747986, "learning_rate": 0.00016439628482972133, "loss": 3.7111, "step": 3750 }, { "epoch": 4.79744, "grad_norm": 0.4810066521167755, "learning_rate": 0.00016435590254408397, "loss": 3.6458, "step": 3751 }, { "epoch": 4.79872, "grad_norm": 0.49135395884513855, "learning_rate": 0.0001643155202584466, "loss": 3.6688, "step": 3752 }, { "epoch": 4.8, "grad_norm": 0.4967387020587921, "learning_rate": 0.00016427513797280925, "loss": 3.6786, "step": 3753 }, { "epoch": 4.80128, "grad_norm": 0.48914146423339844, "learning_rate": 0.00016423475568717188, "loss": 3.7037, "step": 3754 }, { "epoch": 4.80256, "grad_norm": 0.5048929452896118, "learning_rate": 0.0001641943734015345, "loss": 3.6746, "step": 3755 }, { "epoch": 4.80384, "grad_norm": 0.4980829954147339, "learning_rate": 0.00016415399111589714, "loss": 3.6226, "step": 3756 }, { "epoch": 4.80512, "grad_norm": 0.49464380741119385, "learning_rate": 0.0001641136088302598, "loss": 3.6642, "step": 3757 }, { "epoch": 4.8064, "grad_norm": 0.4930364489555359, "learning_rate": 0.0001640732265446224, "loss": 3.7118, "step": 3758 }, { "epoch": 4.8076799999999995, "grad_norm": 0.5031747221946716, "learning_rate": 0.00016403284425898503, "loss": 3.6796, "step": 3759 }, { "epoch": 4.80896, "grad_norm": 0.4856695830821991, "learning_rate": 0.00016399246197334766, "loss": 3.6577, "step": 3760 }, { "epoch": 4.81024, "grad_norm": 0.5000268816947937, "learning_rate": 0.00016395207968771032, "loss": 3.638, "step": 3761 }, { "epoch": 4.81152, "grad_norm": 0.5013075470924377, "learning_rate": 0.00016391169740207295, "loss": 3.7025, "step": 3762 }, { "epoch": 4.8128, "grad_norm": 0.49570322036743164, "learning_rate": 0.00016387131511643558, "loss": 3.6357, "step": 3763 }, { "epoch": 4.81408, "grad_norm": 0.4977611005306244, "learning_rate": 0.0001638309328307982, "loss": 3.671, "step": 3764 }, { "epoch": 4.81536, "grad_norm": 0.5036880970001221, "learning_rate": 0.0001637905505451608, "loss": 3.6263, "step": 3765 }, { "epoch": 4.81664, "grad_norm": 0.4825877249240875, "learning_rate": 0.0001637501682595235, "loss": 3.6794, "step": 3766 }, { "epoch": 4.81792, "grad_norm": 0.4834206998348236, "learning_rate": 0.0001637097859738861, "loss": 3.6166, "step": 3767 }, { "epoch": 4.8192, "grad_norm": 0.5228777527809143, "learning_rate": 0.00016366940368824873, "loss": 3.7228, "step": 3768 }, { "epoch": 4.82048, "grad_norm": 0.4963701367378235, "learning_rate": 0.00016362902140261136, "loss": 3.6723, "step": 3769 }, { "epoch": 4.82176, "grad_norm": 0.5132946968078613, "learning_rate": 0.00016358863911697402, "loss": 3.7093, "step": 3770 }, { "epoch": 4.82304, "grad_norm": 0.5134023427963257, "learning_rate": 0.00016354825683133665, "loss": 3.6616, "step": 3771 }, { "epoch": 4.82432, "grad_norm": 0.4977473318576813, "learning_rate": 0.00016350787454569928, "loss": 3.6783, "step": 3772 }, { "epoch": 4.8256, "grad_norm": 0.49358707666397095, "learning_rate": 0.00016346749226006188, "loss": 3.6623, "step": 3773 }, { "epoch": 4.82688, "grad_norm": 0.5281035304069519, "learning_rate": 0.00016342710997442457, "loss": 3.7313, "step": 3774 }, { "epoch": 4.8281600000000005, "grad_norm": 0.4807729125022888, "learning_rate": 0.00016338672768878717, "loss": 3.6286, "step": 3775 }, { "epoch": 4.82944, "grad_norm": 0.4929489195346832, "learning_rate": 0.0001633463454031498, "loss": 3.7261, "step": 3776 }, { "epoch": 4.83072, "grad_norm": 0.493528813123703, "learning_rate": 0.00016330596311751243, "loss": 3.6753, "step": 3777 }, { "epoch": 4.832, "grad_norm": 0.49640294909477234, "learning_rate": 0.00016326558083187506, "loss": 3.6256, "step": 3778 }, { "epoch": 4.83328, "grad_norm": 0.48569512367248535, "learning_rate": 0.00016322519854623772, "loss": 3.6442, "step": 3779 }, { "epoch": 4.83456, "grad_norm": 0.498219758272171, "learning_rate": 0.00016318481626060035, "loss": 3.6997, "step": 3780 }, { "epoch": 4.83584, "grad_norm": 0.5126128792762756, "learning_rate": 0.00016314443397496298, "loss": 3.7796, "step": 3781 }, { "epoch": 4.83712, "grad_norm": 0.4836713671684265, "learning_rate": 0.00016310405168932558, "loss": 3.6532, "step": 3782 }, { "epoch": 4.8384, "grad_norm": 0.5176852345466614, "learning_rate": 0.00016306366940368824, "loss": 3.6887, "step": 3783 }, { "epoch": 4.8396799999999995, "grad_norm": 0.4919340908527374, "learning_rate": 0.00016302328711805087, "loss": 3.7374, "step": 3784 }, { "epoch": 4.84096, "grad_norm": 0.5173082947731018, "learning_rate": 0.0001629829048324135, "loss": 3.6584, "step": 3785 }, { "epoch": 4.84224, "grad_norm": 0.4877385199069977, "learning_rate": 0.00016294252254677613, "loss": 3.7104, "step": 3786 }, { "epoch": 4.84352, "grad_norm": 0.5282792448997498, "learning_rate": 0.00016290214026113878, "loss": 3.7029, "step": 3787 }, { "epoch": 4.8448, "grad_norm": 0.4718897044658661, "learning_rate": 0.0001628617579755014, "loss": 3.5776, "step": 3788 }, { "epoch": 4.84608, "grad_norm": 0.4864949584007263, "learning_rate": 0.00016282137568986404, "loss": 3.6709, "step": 3789 }, { "epoch": 4.84736, "grad_norm": 0.4912480115890503, "learning_rate": 0.00016278099340422665, "loss": 3.6211, "step": 3790 }, { "epoch": 4.84864, "grad_norm": 0.5093181729316711, "learning_rate": 0.00016274061111858928, "loss": 3.8126, "step": 3791 }, { "epoch": 4.84992, "grad_norm": 0.4893922209739685, "learning_rate": 0.00016270022883295193, "loss": 3.6797, "step": 3792 }, { "epoch": 4.8512, "grad_norm": 0.4943704903125763, "learning_rate": 0.00016265984654731456, "loss": 3.6392, "step": 3793 }, { "epoch": 4.85248, "grad_norm": 0.5246706604957581, "learning_rate": 0.0001626194642616772, "loss": 3.685, "step": 3794 }, { "epoch": 4.85376, "grad_norm": 0.4894337058067322, "learning_rate": 0.00016257908197603982, "loss": 3.6819, "step": 3795 }, { "epoch": 4.85504, "grad_norm": 0.49564680457115173, "learning_rate": 0.00016253869969040248, "loss": 3.6284, "step": 3796 }, { "epoch": 4.85632, "grad_norm": 0.49830925464630127, "learning_rate": 0.0001624983174047651, "loss": 3.6549, "step": 3797 }, { "epoch": 4.8576, "grad_norm": 0.47998228669166565, "learning_rate": 0.00016245793511912771, "loss": 3.6778, "step": 3798 }, { "epoch": 4.85888, "grad_norm": 0.4880746006965637, "learning_rate": 0.00016241755283349034, "loss": 3.6366, "step": 3799 }, { "epoch": 4.8601600000000005, "grad_norm": 0.4946172535419464, "learning_rate": 0.000162377170547853, "loss": 3.7328, "step": 3800 }, { "epoch": 4.86144, "grad_norm": 0.48107582330703735, "learning_rate": 0.00016233678826221563, "loss": 3.6619, "step": 3801 }, { "epoch": 4.86272, "grad_norm": 0.47235268354415894, "learning_rate": 0.00016229640597657826, "loss": 3.6913, "step": 3802 }, { "epoch": 4.864, "grad_norm": 0.4809187948703766, "learning_rate": 0.0001622560236909409, "loss": 3.7307, "step": 3803 }, { "epoch": 4.86528, "grad_norm": 0.4874497950077057, "learning_rate": 0.00016221564140530355, "loss": 3.7355, "step": 3804 }, { "epoch": 4.86656, "grad_norm": 0.49071377515792847, "learning_rate": 0.00016217525911966618, "loss": 3.579, "step": 3805 }, { "epoch": 4.86784, "grad_norm": 0.4813006818294525, "learning_rate": 0.00016213487683402878, "loss": 3.6804, "step": 3806 }, { "epoch": 4.86912, "grad_norm": 0.4884811341762543, "learning_rate": 0.0001620944945483914, "loss": 3.7033, "step": 3807 }, { "epoch": 4.8704, "grad_norm": 0.5217158198356628, "learning_rate": 0.00016205411226275404, "loss": 3.6843, "step": 3808 }, { "epoch": 4.87168, "grad_norm": 0.48379671573638916, "learning_rate": 0.0001620137299771167, "loss": 3.7468, "step": 3809 }, { "epoch": 4.87296, "grad_norm": 0.484479159116745, "learning_rate": 0.00016197334769147933, "loss": 3.7231, "step": 3810 }, { "epoch": 4.87424, "grad_norm": 0.5106998085975647, "learning_rate": 0.00016193296540584196, "loss": 3.6457, "step": 3811 }, { "epoch": 4.87552, "grad_norm": 0.4962330162525177, "learning_rate": 0.0001618925831202046, "loss": 3.6255, "step": 3812 }, { "epoch": 4.8768, "grad_norm": 0.5050908923149109, "learning_rate": 0.00016185220083456725, "loss": 3.7097, "step": 3813 }, { "epoch": 4.87808, "grad_norm": 0.495358407497406, "learning_rate": 0.00016181181854892985, "loss": 3.7255, "step": 3814 }, { "epoch": 4.87936, "grad_norm": 0.495797336101532, "learning_rate": 0.00016177143626329248, "loss": 3.6743, "step": 3815 }, { "epoch": 4.88064, "grad_norm": 0.49159398674964905, "learning_rate": 0.0001617310539776551, "loss": 3.6507, "step": 3816 }, { "epoch": 4.88192, "grad_norm": 0.4823581874370575, "learning_rate": 0.00016169067169201777, "loss": 3.7253, "step": 3817 }, { "epoch": 4.8832, "grad_norm": 0.49354973435401917, "learning_rate": 0.0001616502894063804, "loss": 3.6406, "step": 3818 }, { "epoch": 4.88448, "grad_norm": 0.4922613799571991, "learning_rate": 0.00016160990712074303, "loss": 3.6345, "step": 3819 }, { "epoch": 4.88576, "grad_norm": 0.5070391893386841, "learning_rate": 0.00016156952483510566, "loss": 3.6943, "step": 3820 }, { "epoch": 4.88704, "grad_norm": 0.4859599173069, "learning_rate": 0.00016152914254946826, "loss": 3.6488, "step": 3821 }, { "epoch": 4.88832, "grad_norm": 0.4835364520549774, "learning_rate": 0.00016148876026383092, "loss": 3.6938, "step": 3822 }, { "epoch": 4.8896, "grad_norm": 0.4754778742790222, "learning_rate": 0.00016144837797819355, "loss": 3.6802, "step": 3823 }, { "epoch": 4.89088, "grad_norm": 0.508832573890686, "learning_rate": 0.00016140799569255618, "loss": 3.6323, "step": 3824 }, { "epoch": 4.89216, "grad_norm": 0.4894411861896515, "learning_rate": 0.0001613676134069188, "loss": 3.7234, "step": 3825 }, { "epoch": 4.89344, "grad_norm": 0.5034093260765076, "learning_rate": 0.00016132723112128146, "loss": 3.6997, "step": 3826 }, { "epoch": 4.8947199999999995, "grad_norm": 0.4902549684047699, "learning_rate": 0.0001612868488356441, "loss": 3.6539, "step": 3827 }, { "epoch": 4.896, "grad_norm": 0.49553415179252625, "learning_rate": 0.00016124646655000672, "loss": 3.6314, "step": 3828 }, { "epoch": 4.89728, "grad_norm": 0.5032349824905396, "learning_rate": 0.00016120608426436933, "loss": 3.6966, "step": 3829 }, { "epoch": 4.89856, "grad_norm": 0.5162962079048157, "learning_rate": 0.00016116570197873198, "loss": 3.7436, "step": 3830 }, { "epoch": 4.89984, "grad_norm": 0.49726447463035583, "learning_rate": 0.00016112531969309462, "loss": 3.6775, "step": 3831 }, { "epoch": 4.90112, "grad_norm": 0.47404661774635315, "learning_rate": 0.00016108493740745725, "loss": 3.7073, "step": 3832 }, { "epoch": 4.9024, "grad_norm": 0.48400676250457764, "learning_rate": 0.00016104455512181988, "loss": 3.6737, "step": 3833 }, { "epoch": 4.90368, "grad_norm": 0.48407965898513794, "learning_rate": 0.0001610041728361825, "loss": 3.6579, "step": 3834 }, { "epoch": 4.90496, "grad_norm": 0.47430333495140076, "learning_rate": 0.00016096379055054516, "loss": 3.6103, "step": 3835 }, { "epoch": 4.90624, "grad_norm": 0.49005916714668274, "learning_rate": 0.0001609234082649078, "loss": 3.5885, "step": 3836 }, { "epoch": 4.90752, "grad_norm": 0.472774863243103, "learning_rate": 0.0001608830259792704, "loss": 3.7037, "step": 3837 }, { "epoch": 4.9088, "grad_norm": 0.48804715275764465, "learning_rate": 0.00016084264369363303, "loss": 3.7332, "step": 3838 }, { "epoch": 4.91008, "grad_norm": 0.4966636002063751, "learning_rate": 0.00016080226140799568, "loss": 3.6174, "step": 3839 }, { "epoch": 4.91136, "grad_norm": 0.4942854642868042, "learning_rate": 0.0001607618791223583, "loss": 3.6865, "step": 3840 }, { "epoch": 4.91264, "grad_norm": 0.48685768246650696, "learning_rate": 0.00016072149683672094, "loss": 3.6556, "step": 3841 }, { "epoch": 4.91392, "grad_norm": 0.5361490845680237, "learning_rate": 0.00016068111455108357, "loss": 3.7219, "step": 3842 }, { "epoch": 4.9152000000000005, "grad_norm": 0.47551262378692627, "learning_rate": 0.00016064073226544623, "loss": 3.7257, "step": 3843 }, { "epoch": 4.91648, "grad_norm": 0.48748570680618286, "learning_rate": 0.00016060034997980886, "loss": 3.7161, "step": 3844 }, { "epoch": 4.91776, "grad_norm": 0.48473209142684937, "learning_rate": 0.00016055996769417146, "loss": 3.6119, "step": 3845 }, { "epoch": 4.91904, "grad_norm": 0.4792821407318115, "learning_rate": 0.0001605195854085341, "loss": 3.628, "step": 3846 }, { "epoch": 4.92032, "grad_norm": 0.5115758180618286, "learning_rate": 0.00016047920312289672, "loss": 3.7057, "step": 3847 }, { "epoch": 4.9216, "grad_norm": 0.4800952970981598, "learning_rate": 0.00016043882083725938, "loss": 3.6729, "step": 3848 }, { "epoch": 4.92288, "grad_norm": 0.48038557171821594, "learning_rate": 0.000160398438551622, "loss": 3.6507, "step": 3849 }, { "epoch": 4.92416, "grad_norm": 0.49345579743385315, "learning_rate": 0.00016035805626598464, "loss": 3.6492, "step": 3850 }, { "epoch": 4.92544, "grad_norm": 0.4855850338935852, "learning_rate": 0.00016031767398034727, "loss": 3.7056, "step": 3851 }, { "epoch": 4.9267199999999995, "grad_norm": 0.47242411971092224, "learning_rate": 0.00016027729169470993, "loss": 3.7455, "step": 3852 }, { "epoch": 4.928, "grad_norm": 0.4987708330154419, "learning_rate": 0.00016023690940907256, "loss": 3.7361, "step": 3853 }, { "epoch": 4.92928, "grad_norm": 0.4940935969352722, "learning_rate": 0.00016019652712343516, "loss": 3.6286, "step": 3854 }, { "epoch": 4.93056, "grad_norm": 0.4955693483352661, "learning_rate": 0.0001601561448377978, "loss": 3.6897, "step": 3855 }, { "epoch": 4.93184, "grad_norm": 0.4994462728500366, "learning_rate": 0.00016011576255216045, "loss": 3.6699, "step": 3856 }, { "epoch": 4.93312, "grad_norm": 0.5072110295295715, "learning_rate": 0.00016007538026652308, "loss": 3.7264, "step": 3857 }, { "epoch": 4.9344, "grad_norm": 0.49533572793006897, "learning_rate": 0.0001600349979808857, "loss": 3.6975, "step": 3858 }, { "epoch": 4.93568, "grad_norm": 0.512359619140625, "learning_rate": 0.00015999461569524834, "loss": 3.7374, "step": 3859 }, { "epoch": 4.93696, "grad_norm": 0.4889034628868103, "learning_rate": 0.00015995423340961094, "loss": 3.6486, "step": 3860 }, { "epoch": 4.93824, "grad_norm": 0.4879732131958008, "learning_rate": 0.00015991385112397363, "loss": 3.6722, "step": 3861 }, { "epoch": 4.93952, "grad_norm": 0.5002577900886536, "learning_rate": 0.00015987346883833623, "loss": 3.7448, "step": 3862 }, { "epoch": 4.9408, "grad_norm": 0.5065283179283142, "learning_rate": 0.00015983308655269886, "loss": 3.7383, "step": 3863 }, { "epoch": 4.94208, "grad_norm": 0.48911115527153015, "learning_rate": 0.0001597927042670615, "loss": 3.6939, "step": 3864 }, { "epoch": 4.94336, "grad_norm": 0.500379741191864, "learning_rate": 0.00015975232198142415, "loss": 3.6542, "step": 3865 }, { "epoch": 4.94464, "grad_norm": 0.47929057478904724, "learning_rate": 0.00015971193969578678, "loss": 3.7623, "step": 3866 }, { "epoch": 4.94592, "grad_norm": 0.4697692394256592, "learning_rate": 0.0001596715574101494, "loss": 3.6306, "step": 3867 }, { "epoch": 4.9472000000000005, "grad_norm": 0.4873841106891632, "learning_rate": 0.00015963117512451204, "loss": 3.5985, "step": 3868 }, { "epoch": 4.94848, "grad_norm": 0.49631455540657043, "learning_rate": 0.0001595907928388747, "loss": 3.7473, "step": 3869 }, { "epoch": 4.94976, "grad_norm": 0.48060086369514465, "learning_rate": 0.0001595504105532373, "loss": 3.667, "step": 3870 }, { "epoch": 4.95104, "grad_norm": 0.4914751350879669, "learning_rate": 0.00015951002826759993, "loss": 3.6433, "step": 3871 }, { "epoch": 4.95232, "grad_norm": 0.5151807069778442, "learning_rate": 0.00015946964598196256, "loss": 3.6871, "step": 3872 }, { "epoch": 4.9536, "grad_norm": 0.48747456073760986, "learning_rate": 0.0001594292636963252, "loss": 3.6983, "step": 3873 }, { "epoch": 4.95488, "grad_norm": 0.49215272068977356, "learning_rate": 0.00015938888141068784, "loss": 3.7184, "step": 3874 }, { "epoch": 4.95616, "grad_norm": 0.47915881872177124, "learning_rate": 0.00015934849912505047, "loss": 3.6691, "step": 3875 }, { "epoch": 4.95744, "grad_norm": 0.4974246323108673, "learning_rate": 0.0001593081168394131, "loss": 3.6795, "step": 3876 }, { "epoch": 4.95872, "grad_norm": 0.47061559557914734, "learning_rate": 0.0001592677345537757, "loss": 3.6472, "step": 3877 }, { "epoch": 4.96, "grad_norm": 0.4836273193359375, "learning_rate": 0.00015922735226813836, "loss": 3.6728, "step": 3878 }, { "epoch": 4.96128, "grad_norm": 0.5134618878364563, "learning_rate": 0.000159186969982501, "loss": 3.6932, "step": 3879 }, { "epoch": 4.96256, "grad_norm": 0.49473533034324646, "learning_rate": 0.00015914658769686362, "loss": 3.6023, "step": 3880 }, { "epoch": 4.96384, "grad_norm": 0.5070651173591614, "learning_rate": 0.00015910620541122625, "loss": 3.6855, "step": 3881 }, { "epoch": 4.96512, "grad_norm": 0.4944706857204437, "learning_rate": 0.0001590658231255889, "loss": 3.6878, "step": 3882 }, { "epoch": 4.9664, "grad_norm": 0.4829122722148895, "learning_rate": 0.00015902544083995154, "loss": 3.6283, "step": 3883 }, { "epoch": 4.96768, "grad_norm": 0.47128134965896606, "learning_rate": 0.00015898505855431417, "loss": 3.7395, "step": 3884 }, { "epoch": 4.96896, "grad_norm": 0.5024461150169373, "learning_rate": 0.00015894467626867677, "loss": 3.7182, "step": 3885 }, { "epoch": 4.97024, "grad_norm": 0.4943237602710724, "learning_rate": 0.0001589042939830394, "loss": 3.705, "step": 3886 }, { "epoch": 4.97152, "grad_norm": 0.49064311385154724, "learning_rate": 0.00015886391169740206, "loss": 3.6011, "step": 3887 }, { "epoch": 4.9728, "grad_norm": 0.4886166751384735, "learning_rate": 0.0001588235294117647, "loss": 3.6503, "step": 3888 }, { "epoch": 4.97408, "grad_norm": 0.4722157120704651, "learning_rate": 0.00015878314712612732, "loss": 3.5743, "step": 3889 }, { "epoch": 4.97536, "grad_norm": 0.49199846386909485, "learning_rate": 0.00015874276484048995, "loss": 3.6846, "step": 3890 }, { "epoch": 4.97664, "grad_norm": 0.504956066608429, "learning_rate": 0.0001587023825548526, "loss": 3.6009, "step": 3891 }, { "epoch": 4.97792, "grad_norm": 0.49585065245628357, "learning_rate": 0.00015866200026921524, "loss": 3.6558, "step": 3892 }, { "epoch": 4.9792, "grad_norm": 0.5018090605735779, "learning_rate": 0.00015862161798357784, "loss": 3.6642, "step": 3893 }, { "epoch": 4.98048, "grad_norm": 0.49588721990585327, "learning_rate": 0.00015858123569794047, "loss": 3.6914, "step": 3894 }, { "epoch": 4.9817599999999995, "grad_norm": 0.4850863516330719, "learning_rate": 0.00015854085341230313, "loss": 3.6784, "step": 3895 }, { "epoch": 4.98304, "grad_norm": 0.4720815122127533, "learning_rate": 0.00015850047112666576, "loss": 3.6412, "step": 3896 }, { "epoch": 4.98432, "grad_norm": 0.49307361245155334, "learning_rate": 0.0001584600888410284, "loss": 3.6723, "step": 3897 }, { "epoch": 4.9856, "grad_norm": 0.48994728922843933, "learning_rate": 0.00015841970655539102, "loss": 3.6367, "step": 3898 }, { "epoch": 4.98688, "grad_norm": 0.47764644026756287, "learning_rate": 0.00015837932426975365, "loss": 3.6513, "step": 3899 }, { "epoch": 4.98816, "grad_norm": 0.4945572316646576, "learning_rate": 0.0001583389419841163, "loss": 3.6575, "step": 3900 }, { "epoch": 4.98944, "grad_norm": 0.4753789007663727, "learning_rate": 0.0001582985596984789, "loss": 3.63, "step": 3901 }, { "epoch": 4.99072, "grad_norm": 0.49885818362236023, "learning_rate": 0.00015825817741284154, "loss": 3.7375, "step": 3902 }, { "epoch": 4.992, "grad_norm": 0.4667688310146332, "learning_rate": 0.00015821779512720417, "loss": 3.636, "step": 3903 }, { "epoch": 4.99328, "grad_norm": 0.46380728483200073, "learning_rate": 0.00015817741284156683, "loss": 3.5965, "step": 3904 }, { "epoch": 4.99456, "grad_norm": 0.5012388229370117, "learning_rate": 0.00015813703055592946, "loss": 3.7338, "step": 3905 }, { "epoch": 4.99584, "grad_norm": 0.5032443404197693, "learning_rate": 0.0001580966482702921, "loss": 3.6502, "step": 3906 }, { "epoch": 4.99712, "grad_norm": 0.46951258182525635, "learning_rate": 0.00015805626598465472, "loss": 3.6379, "step": 3907 }, { "epoch": 4.9984, "grad_norm": 0.502934992313385, "learning_rate": 0.00015801588369901737, "loss": 3.6745, "step": 3908 }, { "epoch": 4.99968, "grad_norm": 0.46453621983528137, "learning_rate": 0.00015797550141337998, "loss": 3.6497, "step": 3909 }, { "epoch": 5.0, "grad_norm": 0.9125704169273376, "learning_rate": 0.0001579351191277426, "loss": 3.7091, "step": 3910 }, { "epoch": 5.00128, "grad_norm": 0.5202215909957886, "learning_rate": 0.00015789473684210524, "loss": 3.5637, "step": 3911 }, { "epoch": 5.00256, "grad_norm": 0.5078494548797607, "learning_rate": 0.00015785435455646787, "loss": 3.5149, "step": 3912 }, { "epoch": 5.00384, "grad_norm": 0.463044673204422, "learning_rate": 0.00015781397227083053, "loss": 3.5784, "step": 3913 }, { "epoch": 5.00512, "grad_norm": 0.5001516342163086, "learning_rate": 0.00015777358998519316, "loss": 3.5504, "step": 3914 }, { "epoch": 5.0064, "grad_norm": 0.4898684620857239, "learning_rate": 0.00015773320769955579, "loss": 3.5146, "step": 3915 }, { "epoch": 5.00768, "grad_norm": 0.5046645998954773, "learning_rate": 0.0001576928254139184, "loss": 3.5652, "step": 3916 }, { "epoch": 5.00896, "grad_norm": 0.5051593780517578, "learning_rate": 0.00015765244312828105, "loss": 3.4985, "step": 3917 }, { "epoch": 5.01024, "grad_norm": 0.49468353390693665, "learning_rate": 0.00015761206084264368, "loss": 3.5268, "step": 3918 }, { "epoch": 5.01152, "grad_norm": 0.48806026577949524, "learning_rate": 0.0001575716785570063, "loss": 3.5752, "step": 3919 }, { "epoch": 5.0128, "grad_norm": 0.48567843437194824, "learning_rate": 0.00015753129627136894, "loss": 3.5006, "step": 3920 }, { "epoch": 5.01408, "grad_norm": 0.4762195348739624, "learning_rate": 0.0001574909139857316, "loss": 3.523, "step": 3921 }, { "epoch": 5.01536, "grad_norm": 0.4921858012676239, "learning_rate": 0.00015745053170009422, "loss": 3.501, "step": 3922 }, { "epoch": 5.01664, "grad_norm": 0.49300310015678406, "learning_rate": 0.00015741014941445685, "loss": 3.4951, "step": 3923 }, { "epoch": 5.01792, "grad_norm": 0.5041300058364868, "learning_rate": 0.00015736976712881946, "loss": 3.5221, "step": 3924 }, { "epoch": 5.0192, "grad_norm": 0.4857078790664673, "learning_rate": 0.00015732938484318214, "loss": 3.5616, "step": 3925 }, { "epoch": 5.02048, "grad_norm": 0.49047553539276123, "learning_rate": 0.00015728900255754474, "loss": 3.4968, "step": 3926 }, { "epoch": 5.0217600000000004, "grad_norm": 0.510486900806427, "learning_rate": 0.00015724862027190737, "loss": 3.5057, "step": 3927 }, { "epoch": 5.02304, "grad_norm": 0.5110107064247131, "learning_rate": 0.00015720823798627, "loss": 3.5816, "step": 3928 }, { "epoch": 5.02432, "grad_norm": 0.4852614104747772, "learning_rate": 0.00015716785570063263, "loss": 3.5548, "step": 3929 }, { "epoch": 5.0256, "grad_norm": 0.4855445921421051, "learning_rate": 0.0001571274734149953, "loss": 3.4835, "step": 3930 }, { "epoch": 5.02688, "grad_norm": 0.5051552057266235, "learning_rate": 0.00015708709112935792, "loss": 3.5332, "step": 3931 }, { "epoch": 5.02816, "grad_norm": 0.4770444631576538, "learning_rate": 0.00015704670884372052, "loss": 3.5793, "step": 3932 }, { "epoch": 5.02944, "grad_norm": 0.49611613154411316, "learning_rate": 0.00015700632655808315, "loss": 3.5763, "step": 3933 }, { "epoch": 5.03072, "grad_norm": 0.5175138711929321, "learning_rate": 0.0001569659442724458, "loss": 3.5942, "step": 3934 }, { "epoch": 5.032, "grad_norm": 0.5066589117050171, "learning_rate": 0.00015692556198680844, "loss": 3.5047, "step": 3935 }, { "epoch": 5.03328, "grad_norm": 0.5006115436553955, "learning_rate": 0.00015688517970117107, "loss": 3.44, "step": 3936 }, { "epoch": 5.03456, "grad_norm": 0.493075430393219, "learning_rate": 0.0001568447974155337, "loss": 3.591, "step": 3937 }, { "epoch": 5.03584, "grad_norm": 0.5144281387329102, "learning_rate": 0.00015680441512989636, "loss": 3.6073, "step": 3938 }, { "epoch": 5.03712, "grad_norm": 0.5123122930526733, "learning_rate": 0.000156764032844259, "loss": 3.5315, "step": 3939 }, { "epoch": 5.0384, "grad_norm": 0.5100513696670532, "learning_rate": 0.00015672365055862162, "loss": 3.4975, "step": 3940 }, { "epoch": 5.03968, "grad_norm": 0.5032030940055847, "learning_rate": 0.00015668326827298422, "loss": 3.58, "step": 3941 }, { "epoch": 5.04096, "grad_norm": 0.4905944764614105, "learning_rate": 0.00015664288598734685, "loss": 3.6283, "step": 3942 }, { "epoch": 5.04224, "grad_norm": 0.5140021443367004, "learning_rate": 0.0001566025037017095, "loss": 3.5188, "step": 3943 }, { "epoch": 5.04352, "grad_norm": 0.4894171953201294, "learning_rate": 0.00015656212141607214, "loss": 3.4722, "step": 3944 }, { "epoch": 5.0448, "grad_norm": 0.4924975335597992, "learning_rate": 0.00015652173913043477, "loss": 3.582, "step": 3945 }, { "epoch": 5.04608, "grad_norm": 0.5047486424446106, "learning_rate": 0.0001564813568447974, "loss": 3.5844, "step": 3946 }, { "epoch": 5.04736, "grad_norm": 0.5076228380203247, "learning_rate": 0.00015644097455916006, "loss": 3.4754, "step": 3947 }, { "epoch": 5.04864, "grad_norm": 0.48156487941741943, "learning_rate": 0.00015640059227352269, "loss": 3.5801, "step": 3948 }, { "epoch": 5.04992, "grad_norm": 0.52396559715271, "learning_rate": 0.0001563602099878853, "loss": 3.5727, "step": 3949 }, { "epoch": 5.0512, "grad_norm": 0.5130919814109802, "learning_rate": 0.00015631982770224792, "loss": 3.5455, "step": 3950 }, { "epoch": 5.05248, "grad_norm": 0.5077847242355347, "learning_rate": 0.00015627944541661058, "loss": 3.574, "step": 3951 }, { "epoch": 5.05376, "grad_norm": 0.5045159459114075, "learning_rate": 0.0001562390631309732, "loss": 3.5511, "step": 3952 }, { "epoch": 5.05504, "grad_norm": 0.5126574635505676, "learning_rate": 0.00015619868084533584, "loss": 3.6289, "step": 3953 }, { "epoch": 5.05632, "grad_norm": 0.4960666000843048, "learning_rate": 0.00015615829855969847, "loss": 3.5108, "step": 3954 }, { "epoch": 5.0576, "grad_norm": 0.49016058444976807, "learning_rate": 0.0001561179162740611, "loss": 3.5506, "step": 3955 }, { "epoch": 5.05888, "grad_norm": 0.5145988464355469, "learning_rate": 0.00015607753398842375, "loss": 3.5351, "step": 3956 }, { "epoch": 5.06016, "grad_norm": 0.495882123708725, "learning_rate": 0.00015603715170278636, "loss": 3.5469, "step": 3957 }, { "epoch": 5.06144, "grad_norm": 0.514789879322052, "learning_rate": 0.000155996769417149, "loss": 3.5172, "step": 3958 }, { "epoch": 5.06272, "grad_norm": 0.4931298494338989, "learning_rate": 0.00015595638713151162, "loss": 3.5787, "step": 3959 }, { "epoch": 5.064, "grad_norm": 0.47730547189712524, "learning_rate": 0.00015591600484587427, "loss": 3.5478, "step": 3960 }, { "epoch": 5.06528, "grad_norm": 0.534153938293457, "learning_rate": 0.0001558756225602369, "loss": 3.5449, "step": 3961 }, { "epoch": 5.06656, "grad_norm": 0.4863246977329254, "learning_rate": 0.00015583524027459953, "loss": 3.4829, "step": 3962 }, { "epoch": 5.06784, "grad_norm": 0.5165325999259949, "learning_rate": 0.00015579485798896216, "loss": 3.5866, "step": 3963 }, { "epoch": 5.06912, "grad_norm": 0.5204668641090393, "learning_rate": 0.00015575447570332482, "loss": 3.5346, "step": 3964 }, { "epoch": 5.0704, "grad_norm": 0.5140581130981445, "learning_rate": 0.00015571409341768742, "loss": 3.5652, "step": 3965 }, { "epoch": 5.07168, "grad_norm": 0.5002838373184204, "learning_rate": 0.00015567371113205005, "loss": 3.5275, "step": 3966 }, { "epoch": 5.07296, "grad_norm": 0.48074761033058167, "learning_rate": 0.00015563332884641268, "loss": 3.5435, "step": 3967 }, { "epoch": 5.07424, "grad_norm": 0.49693623185157776, "learning_rate": 0.00015559294656077531, "loss": 3.5735, "step": 3968 }, { "epoch": 5.07552, "grad_norm": 0.5019185543060303, "learning_rate": 0.00015555256427513797, "loss": 3.5511, "step": 3969 }, { "epoch": 5.0768, "grad_norm": 0.5159568190574646, "learning_rate": 0.0001555121819895006, "loss": 3.5598, "step": 3970 }, { "epoch": 5.07808, "grad_norm": 0.49248459935188293, "learning_rate": 0.00015547179970386323, "loss": 3.594, "step": 3971 }, { "epoch": 5.07936, "grad_norm": 0.5076256990432739, "learning_rate": 0.00015543141741822584, "loss": 3.586, "step": 3972 }, { "epoch": 5.08064, "grad_norm": 0.5257125496864319, "learning_rate": 0.0001553910351325885, "loss": 3.5359, "step": 3973 }, { "epoch": 5.08192, "grad_norm": 0.48996734619140625, "learning_rate": 0.00015535065284695112, "loss": 3.5319, "step": 3974 }, { "epoch": 5.0832, "grad_norm": 0.510186493396759, "learning_rate": 0.00015531027056131375, "loss": 3.5718, "step": 3975 }, { "epoch": 5.08448, "grad_norm": 0.49666544795036316, "learning_rate": 0.00015526988827567638, "loss": 3.5401, "step": 3976 }, { "epoch": 5.08576, "grad_norm": 0.5227068066596985, "learning_rate": 0.00015522950599003904, "loss": 3.6226, "step": 3977 }, { "epoch": 5.08704, "grad_norm": 0.5159752368927002, "learning_rate": 0.00015518912370440167, "loss": 3.5353, "step": 3978 }, { "epoch": 5.08832, "grad_norm": 0.49110716581344604, "learning_rate": 0.0001551487414187643, "loss": 3.4429, "step": 3979 }, { "epoch": 5.0896, "grad_norm": 0.5033590793609619, "learning_rate": 0.0001551083591331269, "loss": 3.5877, "step": 3980 }, { "epoch": 5.09088, "grad_norm": 0.5176244974136353, "learning_rate": 0.00015506797684748953, "loss": 3.5388, "step": 3981 }, { "epoch": 5.09216, "grad_norm": 0.5007118582725525, "learning_rate": 0.0001550275945618522, "loss": 3.5527, "step": 3982 }, { "epoch": 5.09344, "grad_norm": 0.4941405653953552, "learning_rate": 0.00015498721227621482, "loss": 3.5253, "step": 3983 }, { "epoch": 5.09472, "grad_norm": 0.49819836020469666, "learning_rate": 0.00015494682999057745, "loss": 3.5868, "step": 3984 }, { "epoch": 5.096, "grad_norm": 0.4690377712249756, "learning_rate": 0.00015490644770494008, "loss": 3.521, "step": 3985 }, { "epoch": 5.09728, "grad_norm": 0.5075995326042175, "learning_rate": 0.00015486606541930274, "loss": 3.4984, "step": 3986 }, { "epoch": 5.09856, "grad_norm": 0.4840170741081238, "learning_rate": 0.00015482568313366537, "loss": 3.5351, "step": 3987 }, { "epoch": 5.09984, "grad_norm": 0.5013239979743958, "learning_rate": 0.00015478530084802797, "loss": 3.5958, "step": 3988 }, { "epoch": 5.10112, "grad_norm": 0.500015377998352, "learning_rate": 0.0001547449185623906, "loss": 3.4832, "step": 3989 }, { "epoch": 5.1024, "grad_norm": 0.5128130316734314, "learning_rate": 0.00015470453627675326, "loss": 3.5816, "step": 3990 }, { "epoch": 5.10368, "grad_norm": 0.5172329545021057, "learning_rate": 0.0001546641539911159, "loss": 3.474, "step": 3991 }, { "epoch": 5.10496, "grad_norm": 0.4870198369026184, "learning_rate": 0.00015462377170547852, "loss": 3.5239, "step": 3992 }, { "epoch": 5.10624, "grad_norm": 0.5066513419151306, "learning_rate": 0.00015458338941984115, "loss": 3.6001, "step": 3993 }, { "epoch": 5.10752, "grad_norm": 0.5186413526535034, "learning_rate": 0.00015454300713420378, "loss": 3.5248, "step": 3994 }, { "epoch": 5.1088, "grad_norm": 0.4880644679069519, "learning_rate": 0.00015450262484856644, "loss": 3.5703, "step": 3995 }, { "epoch": 5.11008, "grad_norm": 0.5023115277290344, "learning_rate": 0.00015446224256292904, "loss": 3.6021, "step": 3996 }, { "epoch": 5.11136, "grad_norm": 0.490438312292099, "learning_rate": 0.00015442186027729167, "loss": 3.4991, "step": 3997 }, { "epoch": 5.11264, "grad_norm": 0.48022663593292236, "learning_rate": 0.0001543814779916543, "loss": 3.411, "step": 3998 }, { "epoch": 5.11392, "grad_norm": 0.4824524521827698, "learning_rate": 0.00015434109570601696, "loss": 3.5548, "step": 3999 }, { "epoch": 5.1152, "grad_norm": 0.48679637908935547, "learning_rate": 0.00015430071342037959, "loss": 3.4643, "step": 4000 }, { "epoch": 5.11648, "grad_norm": 0.5010484457015991, "learning_rate": 0.00015426033113474222, "loss": 3.5182, "step": 4001 }, { "epoch": 5.11776, "grad_norm": 0.49517738819122314, "learning_rate": 0.00015421994884910485, "loss": 3.5063, "step": 4002 }, { "epoch": 5.11904, "grad_norm": 0.5106388926506042, "learning_rate": 0.0001541795665634675, "loss": 3.6582, "step": 4003 }, { "epoch": 5.12032, "grad_norm": 0.4887462556362152, "learning_rate": 0.0001541391842778301, "loss": 3.6125, "step": 4004 }, { "epoch": 5.1216, "grad_norm": 0.4951592981815338, "learning_rate": 0.00015409880199219274, "loss": 3.5286, "step": 4005 }, { "epoch": 5.12288, "grad_norm": 0.5019987225532532, "learning_rate": 0.00015405841970655537, "loss": 3.5732, "step": 4006 }, { "epoch": 5.12416, "grad_norm": 0.48524710536003113, "learning_rate": 0.000154018037420918, "loss": 3.5385, "step": 4007 }, { "epoch": 5.12544, "grad_norm": 0.5041416883468628, "learning_rate": 0.00015397765513528065, "loss": 3.6019, "step": 4008 }, { "epoch": 5.12672, "grad_norm": 0.48453205823898315, "learning_rate": 0.00015393727284964328, "loss": 3.5418, "step": 4009 }, { "epoch": 5.128, "grad_norm": 0.5173502564430237, "learning_rate": 0.0001538968905640059, "loss": 3.597, "step": 4010 }, { "epoch": 5.12928, "grad_norm": 0.5132037997245789, "learning_rate": 0.00015385650827836852, "loss": 3.4569, "step": 4011 }, { "epoch": 5.13056, "grad_norm": 0.5146015286445618, "learning_rate": 0.0001538161259927312, "loss": 3.5441, "step": 4012 }, { "epoch": 5.13184, "grad_norm": 0.5107573866844177, "learning_rate": 0.0001537757437070938, "loss": 3.569, "step": 4013 }, { "epoch": 5.13312, "grad_norm": 0.5163334012031555, "learning_rate": 0.00015373536142145643, "loss": 3.6092, "step": 4014 }, { "epoch": 5.1344, "grad_norm": 0.512444019317627, "learning_rate": 0.00015369497913581906, "loss": 3.6463, "step": 4015 }, { "epoch": 5.13568, "grad_norm": 0.5129110813140869, "learning_rate": 0.00015365459685018172, "loss": 3.5649, "step": 4016 }, { "epoch": 5.13696, "grad_norm": 0.52217698097229, "learning_rate": 0.00015361421456454435, "loss": 3.5541, "step": 4017 }, { "epoch": 5.13824, "grad_norm": 0.5022246241569519, "learning_rate": 0.00015357383227890698, "loss": 3.4864, "step": 4018 }, { "epoch": 5.13952, "grad_norm": 0.5264055132865906, "learning_rate": 0.00015353344999326958, "loss": 3.5228, "step": 4019 }, { "epoch": 5.1408, "grad_norm": 0.5241397023200989, "learning_rate": 0.00015349306770763221, "loss": 3.5233, "step": 4020 }, { "epoch": 5.14208, "grad_norm": 0.6818204522132874, "learning_rate": 0.00015345268542199487, "loss": 3.639, "step": 4021 }, { "epoch": 5.14336, "grad_norm": 0.4898463785648346, "learning_rate": 0.0001534123031363575, "loss": 3.4437, "step": 4022 }, { "epoch": 5.14464, "grad_norm": 0.521457314491272, "learning_rate": 0.00015337192085072013, "loss": 3.5246, "step": 4023 }, { "epoch": 5.14592, "grad_norm": 0.5136117935180664, "learning_rate": 0.00015333153856508276, "loss": 3.5557, "step": 4024 }, { "epoch": 5.1472, "grad_norm": 0.49397048354148865, "learning_rate": 0.00015329115627944542, "loss": 3.4808, "step": 4025 }, { "epoch": 5.14848, "grad_norm": 0.5146060585975647, "learning_rate": 0.00015325077399380805, "loss": 3.518, "step": 4026 }, { "epoch": 5.14976, "grad_norm": 0.5058059692382812, "learning_rate": 0.00015321039170817068, "loss": 3.5399, "step": 4027 }, { "epoch": 5.15104, "grad_norm": 0.4919331967830658, "learning_rate": 0.00015317000942253328, "loss": 3.5817, "step": 4028 }, { "epoch": 5.15232, "grad_norm": 0.4798501133918762, "learning_rate": 0.00015312962713689594, "loss": 3.5386, "step": 4029 }, { "epoch": 5.1536, "grad_norm": 0.49712446331977844, "learning_rate": 0.00015308924485125857, "loss": 3.532, "step": 4030 }, { "epoch": 5.15488, "grad_norm": 0.5078490376472473, "learning_rate": 0.0001530488625656212, "loss": 3.5602, "step": 4031 }, { "epoch": 5.15616, "grad_norm": 0.49252021312713623, "learning_rate": 0.00015300848027998383, "loss": 3.5433, "step": 4032 }, { "epoch": 5.15744, "grad_norm": 0.5331763029098511, "learning_rate": 0.00015296809799434646, "loss": 3.5577, "step": 4033 }, { "epoch": 5.15872, "grad_norm": 0.4990587830543518, "learning_rate": 0.00015292771570870912, "loss": 3.559, "step": 4034 }, { "epoch": 5.16, "grad_norm": 0.5130864977836609, "learning_rate": 0.00015288733342307175, "loss": 3.5403, "step": 4035 }, { "epoch": 5.16128, "grad_norm": 0.4923115372657776, "learning_rate": 0.00015284695113743435, "loss": 3.4671, "step": 4036 }, { "epoch": 5.16256, "grad_norm": 0.5185816287994385, "learning_rate": 0.00015280656885179698, "loss": 3.6086, "step": 4037 }, { "epoch": 5.16384, "grad_norm": 0.5011401176452637, "learning_rate": 0.00015276618656615964, "loss": 3.5315, "step": 4038 }, { "epoch": 5.16512, "grad_norm": 0.5076486468315125, "learning_rate": 0.00015272580428052227, "loss": 3.5743, "step": 4039 }, { "epoch": 5.1664, "grad_norm": 0.49623793363571167, "learning_rate": 0.0001526854219948849, "loss": 3.6119, "step": 4040 }, { "epoch": 5.16768, "grad_norm": 0.5004082918167114, "learning_rate": 0.00015264503970924753, "loss": 3.4876, "step": 4041 }, { "epoch": 5.16896, "grad_norm": 0.4831116199493408, "learning_rate": 0.00015260465742361018, "loss": 3.5508, "step": 4042 }, { "epoch": 5.17024, "grad_norm": 0.5067176818847656, "learning_rate": 0.00015256427513797281, "loss": 3.56, "step": 4043 }, { "epoch": 5.17152, "grad_norm": 0.5060960650444031, "learning_rate": 0.00015252389285233542, "loss": 3.5146, "step": 4044 }, { "epoch": 5.1728, "grad_norm": 0.5012076497077942, "learning_rate": 0.00015248351056669805, "loss": 3.5358, "step": 4045 }, { "epoch": 5.17408, "grad_norm": 0.48700445890426636, "learning_rate": 0.0001524431282810607, "loss": 3.6407, "step": 4046 }, { "epoch": 5.17536, "grad_norm": 0.49862363934516907, "learning_rate": 0.00015240274599542333, "loss": 3.5476, "step": 4047 }, { "epoch": 5.17664, "grad_norm": 0.5184906721115112, "learning_rate": 0.00015236236370978596, "loss": 3.6116, "step": 4048 }, { "epoch": 5.17792, "grad_norm": 0.4845800995826721, "learning_rate": 0.0001523219814241486, "loss": 3.4903, "step": 4049 }, { "epoch": 5.1792, "grad_norm": 0.5097942352294922, "learning_rate": 0.00015228159913851122, "loss": 3.5238, "step": 4050 }, { "epoch": 5.18048, "grad_norm": 0.5073378086090088, "learning_rate": 0.00015224121685287388, "loss": 3.6045, "step": 4051 }, { "epoch": 5.18176, "grad_norm": 0.4886874258518219, "learning_rate": 0.00015220083456723649, "loss": 3.5069, "step": 4052 }, { "epoch": 5.18304, "grad_norm": 0.49647676944732666, "learning_rate": 0.00015216045228159912, "loss": 3.5864, "step": 4053 }, { "epoch": 5.18432, "grad_norm": 0.4973941445350647, "learning_rate": 0.00015212006999596175, "loss": 3.5571, "step": 4054 }, { "epoch": 5.1856, "grad_norm": 0.5050147175788879, "learning_rate": 0.0001520796877103244, "loss": 3.5203, "step": 4055 }, { "epoch": 5.18688, "grad_norm": 0.48217907547950745, "learning_rate": 0.00015203930542468703, "loss": 3.5327, "step": 4056 }, { "epoch": 5.18816, "grad_norm": 0.5200319290161133, "learning_rate": 0.00015199892313904966, "loss": 3.6177, "step": 4057 }, { "epoch": 5.18944, "grad_norm": 0.4934719204902649, "learning_rate": 0.0001519585408534123, "loss": 3.5692, "step": 4058 }, { "epoch": 5.19072, "grad_norm": 0.5016127824783325, "learning_rate": 0.00015191815856777495, "loss": 3.5915, "step": 4059 }, { "epoch": 5.192, "grad_norm": 0.502185046672821, "learning_rate": 0.00015187777628213755, "loss": 3.5411, "step": 4060 }, { "epoch": 5.19328, "grad_norm": 0.489409863948822, "learning_rate": 0.00015183739399650018, "loss": 3.5464, "step": 4061 }, { "epoch": 5.19456, "grad_norm": 0.5016448497772217, "learning_rate": 0.0001517970117108628, "loss": 3.5695, "step": 4062 }, { "epoch": 5.19584, "grad_norm": 0.49411389231681824, "learning_rate": 0.00015175662942522544, "loss": 3.5293, "step": 4063 }, { "epoch": 5.19712, "grad_norm": 0.5000140070915222, "learning_rate": 0.0001517162471395881, "loss": 3.5094, "step": 4064 }, { "epoch": 5.1984, "grad_norm": 0.49250856041908264, "learning_rate": 0.00015167586485395073, "loss": 3.5954, "step": 4065 }, { "epoch": 5.19968, "grad_norm": 0.48768413066864014, "learning_rate": 0.00015163548256831336, "loss": 3.4345, "step": 4066 }, { "epoch": 5.20096, "grad_norm": 0.506270170211792, "learning_rate": 0.00015159510028267596, "loss": 3.5516, "step": 4067 }, { "epoch": 5.20224, "grad_norm": 0.5164130926132202, "learning_rate": 0.00015155471799703862, "loss": 3.558, "step": 4068 }, { "epoch": 5.20352, "grad_norm": 0.5122447609901428, "learning_rate": 0.00015151433571140125, "loss": 3.596, "step": 4069 }, { "epoch": 5.2048, "grad_norm": 0.5188588500022888, "learning_rate": 0.00015147395342576388, "loss": 3.5264, "step": 4070 }, { "epoch": 5.20608, "grad_norm": 0.49674516916275024, "learning_rate": 0.0001514335711401265, "loss": 3.5941, "step": 4071 }, { "epoch": 5.2073599999999995, "grad_norm": 0.5053087472915649, "learning_rate": 0.00015139318885448917, "loss": 3.5922, "step": 4072 }, { "epoch": 5.20864, "grad_norm": 0.4910738468170166, "learning_rate": 0.0001513528065688518, "loss": 3.516, "step": 4073 }, { "epoch": 5.20992, "grad_norm": 0.48027095198631287, "learning_rate": 0.00015131242428321443, "loss": 3.4961, "step": 4074 }, { "epoch": 5.2112, "grad_norm": 0.5024387240409851, "learning_rate": 0.00015127204199757703, "loss": 3.4995, "step": 4075 }, { "epoch": 5.21248, "grad_norm": 0.5177842378616333, "learning_rate": 0.00015123165971193966, "loss": 3.5783, "step": 4076 }, { "epoch": 5.21376, "grad_norm": 0.4885920584201813, "learning_rate": 0.00015119127742630232, "loss": 3.6155, "step": 4077 }, { "epoch": 5.21504, "grad_norm": 0.5077448487281799, "learning_rate": 0.00015115089514066495, "loss": 3.581, "step": 4078 }, { "epoch": 5.21632, "grad_norm": 0.48989179730415344, "learning_rate": 0.00015111051285502758, "loss": 3.5222, "step": 4079 }, { "epoch": 5.2176, "grad_norm": 0.4760512113571167, "learning_rate": 0.0001510701305693902, "loss": 3.5481, "step": 4080 }, { "epoch": 5.21888, "grad_norm": 0.49710988998413086, "learning_rate": 0.00015102974828375287, "loss": 3.5808, "step": 4081 }, { "epoch": 5.22016, "grad_norm": 0.49187228083610535, "learning_rate": 0.0001509893659981155, "loss": 3.5191, "step": 4082 }, { "epoch": 5.22144, "grad_norm": 0.5187274813652039, "learning_rate": 0.0001509489837124781, "loss": 3.6189, "step": 4083 }, { "epoch": 5.22272, "grad_norm": 0.523126482963562, "learning_rate": 0.00015090860142684073, "loss": 3.6136, "step": 4084 }, { "epoch": 5.224, "grad_norm": 0.49129125475883484, "learning_rate": 0.00015086821914120339, "loss": 3.5228, "step": 4085 }, { "epoch": 5.22528, "grad_norm": 0.5184364318847656, "learning_rate": 0.00015082783685556602, "loss": 3.5247, "step": 4086 }, { "epoch": 5.22656, "grad_norm": 0.5139320492744446, "learning_rate": 0.00015078745456992865, "loss": 3.5408, "step": 4087 }, { "epoch": 5.22784, "grad_norm": 0.5081346035003662, "learning_rate": 0.00015074707228429128, "loss": 3.6262, "step": 4088 }, { "epoch": 5.22912, "grad_norm": 0.5397776365280151, "learning_rate": 0.0001507066899986539, "loss": 3.5904, "step": 4089 }, { "epoch": 5.2304, "grad_norm": 0.5284587740898132, "learning_rate": 0.00015066630771301656, "loss": 3.5313, "step": 4090 }, { "epoch": 5.23168, "grad_norm": 0.4830555319786072, "learning_rate": 0.00015062592542737917, "loss": 3.5611, "step": 4091 }, { "epoch": 5.23296, "grad_norm": 0.5219169855117798, "learning_rate": 0.0001505855431417418, "loss": 3.6486, "step": 4092 }, { "epoch": 5.23424, "grad_norm": 0.5159496068954468, "learning_rate": 0.00015054516085610443, "loss": 3.4704, "step": 4093 }, { "epoch": 5.23552, "grad_norm": 0.530392587184906, "learning_rate": 0.00015050477857046708, "loss": 3.517, "step": 4094 }, { "epoch": 5.2368, "grad_norm": 0.5039182901382446, "learning_rate": 0.00015046439628482971, "loss": 3.5843, "step": 4095 }, { "epoch": 5.23808, "grad_norm": 0.5051393508911133, "learning_rate": 0.00015042401399919234, "loss": 3.5637, "step": 4096 }, { "epoch": 5.23936, "grad_norm": 0.5062666535377502, "learning_rate": 0.00015038363171355497, "loss": 3.5598, "step": 4097 }, { "epoch": 5.24064, "grad_norm": 0.5078728199005127, "learning_rate": 0.00015034324942791763, "loss": 3.5031, "step": 4098 }, { "epoch": 5.24192, "grad_norm": 0.5176907181739807, "learning_rate": 0.00015030286714228023, "loss": 3.5606, "step": 4099 }, { "epoch": 5.2432, "grad_norm": 0.5005961060523987, "learning_rate": 0.00015026248485664286, "loss": 3.658, "step": 4100 }, { "epoch": 5.24448, "grad_norm": 0.5151212811470032, "learning_rate": 0.0001502221025710055, "loss": 3.6295, "step": 4101 }, { "epoch": 5.24576, "grad_norm": 0.5224319100379944, "learning_rate": 0.00015018172028536812, "loss": 3.5873, "step": 4102 }, { "epoch": 5.24704, "grad_norm": 0.5269538164138794, "learning_rate": 0.00015014133799973078, "loss": 3.5697, "step": 4103 }, { "epoch": 5.24832, "grad_norm": 0.4943110942840576, "learning_rate": 0.0001501009557140934, "loss": 3.4694, "step": 4104 }, { "epoch": 5.2496, "grad_norm": 0.5208888649940491, "learning_rate": 0.00015006057342845604, "loss": 3.5701, "step": 4105 }, { "epoch": 5.25088, "grad_norm": 0.5066584944725037, "learning_rate": 0.00015002019114281864, "loss": 3.6059, "step": 4106 }, { "epoch": 5.25216, "grad_norm": 0.5034709572792053, "learning_rate": 0.0001499798088571813, "loss": 3.5807, "step": 4107 }, { "epoch": 5.25344, "grad_norm": 0.49684327840805054, "learning_rate": 0.00014993942657154393, "loss": 3.5518, "step": 4108 }, { "epoch": 5.25472, "grad_norm": 0.5177910327911377, "learning_rate": 0.00014989904428590656, "loss": 3.5615, "step": 4109 }, { "epoch": 5.256, "grad_norm": 0.5041977167129517, "learning_rate": 0.00014985866200026922, "loss": 3.5278, "step": 4110 }, { "epoch": 5.25728, "grad_norm": 0.5060576796531677, "learning_rate": 0.00014981827971463182, "loss": 3.5217, "step": 4111 }, { "epoch": 5.25856, "grad_norm": 0.5150559544563293, "learning_rate": 0.00014977789742899448, "loss": 3.5016, "step": 4112 }, { "epoch": 5.25984, "grad_norm": 0.48947054147720337, "learning_rate": 0.0001497375151433571, "loss": 3.5498, "step": 4113 }, { "epoch": 5.26112, "grad_norm": 0.5176462531089783, "learning_rate": 0.00014969713285771974, "loss": 3.5463, "step": 4114 }, { "epoch": 5.2624, "grad_norm": 0.501726508140564, "learning_rate": 0.00014965675057208237, "loss": 3.5083, "step": 4115 }, { "epoch": 5.26368, "grad_norm": 0.5025134086608887, "learning_rate": 0.000149616368286445, "loss": 3.5474, "step": 4116 }, { "epoch": 5.26496, "grad_norm": 0.4934270679950714, "learning_rate": 0.00014957598600080763, "loss": 3.5807, "step": 4117 }, { "epoch": 5.26624, "grad_norm": 0.49150562286376953, "learning_rate": 0.00014953560371517026, "loss": 3.5932, "step": 4118 }, { "epoch": 5.26752, "grad_norm": 0.4923061728477478, "learning_rate": 0.0001494952214295329, "loss": 3.5125, "step": 4119 }, { "epoch": 5.2688, "grad_norm": 0.506479024887085, "learning_rate": 0.00014945483914389552, "loss": 3.6409, "step": 4120 }, { "epoch": 5.27008, "grad_norm": 0.47946614027023315, "learning_rate": 0.00014941445685825818, "loss": 3.5563, "step": 4121 }, { "epoch": 5.27136, "grad_norm": 0.5117473006248474, "learning_rate": 0.0001493740745726208, "loss": 3.5387, "step": 4122 }, { "epoch": 5.27264, "grad_norm": 0.49644434452056885, "learning_rate": 0.00014933369228698344, "loss": 3.6186, "step": 4123 }, { "epoch": 5.27392, "grad_norm": 0.5136969685554504, "learning_rate": 0.00014929331000134607, "loss": 3.6033, "step": 4124 }, { "epoch": 5.2752, "grad_norm": 0.5244076251983643, "learning_rate": 0.0001492529277157087, "loss": 3.6165, "step": 4125 }, { "epoch": 5.27648, "grad_norm": 0.49152183532714844, "learning_rate": 0.00014921254543007133, "loss": 3.4846, "step": 4126 }, { "epoch": 5.27776, "grad_norm": 0.49941569566726685, "learning_rate": 0.00014917216314443396, "loss": 3.6035, "step": 4127 }, { "epoch": 5.27904, "grad_norm": 0.502313494682312, "learning_rate": 0.0001491317808587966, "loss": 3.603, "step": 4128 }, { "epoch": 5.28032, "grad_norm": 0.5102902054786682, "learning_rate": 0.00014909139857315922, "loss": 3.6138, "step": 4129 }, { "epoch": 5.2816, "grad_norm": 0.5135358572006226, "learning_rate": 0.00014905101628752187, "loss": 3.5688, "step": 4130 }, { "epoch": 5.2828800000000005, "grad_norm": 0.4999293386936188, "learning_rate": 0.00014901063400188448, "loss": 3.536, "step": 4131 }, { "epoch": 5.28416, "grad_norm": 0.49614301323890686, "learning_rate": 0.00014897025171624714, "loss": 3.5474, "step": 4132 }, { "epoch": 5.28544, "grad_norm": 0.5085309147834778, "learning_rate": 0.00014892986943060977, "loss": 3.6116, "step": 4133 }, { "epoch": 5.28672, "grad_norm": 0.479676753282547, "learning_rate": 0.0001488894871449724, "loss": 3.5612, "step": 4134 }, { "epoch": 5.288, "grad_norm": 0.4859643280506134, "learning_rate": 0.00014884910485933503, "loss": 3.5404, "step": 4135 }, { "epoch": 5.28928, "grad_norm": 0.49102458357810974, "learning_rate": 0.00014880872257369766, "loss": 3.567, "step": 4136 }, { "epoch": 5.29056, "grad_norm": 0.4906257688999176, "learning_rate": 0.00014876834028806029, "loss": 3.57, "step": 4137 }, { "epoch": 5.29184, "grad_norm": 0.48179754614830017, "learning_rate": 0.00014872795800242294, "loss": 3.5837, "step": 4138 }, { "epoch": 5.29312, "grad_norm": 0.48550358414649963, "learning_rate": 0.00014868757571678555, "loss": 3.4787, "step": 4139 }, { "epoch": 5.2943999999999996, "grad_norm": 0.49071556329727173, "learning_rate": 0.0001486471934311482, "loss": 3.5332, "step": 4140 }, { "epoch": 5.29568, "grad_norm": 0.4878070652484894, "learning_rate": 0.00014860681114551083, "loss": 3.5856, "step": 4141 }, { "epoch": 5.29696, "grad_norm": 0.4919761121273041, "learning_rate": 0.00014856642885987344, "loss": 3.5238, "step": 4142 }, { "epoch": 5.29824, "grad_norm": 0.502760648727417, "learning_rate": 0.0001485260465742361, "loss": 3.5677, "step": 4143 }, { "epoch": 5.29952, "grad_norm": 0.5055625438690186, "learning_rate": 0.00014848566428859872, "loss": 3.6354, "step": 4144 }, { "epoch": 5.3008, "grad_norm": 0.4847923517227173, "learning_rate": 0.00014844528200296135, "loss": 3.5031, "step": 4145 }, { "epoch": 5.30208, "grad_norm": 0.4974401593208313, "learning_rate": 0.00014840489971732398, "loss": 3.5579, "step": 4146 }, { "epoch": 5.30336, "grad_norm": 0.4971027076244354, "learning_rate": 0.0001483645174316866, "loss": 3.5142, "step": 4147 }, { "epoch": 5.30464, "grad_norm": 0.5040155649185181, "learning_rate": 0.00014832413514604924, "loss": 3.629, "step": 4148 }, { "epoch": 5.30592, "grad_norm": 0.5108280181884766, "learning_rate": 0.0001482837528604119, "loss": 3.5502, "step": 4149 }, { "epoch": 5.3072, "grad_norm": 0.5081576704978943, "learning_rate": 0.0001482433705747745, "loss": 3.5522, "step": 4150 }, { "epoch": 5.30848, "grad_norm": 0.49711495637893677, "learning_rate": 0.00014820298828913716, "loss": 3.5817, "step": 4151 }, { "epoch": 5.30976, "grad_norm": 0.4980352520942688, "learning_rate": 0.0001481626060034998, "loss": 3.5515, "step": 4152 }, { "epoch": 5.31104, "grad_norm": 0.5169786214828491, "learning_rate": 0.00014812222371786242, "loss": 3.5684, "step": 4153 }, { "epoch": 5.31232, "grad_norm": 0.500923752784729, "learning_rate": 0.00014808184143222505, "loss": 3.5626, "step": 4154 }, { "epoch": 5.3136, "grad_norm": 0.4993918836116791, "learning_rate": 0.00014804145914658768, "loss": 3.5241, "step": 4155 }, { "epoch": 5.31488, "grad_norm": 0.5014583468437195, "learning_rate": 0.0001480010768609503, "loss": 3.5245, "step": 4156 }, { "epoch": 5.31616, "grad_norm": 0.5105462670326233, "learning_rate": 0.00014796069457531294, "loss": 3.5115, "step": 4157 }, { "epoch": 5.31744, "grad_norm": 0.506993293762207, "learning_rate": 0.0001479203122896756, "loss": 3.5967, "step": 4158 }, { "epoch": 5.31872, "grad_norm": 0.4856204688549042, "learning_rate": 0.0001478799300040382, "loss": 3.5997, "step": 4159 }, { "epoch": 5.32, "grad_norm": 0.5217317938804626, "learning_rate": 0.00014783954771840086, "loss": 3.5199, "step": 4160 }, { "epoch": 5.32128, "grad_norm": 0.4968002140522003, "learning_rate": 0.0001477991654327635, "loss": 3.5319, "step": 4161 }, { "epoch": 5.32256, "grad_norm": 0.49874651432037354, "learning_rate": 0.00014775878314712612, "loss": 3.5956, "step": 4162 }, { "epoch": 5.32384, "grad_norm": 0.5208538770675659, "learning_rate": 0.00014771840086148875, "loss": 3.5383, "step": 4163 }, { "epoch": 5.32512, "grad_norm": 0.48870334029197693, "learning_rate": 0.00014767801857585138, "loss": 3.526, "step": 4164 }, { "epoch": 5.3264, "grad_norm": 0.47953054308891296, "learning_rate": 0.000147637636290214, "loss": 3.5108, "step": 4165 }, { "epoch": 5.32768, "grad_norm": 0.5066089034080505, "learning_rate": 0.00014759725400457667, "loss": 3.5234, "step": 4166 }, { "epoch": 5.32896, "grad_norm": 0.5130050778388977, "learning_rate": 0.00014755687171893927, "loss": 3.6001, "step": 4167 }, { "epoch": 5.33024, "grad_norm": 0.5042834281921387, "learning_rate": 0.0001475164894333019, "loss": 3.6354, "step": 4168 }, { "epoch": 5.33152, "grad_norm": 0.5093918442726135, "learning_rate": 0.00014747610714766456, "loss": 3.5279, "step": 4169 }, { "epoch": 5.3328, "grad_norm": 0.5074060559272766, "learning_rate": 0.00014743572486202716, "loss": 3.5477, "step": 4170 }, { "epoch": 5.33408, "grad_norm": 0.5181780457496643, "learning_rate": 0.00014739534257638982, "loss": 3.5895, "step": 4171 }, { "epoch": 5.33536, "grad_norm": 0.4882206618785858, "learning_rate": 0.00014735496029075245, "loss": 3.5639, "step": 4172 }, { "epoch": 5.33664, "grad_norm": 0.4917638599872589, "learning_rate": 0.00014731457800511508, "loss": 3.5782, "step": 4173 }, { "epoch": 5.33792, "grad_norm": 0.5091182589530945, "learning_rate": 0.0001472741957194777, "loss": 3.565, "step": 4174 }, { "epoch": 5.3392, "grad_norm": 0.5003569722175598, "learning_rate": 0.00014723381343384034, "loss": 3.5255, "step": 4175 }, { "epoch": 5.34048, "grad_norm": 0.5167132019996643, "learning_rate": 0.00014719343114820297, "loss": 3.5591, "step": 4176 }, { "epoch": 5.34176, "grad_norm": 0.521111011505127, "learning_rate": 0.00014715304886256562, "loss": 3.6051, "step": 4177 }, { "epoch": 5.34304, "grad_norm": 0.5027885437011719, "learning_rate": 0.00014711266657692823, "loss": 3.5387, "step": 4178 }, { "epoch": 5.34432, "grad_norm": 0.5094881057739258, "learning_rate": 0.00014707228429129088, "loss": 3.4888, "step": 4179 }, { "epoch": 5.3456, "grad_norm": 0.5104243755340576, "learning_rate": 0.00014703190200565351, "loss": 3.6305, "step": 4180 }, { "epoch": 5.34688, "grad_norm": 0.5000970363616943, "learning_rate": 0.00014699151972001614, "loss": 3.5628, "step": 4181 }, { "epoch": 5.34816, "grad_norm": 0.5333858132362366, "learning_rate": 0.00014695113743437877, "loss": 3.553, "step": 4182 }, { "epoch": 5.3494399999999995, "grad_norm": 0.4926629364490509, "learning_rate": 0.0001469107551487414, "loss": 3.5841, "step": 4183 }, { "epoch": 5.35072, "grad_norm": 0.5149261951446533, "learning_rate": 0.00014687037286310403, "loss": 3.5844, "step": 4184 }, { "epoch": 5.352, "grad_norm": 0.48902156949043274, "learning_rate": 0.00014682999057746666, "loss": 3.5553, "step": 4185 }, { "epoch": 5.35328, "grad_norm": 0.5008950233459473, "learning_rate": 0.0001467896082918293, "loss": 3.5949, "step": 4186 }, { "epoch": 5.35456, "grad_norm": 0.5129325985908508, "learning_rate": 0.00014674922600619192, "loss": 3.5755, "step": 4187 }, { "epoch": 5.35584, "grad_norm": 0.5095540881156921, "learning_rate": 0.00014670884372055458, "loss": 3.6056, "step": 4188 }, { "epoch": 5.35712, "grad_norm": 0.497380793094635, "learning_rate": 0.0001466684614349172, "loss": 3.5998, "step": 4189 }, { "epoch": 5.3584, "grad_norm": 0.5010343194007874, "learning_rate": 0.00014662807914927984, "loss": 3.5809, "step": 4190 }, { "epoch": 5.35968, "grad_norm": 0.498839795589447, "learning_rate": 0.00014658769686364247, "loss": 3.6314, "step": 4191 }, { "epoch": 5.36096, "grad_norm": 0.5233437418937683, "learning_rate": 0.0001465473145780051, "loss": 3.5928, "step": 4192 }, { "epoch": 5.36224, "grad_norm": 0.4920739233493805, "learning_rate": 0.00014650693229236773, "loss": 3.5803, "step": 4193 }, { "epoch": 5.36352, "grad_norm": 0.5078881978988647, "learning_rate": 0.00014646655000673036, "loss": 3.6682, "step": 4194 }, { "epoch": 5.3648, "grad_norm": 0.48847028613090515, "learning_rate": 0.000146426167721093, "loss": 3.5167, "step": 4195 }, { "epoch": 5.36608, "grad_norm": 0.49251747131347656, "learning_rate": 0.00014638578543545562, "loss": 3.6127, "step": 4196 }, { "epoch": 5.36736, "grad_norm": 0.491445928812027, "learning_rate": 0.00014634540314981828, "loss": 3.6277, "step": 4197 }, { "epoch": 5.36864, "grad_norm": 0.5016657710075378, "learning_rate": 0.00014630502086418088, "loss": 3.5437, "step": 4198 }, { "epoch": 5.3699200000000005, "grad_norm": 0.4860299825668335, "learning_rate": 0.00014626463857854354, "loss": 3.5576, "step": 4199 }, { "epoch": 5.3712, "grad_norm": 0.5150024890899658, "learning_rate": 0.00014622425629290617, "loss": 3.5746, "step": 4200 }, { "epoch": 5.37248, "grad_norm": 0.49160733819007874, "learning_rate": 0.0001461838740072688, "loss": 3.5343, "step": 4201 }, { "epoch": 5.37376, "grad_norm": 0.5088993906974792, "learning_rate": 0.00014614349172163143, "loss": 3.6182, "step": 4202 }, { "epoch": 5.37504, "grad_norm": 0.4988862872123718, "learning_rate": 0.00014610310943599406, "loss": 3.5938, "step": 4203 }, { "epoch": 5.37632, "grad_norm": 0.49565553665161133, "learning_rate": 0.0001460627271503567, "loss": 3.5703, "step": 4204 }, { "epoch": 5.3776, "grad_norm": 0.5155361294746399, "learning_rate": 0.00014602234486471935, "loss": 3.5962, "step": 4205 }, { "epoch": 5.37888, "grad_norm": 0.49401092529296875, "learning_rate": 0.00014598196257908195, "loss": 3.571, "step": 4206 }, { "epoch": 5.38016, "grad_norm": 0.4905524253845215, "learning_rate": 0.0001459415802934446, "loss": 3.5594, "step": 4207 }, { "epoch": 5.38144, "grad_norm": 0.5025436878204346, "learning_rate": 0.00014590119800780724, "loss": 3.5889, "step": 4208 }, { "epoch": 5.38272, "grad_norm": 0.5062000155448914, "learning_rate": 0.00014586081572216987, "loss": 3.5641, "step": 4209 }, { "epoch": 5.384, "grad_norm": 0.4925670027732849, "learning_rate": 0.0001458204334365325, "loss": 3.5346, "step": 4210 }, { "epoch": 5.38528, "grad_norm": 0.5170202255249023, "learning_rate": 0.00014578005115089513, "loss": 3.5173, "step": 4211 }, { "epoch": 5.38656, "grad_norm": 0.4998747706413269, "learning_rate": 0.00014573966886525776, "loss": 3.4946, "step": 4212 }, { "epoch": 5.38784, "grad_norm": 0.5495707392692566, "learning_rate": 0.0001456992865796204, "loss": 3.6223, "step": 4213 }, { "epoch": 5.38912, "grad_norm": 0.5236058831214905, "learning_rate": 0.00014565890429398302, "loss": 3.5723, "step": 4214 }, { "epoch": 5.3904, "grad_norm": 0.49637115001678467, "learning_rate": 0.00014561852200834565, "loss": 3.5521, "step": 4215 }, { "epoch": 5.39168, "grad_norm": 0.5067302584648132, "learning_rate": 0.0001455781397227083, "loss": 3.6365, "step": 4216 }, { "epoch": 5.39296, "grad_norm": 0.5126550197601318, "learning_rate": 0.00014553775743707094, "loss": 3.5732, "step": 4217 }, { "epoch": 5.39424, "grad_norm": 0.4854673743247986, "learning_rate": 0.00014549737515143357, "loss": 3.5872, "step": 4218 }, { "epoch": 5.39552, "grad_norm": 0.5130731463432312, "learning_rate": 0.0001454569928657962, "loss": 3.5871, "step": 4219 }, { "epoch": 5.3968, "grad_norm": 0.5053207278251648, "learning_rate": 0.00014541661058015883, "loss": 3.5029, "step": 4220 }, { "epoch": 5.39808, "grad_norm": 0.5023342370986938, "learning_rate": 0.00014537622829452146, "loss": 3.633, "step": 4221 }, { "epoch": 5.39936, "grad_norm": 0.4882518947124481, "learning_rate": 0.00014533584600888409, "loss": 3.5118, "step": 4222 }, { "epoch": 5.40064, "grad_norm": 0.4898047149181366, "learning_rate": 0.00014529546372324672, "loss": 3.5909, "step": 4223 }, { "epoch": 5.40192, "grad_norm": 0.4916389286518097, "learning_rate": 0.00014525508143760935, "loss": 3.5209, "step": 4224 }, { "epoch": 5.4032, "grad_norm": 0.4937368631362915, "learning_rate": 0.000145214699151972, "loss": 3.5068, "step": 4225 }, { "epoch": 5.40448, "grad_norm": 0.49129781126976013, "learning_rate": 0.0001451743168663346, "loss": 3.6266, "step": 4226 }, { "epoch": 5.40576, "grad_norm": 0.5058757066726685, "learning_rate": 0.00014513393458069726, "loss": 3.5565, "step": 4227 }, { "epoch": 5.40704, "grad_norm": 0.5102645754814148, "learning_rate": 0.0001450935522950599, "loss": 3.6193, "step": 4228 }, { "epoch": 5.40832, "grad_norm": 0.5095773935317993, "learning_rate": 0.00014505317000942252, "loss": 3.5692, "step": 4229 }, { "epoch": 5.4096, "grad_norm": 0.5054805278778076, "learning_rate": 0.00014501278772378515, "loss": 3.5361, "step": 4230 }, { "epoch": 5.41088, "grad_norm": 0.4920346438884735, "learning_rate": 0.00014497240543814778, "loss": 3.5224, "step": 4231 }, { "epoch": 5.41216, "grad_norm": 0.4970761239528656, "learning_rate": 0.00014493202315251041, "loss": 3.5793, "step": 4232 }, { "epoch": 5.41344, "grad_norm": 0.502128005027771, "learning_rate": 0.00014489164086687307, "loss": 3.5549, "step": 4233 }, { "epoch": 5.41472, "grad_norm": 0.4969402551651001, "learning_rate": 0.00014485125858123567, "loss": 3.5693, "step": 4234 }, { "epoch": 5.416, "grad_norm": 0.4911673665046692, "learning_rate": 0.0001448108762955983, "loss": 3.5352, "step": 4235 }, { "epoch": 5.41728, "grad_norm": 0.5134766101837158, "learning_rate": 0.00014477049400996096, "loss": 3.5124, "step": 4236 }, { "epoch": 5.41856, "grad_norm": 0.5096125602722168, "learning_rate": 0.00014473011172432356, "loss": 3.5739, "step": 4237 }, { "epoch": 5.41984, "grad_norm": 0.5185458064079285, "learning_rate": 0.00014468972943868622, "loss": 3.5937, "step": 4238 }, { "epoch": 5.42112, "grad_norm": 0.5176596641540527, "learning_rate": 0.00014464934715304885, "loss": 3.5228, "step": 4239 }, { "epoch": 5.4224, "grad_norm": 0.5246290564537048, "learning_rate": 0.00014460896486741148, "loss": 3.5686, "step": 4240 }, { "epoch": 5.42368, "grad_norm": 0.5069420337677002, "learning_rate": 0.0001445685825817741, "loss": 3.5579, "step": 4241 }, { "epoch": 5.4249600000000004, "grad_norm": 0.5116468667984009, "learning_rate": 0.00014452820029613674, "loss": 3.5462, "step": 4242 }, { "epoch": 5.42624, "grad_norm": 0.5116104483604431, "learning_rate": 0.00014448781801049937, "loss": 3.587, "step": 4243 }, { "epoch": 5.42752, "grad_norm": 0.5079525113105774, "learning_rate": 0.00014444743572486203, "loss": 3.5666, "step": 4244 }, { "epoch": 5.4288, "grad_norm": 0.49015963077545166, "learning_rate": 0.00014440705343922466, "loss": 3.5803, "step": 4245 }, { "epoch": 5.43008, "grad_norm": 0.5075419545173645, "learning_rate": 0.0001443666711535873, "loss": 3.5527, "step": 4246 }, { "epoch": 5.43136, "grad_norm": 0.4922216534614563, "learning_rate": 0.00014432628886794992, "loss": 3.5723, "step": 4247 }, { "epoch": 5.43264, "grad_norm": 0.5076229572296143, "learning_rate": 0.00014428590658231255, "loss": 3.5729, "step": 4248 }, { "epoch": 5.43392, "grad_norm": 0.4939456284046173, "learning_rate": 0.00014424552429667518, "loss": 3.4065, "step": 4249 }, { "epoch": 5.4352, "grad_norm": 0.5079296827316284, "learning_rate": 0.0001442051420110378, "loss": 3.6077, "step": 4250 }, { "epoch": 5.4364799999999995, "grad_norm": 0.4930500090122223, "learning_rate": 0.00014416475972540044, "loss": 3.554, "step": 4251 }, { "epoch": 5.43776, "grad_norm": 0.5013241767883301, "learning_rate": 0.00014412437743976307, "loss": 3.5551, "step": 4252 }, { "epoch": 5.43904, "grad_norm": 0.5022532939910889, "learning_rate": 0.00014408399515412573, "loss": 3.5839, "step": 4253 }, { "epoch": 5.44032, "grad_norm": 0.5082839727401733, "learning_rate": 0.00014404361286848833, "loss": 3.6063, "step": 4254 }, { "epoch": 5.4416, "grad_norm": 0.5208956599235535, "learning_rate": 0.000144003230582851, "loss": 3.4991, "step": 4255 }, { "epoch": 5.44288, "grad_norm": 0.5048259496688843, "learning_rate": 0.00014396284829721362, "loss": 3.5349, "step": 4256 }, { "epoch": 5.44416, "grad_norm": 0.49307942390441895, "learning_rate": 0.00014392246601157625, "loss": 3.5114, "step": 4257 }, { "epoch": 5.44544, "grad_norm": 0.5296189188957214, "learning_rate": 0.00014388208372593888, "loss": 3.5802, "step": 4258 }, { "epoch": 5.44672, "grad_norm": 0.5138427019119263, "learning_rate": 0.0001438417014403015, "loss": 3.5841, "step": 4259 }, { "epoch": 5.448, "grad_norm": 0.4858483672142029, "learning_rate": 0.00014380131915466414, "loss": 3.5527, "step": 4260 }, { "epoch": 5.44928, "grad_norm": 0.5035324692726135, "learning_rate": 0.00014376093686902677, "loss": 3.4928, "step": 4261 }, { "epoch": 5.45056, "grad_norm": 0.49650654196739197, "learning_rate": 0.0001437205545833894, "loss": 3.631, "step": 4262 }, { "epoch": 5.45184, "grad_norm": 0.5007782578468323, "learning_rate": 0.00014368017229775203, "loss": 3.5251, "step": 4263 }, { "epoch": 5.45312, "grad_norm": 0.4857237637042999, "learning_rate": 0.00014363979001211468, "loss": 3.5328, "step": 4264 }, { "epoch": 5.4544, "grad_norm": 0.5186595320701599, "learning_rate": 0.0001435994077264773, "loss": 3.5839, "step": 4265 }, { "epoch": 5.45568, "grad_norm": 0.48634448647499084, "learning_rate": 0.00014355902544083994, "loss": 3.5082, "step": 4266 }, { "epoch": 5.45696, "grad_norm": 0.5036529302597046, "learning_rate": 0.00014351864315520257, "loss": 3.6215, "step": 4267 }, { "epoch": 5.45824, "grad_norm": 0.49025043845176697, "learning_rate": 0.0001434782608695652, "loss": 3.5371, "step": 4268 }, { "epoch": 5.45952, "grad_norm": 0.5002211928367615, "learning_rate": 0.00014343787858392783, "loss": 3.5434, "step": 4269 }, { "epoch": 5.4608, "grad_norm": 0.48219621181488037, "learning_rate": 0.00014339749629829046, "loss": 3.567, "step": 4270 }, { "epoch": 5.46208, "grad_norm": 0.4965188801288605, "learning_rate": 0.0001433571140126531, "loss": 3.5367, "step": 4271 }, { "epoch": 5.46336, "grad_norm": 0.4985227584838867, "learning_rate": 0.00014331673172701575, "loss": 3.4608, "step": 4272 }, { "epoch": 5.46464, "grad_norm": 0.4829920828342438, "learning_rate": 0.00014327634944137836, "loss": 3.5967, "step": 4273 }, { "epoch": 5.46592, "grad_norm": 0.49400630593299866, "learning_rate": 0.000143235967155741, "loss": 3.5266, "step": 4274 }, { "epoch": 5.4672, "grad_norm": 0.5178740620613098, "learning_rate": 0.00014319558487010364, "loss": 3.5871, "step": 4275 }, { "epoch": 5.46848, "grad_norm": 0.49614524841308594, "learning_rate": 0.00014315520258446627, "loss": 3.6358, "step": 4276 }, { "epoch": 5.46976, "grad_norm": 0.48628175258636475, "learning_rate": 0.0001431148202988289, "loss": 3.4888, "step": 4277 }, { "epoch": 5.47104, "grad_norm": 0.5142548680305481, "learning_rate": 0.00014307443801319153, "loss": 3.5692, "step": 4278 }, { "epoch": 5.47232, "grad_norm": 0.5002409219741821, "learning_rate": 0.00014303405572755416, "loss": 3.6088, "step": 4279 }, { "epoch": 5.4736, "grad_norm": 0.5171060562133789, "learning_rate": 0.0001429936734419168, "loss": 3.5909, "step": 4280 }, { "epoch": 5.47488, "grad_norm": 0.5077352523803711, "learning_rate": 0.00014295329115627945, "loss": 3.655, "step": 4281 }, { "epoch": 5.47616, "grad_norm": 0.4823010563850403, "learning_rate": 0.00014291290887064205, "loss": 3.5184, "step": 4282 }, { "epoch": 5.47744, "grad_norm": 0.5208946466445923, "learning_rate": 0.0001428725265850047, "loss": 3.6297, "step": 4283 }, { "epoch": 5.47872, "grad_norm": 0.5040287375450134, "learning_rate": 0.00014283214429936734, "loss": 3.5483, "step": 4284 }, { "epoch": 5.48, "grad_norm": 0.5379658341407776, "learning_rate": 0.00014279176201372997, "loss": 3.5732, "step": 4285 }, { "epoch": 5.48128, "grad_norm": 0.4982692003250122, "learning_rate": 0.0001427513797280926, "loss": 3.5691, "step": 4286 }, { "epoch": 5.48256, "grad_norm": 0.5001688003540039, "learning_rate": 0.00014271099744245523, "loss": 3.6635, "step": 4287 }, { "epoch": 5.48384, "grad_norm": 0.4979035258293152, "learning_rate": 0.00014267061515681786, "loss": 3.582, "step": 4288 }, { "epoch": 5.48512, "grad_norm": 0.49051418900489807, "learning_rate": 0.0001426302328711805, "loss": 3.5999, "step": 4289 }, { "epoch": 5.4864, "grad_norm": 0.4869391918182373, "learning_rate": 0.00014258985058554312, "loss": 3.5503, "step": 4290 }, { "epoch": 5.48768, "grad_norm": 0.5105782747268677, "learning_rate": 0.00014254946829990575, "loss": 3.6143, "step": 4291 }, { "epoch": 5.48896, "grad_norm": 0.4911419451236725, "learning_rate": 0.0001425090860142684, "loss": 3.5467, "step": 4292 }, { "epoch": 5.49024, "grad_norm": 0.4994243085384369, "learning_rate": 0.000142468703728631, "loss": 3.6287, "step": 4293 }, { "epoch": 5.49152, "grad_norm": 0.4881093502044678, "learning_rate": 0.00014242832144299367, "loss": 3.5324, "step": 4294 }, { "epoch": 5.4928, "grad_norm": 0.4933335781097412, "learning_rate": 0.0001423879391573563, "loss": 3.5793, "step": 4295 }, { "epoch": 5.49408, "grad_norm": 0.4933607876300812, "learning_rate": 0.00014234755687171893, "loss": 3.5533, "step": 4296 }, { "epoch": 5.49536, "grad_norm": 0.4941942095756531, "learning_rate": 0.00014230717458608156, "loss": 3.5883, "step": 4297 }, { "epoch": 5.49664, "grad_norm": 0.5198619961738586, "learning_rate": 0.0001422667923004442, "loss": 3.667, "step": 4298 }, { "epoch": 5.49792, "grad_norm": 0.4889330267906189, "learning_rate": 0.00014222641001480682, "loss": 3.5658, "step": 4299 }, { "epoch": 5.4992, "grad_norm": 0.48626118898391724, "learning_rate": 0.00014218602772916948, "loss": 3.5525, "step": 4300 }, { "epoch": 5.50048, "grad_norm": 0.4914554953575134, "learning_rate": 0.00014214564544353208, "loss": 3.5968, "step": 4301 }, { "epoch": 5.50176, "grad_norm": 0.49524810910224915, "learning_rate": 0.0001421052631578947, "loss": 3.5555, "step": 4302 }, { "epoch": 5.50304, "grad_norm": 0.5117614269256592, "learning_rate": 0.00014206488087225737, "loss": 3.6015, "step": 4303 }, { "epoch": 5.50432, "grad_norm": 0.4925834834575653, "learning_rate": 0.00014202449858662, "loss": 3.5378, "step": 4304 }, { "epoch": 5.5056, "grad_norm": 0.494954913854599, "learning_rate": 0.00014198411630098263, "loss": 3.5463, "step": 4305 }, { "epoch": 5.50688, "grad_norm": 0.5056374073028564, "learning_rate": 0.00014194373401534526, "loss": 3.6499, "step": 4306 }, { "epoch": 5.50816, "grad_norm": 0.495236873626709, "learning_rate": 0.00014190335172970789, "loss": 3.5903, "step": 4307 }, { "epoch": 5.50944, "grad_norm": 0.48822644352912903, "learning_rate": 0.00014186296944407052, "loss": 3.5119, "step": 4308 }, { "epoch": 5.51072, "grad_norm": 0.5125917792320251, "learning_rate": 0.00014182258715843315, "loss": 3.5481, "step": 4309 }, { "epoch": 5.5120000000000005, "grad_norm": 0.5041484832763672, "learning_rate": 0.00014178220487279578, "loss": 3.542, "step": 4310 }, { "epoch": 5.51328, "grad_norm": 0.48397096991539, "learning_rate": 0.00014174182258715843, "loss": 3.5553, "step": 4311 }, { "epoch": 5.51456, "grad_norm": 0.49939045310020447, "learning_rate": 0.00014170144030152106, "loss": 3.6071, "step": 4312 }, { "epoch": 5.51584, "grad_norm": 0.4981886148452759, "learning_rate": 0.0001416610580158837, "loss": 3.6154, "step": 4313 }, { "epoch": 5.51712, "grad_norm": 0.4908769130706787, "learning_rate": 0.00014162067573024632, "loss": 3.5811, "step": 4314 }, { "epoch": 5.5184, "grad_norm": 0.4873631000518799, "learning_rate": 0.00014158029344460895, "loss": 3.5451, "step": 4315 }, { "epoch": 5.51968, "grad_norm": 0.5031121969223022, "learning_rate": 0.00014153991115897158, "loss": 3.4982, "step": 4316 }, { "epoch": 5.52096, "grad_norm": 0.5029070973396301, "learning_rate": 0.00014149952887333421, "loss": 3.5159, "step": 4317 }, { "epoch": 5.52224, "grad_norm": 0.5104166865348816, "learning_rate": 0.00014145914658769684, "loss": 3.5713, "step": 4318 }, { "epoch": 5.5235199999999995, "grad_norm": 0.512605607509613, "learning_rate": 0.00014141876430205947, "loss": 3.5637, "step": 4319 }, { "epoch": 5.5248, "grad_norm": 0.5072982311248779, "learning_rate": 0.00014137838201642213, "loss": 3.5527, "step": 4320 }, { "epoch": 5.52608, "grad_norm": 0.5111182332038879, "learning_rate": 0.00014133799973078473, "loss": 3.6156, "step": 4321 }, { "epoch": 5.52736, "grad_norm": 0.5078898668289185, "learning_rate": 0.0001412976174451474, "loss": 3.5136, "step": 4322 }, { "epoch": 5.52864, "grad_norm": 0.49358004331588745, "learning_rate": 0.00014125723515951002, "loss": 3.481, "step": 4323 }, { "epoch": 5.52992, "grad_norm": 0.4940800070762634, "learning_rate": 0.00014121685287387265, "loss": 3.5878, "step": 4324 }, { "epoch": 5.5312, "grad_norm": 0.4859899580478668, "learning_rate": 0.00014117647058823528, "loss": 3.5579, "step": 4325 }, { "epoch": 5.53248, "grad_norm": 0.5090684294700623, "learning_rate": 0.0001411360883025979, "loss": 3.5731, "step": 4326 }, { "epoch": 5.53376, "grad_norm": 0.5048773884773254, "learning_rate": 0.00014109570601696054, "loss": 3.6074, "step": 4327 }, { "epoch": 5.53504, "grad_norm": 0.5031253099441528, "learning_rate": 0.0001410553237313232, "loss": 3.554, "step": 4328 }, { "epoch": 5.53632, "grad_norm": 0.48305338621139526, "learning_rate": 0.0001410149414456858, "loss": 3.5785, "step": 4329 }, { "epoch": 5.5376, "grad_norm": 0.5137234330177307, "learning_rate": 0.00014097455916004843, "loss": 3.5755, "step": 4330 }, { "epoch": 5.53888, "grad_norm": 0.4992203414440155, "learning_rate": 0.0001409341768744111, "loss": 3.5376, "step": 4331 }, { "epoch": 5.54016, "grad_norm": 0.5001314878463745, "learning_rate": 0.00014089379458877372, "loss": 3.6757, "step": 4332 }, { "epoch": 5.54144, "grad_norm": 0.4925899803638458, "learning_rate": 0.00014085341230313635, "loss": 3.5308, "step": 4333 }, { "epoch": 5.54272, "grad_norm": 0.5240253210067749, "learning_rate": 0.00014081303001749898, "loss": 3.539, "step": 4334 }, { "epoch": 5.5440000000000005, "grad_norm": 0.5012069940567017, "learning_rate": 0.0001407726477318616, "loss": 3.5401, "step": 4335 }, { "epoch": 5.54528, "grad_norm": 0.4972357451915741, "learning_rate": 0.00014073226544622424, "loss": 3.5214, "step": 4336 }, { "epoch": 5.54656, "grad_norm": 0.5054651498794556, "learning_rate": 0.00014069188316058687, "loss": 3.4729, "step": 4337 }, { "epoch": 5.54784, "grad_norm": 0.4873622953891754, "learning_rate": 0.0001406515008749495, "loss": 3.5886, "step": 4338 }, { "epoch": 5.54912, "grad_norm": 0.5032194256782532, "learning_rate": 0.00014061111858931216, "loss": 3.4718, "step": 4339 }, { "epoch": 5.5504, "grad_norm": 0.5260450839996338, "learning_rate": 0.0001405707363036748, "loss": 3.5285, "step": 4340 }, { "epoch": 5.55168, "grad_norm": 0.4904164671897888, "learning_rate": 0.00014053035401803742, "loss": 3.5785, "step": 4341 }, { "epoch": 5.55296, "grad_norm": 0.49703943729400635, "learning_rate": 0.00014048997173240005, "loss": 3.5338, "step": 4342 }, { "epoch": 5.55424, "grad_norm": 0.50665283203125, "learning_rate": 0.00014044958944676268, "loss": 3.5355, "step": 4343 }, { "epoch": 5.55552, "grad_norm": 0.5014834403991699, "learning_rate": 0.0001404092071611253, "loss": 3.5493, "step": 4344 }, { "epoch": 5.5568, "grad_norm": 0.5009499788284302, "learning_rate": 0.00014036882487548794, "loss": 3.6308, "step": 4345 }, { "epoch": 5.55808, "grad_norm": 0.4850377142429352, "learning_rate": 0.00014032844258985057, "loss": 3.6086, "step": 4346 }, { "epoch": 5.55936, "grad_norm": 0.5173080563545227, "learning_rate": 0.0001402880603042132, "loss": 3.5599, "step": 4347 }, { "epoch": 5.56064, "grad_norm": 0.5030738711357117, "learning_rate": 0.00014024767801857585, "loss": 3.618, "step": 4348 }, { "epoch": 5.56192, "grad_norm": 0.5130670666694641, "learning_rate": 0.00014020729573293846, "loss": 3.5968, "step": 4349 }, { "epoch": 5.5632, "grad_norm": 0.5004739761352539, "learning_rate": 0.00014016691344730111, "loss": 3.5825, "step": 4350 }, { "epoch": 5.56448, "grad_norm": 0.5110647082328796, "learning_rate": 0.00014012653116166375, "loss": 3.5971, "step": 4351 }, { "epoch": 5.56576, "grad_norm": 0.4908808767795563, "learning_rate": 0.00014008614887602638, "loss": 3.5662, "step": 4352 }, { "epoch": 5.56704, "grad_norm": 0.49879395961761475, "learning_rate": 0.000140045766590389, "loss": 3.5705, "step": 4353 }, { "epoch": 5.56832, "grad_norm": 0.5124648809432983, "learning_rate": 0.00014000538430475164, "loss": 3.618, "step": 4354 }, { "epoch": 5.5696, "grad_norm": 0.48847079277038574, "learning_rate": 0.00013996500201911427, "loss": 3.5621, "step": 4355 }, { "epoch": 5.57088, "grad_norm": 0.5030008554458618, "learning_rate": 0.0001399246197334769, "loss": 3.4727, "step": 4356 }, { "epoch": 5.57216, "grad_norm": 0.5004321932792664, "learning_rate": 0.00013988423744783953, "loss": 3.5718, "step": 4357 }, { "epoch": 5.57344, "grad_norm": 0.49990010261535645, "learning_rate": 0.00013984385516220216, "loss": 3.5607, "step": 4358 }, { "epoch": 5.57472, "grad_norm": 0.4781895577907562, "learning_rate": 0.0001398034728765648, "loss": 3.5735, "step": 4359 }, { "epoch": 5.576, "grad_norm": 0.4892998933792114, "learning_rate": 0.00013976309059092742, "loss": 3.6314, "step": 4360 }, { "epoch": 5.57728, "grad_norm": 0.5173603296279907, "learning_rate": 0.00013972270830529007, "loss": 3.5335, "step": 4361 }, { "epoch": 5.5785599999999995, "grad_norm": 0.4709049463272095, "learning_rate": 0.0001396823260196527, "loss": 3.5218, "step": 4362 }, { "epoch": 5.57984, "grad_norm": 0.5281604528427124, "learning_rate": 0.00013964194373401533, "loss": 3.589, "step": 4363 }, { "epoch": 5.58112, "grad_norm": 0.49531951546669006, "learning_rate": 0.00013960156144837796, "loss": 3.5535, "step": 4364 }, { "epoch": 5.5824, "grad_norm": 0.5021586418151855, "learning_rate": 0.0001395611791627406, "loss": 3.461, "step": 4365 }, { "epoch": 5.58368, "grad_norm": 0.5123895406723022, "learning_rate": 0.00013952079687710322, "loss": 3.4411, "step": 4366 }, { "epoch": 5.58496, "grad_norm": 0.515090823173523, "learning_rate": 0.00013948041459146588, "loss": 3.6073, "step": 4367 }, { "epoch": 5.58624, "grad_norm": 0.5333694815635681, "learning_rate": 0.0001394400323058285, "loss": 3.6164, "step": 4368 }, { "epoch": 5.58752, "grad_norm": 0.48728322982788086, "learning_rate": 0.0001393996500201911, "loss": 3.529, "step": 4369 }, { "epoch": 5.5888, "grad_norm": 0.5210532546043396, "learning_rate": 0.00013935926773455377, "loss": 3.455, "step": 4370 }, { "epoch": 5.59008, "grad_norm": 0.5110505223274231, "learning_rate": 0.0001393188854489164, "loss": 3.5412, "step": 4371 }, { "epoch": 5.59136, "grad_norm": 0.4856546223163605, "learning_rate": 0.00013927850316327903, "loss": 3.5445, "step": 4372 }, { "epoch": 5.59264, "grad_norm": 0.5165418386459351, "learning_rate": 0.00013923812087764166, "loss": 3.5215, "step": 4373 }, { "epoch": 5.59392, "grad_norm": 0.524249255657196, "learning_rate": 0.0001391977385920043, "loss": 3.579, "step": 4374 }, { "epoch": 5.5952, "grad_norm": 0.5186324715614319, "learning_rate": 0.00013915735630636692, "loss": 3.5448, "step": 4375 }, { "epoch": 5.59648, "grad_norm": 0.4949183166027069, "learning_rate": 0.00013911697402072958, "loss": 3.5652, "step": 4376 }, { "epoch": 5.59776, "grad_norm": 0.5009962916374207, "learning_rate": 0.00013907659173509218, "loss": 3.6081, "step": 4377 }, { "epoch": 5.5990400000000005, "grad_norm": 0.4920154809951782, "learning_rate": 0.00013903620944945484, "loss": 3.5249, "step": 4378 }, { "epoch": 5.60032, "grad_norm": 0.5058812499046326, "learning_rate": 0.00013899582716381747, "loss": 3.612, "step": 4379 }, { "epoch": 5.6016, "grad_norm": 0.5027937889099121, "learning_rate": 0.0001389554448781801, "loss": 3.5013, "step": 4380 }, { "epoch": 5.60288, "grad_norm": 0.5067436099052429, "learning_rate": 0.00013891506259254273, "loss": 3.5146, "step": 4381 }, { "epoch": 5.60416, "grad_norm": 0.49127253890037537, "learning_rate": 0.00013887468030690536, "loss": 3.6259, "step": 4382 }, { "epoch": 5.60544, "grad_norm": 0.5145688056945801, "learning_rate": 0.000138834298021268, "loss": 3.6815, "step": 4383 }, { "epoch": 5.60672, "grad_norm": 0.5067407488822937, "learning_rate": 0.00013879391573563062, "loss": 3.6327, "step": 4384 }, { "epoch": 5.608, "grad_norm": 0.5062004923820496, "learning_rate": 0.00013875353344999325, "loss": 3.5035, "step": 4385 }, { "epoch": 5.60928, "grad_norm": 0.483181893825531, "learning_rate": 0.00013871315116435588, "loss": 3.5152, "step": 4386 }, { "epoch": 5.6105599999999995, "grad_norm": 0.5145739912986755, "learning_rate": 0.00013867276887871854, "loss": 3.572, "step": 4387 }, { "epoch": 5.61184, "grad_norm": 0.4981113374233246, "learning_rate": 0.00013863238659308114, "loss": 3.5428, "step": 4388 }, { "epoch": 5.61312, "grad_norm": 0.4911549389362335, "learning_rate": 0.0001385920043074438, "loss": 3.5261, "step": 4389 }, { "epoch": 5.6144, "grad_norm": 0.5160556435585022, "learning_rate": 0.00013855162202180643, "loss": 3.5217, "step": 4390 }, { "epoch": 5.61568, "grad_norm": 0.4965737760066986, "learning_rate": 0.00013851123973616906, "loss": 3.5228, "step": 4391 }, { "epoch": 5.61696, "grad_norm": 0.49209117889404297, "learning_rate": 0.0001384708574505317, "loss": 3.5664, "step": 4392 }, { "epoch": 5.61824, "grad_norm": 0.4959764778614044, "learning_rate": 0.00013843047516489432, "loss": 3.6293, "step": 4393 }, { "epoch": 5.61952, "grad_norm": 0.5043753981590271, "learning_rate": 0.00013839009287925695, "loss": 3.4969, "step": 4394 }, { "epoch": 5.6208, "grad_norm": 0.5141623020172119, "learning_rate": 0.0001383497105936196, "loss": 3.5033, "step": 4395 }, { "epoch": 5.62208, "grad_norm": 0.4955049753189087, "learning_rate": 0.0001383093283079822, "loss": 3.5877, "step": 4396 }, { "epoch": 5.62336, "grad_norm": 0.4988468289375305, "learning_rate": 0.00013826894602234484, "loss": 3.5641, "step": 4397 }, { "epoch": 5.62464, "grad_norm": 0.49312546849250793, "learning_rate": 0.0001382285637367075, "loss": 3.5163, "step": 4398 }, { "epoch": 5.62592, "grad_norm": 0.49335968494415283, "learning_rate": 0.00013818818145107012, "loss": 3.5558, "step": 4399 }, { "epoch": 5.6272, "grad_norm": 0.49063315987586975, "learning_rate": 0.00013814779916543275, "loss": 3.6151, "step": 4400 }, { "epoch": 5.62848, "grad_norm": 0.48111316561698914, "learning_rate": 0.00013810741687979538, "loss": 3.5394, "step": 4401 }, { "epoch": 5.62976, "grad_norm": 0.4941888153553009, "learning_rate": 0.00013806703459415801, "loss": 3.5944, "step": 4402 }, { "epoch": 5.6310400000000005, "grad_norm": 0.5081154704093933, "learning_rate": 0.00013802665230852064, "loss": 3.546, "step": 4403 }, { "epoch": 5.63232, "grad_norm": 0.49409082531929016, "learning_rate": 0.0001379862700228833, "loss": 3.6016, "step": 4404 }, { "epoch": 5.6336, "grad_norm": 0.4921821355819702, "learning_rate": 0.0001379458877372459, "loss": 3.56, "step": 4405 }, { "epoch": 5.63488, "grad_norm": 0.5267976522445679, "learning_rate": 0.00013790550545160856, "loss": 3.5694, "step": 4406 }, { "epoch": 5.63616, "grad_norm": 0.4959132671356201, "learning_rate": 0.0001378651231659712, "loss": 3.5944, "step": 4407 }, { "epoch": 5.63744, "grad_norm": 0.4983578622341156, "learning_rate": 0.00013782474088033382, "loss": 3.514, "step": 4408 }, { "epoch": 5.63872, "grad_norm": 0.5071173310279846, "learning_rate": 0.00013778435859469645, "loss": 3.5079, "step": 4409 }, { "epoch": 5.64, "grad_norm": 0.4979630410671234, "learning_rate": 0.00013774397630905908, "loss": 3.5649, "step": 4410 }, { "epoch": 5.64128, "grad_norm": 0.48886778950691223, "learning_rate": 0.0001377035940234217, "loss": 3.5758, "step": 4411 }, { "epoch": 5.64256, "grad_norm": 0.5049862265586853, "learning_rate": 0.00013766321173778434, "loss": 3.5894, "step": 4412 }, { "epoch": 5.64384, "grad_norm": 0.4826545715332031, "learning_rate": 0.00013762282945214697, "loss": 3.6285, "step": 4413 }, { "epoch": 5.64512, "grad_norm": 0.5009142160415649, "learning_rate": 0.0001375824471665096, "loss": 3.5911, "step": 4414 }, { "epoch": 5.6464, "grad_norm": 0.49813807010650635, "learning_rate": 0.00013754206488087226, "loss": 3.5388, "step": 4415 }, { "epoch": 5.64768, "grad_norm": 0.48648300766944885, "learning_rate": 0.00013750168259523486, "loss": 3.5231, "step": 4416 }, { "epoch": 5.64896, "grad_norm": 0.5059703588485718, "learning_rate": 0.00013746130030959752, "loss": 3.5606, "step": 4417 }, { "epoch": 5.65024, "grad_norm": 0.49821940064430237, "learning_rate": 0.00013742091802396015, "loss": 3.5415, "step": 4418 }, { "epoch": 5.65152, "grad_norm": 0.5076292157173157, "learning_rate": 0.00013738053573832278, "loss": 3.5893, "step": 4419 }, { "epoch": 5.6528, "grad_norm": 0.5033696293830872, "learning_rate": 0.0001373401534526854, "loss": 3.5241, "step": 4420 }, { "epoch": 5.65408, "grad_norm": 0.49491533637046814, "learning_rate": 0.00013729977116704804, "loss": 3.5782, "step": 4421 }, { "epoch": 5.65536, "grad_norm": 0.5117203593254089, "learning_rate": 0.00013725938888141067, "loss": 3.5175, "step": 4422 }, { "epoch": 5.65664, "grad_norm": 0.5070961117744446, "learning_rate": 0.0001372190065957733, "loss": 3.5301, "step": 4423 }, { "epoch": 5.65792, "grad_norm": 0.5147043466567993, "learning_rate": 0.00013717862431013593, "loss": 3.5733, "step": 4424 }, { "epoch": 5.6592, "grad_norm": 0.5103664398193359, "learning_rate": 0.00013713824202449856, "loss": 3.6548, "step": 4425 }, { "epoch": 5.66048, "grad_norm": 0.4940325915813446, "learning_rate": 0.00013709785973886122, "loss": 3.6025, "step": 4426 }, { "epoch": 5.66176, "grad_norm": 0.49647340178489685, "learning_rate": 0.00013705747745322385, "loss": 3.4952, "step": 4427 }, { "epoch": 5.66304, "grad_norm": 0.5108480453491211, "learning_rate": 0.00013701709516758648, "loss": 3.6175, "step": 4428 }, { "epoch": 5.66432, "grad_norm": 0.5075586438179016, "learning_rate": 0.0001369767128819491, "loss": 3.5372, "step": 4429 }, { "epoch": 5.6655999999999995, "grad_norm": 0.49585655331611633, "learning_rate": 0.00013693633059631174, "loss": 3.5006, "step": 4430 }, { "epoch": 5.66688, "grad_norm": 0.5268029570579529, "learning_rate": 0.00013689594831067437, "loss": 3.5752, "step": 4431 }, { "epoch": 5.66816, "grad_norm": 0.5017437934875488, "learning_rate": 0.000136855566025037, "loss": 3.5115, "step": 4432 }, { "epoch": 5.66944, "grad_norm": 0.5259951949119568, "learning_rate": 0.00013681518373939963, "loss": 3.5269, "step": 4433 }, { "epoch": 5.67072, "grad_norm": 0.49649110436439514, "learning_rate": 0.00013677480145376229, "loss": 3.5358, "step": 4434 }, { "epoch": 5.672, "grad_norm": 0.5151202082633972, "learning_rate": 0.00013673441916812492, "loss": 3.5663, "step": 4435 }, { "epoch": 5.67328, "grad_norm": 0.5322958827018738, "learning_rate": 0.00013669403688248752, "loss": 3.5762, "step": 4436 }, { "epoch": 5.67456, "grad_norm": 0.505379319190979, "learning_rate": 0.00013665365459685018, "loss": 3.5684, "step": 4437 }, { "epoch": 5.67584, "grad_norm": 0.5019539594650269, "learning_rate": 0.0001366132723112128, "loss": 3.5522, "step": 4438 }, { "epoch": 5.67712, "grad_norm": 0.5176437497138977, "learning_rate": 0.00013657289002557544, "loss": 3.5629, "step": 4439 }, { "epoch": 5.6784, "grad_norm": 0.4964632987976074, "learning_rate": 0.00013653250773993807, "loss": 3.6072, "step": 4440 }, { "epoch": 5.67968, "grad_norm": 0.49535518884658813, "learning_rate": 0.0001364921254543007, "loss": 3.6126, "step": 4441 }, { "epoch": 5.68096, "grad_norm": 0.4958983063697815, "learning_rate": 0.00013645174316866333, "loss": 3.5765, "step": 4442 }, { "epoch": 5.68224, "grad_norm": 0.4932209253311157, "learning_rate": 0.00013641136088302598, "loss": 3.5613, "step": 4443 }, { "epoch": 5.68352, "grad_norm": 0.48881053924560547, "learning_rate": 0.00013637097859738859, "loss": 3.5207, "step": 4444 }, { "epoch": 5.6848, "grad_norm": 0.4943525791168213, "learning_rate": 0.00013633059631175124, "loss": 3.616, "step": 4445 }, { "epoch": 5.6860800000000005, "grad_norm": 0.504132091999054, "learning_rate": 0.00013629021402611387, "loss": 3.5652, "step": 4446 }, { "epoch": 5.68736, "grad_norm": 0.5116013288497925, "learning_rate": 0.0001362498317404765, "loss": 3.5694, "step": 4447 }, { "epoch": 5.68864, "grad_norm": 0.49192342162132263, "learning_rate": 0.00013620944945483913, "loss": 3.5206, "step": 4448 }, { "epoch": 5.68992, "grad_norm": 0.49895188212394714, "learning_rate": 0.00013616906716920176, "loss": 3.5208, "step": 4449 }, { "epoch": 5.6912, "grad_norm": 0.5205003619194031, "learning_rate": 0.0001361286848835644, "loss": 3.6103, "step": 4450 }, { "epoch": 5.69248, "grad_norm": 0.5131163001060486, "learning_rate": 0.00013608830259792702, "loss": 3.5963, "step": 4451 }, { "epoch": 5.69376, "grad_norm": 0.4999935030937195, "learning_rate": 0.00013604792031228965, "loss": 3.5195, "step": 4452 }, { "epoch": 5.69504, "grad_norm": 0.5080338716506958, "learning_rate": 0.00013600753802665228, "loss": 3.5515, "step": 4453 }, { "epoch": 5.69632, "grad_norm": 0.5050440430641174, "learning_rate": 0.00013596715574101494, "loss": 3.5676, "step": 4454 }, { "epoch": 5.6975999999999996, "grad_norm": 0.4881718158721924, "learning_rate": 0.00013592677345537757, "loss": 3.5505, "step": 4455 }, { "epoch": 5.69888, "grad_norm": 0.5083419680595398, "learning_rate": 0.0001358863911697402, "loss": 3.5002, "step": 4456 }, { "epoch": 5.70016, "grad_norm": 0.5217411518096924, "learning_rate": 0.00013584600888410283, "loss": 3.5094, "step": 4457 }, { "epoch": 5.70144, "grad_norm": 0.4996829628944397, "learning_rate": 0.00013580562659846546, "loss": 3.6286, "step": 4458 }, { "epoch": 5.70272, "grad_norm": 0.5101032853126526, "learning_rate": 0.0001357652443128281, "loss": 3.5412, "step": 4459 }, { "epoch": 5.704, "grad_norm": 0.5019393563270569, "learning_rate": 0.00013572486202719072, "loss": 3.5845, "step": 4460 }, { "epoch": 5.70528, "grad_norm": 0.5044521689414978, "learning_rate": 0.00013568447974155335, "loss": 3.633, "step": 4461 }, { "epoch": 5.70656, "grad_norm": 0.5104800462722778, "learning_rate": 0.000135644097455916, "loss": 3.5554, "step": 4462 }, { "epoch": 5.70784, "grad_norm": 0.4980311989784241, "learning_rate": 0.00013560371517027864, "loss": 3.5151, "step": 4463 }, { "epoch": 5.70912, "grad_norm": 0.501975417137146, "learning_rate": 0.00013556333288464124, "loss": 3.5999, "step": 4464 }, { "epoch": 5.7104, "grad_norm": 0.4794447422027588, "learning_rate": 0.0001355229505990039, "loss": 3.5524, "step": 4465 }, { "epoch": 5.71168, "grad_norm": 0.5068936944007874, "learning_rate": 0.00013548256831336653, "loss": 3.5434, "step": 4466 }, { "epoch": 5.71296, "grad_norm": 0.5029814839363098, "learning_rate": 0.00013544218602772916, "loss": 3.531, "step": 4467 }, { "epoch": 5.71424, "grad_norm": 0.48336413502693176, "learning_rate": 0.0001354018037420918, "loss": 3.5022, "step": 4468 }, { "epoch": 5.71552, "grad_norm": 0.4998653829097748, "learning_rate": 0.00013536142145645442, "loss": 3.6426, "step": 4469 }, { "epoch": 5.7168, "grad_norm": 0.5224689841270447, "learning_rate": 0.00013532103917081705, "loss": 3.6244, "step": 4470 }, { "epoch": 5.7180800000000005, "grad_norm": 0.49238261580467224, "learning_rate": 0.0001352806568851797, "loss": 3.5419, "step": 4471 }, { "epoch": 5.71936, "grad_norm": 0.5162872672080994, "learning_rate": 0.0001352402745995423, "loss": 3.5593, "step": 4472 }, { "epoch": 5.7206399999999995, "grad_norm": 0.4984869658946991, "learning_rate": 0.00013519989231390497, "loss": 3.5436, "step": 4473 }, { "epoch": 5.72192, "grad_norm": 0.49168428778648376, "learning_rate": 0.0001351595100282676, "loss": 3.5536, "step": 4474 }, { "epoch": 5.7232, "grad_norm": 0.47858181595802307, "learning_rate": 0.00013511912774263023, "loss": 3.5043, "step": 4475 }, { "epoch": 5.72448, "grad_norm": 0.5016847848892212, "learning_rate": 0.00013507874545699286, "loss": 3.6073, "step": 4476 }, { "epoch": 5.72576, "grad_norm": 0.5029941201210022, "learning_rate": 0.0001350383631713555, "loss": 3.6127, "step": 4477 }, { "epoch": 5.72704, "grad_norm": 0.48141196370124817, "learning_rate": 0.00013499798088571812, "loss": 3.574, "step": 4478 }, { "epoch": 5.72832, "grad_norm": 0.4969176948070526, "learning_rate": 0.00013495759860008075, "loss": 3.5789, "step": 4479 }, { "epoch": 5.7296, "grad_norm": 0.48834389448165894, "learning_rate": 0.00013491721631444338, "loss": 3.5863, "step": 4480 }, { "epoch": 5.73088, "grad_norm": 0.49428460001945496, "learning_rate": 0.000134876834028806, "loss": 3.5619, "step": 4481 }, { "epoch": 5.73216, "grad_norm": 0.49237480759620667, "learning_rate": 0.00013483645174316866, "loss": 3.5791, "step": 4482 }, { "epoch": 5.73344, "grad_norm": 0.5003536343574524, "learning_rate": 0.00013479606945753127, "loss": 3.595, "step": 4483 }, { "epoch": 5.73472, "grad_norm": 0.4975607395172119, "learning_rate": 0.00013475568717189392, "loss": 3.5885, "step": 4484 }, { "epoch": 5.736, "grad_norm": 0.5209734439849854, "learning_rate": 0.00013471530488625655, "loss": 3.5795, "step": 4485 }, { "epoch": 5.73728, "grad_norm": 0.49605241417884827, "learning_rate": 0.00013467492260061918, "loss": 3.5311, "step": 4486 }, { "epoch": 5.73856, "grad_norm": 0.4768206775188446, "learning_rate": 0.00013463454031498181, "loss": 3.556, "step": 4487 }, { "epoch": 5.73984, "grad_norm": 0.4915754199028015, "learning_rate": 0.00013459415802934444, "loss": 3.6062, "step": 4488 }, { "epoch": 5.7411200000000004, "grad_norm": 0.4907207489013672, "learning_rate": 0.00013455377574370707, "loss": 3.5279, "step": 4489 }, { "epoch": 5.7424, "grad_norm": 0.4975474178791046, "learning_rate": 0.0001345133934580697, "loss": 3.5248, "step": 4490 }, { "epoch": 5.74368, "grad_norm": 0.481803834438324, "learning_rate": 0.00013447301117243236, "loss": 3.578, "step": 4491 }, { "epoch": 5.74496, "grad_norm": 0.48836055397987366, "learning_rate": 0.00013443262888679497, "loss": 3.5618, "step": 4492 }, { "epoch": 5.74624, "grad_norm": 0.48223766684532166, "learning_rate": 0.00013439224660115762, "loss": 3.5529, "step": 4493 }, { "epoch": 5.74752, "grad_norm": 0.47637295722961426, "learning_rate": 0.00013435186431552025, "loss": 3.5735, "step": 4494 }, { "epoch": 5.7488, "grad_norm": 0.48114699125289917, "learning_rate": 0.00013431148202988288, "loss": 3.51, "step": 4495 }, { "epoch": 5.75008, "grad_norm": 0.49171772599220276, "learning_rate": 0.0001342710997442455, "loss": 3.6133, "step": 4496 }, { "epoch": 5.75136, "grad_norm": 0.49519824981689453, "learning_rate": 0.00013423071745860814, "loss": 3.5715, "step": 4497 }, { "epoch": 5.7526399999999995, "grad_norm": 0.4867844879627228, "learning_rate": 0.00013419033517297077, "loss": 3.5635, "step": 4498 }, { "epoch": 5.75392, "grad_norm": 0.4920177161693573, "learning_rate": 0.00013414995288733343, "loss": 3.5152, "step": 4499 }, { "epoch": 5.7552, "grad_norm": 0.4885466396808624, "learning_rate": 0.00013410957060169603, "loss": 3.5387, "step": 4500 }, { "epoch": 5.75648, "grad_norm": 0.49109700322151184, "learning_rate": 0.0001340691883160587, "loss": 3.4059, "step": 4501 }, { "epoch": 5.75776, "grad_norm": 0.4847142696380615, "learning_rate": 0.00013402880603042132, "loss": 3.4758, "step": 4502 }, { "epoch": 5.75904, "grad_norm": 0.48357903957366943, "learning_rate": 0.00013398842374478392, "loss": 3.4209, "step": 4503 }, { "epoch": 5.76032, "grad_norm": 0.4984717071056366, "learning_rate": 0.00013394804145914658, "loss": 3.4487, "step": 4504 }, { "epoch": 5.7616, "grad_norm": 0.4831765592098236, "learning_rate": 0.0001339076591735092, "loss": 3.4414, "step": 4505 }, { "epoch": 5.76288, "grad_norm": 0.5067592263221741, "learning_rate": 0.00013386727688787184, "loss": 3.4324, "step": 4506 }, { "epoch": 5.76416, "grad_norm": 0.496747225522995, "learning_rate": 0.00013382689460223447, "loss": 3.5664, "step": 4507 }, { "epoch": 5.76544, "grad_norm": 0.4953940808773041, "learning_rate": 0.0001337865123165971, "loss": 3.425, "step": 4508 }, { "epoch": 5.76672, "grad_norm": 0.5052168369293213, "learning_rate": 0.00013374613003095973, "loss": 3.3923, "step": 4509 }, { "epoch": 5.768, "grad_norm": 0.49418163299560547, "learning_rate": 0.0001337057477453224, "loss": 3.435, "step": 4510 }, { "epoch": 5.76928, "grad_norm": 0.5005443096160889, "learning_rate": 0.000133665365459685, "loss": 3.4357, "step": 4511 }, { "epoch": 5.77056, "grad_norm": 0.4933774471282959, "learning_rate": 0.00013362498317404765, "loss": 3.4941, "step": 4512 }, { "epoch": 5.77184, "grad_norm": 0.49201011657714844, "learning_rate": 0.00013358460088841028, "loss": 3.4382, "step": 4513 }, { "epoch": 5.7731200000000005, "grad_norm": 0.4911024570465088, "learning_rate": 0.0001335442186027729, "loss": 3.4343, "step": 4514 }, { "epoch": 5.7744, "grad_norm": 0.4980873167514801, "learning_rate": 0.00013350383631713554, "loss": 3.4233, "step": 4515 }, { "epoch": 5.77568, "grad_norm": 0.4919237792491913, "learning_rate": 0.00013346345403149817, "loss": 3.5077, "step": 4516 }, { "epoch": 5.77696, "grad_norm": 0.5091717839241028, "learning_rate": 0.0001334230717458608, "loss": 3.3742, "step": 4517 }, { "epoch": 5.77824, "grad_norm": 0.49231696128845215, "learning_rate": 0.00013338268946022343, "loss": 3.4706, "step": 4518 }, { "epoch": 5.77952, "grad_norm": 0.4917304813861847, "learning_rate": 0.00013334230717458606, "loss": 3.4684, "step": 4519 }, { "epoch": 5.7808, "grad_norm": 0.5030657649040222, "learning_rate": 0.0001333019248889487, "loss": 3.4561, "step": 4520 }, { "epoch": 5.78208, "grad_norm": 0.5002293586730957, "learning_rate": 0.00013326154260331135, "loss": 3.4811, "step": 4521 }, { "epoch": 5.78336, "grad_norm": 0.49681586027145386, "learning_rate": 0.00013322116031767398, "loss": 3.4188, "step": 4522 }, { "epoch": 5.78464, "grad_norm": 0.4821189045906067, "learning_rate": 0.0001331807780320366, "loss": 3.3997, "step": 4523 }, { "epoch": 5.78592, "grad_norm": 0.4861787259578705, "learning_rate": 0.00013314039574639924, "loss": 3.4388, "step": 4524 }, { "epoch": 5.7872, "grad_norm": 0.48813995718955994, "learning_rate": 0.00013310001346076187, "loss": 3.4686, "step": 4525 }, { "epoch": 5.78848, "grad_norm": 0.4976069927215576, "learning_rate": 0.0001330596311751245, "loss": 3.4327, "step": 4526 }, { "epoch": 5.78976, "grad_norm": 0.5135701298713684, "learning_rate": 0.00013301924888948715, "loss": 3.4294, "step": 4527 }, { "epoch": 5.79104, "grad_norm": 0.5241384506225586, "learning_rate": 0.00013297886660384976, "loss": 3.5, "step": 4528 }, { "epoch": 5.79232, "grad_norm": 0.523514449596405, "learning_rate": 0.0001329384843182124, "loss": 3.4332, "step": 4529 }, { "epoch": 5.7936, "grad_norm": 0.49370086193084717, "learning_rate": 0.00013289810203257504, "loss": 3.4522, "step": 4530 }, { "epoch": 5.79488, "grad_norm": 0.5221703052520752, "learning_rate": 0.00013285771974693765, "loss": 3.4623, "step": 4531 }, { "epoch": 5.79616, "grad_norm": 0.5171593427658081, "learning_rate": 0.0001328173374613003, "loss": 3.5817, "step": 4532 }, { "epoch": 5.79744, "grad_norm": 0.5053053498268127, "learning_rate": 0.00013277695517566293, "loss": 3.4717, "step": 4533 }, { "epoch": 5.79872, "grad_norm": 0.5156946182250977, "learning_rate": 0.00013273657289002556, "loss": 3.4493, "step": 4534 }, { "epoch": 5.8, "grad_norm": 0.5175525546073914, "learning_rate": 0.0001326961906043882, "loss": 3.4689, "step": 4535 }, { "epoch": 5.80128, "grad_norm": 0.4784032106399536, "learning_rate": 0.00013265580831875082, "loss": 3.3858, "step": 4536 }, { "epoch": 5.80256, "grad_norm": 0.49603456258773804, "learning_rate": 0.00013261542603311345, "loss": 3.4567, "step": 4537 }, { "epoch": 5.80384, "grad_norm": 0.512304425239563, "learning_rate": 0.0001325750437474761, "loss": 3.4817, "step": 4538 }, { "epoch": 5.80512, "grad_norm": 0.5141502618789673, "learning_rate": 0.00013253466146183871, "loss": 3.3569, "step": 4539 }, { "epoch": 5.8064, "grad_norm": 0.5332363843917847, "learning_rate": 0.00013249427917620137, "loss": 3.4232, "step": 4540 }, { "epoch": 5.8076799999999995, "grad_norm": 0.5129743814468384, "learning_rate": 0.000132453896890564, "loss": 3.5049, "step": 4541 }, { "epoch": 5.80896, "grad_norm": 0.5121658444404602, "learning_rate": 0.00013241351460492663, "loss": 3.3905, "step": 4542 }, { "epoch": 5.81024, "grad_norm": 0.5085654854774475, "learning_rate": 0.00013237313231928926, "loss": 3.4257, "step": 4543 }, { "epoch": 5.81152, "grad_norm": 0.5045654773712158, "learning_rate": 0.0001323327500336519, "loss": 3.4154, "step": 4544 }, { "epoch": 5.8128, "grad_norm": 0.49208593368530273, "learning_rate": 0.00013229236774801452, "loss": 3.4489, "step": 4545 }, { "epoch": 5.81408, "grad_norm": 0.5095329880714417, "learning_rate": 0.00013225198546237715, "loss": 3.4906, "step": 4546 }, { "epoch": 5.81536, "grad_norm": 0.5090464353561401, "learning_rate": 0.00013221160317673978, "loss": 3.4252, "step": 4547 }, { "epoch": 5.81664, "grad_norm": 0.5093858242034912, "learning_rate": 0.0001321712208911024, "loss": 3.4324, "step": 4548 }, { "epoch": 5.81792, "grad_norm": 0.5184779167175293, "learning_rate": 0.00013213083860546507, "loss": 3.4721, "step": 4549 }, { "epoch": 5.8192, "grad_norm": 0.5006645321846008, "learning_rate": 0.0001320904563198277, "loss": 3.4371, "step": 4550 }, { "epoch": 5.82048, "grad_norm": 0.5141642689704895, "learning_rate": 0.00013205007403419033, "loss": 3.3902, "step": 4551 }, { "epoch": 5.82176, "grad_norm": 0.5118807554244995, "learning_rate": 0.00013200969174855296, "loss": 3.4246, "step": 4552 }, { "epoch": 5.82304, "grad_norm": 0.5281966924667358, "learning_rate": 0.0001319693094629156, "loss": 3.5053, "step": 4553 }, { "epoch": 5.82432, "grad_norm": 0.5333678126335144, "learning_rate": 0.00013192892717727822, "loss": 3.4423, "step": 4554 }, { "epoch": 5.8256, "grad_norm": 0.5082951188087463, "learning_rate": 0.00013188854489164085, "loss": 3.4716, "step": 4555 }, { "epoch": 5.82688, "grad_norm": 0.5443100929260254, "learning_rate": 0.00013184816260600348, "loss": 3.4103, "step": 4556 }, { "epoch": 5.8281600000000005, "grad_norm": 0.5330334901809692, "learning_rate": 0.0001318077803203661, "loss": 3.4818, "step": 4557 }, { "epoch": 5.82944, "grad_norm": 0.525781512260437, "learning_rate": 0.00013176739803472877, "loss": 3.5291, "step": 4558 }, { "epoch": 5.83072, "grad_norm": 0.5201729536056519, "learning_rate": 0.00013172701574909137, "loss": 3.5319, "step": 4559 }, { "epoch": 5.832, "grad_norm": 0.5168731212615967, "learning_rate": 0.00013168663346345403, "loss": 3.3785, "step": 4560 }, { "epoch": 5.83328, "grad_norm": 0.5290531516075134, "learning_rate": 0.00013164625117781666, "loss": 3.4085, "step": 4561 }, { "epoch": 5.83456, "grad_norm": 0.5436719059944153, "learning_rate": 0.0001316058688921793, "loss": 3.5384, "step": 4562 }, { "epoch": 5.83584, "grad_norm": 0.5178592801094055, "learning_rate": 0.00013156548660654192, "loss": 3.3646, "step": 4563 }, { "epoch": 5.83712, "grad_norm": 0.5080958604812622, "learning_rate": 0.00013152510432090455, "loss": 3.4061, "step": 4564 }, { "epoch": 5.8384, "grad_norm": 0.52479487657547, "learning_rate": 0.00013148472203526718, "loss": 3.5095, "step": 4565 }, { "epoch": 5.8396799999999995, "grad_norm": 0.531376302242279, "learning_rate": 0.00013144433974962983, "loss": 3.4566, "step": 4566 }, { "epoch": 5.84096, "grad_norm": 0.5431889891624451, "learning_rate": 0.00013140395746399244, "loss": 3.508, "step": 4567 }, { "epoch": 5.84224, "grad_norm": 0.5298098921775818, "learning_rate": 0.0001313635751783551, "loss": 3.4721, "step": 4568 }, { "epoch": 5.84352, "grad_norm": 0.5101543664932251, "learning_rate": 0.00013132319289271772, "loss": 3.4789, "step": 4569 }, { "epoch": 5.8448, "grad_norm": 0.5432095527648926, "learning_rate": 0.00013128281060708033, "loss": 3.3721, "step": 4570 }, { "epoch": 5.84608, "grad_norm": 0.5078821778297424, "learning_rate": 0.00013124242832144299, "loss": 3.4997, "step": 4571 }, { "epoch": 5.84736, "grad_norm": 0.5241081118583679, "learning_rate": 0.00013120204603580562, "loss": 3.3372, "step": 4572 }, { "epoch": 5.84864, "grad_norm": 0.5384151935577393, "learning_rate": 0.00013116166375016825, "loss": 3.5231, "step": 4573 }, { "epoch": 5.84992, "grad_norm": 0.5285201072692871, "learning_rate": 0.00013112128146453088, "loss": 3.4693, "step": 4574 }, { "epoch": 5.8512, "grad_norm": 0.5212448239326477, "learning_rate": 0.0001310808991788935, "loss": 3.4739, "step": 4575 }, { "epoch": 5.85248, "grad_norm": 0.5330734252929688, "learning_rate": 0.00013104051689325614, "loss": 3.4567, "step": 4576 }, { "epoch": 5.85376, "grad_norm": 0.5170498490333557, "learning_rate": 0.0001310001346076188, "loss": 3.4205, "step": 4577 }, { "epoch": 5.85504, "grad_norm": 0.5200040340423584, "learning_rate": 0.00013095975232198142, "loss": 3.5097, "step": 4578 }, { "epoch": 5.85632, "grad_norm": 0.5016375184059143, "learning_rate": 0.00013091937003634405, "loss": 3.4698, "step": 4579 }, { "epoch": 5.8576, "grad_norm": 0.5077860951423645, "learning_rate": 0.00013087898775070668, "loss": 3.4589, "step": 4580 }, { "epoch": 5.85888, "grad_norm": 0.4943268597126007, "learning_rate": 0.0001308386054650693, "loss": 3.4539, "step": 4581 }, { "epoch": 5.8601600000000005, "grad_norm": 0.5271481871604919, "learning_rate": 0.00013079822317943194, "loss": 3.4382, "step": 4582 }, { "epoch": 5.86144, "grad_norm": 0.5146161913871765, "learning_rate": 0.00013075784089379457, "loss": 3.4512, "step": 4583 }, { "epoch": 5.86272, "grad_norm": 0.4986138343811035, "learning_rate": 0.0001307174586081572, "loss": 3.4464, "step": 4584 }, { "epoch": 5.864, "grad_norm": 0.5161074995994568, "learning_rate": 0.00013067707632251983, "loss": 3.4749, "step": 4585 }, { "epoch": 5.86528, "grad_norm": 0.5155913829803467, "learning_rate": 0.0001306366940368825, "loss": 3.3951, "step": 4586 }, { "epoch": 5.86656, "grad_norm": 0.511010468006134, "learning_rate": 0.0001305963117512451, "loss": 3.5491, "step": 4587 }, { "epoch": 5.86784, "grad_norm": 0.5245579481124878, "learning_rate": 0.00013055592946560775, "loss": 3.4911, "step": 4588 }, { "epoch": 5.86912, "grad_norm": 0.5299880504608154, "learning_rate": 0.00013051554717997038, "loss": 3.5118, "step": 4589 }, { "epoch": 5.8704, "grad_norm": 0.5251659154891968, "learning_rate": 0.000130475164894333, "loss": 3.4269, "step": 4590 }, { "epoch": 5.87168, "grad_norm": 0.5328729748725891, "learning_rate": 0.00013043478260869564, "loss": 3.4694, "step": 4591 }, { "epoch": 5.87296, "grad_norm": 0.5197456479072571, "learning_rate": 0.00013039440032305827, "loss": 3.4146, "step": 4592 }, { "epoch": 5.87424, "grad_norm": 0.5089783668518066, "learning_rate": 0.0001303540180374209, "loss": 3.4786, "step": 4593 }, { "epoch": 5.87552, "grad_norm": 0.5036661624908447, "learning_rate": 0.00013031363575178356, "loss": 3.494, "step": 4594 }, { "epoch": 5.8768, "grad_norm": 0.5152899622917175, "learning_rate": 0.00013027325346614616, "loss": 3.4804, "step": 4595 }, { "epoch": 5.87808, "grad_norm": 0.5099830031394958, "learning_rate": 0.00013023287118050882, "loss": 3.5013, "step": 4596 }, { "epoch": 5.87936, "grad_norm": 0.5049211382865906, "learning_rate": 0.00013019248889487145, "loss": 3.4966, "step": 4597 }, { "epoch": 5.88064, "grad_norm": 0.5128461122512817, "learning_rate": 0.00013015210660923405, "loss": 3.5043, "step": 4598 }, { "epoch": 5.88192, "grad_norm": 0.5299509763717651, "learning_rate": 0.0001301117243235967, "loss": 3.5566, "step": 4599 }, { "epoch": 5.8832, "grad_norm": 0.49836984276771545, "learning_rate": 0.00013007134203795934, "loss": 3.4689, "step": 4600 }, { "epoch": 5.88448, "grad_norm": 0.5294039845466614, "learning_rate": 0.00013003095975232197, "loss": 3.417, "step": 4601 }, { "epoch": 5.88576, "grad_norm": 0.5025843381881714, "learning_rate": 0.0001299905774666846, "loss": 3.4296, "step": 4602 }, { "epoch": 5.88704, "grad_norm": 0.5195445418357849, "learning_rate": 0.00012995019518104723, "loss": 3.4922, "step": 4603 }, { "epoch": 5.88832, "grad_norm": 0.5041271448135376, "learning_rate": 0.00012990981289540986, "loss": 3.3655, "step": 4604 }, { "epoch": 5.8896, "grad_norm": 0.5099116563796997, "learning_rate": 0.00012986943060977252, "loss": 3.3968, "step": 4605 }, { "epoch": 5.89088, "grad_norm": 0.5230572819709778, "learning_rate": 0.00012982904832413512, "loss": 3.4775, "step": 4606 }, { "epoch": 5.89216, "grad_norm": 0.4989769458770752, "learning_rate": 0.00012978866603849778, "loss": 3.4765, "step": 4607 }, { "epoch": 5.89344, "grad_norm": 0.5254843831062317, "learning_rate": 0.0001297482837528604, "loss": 3.4976, "step": 4608 }, { "epoch": 5.8947199999999995, "grad_norm": 0.501315176486969, "learning_rate": 0.00012970790146722304, "loss": 3.3925, "step": 4609 }, { "epoch": 5.896, "grad_norm": 0.5095430016517639, "learning_rate": 0.00012966751918158567, "loss": 3.4135, "step": 4610 }, { "epoch": 5.89728, "grad_norm": 0.5203284025192261, "learning_rate": 0.0001296271368959483, "loss": 3.5653, "step": 4611 }, { "epoch": 5.89856, "grad_norm": 0.524929940700531, "learning_rate": 0.00012958675461031093, "loss": 3.4164, "step": 4612 }, { "epoch": 5.89984, "grad_norm": 0.49622589349746704, "learning_rate": 0.00012954637232467356, "loss": 3.4858, "step": 4613 }, { "epoch": 5.90112, "grad_norm": 0.5328811407089233, "learning_rate": 0.00012950599003903621, "loss": 3.4519, "step": 4614 }, { "epoch": 5.9024, "grad_norm": 0.5233384370803833, "learning_rate": 0.00012946560775339882, "loss": 3.4802, "step": 4615 }, { "epoch": 5.90368, "grad_norm": 0.5104706287384033, "learning_rate": 0.00012942522546776147, "loss": 3.503, "step": 4616 }, { "epoch": 5.90496, "grad_norm": 0.5111770033836365, "learning_rate": 0.0001293848431821241, "loss": 3.3882, "step": 4617 }, { "epoch": 5.90624, "grad_norm": 0.5340044498443604, "learning_rate": 0.00012934446089648673, "loss": 3.4865, "step": 4618 }, { "epoch": 5.90752, "grad_norm": 0.5179080367088318, "learning_rate": 0.00012930407861084936, "loss": 3.448, "step": 4619 }, { "epoch": 5.9088, "grad_norm": 0.5176510214805603, "learning_rate": 0.000129263696325212, "loss": 3.4914, "step": 4620 }, { "epoch": 5.91008, "grad_norm": 0.5105175375938416, "learning_rate": 0.00012922331403957462, "loss": 3.4427, "step": 4621 }, { "epoch": 5.91136, "grad_norm": 0.4958018362522125, "learning_rate": 0.00012918293175393728, "loss": 3.4691, "step": 4622 }, { "epoch": 5.91264, "grad_norm": 0.5175390839576721, "learning_rate": 0.00012914254946829988, "loss": 3.4176, "step": 4623 }, { "epoch": 5.91392, "grad_norm": 0.49366575479507446, "learning_rate": 0.00012910216718266251, "loss": 3.4056, "step": 4624 }, { "epoch": 5.9152000000000005, "grad_norm": 0.49612316489219666, "learning_rate": 0.00012906178489702517, "loss": 3.4146, "step": 4625 }, { "epoch": 5.91648, "grad_norm": 0.5153398513793945, "learning_rate": 0.00012902140261138777, "loss": 3.4182, "step": 4626 }, { "epoch": 5.91776, "grad_norm": 0.5095019936561584, "learning_rate": 0.00012898102032575043, "loss": 3.4108, "step": 4627 }, { "epoch": 5.91904, "grad_norm": 0.5022311210632324, "learning_rate": 0.00012894063804011306, "loss": 3.4513, "step": 4628 }, { "epoch": 5.92032, "grad_norm": 0.5156865119934082, "learning_rate": 0.0001289002557544757, "loss": 3.4506, "step": 4629 }, { "epoch": 5.9216, "grad_norm": 0.4920865595340729, "learning_rate": 0.00012885987346883832, "loss": 3.3831, "step": 4630 }, { "epoch": 5.92288, "grad_norm": 0.5026581883430481, "learning_rate": 0.00012881949118320095, "loss": 3.4406, "step": 4631 }, { "epoch": 5.92416, "grad_norm": 0.5237811803817749, "learning_rate": 0.00012877910889756358, "loss": 3.441, "step": 4632 }, { "epoch": 5.92544, "grad_norm": 0.5131553411483765, "learning_rate": 0.00012873872661192624, "loss": 3.4405, "step": 4633 }, { "epoch": 5.9267199999999995, "grad_norm": 0.5147265791893005, "learning_rate": 0.00012869834432628884, "loss": 3.4757, "step": 4634 }, { "epoch": 5.928, "grad_norm": 0.5250530242919922, "learning_rate": 0.0001286579620406515, "loss": 3.4194, "step": 4635 }, { "epoch": 5.92928, "grad_norm": 0.5036343336105347, "learning_rate": 0.00012861757975501413, "loss": 3.3476, "step": 4636 }, { "epoch": 5.93056, "grad_norm": 0.5144938826560974, "learning_rate": 0.00012857719746937676, "loss": 3.4407, "step": 4637 }, { "epoch": 5.93184, "grad_norm": 0.5358858704566956, "learning_rate": 0.0001285368151837394, "loss": 3.4875, "step": 4638 }, { "epoch": 5.93312, "grad_norm": 0.503612220287323, "learning_rate": 0.00012849643289810202, "loss": 3.4862, "step": 4639 }, { "epoch": 5.9344, "grad_norm": 0.5070048570632935, "learning_rate": 0.00012845605061246465, "loss": 3.4842, "step": 4640 }, { "epoch": 5.93568, "grad_norm": 0.5292816758155823, "learning_rate": 0.00012841566832682728, "loss": 3.4764, "step": 4641 }, { "epoch": 5.93696, "grad_norm": 0.5156516432762146, "learning_rate": 0.0001283752860411899, "loss": 3.4486, "step": 4642 }, { "epoch": 5.93824, "grad_norm": 0.5131589770317078, "learning_rate": 0.00012833490375555254, "loss": 3.4249, "step": 4643 }, { "epoch": 5.93952, "grad_norm": 0.5197781324386597, "learning_rate": 0.0001282945214699152, "loss": 3.4929, "step": 4644 }, { "epoch": 5.9408, "grad_norm": 0.5134233236312866, "learning_rate": 0.00012825413918427783, "loss": 3.445, "step": 4645 }, { "epoch": 5.94208, "grad_norm": 0.5193766355514526, "learning_rate": 0.00012821375689864046, "loss": 3.4006, "step": 4646 }, { "epoch": 5.94336, "grad_norm": 0.5153205990791321, "learning_rate": 0.0001281733746130031, "loss": 3.4529, "step": 4647 }, { "epoch": 5.94464, "grad_norm": 0.5201491713523865, "learning_rate": 0.00012813299232736572, "loss": 3.4742, "step": 4648 }, { "epoch": 5.94592, "grad_norm": 0.5122271180152893, "learning_rate": 0.00012809261004172835, "loss": 3.4414, "step": 4649 }, { "epoch": 5.9472000000000005, "grad_norm": 0.5078105926513672, "learning_rate": 0.000128052227756091, "loss": 3.3802, "step": 4650 }, { "epoch": 5.94848, "grad_norm": 0.5191426873207092, "learning_rate": 0.0001280118454704536, "loss": 3.3924, "step": 4651 }, { "epoch": 5.94976, "grad_norm": 0.511929452419281, "learning_rate": 0.00012797146318481624, "loss": 3.4573, "step": 4652 }, { "epoch": 5.95104, "grad_norm": 0.512454628944397, "learning_rate": 0.0001279310808991789, "loss": 3.4888, "step": 4653 }, { "epoch": 5.95232, "grad_norm": 0.5255744457244873, "learning_rate": 0.0001278906986135415, "loss": 3.5094, "step": 4654 }, { "epoch": 5.9536, "grad_norm": 0.5149945616722107, "learning_rate": 0.00012785031632790416, "loss": 3.4169, "step": 4655 }, { "epoch": 5.95488, "grad_norm": 0.507247269153595, "learning_rate": 0.00012780993404226679, "loss": 3.4484, "step": 4656 }, { "epoch": 5.95616, "grad_norm": 0.5141899585723877, "learning_rate": 0.00012776955175662942, "loss": 3.4004, "step": 4657 }, { "epoch": 5.95744, "grad_norm": 0.525027334690094, "learning_rate": 0.00012772916947099205, "loss": 3.5179, "step": 4658 }, { "epoch": 5.95872, "grad_norm": 0.5136873126029968, "learning_rate": 0.00012768878718535468, "loss": 3.4939, "step": 4659 }, { "epoch": 5.96, "grad_norm": 0.5179911255836487, "learning_rate": 0.0001276484048997173, "loss": 3.4725, "step": 4660 }, { "epoch": 5.96128, "grad_norm": 0.5283648371696472, "learning_rate": 0.00012760802261407996, "loss": 3.4843, "step": 4661 }, { "epoch": 5.96256, "grad_norm": 0.5253385305404663, "learning_rate": 0.00012756764032844257, "loss": 3.4472, "step": 4662 }, { "epoch": 5.96384, "grad_norm": 0.5267717242240906, "learning_rate": 0.00012752725804280522, "loss": 3.4637, "step": 4663 }, { "epoch": 5.96512, "grad_norm": 0.5325003266334534, "learning_rate": 0.00012748687575716785, "loss": 3.4575, "step": 4664 }, { "epoch": 5.9664, "grad_norm": 0.51334148645401, "learning_rate": 0.00012744649347153048, "loss": 3.4401, "step": 4665 }, { "epoch": 5.96768, "grad_norm": 0.5157260298728943, "learning_rate": 0.0001274061111858931, "loss": 3.4944, "step": 4666 }, { "epoch": 5.96896, "grad_norm": 0.5126465559005737, "learning_rate": 0.00012736572890025574, "loss": 3.4313, "step": 4667 }, { "epoch": 5.97024, "grad_norm": 0.5327089428901672, "learning_rate": 0.00012732534661461837, "loss": 3.4903, "step": 4668 }, { "epoch": 5.97152, "grad_norm": 0.5237742066383362, "learning_rate": 0.000127284964328981, "loss": 3.4409, "step": 4669 }, { "epoch": 5.9728, "grad_norm": 0.5220187306404114, "learning_rate": 0.00012724458204334363, "loss": 3.4712, "step": 4670 }, { "epoch": 5.97408, "grad_norm": 0.5200389623641968, "learning_rate": 0.00012720419975770626, "loss": 3.5174, "step": 4671 }, { "epoch": 5.97536, "grad_norm": 0.5262846946716309, "learning_rate": 0.00012716381747206892, "loss": 3.4454, "step": 4672 }, { "epoch": 5.97664, "grad_norm": 0.5050617456436157, "learning_rate": 0.00012712343518643155, "loss": 3.4899, "step": 4673 }, { "epoch": 5.97792, "grad_norm": 0.503463089466095, "learning_rate": 0.00012708305290079418, "loss": 3.4283, "step": 4674 }, { "epoch": 5.9792, "grad_norm": 0.5130547881126404, "learning_rate": 0.0001270426706151568, "loss": 3.4625, "step": 4675 }, { "epoch": 5.98048, "grad_norm": 0.5259498953819275, "learning_rate": 0.00012700228832951944, "loss": 3.491, "step": 4676 }, { "epoch": 5.9817599999999995, "grad_norm": 0.4981289803981781, "learning_rate": 0.00012696190604388207, "loss": 3.4928, "step": 4677 }, { "epoch": 5.98304, "grad_norm": 0.520211398601532, "learning_rate": 0.0001269215237582447, "loss": 3.4893, "step": 4678 }, { "epoch": 5.98432, "grad_norm": 0.5219488739967346, "learning_rate": 0.00012688114147260733, "loss": 3.4604, "step": 4679 }, { "epoch": 5.9856, "grad_norm": 0.5058456063270569, "learning_rate": 0.00012684075918696996, "loss": 3.4903, "step": 4680 }, { "epoch": 5.98688, "grad_norm": 0.5197397470474243, "learning_rate": 0.00012680037690133262, "loss": 3.493, "step": 4681 }, { "epoch": 5.98816, "grad_norm": 0.5335855484008789, "learning_rate": 0.00012675999461569522, "loss": 3.5079, "step": 4682 }, { "epoch": 5.98944, "grad_norm": 0.5145869255065918, "learning_rate": 0.00012671961233005788, "loss": 3.4, "step": 4683 }, { "epoch": 5.99072, "grad_norm": 0.5230121612548828, "learning_rate": 0.0001266792300444205, "loss": 3.4968, "step": 4684 }, { "epoch": 5.992, "grad_norm": 0.5136299729347229, "learning_rate": 0.00012663884775878314, "loss": 3.4992, "step": 4685 }, { "epoch": 5.99328, "grad_norm": 0.5114295482635498, "learning_rate": 0.00012659846547314577, "loss": 3.4265, "step": 4686 }, { "epoch": 5.99456, "grad_norm": 0.511169970035553, "learning_rate": 0.0001265580831875084, "loss": 3.4609, "step": 4687 }, { "epoch": 5.99584, "grad_norm": 0.5237197279930115, "learning_rate": 0.00012651770090187103, "loss": 3.4253, "step": 4688 }, { "epoch": 5.99712, "grad_norm": 0.5328317880630493, "learning_rate": 0.00012647731861623369, "loss": 3.3671, "step": 4689 }, { "epoch": 5.9984, "grad_norm": 0.5123398303985596, "learning_rate": 0.0001264369363305963, "loss": 3.4323, "step": 4690 }, { "epoch": 5.99968, "grad_norm": 0.5049529671669006, "learning_rate": 0.00012639655404495892, "loss": 3.4654, "step": 4691 }, { "epoch": 6.0, "grad_norm": 0.8919804692268372, "learning_rate": 0.00012635617175932158, "loss": 3.1534, "step": 4692 }, { "epoch": 6.00128, "grad_norm": 0.5235251188278198, "learning_rate": 0.00012631578947368418, "loss": 3.476, "step": 4693 }, { "epoch": 6.00256, "grad_norm": 0.5221917629241943, "learning_rate": 0.00012627540718804684, "loss": 3.4639, "step": 4694 }, { "epoch": 6.00384, "grad_norm": 0.5179303288459778, "learning_rate": 0.00012623502490240947, "loss": 3.4869, "step": 4695 }, { "epoch": 6.00512, "grad_norm": 0.509280264377594, "learning_rate": 0.0001261946426167721, "loss": 3.5512, "step": 4696 }, { "epoch": 6.0064, "grad_norm": 0.5337740182876587, "learning_rate": 0.00012615426033113473, "loss": 3.4508, "step": 4697 }, { "epoch": 6.00768, "grad_norm": 0.511637806892395, "learning_rate": 0.00012611387804549736, "loss": 3.4186, "step": 4698 }, { "epoch": 6.00896, "grad_norm": 0.5246609449386597, "learning_rate": 0.00012607349575986, "loss": 3.4857, "step": 4699 }, { "epoch": 6.01024, "grad_norm": 0.5246115922927856, "learning_rate": 0.00012603311347422264, "loss": 3.4306, "step": 4700 }, { "epoch": 6.01152, "grad_norm": 0.5044547915458679, "learning_rate": 0.00012599273118858527, "loss": 3.4457, "step": 4701 }, { "epoch": 6.0128, "grad_norm": 0.524234414100647, "learning_rate": 0.0001259523489029479, "loss": 3.4756, "step": 4702 }, { "epoch": 6.01408, "grad_norm": 0.5325637459754944, "learning_rate": 0.00012591196661731053, "loss": 3.4793, "step": 4703 }, { "epoch": 6.01536, "grad_norm": 0.5177270174026489, "learning_rate": 0.00012587158433167316, "loss": 3.4615, "step": 4704 }, { "epoch": 6.01664, "grad_norm": 0.5080181360244751, "learning_rate": 0.0001258312020460358, "loss": 3.4701, "step": 4705 }, { "epoch": 6.01792, "grad_norm": 0.5270411968231201, "learning_rate": 0.00012579081976039842, "loss": 3.4342, "step": 4706 }, { "epoch": 6.0192, "grad_norm": 0.526719868183136, "learning_rate": 0.00012575043747476105, "loss": 3.4343, "step": 4707 }, { "epoch": 6.02048, "grad_norm": 0.5216172933578491, "learning_rate": 0.00012571005518912368, "loss": 3.4759, "step": 4708 }, { "epoch": 6.0217600000000004, "grad_norm": 0.5187539458274841, "learning_rate": 0.00012566967290348634, "loss": 3.5129, "step": 4709 }, { "epoch": 6.02304, "grad_norm": 0.5328104496002197, "learning_rate": 0.00012562929061784894, "loss": 3.4285, "step": 4710 }, { "epoch": 6.02432, "grad_norm": 0.5354236364364624, "learning_rate": 0.0001255889083322116, "loss": 3.4189, "step": 4711 }, { "epoch": 6.0256, "grad_norm": 0.5221328139305115, "learning_rate": 0.00012554852604657423, "loss": 3.338, "step": 4712 }, { "epoch": 6.02688, "grad_norm": 0.5087037682533264, "learning_rate": 0.00012550814376093686, "loss": 3.4602, "step": 4713 }, { "epoch": 6.02816, "grad_norm": 0.5138629674911499, "learning_rate": 0.0001254677614752995, "loss": 3.4034, "step": 4714 }, { "epoch": 6.02944, "grad_norm": 0.5265618562698364, "learning_rate": 0.00012542737918966212, "loss": 3.4126, "step": 4715 }, { "epoch": 6.03072, "grad_norm": 0.5210886597633362, "learning_rate": 0.00012538699690402475, "loss": 3.4671, "step": 4716 }, { "epoch": 6.032, "grad_norm": 0.5192465782165527, "learning_rate": 0.0001253466146183874, "loss": 3.3955, "step": 4717 }, { "epoch": 6.03328, "grad_norm": 0.5116483569145203, "learning_rate": 0.00012530623233275, "loss": 3.4776, "step": 4718 }, { "epoch": 6.03456, "grad_norm": 0.5206137299537659, "learning_rate": 0.00012526585004711264, "loss": 3.4733, "step": 4719 }, { "epoch": 6.03584, "grad_norm": 0.523862361907959, "learning_rate": 0.0001252254677614753, "loss": 3.3922, "step": 4720 }, { "epoch": 6.03712, "grad_norm": 0.5030114054679871, "learning_rate": 0.0001251850854758379, "loss": 3.4266, "step": 4721 }, { "epoch": 6.0384, "grad_norm": 0.5357561111450195, "learning_rate": 0.00012514470319020056, "loss": 3.4608, "step": 4722 }, { "epoch": 6.03968, "grad_norm": 0.5137104392051697, "learning_rate": 0.0001251043209045632, "loss": 3.4066, "step": 4723 }, { "epoch": 6.04096, "grad_norm": 0.49622246623039246, "learning_rate": 0.00012506393861892582, "loss": 3.4098, "step": 4724 }, { "epoch": 6.04224, "grad_norm": 0.5154051780700684, "learning_rate": 0.00012502355633328845, "loss": 3.4739, "step": 4725 }, { "epoch": 6.04352, "grad_norm": 0.5417183041572571, "learning_rate": 0.00012498317404765108, "loss": 3.4566, "step": 4726 }, { "epoch": 6.0448, "grad_norm": 0.5043236613273621, "learning_rate": 0.0001249427917620137, "loss": 3.5022, "step": 4727 }, { "epoch": 6.04608, "grad_norm": 0.5146865248680115, "learning_rate": 0.00012490240947637637, "loss": 3.5059, "step": 4728 }, { "epoch": 6.04736, "grad_norm": 0.5209075212478638, "learning_rate": 0.00012486202719073897, "loss": 3.5078, "step": 4729 }, { "epoch": 6.04864, "grad_norm": 0.5176512002944946, "learning_rate": 0.00012482164490510163, "loss": 3.4633, "step": 4730 }, { "epoch": 6.04992, "grad_norm": 0.5079405903816223, "learning_rate": 0.00012478126261946426, "loss": 3.4449, "step": 4731 }, { "epoch": 6.0512, "grad_norm": 0.5069144368171692, "learning_rate": 0.0001247408803338269, "loss": 3.5183, "step": 4732 }, { "epoch": 6.05248, "grad_norm": 0.5130882263183594, "learning_rate": 0.00012470049804818952, "loss": 3.4276, "step": 4733 }, { "epoch": 6.05376, "grad_norm": 0.5110294222831726, "learning_rate": 0.00012466011576255215, "loss": 3.4404, "step": 4734 }, { "epoch": 6.05504, "grad_norm": 0.513787031173706, "learning_rate": 0.00012461973347691478, "loss": 3.4809, "step": 4735 }, { "epoch": 6.05632, "grad_norm": 0.4997158348560333, "learning_rate": 0.0001245793511912774, "loss": 3.4271, "step": 4736 }, { "epoch": 6.0576, "grad_norm": 0.5131158828735352, "learning_rate": 0.00012453896890564007, "loss": 3.4951, "step": 4737 }, { "epoch": 6.05888, "grad_norm": 0.511933445930481, "learning_rate": 0.00012449858662000267, "loss": 3.4584, "step": 4738 }, { "epoch": 6.06016, "grad_norm": 0.5187560319900513, "learning_rate": 0.00012445820433436533, "loss": 3.4039, "step": 4739 }, { "epoch": 6.06144, "grad_norm": 0.5163741111755371, "learning_rate": 0.00012441782204872796, "loss": 3.4948, "step": 4740 }, { "epoch": 6.06272, "grad_norm": 0.5106804370880127, "learning_rate": 0.00012437743976309059, "loss": 3.4327, "step": 4741 }, { "epoch": 6.064, "grad_norm": 0.5465947985649109, "learning_rate": 0.00012433705747745322, "loss": 3.5185, "step": 4742 }, { "epoch": 6.06528, "grad_norm": 0.49931657314300537, "learning_rate": 0.00012429667519181585, "loss": 3.3982, "step": 4743 }, { "epoch": 6.06656, "grad_norm": 0.510944664478302, "learning_rate": 0.00012425629290617848, "loss": 3.4117, "step": 4744 }, { "epoch": 6.06784, "grad_norm": 0.5148774981498718, "learning_rate": 0.0001242159106205411, "loss": 3.4909, "step": 4745 }, { "epoch": 6.06912, "grad_norm": 0.5062376260757446, "learning_rate": 0.00012417552833490374, "loss": 3.4504, "step": 4746 }, { "epoch": 6.0704, "grad_norm": 0.5255651473999023, "learning_rate": 0.00012413514604926637, "loss": 3.5011, "step": 4747 }, { "epoch": 6.07168, "grad_norm": 0.5072139501571655, "learning_rate": 0.00012409476376362902, "loss": 3.5092, "step": 4748 }, { "epoch": 6.07296, "grad_norm": 0.5306870341300964, "learning_rate": 0.00012405438147799163, "loss": 3.4533, "step": 4749 }, { "epoch": 6.07424, "grad_norm": 0.5114386677742004, "learning_rate": 0.00012401399919235428, "loss": 3.4855, "step": 4750 }, { "epoch": 6.07552, "grad_norm": 0.5131011605262756, "learning_rate": 0.00012397361690671691, "loss": 3.4171, "step": 4751 }, { "epoch": 6.0768, "grad_norm": 0.5139480233192444, "learning_rate": 0.00012393323462107954, "loss": 3.3778, "step": 4752 }, { "epoch": 6.07808, "grad_norm": 0.5153706073760986, "learning_rate": 0.00012389285233544217, "loss": 3.4194, "step": 4753 }, { "epoch": 6.07936, "grad_norm": 0.5061531066894531, "learning_rate": 0.0001238524700498048, "loss": 3.4817, "step": 4754 }, { "epoch": 6.08064, "grad_norm": 0.5213887691497803, "learning_rate": 0.00012381208776416743, "loss": 3.4902, "step": 4755 }, { "epoch": 6.08192, "grad_norm": 0.5053319931030273, "learning_rate": 0.0001237717054785301, "loss": 3.4497, "step": 4756 }, { "epoch": 6.0832, "grad_norm": 0.5132631659507751, "learning_rate": 0.0001237313231928927, "loss": 3.4278, "step": 4757 }, { "epoch": 6.08448, "grad_norm": 0.5187950730323792, "learning_rate": 0.00012369094090725535, "loss": 3.4636, "step": 4758 }, { "epoch": 6.08576, "grad_norm": 0.5095174908638, "learning_rate": 0.00012365055862161798, "loss": 3.5092, "step": 4759 }, { "epoch": 6.08704, "grad_norm": 0.5095679759979248, "learning_rate": 0.0001236101763359806, "loss": 3.4271, "step": 4760 }, { "epoch": 6.08832, "grad_norm": 0.5283337831497192, "learning_rate": 0.00012356979405034324, "loss": 3.4452, "step": 4761 }, { "epoch": 6.0896, "grad_norm": 0.5230772495269775, "learning_rate": 0.00012352941176470587, "loss": 3.4283, "step": 4762 }, { "epoch": 6.09088, "grad_norm": 0.5018015503883362, "learning_rate": 0.0001234890294790685, "loss": 3.5196, "step": 4763 }, { "epoch": 6.09216, "grad_norm": 0.5031048655509949, "learning_rate": 0.00012344864719343113, "loss": 3.4459, "step": 4764 }, { "epoch": 6.09344, "grad_norm": 0.49931055307388306, "learning_rate": 0.00012340826490779376, "loss": 3.4384, "step": 4765 }, { "epoch": 6.09472, "grad_norm": 0.5125550627708435, "learning_rate": 0.0001233678826221564, "loss": 3.5177, "step": 4766 }, { "epoch": 6.096, "grad_norm": 0.5063710808753967, "learning_rate": 0.00012332750033651905, "loss": 3.3931, "step": 4767 }, { "epoch": 6.09728, "grad_norm": 0.5210559368133545, "learning_rate": 0.00012328711805088168, "loss": 3.4212, "step": 4768 }, { "epoch": 6.09856, "grad_norm": 0.5136427879333496, "learning_rate": 0.0001232467357652443, "loss": 3.513, "step": 4769 }, { "epoch": 6.09984, "grad_norm": 0.5130379796028137, "learning_rate": 0.00012320635347960694, "loss": 3.4504, "step": 4770 }, { "epoch": 6.10112, "grad_norm": 0.5283026099205017, "learning_rate": 0.00012316597119396957, "loss": 3.4788, "step": 4771 }, { "epoch": 6.1024, "grad_norm": 0.5066593289375305, "learning_rate": 0.0001231255889083322, "loss": 3.4462, "step": 4772 }, { "epoch": 6.10368, "grad_norm": 0.5073418021202087, "learning_rate": 0.00012308520662269483, "loss": 3.4944, "step": 4773 }, { "epoch": 6.10496, "grad_norm": 0.5373370051383972, "learning_rate": 0.00012304482433705746, "loss": 3.4202, "step": 4774 }, { "epoch": 6.10624, "grad_norm": 0.5177907347679138, "learning_rate": 0.0001230044420514201, "loss": 3.455, "step": 4775 }, { "epoch": 6.10752, "grad_norm": 0.515060305595398, "learning_rate": 0.00012296405976578275, "loss": 3.3564, "step": 4776 }, { "epoch": 6.1088, "grad_norm": 0.5271644592285156, "learning_rate": 0.00012292367748014535, "loss": 3.5478, "step": 4777 }, { "epoch": 6.11008, "grad_norm": 0.5147733688354492, "learning_rate": 0.000122883295194508, "loss": 3.4428, "step": 4778 }, { "epoch": 6.11136, "grad_norm": 0.5317122340202332, "learning_rate": 0.00012284291290887064, "loss": 3.4613, "step": 4779 }, { "epoch": 6.11264, "grad_norm": 0.5279807448387146, "learning_rate": 0.00012280253062323327, "loss": 3.3885, "step": 4780 }, { "epoch": 6.11392, "grad_norm": 0.5295637249946594, "learning_rate": 0.0001227621483375959, "loss": 3.4598, "step": 4781 }, { "epoch": 6.1152, "grad_norm": 0.5437308549880981, "learning_rate": 0.00012272176605195853, "loss": 3.4348, "step": 4782 }, { "epoch": 6.11648, "grad_norm": 0.5441926717758179, "learning_rate": 0.00012268138376632116, "loss": 3.4985, "step": 4783 }, { "epoch": 6.11776, "grad_norm": 0.5059524774551392, "learning_rate": 0.00012264100148068381, "loss": 3.3963, "step": 4784 }, { "epoch": 6.11904, "grad_norm": 0.5069935917854309, "learning_rate": 0.00012260061919504642, "loss": 3.4357, "step": 4785 }, { "epoch": 6.12032, "grad_norm": 0.5232740044593811, "learning_rate": 0.00012256023690940905, "loss": 3.417, "step": 4786 }, { "epoch": 6.1216, "grad_norm": 0.5216338634490967, "learning_rate": 0.0001225198546237717, "loss": 3.5042, "step": 4787 }, { "epoch": 6.12288, "grad_norm": 0.5343948006629944, "learning_rate": 0.00012247947233813433, "loss": 3.4608, "step": 4788 }, { "epoch": 6.12416, "grad_norm": 0.5192834138870239, "learning_rate": 0.00012243909005249696, "loss": 3.4506, "step": 4789 }, { "epoch": 6.12544, "grad_norm": 0.5342220067977905, "learning_rate": 0.0001223987077668596, "loss": 3.5125, "step": 4790 }, { "epoch": 6.12672, "grad_norm": 0.5249612331390381, "learning_rate": 0.00012235832548122223, "loss": 3.4471, "step": 4791 }, { "epoch": 6.128, "grad_norm": 0.521793782711029, "learning_rate": 0.00012231794319558486, "loss": 3.4716, "step": 4792 }, { "epoch": 6.12928, "grad_norm": 0.5028202533721924, "learning_rate": 0.00012227756090994749, "loss": 3.482, "step": 4793 }, { "epoch": 6.13056, "grad_norm": 0.5394800901412964, "learning_rate": 0.00012223717862431012, "loss": 3.4238, "step": 4794 }, { "epoch": 6.13184, "grad_norm": 0.5089391469955444, "learning_rate": 0.00012219679633867277, "loss": 3.4219, "step": 4795 }, { "epoch": 6.13312, "grad_norm": 0.5048559904098511, "learning_rate": 0.0001221564140530354, "loss": 3.4201, "step": 4796 }, { "epoch": 6.1344, "grad_norm": 0.5039951801300049, "learning_rate": 0.00012211603176739803, "loss": 3.492, "step": 4797 }, { "epoch": 6.13568, "grad_norm": 0.5205702185630798, "learning_rate": 0.00012207564948176066, "loss": 3.4307, "step": 4798 }, { "epoch": 6.13696, "grad_norm": 0.5324816107749939, "learning_rate": 0.00012203526719612328, "loss": 3.4693, "step": 4799 }, { "epoch": 6.13824, "grad_norm": 0.4904349446296692, "learning_rate": 0.00012199488491048592, "loss": 3.3403, "step": 4800 }, { "epoch": 6.13952, "grad_norm": 0.5032051801681519, "learning_rate": 0.00012195450262484855, "loss": 3.3592, "step": 4801 }, { "epoch": 6.1408, "grad_norm": 0.5135837197303772, "learning_rate": 0.0001219141203392112, "loss": 3.4193, "step": 4802 }, { "epoch": 6.14208, "grad_norm": 0.49779608845710754, "learning_rate": 0.00012187373805357381, "loss": 3.4737, "step": 4803 }, { "epoch": 6.14336, "grad_norm": 0.5166708827018738, "learning_rate": 0.00012183335576793646, "loss": 3.4728, "step": 4804 }, { "epoch": 6.14464, "grad_norm": 0.524376392364502, "learning_rate": 0.00012179297348229909, "loss": 3.5283, "step": 4805 }, { "epoch": 6.14592, "grad_norm": 0.5073275566101074, "learning_rate": 0.00012175259119666173, "loss": 3.4763, "step": 4806 }, { "epoch": 6.1472, "grad_norm": 0.5140321850776672, "learning_rate": 0.00012171220891102435, "loss": 3.5098, "step": 4807 }, { "epoch": 6.14848, "grad_norm": 0.5006900429725647, "learning_rate": 0.00012167182662538699, "loss": 3.4547, "step": 4808 }, { "epoch": 6.14976, "grad_norm": 0.5014587640762329, "learning_rate": 0.00012163144433974962, "loss": 3.457, "step": 4809 }, { "epoch": 6.15104, "grad_norm": 0.5160591006278992, "learning_rate": 0.00012159106205411226, "loss": 3.4686, "step": 4810 }, { "epoch": 6.15232, "grad_norm": 0.5109474658966064, "learning_rate": 0.00012155067976847488, "loss": 3.4054, "step": 4811 }, { "epoch": 6.1536, "grad_norm": 0.49859675765037537, "learning_rate": 0.00012151029748283751, "loss": 3.4768, "step": 4812 }, { "epoch": 6.15488, "grad_norm": 0.5176275372505188, "learning_rate": 0.00012146991519720015, "loss": 3.4349, "step": 4813 }, { "epoch": 6.15616, "grad_norm": 0.5007618069648743, "learning_rate": 0.00012142953291156277, "loss": 3.3093, "step": 4814 }, { "epoch": 6.15744, "grad_norm": 0.5273295044898987, "learning_rate": 0.00012138915062592541, "loss": 3.4368, "step": 4815 }, { "epoch": 6.15872, "grad_norm": 0.49494895339012146, "learning_rate": 0.00012134876834028804, "loss": 3.4666, "step": 4816 }, { "epoch": 6.16, "grad_norm": 0.5105710625648499, "learning_rate": 0.00012130838605465069, "loss": 3.4899, "step": 4817 }, { "epoch": 6.16128, "grad_norm": 0.5119180083274841, "learning_rate": 0.0001212680037690133, "loss": 3.4086, "step": 4818 }, { "epoch": 6.16256, "grad_norm": 0.5197157263755798, "learning_rate": 0.00012122762148337595, "loss": 3.4382, "step": 4819 }, { "epoch": 6.16384, "grad_norm": 0.5209784507751465, "learning_rate": 0.00012118723919773858, "loss": 3.4292, "step": 4820 }, { "epoch": 6.16512, "grad_norm": 0.5159290432929993, "learning_rate": 0.00012114685691210122, "loss": 3.48, "step": 4821 }, { "epoch": 6.1664, "grad_norm": 0.5117263793945312, "learning_rate": 0.00012110647462646384, "loss": 3.4611, "step": 4822 }, { "epoch": 6.16768, "grad_norm": 0.5287443995475769, "learning_rate": 0.00012106609234082648, "loss": 3.4852, "step": 4823 }, { "epoch": 6.16896, "grad_norm": 0.5062198042869568, "learning_rate": 0.00012102571005518911, "loss": 3.3852, "step": 4824 }, { "epoch": 6.17024, "grad_norm": 0.5322122573852539, "learning_rate": 0.00012098532776955176, "loss": 3.5064, "step": 4825 }, { "epoch": 6.17152, "grad_norm": 0.525723397731781, "learning_rate": 0.00012094494548391439, "loss": 3.486, "step": 4826 }, { "epoch": 6.1728, "grad_norm": 0.5092231035232544, "learning_rate": 0.000120904563198277, "loss": 3.4674, "step": 4827 }, { "epoch": 6.17408, "grad_norm": 0.5155490636825562, "learning_rate": 0.00012086418091263965, "loss": 3.4667, "step": 4828 }, { "epoch": 6.17536, "grad_norm": 0.5157750248908997, "learning_rate": 0.00012082379862700228, "loss": 3.4403, "step": 4829 }, { "epoch": 6.17664, "grad_norm": 0.49893108010292053, "learning_rate": 0.00012078341634136492, "loss": 3.4855, "step": 4830 }, { "epoch": 6.17792, "grad_norm": 0.5173389911651611, "learning_rate": 0.00012074303405572754, "loss": 3.5103, "step": 4831 }, { "epoch": 6.1792, "grad_norm": 0.515521228313446, "learning_rate": 0.00012070265177009018, "loss": 3.5259, "step": 4832 }, { "epoch": 6.18048, "grad_norm": 0.5073827505111694, "learning_rate": 0.00012066226948445281, "loss": 3.46, "step": 4833 }, { "epoch": 6.18176, "grad_norm": 0.5405731797218323, "learning_rate": 0.00012062188719881545, "loss": 3.4988, "step": 4834 }, { "epoch": 6.18304, "grad_norm": 0.5233080387115479, "learning_rate": 0.00012058150491317807, "loss": 3.4526, "step": 4835 }, { "epoch": 6.18432, "grad_norm": 0.5276393294334412, "learning_rate": 0.00012054112262754071, "loss": 3.4405, "step": 4836 }, { "epoch": 6.1856, "grad_norm": 0.5196059346199036, "learning_rate": 0.00012050074034190334, "loss": 3.4805, "step": 4837 }, { "epoch": 6.18688, "grad_norm": 0.5034788250923157, "learning_rate": 0.00012046035805626599, "loss": 3.5371, "step": 4838 }, { "epoch": 6.18816, "grad_norm": 0.516704797744751, "learning_rate": 0.0001204199757706286, "loss": 3.4467, "step": 4839 }, { "epoch": 6.18944, "grad_norm": 0.512168288230896, "learning_rate": 0.00012037959348499123, "loss": 3.4096, "step": 4840 }, { "epoch": 6.19072, "grad_norm": 0.5281592607498169, "learning_rate": 0.00012033921119935388, "loss": 3.5391, "step": 4841 }, { "epoch": 6.192, "grad_norm": 0.49945464730262756, "learning_rate": 0.0001202988289137165, "loss": 3.478, "step": 4842 }, { "epoch": 6.19328, "grad_norm": 0.5051952004432678, "learning_rate": 0.00012025844662807914, "loss": 3.4026, "step": 4843 }, { "epoch": 6.19456, "grad_norm": 0.5274873971939087, "learning_rate": 0.00012021806434244177, "loss": 3.5439, "step": 4844 }, { "epoch": 6.19584, "grad_norm": 0.5015951991081238, "learning_rate": 0.00012017768205680441, "loss": 3.4883, "step": 4845 }, { "epoch": 6.19712, "grad_norm": 0.5088681578636169, "learning_rate": 0.00012013729977116703, "loss": 3.4458, "step": 4846 }, { "epoch": 6.1984, "grad_norm": 0.49841299653053284, "learning_rate": 0.00012009691748552967, "loss": 3.4178, "step": 4847 }, { "epoch": 6.19968, "grad_norm": 0.5172284245491028, "learning_rate": 0.0001200565351998923, "loss": 3.4052, "step": 4848 }, { "epoch": 6.20096, "grad_norm": 0.5176679491996765, "learning_rate": 0.00012001615291425495, "loss": 3.4595, "step": 4849 }, { "epoch": 6.20224, "grad_norm": 0.5075709819793701, "learning_rate": 0.00011997577062861756, "loss": 3.439, "step": 4850 }, { "epoch": 6.20352, "grad_norm": 0.5104464888572693, "learning_rate": 0.0001199353883429802, "loss": 3.4479, "step": 4851 }, { "epoch": 6.2048, "grad_norm": 0.504559338092804, "learning_rate": 0.00011989500605734284, "loss": 3.4244, "step": 4852 }, { "epoch": 6.20608, "grad_norm": 0.5160381197929382, "learning_rate": 0.00011985462377170547, "loss": 3.4028, "step": 4853 }, { "epoch": 6.2073599999999995, "grad_norm": 0.532819390296936, "learning_rate": 0.0001198142414860681, "loss": 3.4141, "step": 4854 }, { "epoch": 6.20864, "grad_norm": 0.5191538333892822, "learning_rate": 0.00011977385920043073, "loss": 3.4793, "step": 4855 }, { "epoch": 6.20992, "grad_norm": 0.5210806727409363, "learning_rate": 0.00011973347691479337, "loss": 3.4534, "step": 4856 }, { "epoch": 6.2112, "grad_norm": 0.5080645680427551, "learning_rate": 0.000119693094629156, "loss": 3.447, "step": 4857 }, { "epoch": 6.21248, "grad_norm": 0.5201761722564697, "learning_rate": 0.00011965271234351863, "loss": 3.481, "step": 4858 }, { "epoch": 6.21376, "grad_norm": 0.5427380800247192, "learning_rate": 0.00011961233005788126, "loss": 3.462, "step": 4859 }, { "epoch": 6.21504, "grad_norm": 0.5049818158149719, "learning_rate": 0.0001195719477722439, "loss": 3.4419, "step": 4860 }, { "epoch": 6.21632, "grad_norm": 0.5197364687919617, "learning_rate": 0.00011953156548660653, "loss": 3.4193, "step": 4861 }, { "epoch": 6.2176, "grad_norm": 0.5341804623603821, "learning_rate": 0.00011949118320096918, "loss": 3.4411, "step": 4862 }, { "epoch": 6.21888, "grad_norm": 0.5286295413970947, "learning_rate": 0.0001194508009153318, "loss": 3.4425, "step": 4863 }, { "epoch": 6.22016, "grad_norm": 0.5018596053123474, "learning_rate": 0.00011941041862969444, "loss": 3.4148, "step": 4864 }, { "epoch": 6.22144, "grad_norm": 0.5178782939910889, "learning_rate": 0.00011937003634405707, "loss": 3.4321, "step": 4865 }, { "epoch": 6.22272, "grad_norm": 0.5278565287590027, "learning_rate": 0.00011932965405841968, "loss": 3.4431, "step": 4866 }, { "epoch": 6.224, "grad_norm": 0.5157015919685364, "learning_rate": 0.00011928927177278233, "loss": 3.5341, "step": 4867 }, { "epoch": 6.22528, "grad_norm": 0.5221292972564697, "learning_rate": 0.00011924888948714496, "loss": 3.4478, "step": 4868 }, { "epoch": 6.22656, "grad_norm": 0.5204195976257324, "learning_rate": 0.0001192085072015076, "loss": 3.4702, "step": 4869 }, { "epoch": 6.22784, "grad_norm": 0.5190892219543457, "learning_rate": 0.00011916812491587022, "loss": 3.553, "step": 4870 }, { "epoch": 6.22912, "grad_norm": 0.52687007188797, "learning_rate": 0.00011912774263023286, "loss": 3.4113, "step": 4871 }, { "epoch": 6.2304, "grad_norm": 0.5159428715705872, "learning_rate": 0.00011908736034459549, "loss": 3.3968, "step": 4872 }, { "epoch": 6.23168, "grad_norm": 0.5180546641349792, "learning_rate": 0.00011904697805895814, "loss": 3.4233, "step": 4873 }, { "epoch": 6.23296, "grad_norm": 0.5099559426307678, "learning_rate": 0.00011900659577332075, "loss": 3.419, "step": 4874 }, { "epoch": 6.23424, "grad_norm": 0.5090861916542053, "learning_rate": 0.0001189662134876834, "loss": 3.4224, "step": 4875 }, { "epoch": 6.23552, "grad_norm": 0.5132479667663574, "learning_rate": 0.00011892583120204603, "loss": 3.4735, "step": 4876 }, { "epoch": 6.2368, "grad_norm": 0.5198119878768921, "learning_rate": 0.00011888544891640867, "loss": 3.4072, "step": 4877 }, { "epoch": 6.23808, "grad_norm": 0.5327208042144775, "learning_rate": 0.00011884506663077129, "loss": 3.4841, "step": 4878 }, { "epoch": 6.23936, "grad_norm": 0.5163946747779846, "learning_rate": 0.00011880468434513392, "loss": 3.483, "step": 4879 }, { "epoch": 6.24064, "grad_norm": 0.4975844621658325, "learning_rate": 0.00011876430205949656, "loss": 3.5362, "step": 4880 }, { "epoch": 6.24192, "grad_norm": 0.5258901715278625, "learning_rate": 0.00011872391977385919, "loss": 3.449, "step": 4881 }, { "epoch": 6.2432, "grad_norm": 0.5171861052513123, "learning_rate": 0.00011868353748822182, "loss": 3.3967, "step": 4882 }, { "epoch": 6.24448, "grad_norm": 0.5271828174591064, "learning_rate": 0.00011864315520258445, "loss": 3.4485, "step": 4883 }, { "epoch": 6.24576, "grad_norm": 0.5008876323699951, "learning_rate": 0.00011860277291694709, "loss": 3.507, "step": 4884 }, { "epoch": 6.24704, "grad_norm": 0.5155926942825317, "learning_rate": 0.00011856239063130972, "loss": 3.4627, "step": 4885 }, { "epoch": 6.24832, "grad_norm": 0.5127007365226746, "learning_rate": 0.00011852200834567235, "loss": 3.4644, "step": 4886 }, { "epoch": 6.2496, "grad_norm": 0.5121738314628601, "learning_rate": 0.00011848162606003498, "loss": 3.4467, "step": 4887 }, { "epoch": 6.25088, "grad_norm": 0.5105927586555481, "learning_rate": 0.00011844124377439763, "loss": 3.4391, "step": 4888 }, { "epoch": 6.25216, "grad_norm": 0.5314432382583618, "learning_rate": 0.00011840086148876026, "loss": 3.5517, "step": 4889 }, { "epoch": 6.25344, "grad_norm": 0.5190565586090088, "learning_rate": 0.00011836047920312289, "loss": 3.41, "step": 4890 }, { "epoch": 6.25472, "grad_norm": 0.5455322861671448, "learning_rate": 0.00011832009691748552, "loss": 3.4535, "step": 4891 }, { "epoch": 6.256, "grad_norm": 0.5055553913116455, "learning_rate": 0.00011827971463184816, "loss": 3.5182, "step": 4892 }, { "epoch": 6.25728, "grad_norm": 0.5350437760353088, "learning_rate": 0.00011823933234621079, "loss": 3.4637, "step": 4893 }, { "epoch": 6.25856, "grad_norm": 0.5320977568626404, "learning_rate": 0.00011819895006057341, "loss": 3.4818, "step": 4894 }, { "epoch": 6.25984, "grad_norm": 0.5101127624511719, "learning_rate": 0.00011815856777493605, "loss": 3.4325, "step": 4895 }, { "epoch": 6.26112, "grad_norm": 0.5186399817466736, "learning_rate": 0.00011811818548929868, "loss": 3.4622, "step": 4896 }, { "epoch": 6.2624, "grad_norm": 0.5287879109382629, "learning_rate": 0.00011807780320366132, "loss": 3.4531, "step": 4897 }, { "epoch": 6.26368, "grad_norm": 0.5211589932441711, "learning_rate": 0.00011803742091802394, "loss": 3.5668, "step": 4898 }, { "epoch": 6.26496, "grad_norm": 0.5217843651771545, "learning_rate": 0.00011799703863238658, "loss": 3.4539, "step": 4899 }, { "epoch": 6.26624, "grad_norm": 0.5402528047561646, "learning_rate": 0.00011795665634674921, "loss": 3.4953, "step": 4900 }, { "epoch": 6.26752, "grad_norm": 0.5263811945915222, "learning_rate": 0.00011791627406111186, "loss": 3.4311, "step": 4901 }, { "epoch": 6.2688, "grad_norm": 0.5283783674240112, "learning_rate": 0.00011787589177547448, "loss": 3.4235, "step": 4902 }, { "epoch": 6.27008, "grad_norm": 0.5406724214553833, "learning_rate": 0.00011783550948983712, "loss": 3.4439, "step": 4903 }, { "epoch": 6.27136, "grad_norm": 0.5088178515434265, "learning_rate": 0.00011779512720419975, "loss": 3.4806, "step": 4904 }, { "epoch": 6.27264, "grad_norm": 0.5212264060974121, "learning_rate": 0.00011775474491856239, "loss": 3.3909, "step": 4905 }, { "epoch": 6.27392, "grad_norm": 0.5153950452804565, "learning_rate": 0.00011771436263292501, "loss": 3.3559, "step": 4906 }, { "epoch": 6.2752, "grad_norm": 0.5014641880989075, "learning_rate": 0.00011767398034728764, "loss": 3.4949, "step": 4907 }, { "epoch": 6.27648, "grad_norm": 0.5058307647705078, "learning_rate": 0.00011763359806165028, "loss": 3.4748, "step": 4908 }, { "epoch": 6.27776, "grad_norm": 0.5182768106460571, "learning_rate": 0.0001175932157760129, "loss": 3.5126, "step": 4909 }, { "epoch": 6.27904, "grad_norm": 0.50447678565979, "learning_rate": 0.00011755283349037554, "loss": 3.4261, "step": 4910 }, { "epoch": 6.28032, "grad_norm": 0.5356603860855103, "learning_rate": 0.00011751245120473817, "loss": 3.489, "step": 4911 }, { "epoch": 6.2816, "grad_norm": 0.5243074893951416, "learning_rate": 0.00011747206891910082, "loss": 3.4682, "step": 4912 }, { "epoch": 6.2828800000000005, "grad_norm": 0.5267544984817505, "learning_rate": 0.00011743168663346345, "loss": 3.4675, "step": 4913 }, { "epoch": 6.28416, "grad_norm": 0.5158196687698364, "learning_rate": 0.00011739130434782608, "loss": 3.4589, "step": 4914 }, { "epoch": 6.28544, "grad_norm": 0.5127290487289429, "learning_rate": 0.0001173509220621887, "loss": 3.4685, "step": 4915 }, { "epoch": 6.28672, "grad_norm": 0.5056057572364807, "learning_rate": 0.00011731053977655135, "loss": 3.4143, "step": 4916 }, { "epoch": 6.288, "grad_norm": 0.5179975032806396, "learning_rate": 0.00011727015749091398, "loss": 3.4381, "step": 4917 }, { "epoch": 6.28928, "grad_norm": 0.5318068265914917, "learning_rate": 0.00011722977520527661, "loss": 3.4787, "step": 4918 }, { "epoch": 6.29056, "grad_norm": 0.5078819990158081, "learning_rate": 0.00011718939291963924, "loss": 3.4634, "step": 4919 }, { "epoch": 6.29184, "grad_norm": 0.5218449831008911, "learning_rate": 0.00011714901063400187, "loss": 3.4495, "step": 4920 }, { "epoch": 6.29312, "grad_norm": 0.5181546211242676, "learning_rate": 0.00011710862834836451, "loss": 3.5156, "step": 4921 }, { "epoch": 6.2943999999999996, "grad_norm": 0.5055398344993591, "learning_rate": 0.00011706824606272713, "loss": 3.4951, "step": 4922 }, { "epoch": 6.29568, "grad_norm": 0.5111787915229797, "learning_rate": 0.00011702786377708977, "loss": 3.4858, "step": 4923 }, { "epoch": 6.29696, "grad_norm": 0.5231371521949768, "learning_rate": 0.0001169874814914524, "loss": 3.4905, "step": 4924 }, { "epoch": 6.29824, "grad_norm": 0.5168370008468628, "learning_rate": 0.00011694709920581505, "loss": 3.4655, "step": 4925 }, { "epoch": 6.29952, "grad_norm": 0.5131970047950745, "learning_rate": 0.00011690671692017766, "loss": 3.4392, "step": 4926 }, { "epoch": 6.3008, "grad_norm": 0.5175589323043823, "learning_rate": 0.00011686633463454031, "loss": 3.4428, "step": 4927 }, { "epoch": 6.30208, "grad_norm": 0.5290152430534363, "learning_rate": 0.00011682595234890294, "loss": 3.4106, "step": 4928 }, { "epoch": 6.30336, "grad_norm": 0.5033940672874451, "learning_rate": 0.00011678557006326558, "loss": 3.4195, "step": 4929 }, { "epoch": 6.30464, "grad_norm": 0.5039850473403931, "learning_rate": 0.0001167451877776282, "loss": 3.4448, "step": 4930 }, { "epoch": 6.30592, "grad_norm": 0.5078590512275696, "learning_rate": 0.00011670480549199084, "loss": 3.4318, "step": 4931 }, { "epoch": 6.3072, "grad_norm": 0.5021501779556274, "learning_rate": 0.00011666442320635347, "loss": 3.4763, "step": 4932 }, { "epoch": 6.30848, "grad_norm": 0.5251853466033936, "learning_rate": 0.00011662404092071609, "loss": 3.4158, "step": 4933 }, { "epoch": 6.30976, "grad_norm": 0.5062684416770935, "learning_rate": 0.00011658365863507873, "loss": 3.4314, "step": 4934 }, { "epoch": 6.31104, "grad_norm": 0.5202977061271667, "learning_rate": 0.00011654327634944136, "loss": 3.4411, "step": 4935 }, { "epoch": 6.31232, "grad_norm": 0.5242408514022827, "learning_rate": 0.000116502894063804, "loss": 3.5071, "step": 4936 }, { "epoch": 6.3136, "grad_norm": 0.5003301501274109, "learning_rate": 0.00011646251177816662, "loss": 3.4682, "step": 4937 }, { "epoch": 6.31488, "grad_norm": 0.5161333680152893, "learning_rate": 0.00011642212949252927, "loss": 3.4328, "step": 4938 }, { "epoch": 6.31616, "grad_norm": 0.4897507429122925, "learning_rate": 0.0001163817472068919, "loss": 3.5237, "step": 4939 }, { "epoch": 6.31744, "grad_norm": 0.5239113569259644, "learning_rate": 0.00011634136492125454, "loss": 3.4244, "step": 4940 }, { "epoch": 6.31872, "grad_norm": 0.5347805023193359, "learning_rate": 0.00011630098263561716, "loss": 3.4078, "step": 4941 }, { "epoch": 6.32, "grad_norm": 0.49066007137298584, "learning_rate": 0.0001162606003499798, "loss": 3.474, "step": 4942 }, { "epoch": 6.32128, "grad_norm": 0.5044615268707275, "learning_rate": 0.00011622021806434243, "loss": 3.5421, "step": 4943 }, { "epoch": 6.32256, "grad_norm": 0.5164585113525391, "learning_rate": 0.00011617983577870507, "loss": 3.4537, "step": 4944 }, { "epoch": 6.32384, "grad_norm": 0.5173794627189636, "learning_rate": 0.00011613945349306769, "loss": 3.4225, "step": 4945 }, { "epoch": 6.32512, "grad_norm": 0.5148087739944458, "learning_rate": 0.00011609907120743033, "loss": 3.4548, "step": 4946 }, { "epoch": 6.3264, "grad_norm": 0.533028781414032, "learning_rate": 0.00011605868892179296, "loss": 3.5002, "step": 4947 }, { "epoch": 6.32768, "grad_norm": 0.5113101005554199, "learning_rate": 0.0001160183066361556, "loss": 3.425, "step": 4948 }, { "epoch": 6.32896, "grad_norm": 0.5407309532165527, "learning_rate": 0.00011597792435051822, "loss": 3.52, "step": 4949 }, { "epoch": 6.33024, "grad_norm": 0.5209356546401978, "learning_rate": 0.00011593754206488085, "loss": 3.4593, "step": 4950 }, { "epoch": 6.33152, "grad_norm": 0.516190230846405, "learning_rate": 0.0001158971597792435, "loss": 3.4489, "step": 4951 }, { "epoch": 6.3328, "grad_norm": 0.503466784954071, "learning_rate": 0.00011585677749360613, "loss": 3.3637, "step": 4952 }, { "epoch": 6.33408, "grad_norm": 0.534572422504425, "learning_rate": 0.00011581639520796877, "loss": 3.4754, "step": 4953 }, { "epoch": 6.33536, "grad_norm": 0.5381052494049072, "learning_rate": 0.00011577601292233139, "loss": 3.4082, "step": 4954 }, { "epoch": 6.33664, "grad_norm": 0.5160523653030396, "learning_rate": 0.00011573563063669403, "loss": 3.4497, "step": 4955 }, { "epoch": 6.33792, "grad_norm": 0.5168169140815735, "learning_rate": 0.00011569524835105666, "loss": 3.4778, "step": 4956 }, { "epoch": 6.3392, "grad_norm": 0.5144256949424744, "learning_rate": 0.0001156548660654193, "loss": 3.5237, "step": 4957 }, { "epoch": 6.34048, "grad_norm": 0.529626190662384, "learning_rate": 0.00011561448377978192, "loss": 3.4262, "step": 4958 }, { "epoch": 6.34176, "grad_norm": 0.5221841335296631, "learning_rate": 0.00011557410149414457, "loss": 3.4335, "step": 4959 }, { "epoch": 6.34304, "grad_norm": 0.5182304382324219, "learning_rate": 0.0001155337192085072, "loss": 3.3717, "step": 4960 }, { "epoch": 6.34432, "grad_norm": 0.5277256369590759, "learning_rate": 0.00011549333692286981, "loss": 3.4908, "step": 4961 }, { "epoch": 6.3456, "grad_norm": 0.5436074733734131, "learning_rate": 0.00011545295463723246, "loss": 3.5252, "step": 4962 }, { "epoch": 6.34688, "grad_norm": 0.5004885196685791, "learning_rate": 0.00011541257235159509, "loss": 3.4502, "step": 4963 }, { "epoch": 6.34816, "grad_norm": 0.5198299884796143, "learning_rate": 0.00011537219006595773, "loss": 3.4588, "step": 4964 }, { "epoch": 6.3494399999999995, "grad_norm": 0.5136741399765015, "learning_rate": 0.00011533180778032035, "loss": 3.4659, "step": 4965 }, { "epoch": 6.35072, "grad_norm": 0.5468453764915466, "learning_rate": 0.00011529142549468299, "loss": 3.4652, "step": 4966 }, { "epoch": 6.352, "grad_norm": 0.4965207874774933, "learning_rate": 0.00011525104320904562, "loss": 3.4365, "step": 4967 }, { "epoch": 6.35328, "grad_norm": 0.5069467425346375, "learning_rate": 0.00011521066092340826, "loss": 3.4168, "step": 4968 }, { "epoch": 6.35456, "grad_norm": 0.5168161988258362, "learning_rate": 0.00011517027863777088, "loss": 3.519, "step": 4969 }, { "epoch": 6.35584, "grad_norm": 0.5197111368179321, "learning_rate": 0.00011512989635213352, "loss": 3.4787, "step": 4970 }, { "epoch": 6.35712, "grad_norm": 0.5127108097076416, "learning_rate": 0.00011508951406649615, "loss": 3.5048, "step": 4971 }, { "epoch": 6.3584, "grad_norm": 0.49976083636283875, "learning_rate": 0.0001150491317808588, "loss": 3.3816, "step": 4972 }, { "epoch": 6.35968, "grad_norm": 0.5142550468444824, "learning_rate": 0.00011500874949522141, "loss": 3.4407, "step": 4973 }, { "epoch": 6.36096, "grad_norm": 0.5232443809509277, "learning_rate": 0.00011496836720958404, "loss": 3.4582, "step": 4974 }, { "epoch": 6.36224, "grad_norm": 0.5081977844238281, "learning_rate": 0.00011492798492394669, "loss": 3.4799, "step": 4975 }, { "epoch": 6.36352, "grad_norm": 0.49449968338012695, "learning_rate": 0.00011488760263830932, "loss": 3.4118, "step": 4976 }, { "epoch": 6.3648, "grad_norm": 0.5099364519119263, "learning_rate": 0.00011484722035267195, "loss": 3.4475, "step": 4977 }, { "epoch": 6.36608, "grad_norm": 0.5084264278411865, "learning_rate": 0.00011480683806703458, "loss": 3.5485, "step": 4978 }, { "epoch": 6.36736, "grad_norm": 0.5079696178436279, "learning_rate": 0.00011476645578139722, "loss": 3.4334, "step": 4979 }, { "epoch": 6.36864, "grad_norm": 0.5050063729286194, "learning_rate": 0.00011472607349575985, "loss": 3.4747, "step": 4980 }, { "epoch": 6.3699200000000005, "grad_norm": 0.5045068264007568, "learning_rate": 0.00011468569121012248, "loss": 3.4687, "step": 4981 }, { "epoch": 6.3712, "grad_norm": 0.5139387845993042, "learning_rate": 0.00011464530892448511, "loss": 3.4448, "step": 4982 }, { "epoch": 6.37248, "grad_norm": 0.5034843683242798, "learning_rate": 0.00011460492663884776, "loss": 3.4381, "step": 4983 }, { "epoch": 6.37376, "grad_norm": 0.5243584513664246, "learning_rate": 0.00011456454435321039, "loss": 3.5206, "step": 4984 }, { "epoch": 6.37504, "grad_norm": 0.5162044167518616, "learning_rate": 0.00011452416206757302, "loss": 3.5582, "step": 4985 }, { "epoch": 6.37632, "grad_norm": 0.49788084626197815, "learning_rate": 0.00011448377978193565, "loss": 3.4328, "step": 4986 }, { "epoch": 6.3776, "grad_norm": 0.5192862153053284, "learning_rate": 0.00011444339749629828, "loss": 3.4449, "step": 4987 }, { "epoch": 6.37888, "grad_norm": 0.5089589357376099, "learning_rate": 0.00011440301521066092, "loss": 3.4463, "step": 4988 }, { "epoch": 6.38016, "grad_norm": 0.5230059623718262, "learning_rate": 0.00011436263292502354, "loss": 3.4992, "step": 4989 }, { "epoch": 6.38144, "grad_norm": 0.5021970868110657, "learning_rate": 0.00011432225063938618, "loss": 3.461, "step": 4990 }, { "epoch": 6.38272, "grad_norm": 0.5131734013557434, "learning_rate": 0.00011428186835374881, "loss": 3.4653, "step": 4991 }, { "epoch": 6.384, "grad_norm": 0.49421629309654236, "learning_rate": 0.00011424148606811145, "loss": 3.4365, "step": 4992 }, { "epoch": 6.38528, "grad_norm": 0.5171757340431213, "learning_rate": 0.00011420110378247407, "loss": 3.4412, "step": 4993 }, { "epoch": 6.38656, "grad_norm": 0.530982494354248, "learning_rate": 0.00011416072149683671, "loss": 3.4623, "step": 4994 }, { "epoch": 6.38784, "grad_norm": 0.5211870670318604, "learning_rate": 0.00011412033921119934, "loss": 3.4505, "step": 4995 }, { "epoch": 6.38912, "grad_norm": 0.5216143131256104, "learning_rate": 0.00011407995692556199, "loss": 3.4959, "step": 4996 }, { "epoch": 6.3904, "grad_norm": 0.5288113355636597, "learning_rate": 0.0001140395746399246, "loss": 3.4404, "step": 4997 }, { "epoch": 6.39168, "grad_norm": 0.5123031735420227, "learning_rate": 0.00011399919235428725, "loss": 3.449, "step": 4998 }, { "epoch": 6.39296, "grad_norm": 0.49802166223526, "learning_rate": 0.00011395881006864988, "loss": 3.3765, "step": 4999 }, { "epoch": 6.39424, "grad_norm": 0.5179408192634583, "learning_rate": 0.0001139184277830125, "loss": 3.4644, "step": 5000 }, { "epoch": 6.39552, "grad_norm": 0.5292978286743164, "learning_rate": 0.00011387804549737514, "loss": 3.489, "step": 5001 }, { "epoch": 6.3968, "grad_norm": 0.5212100744247437, "learning_rate": 0.00011383766321173777, "loss": 3.4714, "step": 5002 }, { "epoch": 6.39808, "grad_norm": 0.5056613087654114, "learning_rate": 0.00011379728092610041, "loss": 3.4111, "step": 5003 }, { "epoch": 6.39936, "grad_norm": 0.5097994804382324, "learning_rate": 0.00011375689864046304, "loss": 3.4453, "step": 5004 }, { "epoch": 6.40064, "grad_norm": 0.5117822885513306, "learning_rate": 0.00011371651635482567, "loss": 3.483, "step": 5005 }, { "epoch": 6.40192, "grad_norm": 0.51289302110672, "learning_rate": 0.0001136761340691883, "loss": 3.5011, "step": 5006 }, { "epoch": 6.4032, "grad_norm": 0.5105553865432739, "learning_rate": 0.00011363575178355094, "loss": 3.4876, "step": 5007 }, { "epoch": 6.40448, "grad_norm": 0.5061222314834595, "learning_rate": 0.00011359536949791357, "loss": 3.4529, "step": 5008 }, { "epoch": 6.40576, "grad_norm": 0.5081943869590759, "learning_rate": 0.0001135549872122762, "loss": 3.5208, "step": 5009 }, { "epoch": 6.40704, "grad_norm": 0.5123940706253052, "learning_rate": 0.00011351460492663883, "loss": 3.4691, "step": 5010 }, { "epoch": 6.40832, "grad_norm": 0.5150337219238281, "learning_rate": 0.00011347422264100148, "loss": 3.5093, "step": 5011 }, { "epoch": 6.4096, "grad_norm": 0.5125681161880493, "learning_rate": 0.00011343384035536411, "loss": 3.4547, "step": 5012 }, { "epoch": 6.41088, "grad_norm": 0.509105384349823, "learning_rate": 0.00011339345806972674, "loss": 3.4142, "step": 5013 }, { "epoch": 6.41216, "grad_norm": 0.5203231573104858, "learning_rate": 0.00011335307578408937, "loss": 3.4432, "step": 5014 }, { "epoch": 6.41344, "grad_norm": 0.5590437650680542, "learning_rate": 0.000113312693498452, "loss": 3.4524, "step": 5015 }, { "epoch": 6.41472, "grad_norm": 0.5066128969192505, "learning_rate": 0.00011327231121281464, "loss": 3.5085, "step": 5016 }, { "epoch": 6.416, "grad_norm": 0.5234223008155823, "learning_rate": 0.00011323192892717726, "loss": 3.5202, "step": 5017 }, { "epoch": 6.41728, "grad_norm": 0.49386492371559143, "learning_rate": 0.0001131915466415399, "loss": 3.4921, "step": 5018 }, { "epoch": 6.41856, "grad_norm": 0.5142998695373535, "learning_rate": 0.00011315116435590253, "loss": 3.4908, "step": 5019 }, { "epoch": 6.41984, "grad_norm": 0.5108798742294312, "learning_rate": 0.00011311078207026518, "loss": 3.3707, "step": 5020 }, { "epoch": 6.42112, "grad_norm": 0.5349586009979248, "learning_rate": 0.00011307039978462779, "loss": 3.5232, "step": 5021 }, { "epoch": 6.4224, "grad_norm": 0.5002762675285339, "learning_rate": 0.00011303001749899044, "loss": 3.4531, "step": 5022 }, { "epoch": 6.42368, "grad_norm": 0.5073869228363037, "learning_rate": 0.00011298963521335307, "loss": 3.4888, "step": 5023 }, { "epoch": 6.4249600000000004, "grad_norm": 0.5094159245491028, "learning_rate": 0.00011294925292771571, "loss": 3.4729, "step": 5024 }, { "epoch": 6.42624, "grad_norm": 0.502030074596405, "learning_rate": 0.00011290887064207833, "loss": 3.4268, "step": 5025 }, { "epoch": 6.42752, "grad_norm": 0.5096508264541626, "learning_rate": 0.00011286848835644097, "loss": 3.4771, "step": 5026 }, { "epoch": 6.4288, "grad_norm": 0.5121338367462158, "learning_rate": 0.0001128281060708036, "loss": 3.5188, "step": 5027 }, { "epoch": 6.43008, "grad_norm": 0.5106524229049683, "learning_rate": 0.00011278772378516622, "loss": 3.487, "step": 5028 }, { "epoch": 6.43136, "grad_norm": 0.5135172605514526, "learning_rate": 0.00011274734149952886, "loss": 3.4631, "step": 5029 }, { "epoch": 6.43264, "grad_norm": 0.5284242033958435, "learning_rate": 0.00011270695921389149, "loss": 3.5371, "step": 5030 }, { "epoch": 6.43392, "grad_norm": 0.49686378240585327, "learning_rate": 0.00011266657692825413, "loss": 3.4035, "step": 5031 }, { "epoch": 6.4352, "grad_norm": 0.5379998683929443, "learning_rate": 0.00011262619464261675, "loss": 3.4359, "step": 5032 }, { "epoch": 6.4364799999999995, "grad_norm": 0.5207564830780029, "learning_rate": 0.0001125858123569794, "loss": 3.433, "step": 5033 }, { "epoch": 6.43776, "grad_norm": 0.49627432227134705, "learning_rate": 0.00011254543007134202, "loss": 3.4353, "step": 5034 }, { "epoch": 6.43904, "grad_norm": 0.5221447944641113, "learning_rate": 0.00011250504778570467, "loss": 3.4024, "step": 5035 }, { "epoch": 6.44032, "grad_norm": 0.5198942422866821, "learning_rate": 0.00011246466550006728, "loss": 3.4782, "step": 5036 }, { "epoch": 6.4416, "grad_norm": 0.5337004661560059, "learning_rate": 0.00011242428321442993, "loss": 3.5193, "step": 5037 }, { "epoch": 6.44288, "grad_norm": 0.5141910314559937, "learning_rate": 0.00011238390092879256, "loss": 3.5241, "step": 5038 }, { "epoch": 6.44416, "grad_norm": 0.5050771832466125, "learning_rate": 0.0001123435186431552, "loss": 3.4287, "step": 5039 }, { "epoch": 6.44544, "grad_norm": 0.5285760164260864, "learning_rate": 0.00011230313635751783, "loss": 3.4792, "step": 5040 }, { "epoch": 6.44672, "grad_norm": 0.5192365646362305, "learning_rate": 0.00011226275407188045, "loss": 3.4925, "step": 5041 }, { "epoch": 6.448, "grad_norm": 0.5274104475975037, "learning_rate": 0.00011222237178624309, "loss": 3.444, "step": 5042 }, { "epoch": 6.44928, "grad_norm": 0.5355402231216431, "learning_rate": 0.00011218198950060572, "loss": 3.5535, "step": 5043 }, { "epoch": 6.45056, "grad_norm": 0.5089603662490845, "learning_rate": 0.00011214160721496837, "loss": 3.4749, "step": 5044 }, { "epoch": 6.45184, "grad_norm": 0.5029155611991882, "learning_rate": 0.00011210122492933098, "loss": 3.3917, "step": 5045 }, { "epoch": 6.45312, "grad_norm": 0.5419932007789612, "learning_rate": 0.00011206084264369363, "loss": 3.4852, "step": 5046 }, { "epoch": 6.4544, "grad_norm": 0.5222985148429871, "learning_rate": 0.00011202046035805626, "loss": 3.4503, "step": 5047 }, { "epoch": 6.45568, "grad_norm": 0.5103903412818909, "learning_rate": 0.0001119800780724189, "loss": 3.3846, "step": 5048 }, { "epoch": 6.45696, "grad_norm": 0.5334930419921875, "learning_rate": 0.00011193969578678152, "loss": 3.4451, "step": 5049 }, { "epoch": 6.45824, "grad_norm": 0.506489098072052, "learning_rate": 0.00011189931350114416, "loss": 3.4588, "step": 5050 }, { "epoch": 6.45952, "grad_norm": 0.5101386308670044, "learning_rate": 0.00011185893121550679, "loss": 3.5153, "step": 5051 }, { "epoch": 6.4608, "grad_norm": 0.5294933915138245, "learning_rate": 0.00011181854892986943, "loss": 3.4905, "step": 5052 }, { "epoch": 6.46208, "grad_norm": 0.5064314603805542, "learning_rate": 0.00011177816664423205, "loss": 3.4571, "step": 5053 }, { "epoch": 6.46336, "grad_norm": 0.526623010635376, "learning_rate": 0.00011173778435859468, "loss": 3.4502, "step": 5054 }, { "epoch": 6.46464, "grad_norm": 0.5297154784202576, "learning_rate": 0.00011169740207295732, "loss": 3.5096, "step": 5055 }, { "epoch": 6.46592, "grad_norm": 0.5068486332893372, "learning_rate": 0.00011165701978731994, "loss": 3.4319, "step": 5056 }, { "epoch": 6.4672, "grad_norm": 0.5235540866851807, "learning_rate": 0.00011161663750168258, "loss": 3.4706, "step": 5057 }, { "epoch": 6.46848, "grad_norm": 0.5198733806610107, "learning_rate": 0.00011157625521604521, "loss": 3.487, "step": 5058 }, { "epoch": 6.46976, "grad_norm": 0.5035125613212585, "learning_rate": 0.00011153587293040786, "loss": 3.4629, "step": 5059 }, { "epoch": 6.47104, "grad_norm": 0.5046782493591309, "learning_rate": 0.00011149549064477047, "loss": 3.4192, "step": 5060 }, { "epoch": 6.47232, "grad_norm": 0.50188809633255, "learning_rate": 0.00011145510835913312, "loss": 3.4571, "step": 5061 }, { "epoch": 6.4736, "grad_norm": 0.5252716541290283, "learning_rate": 0.00011141472607349575, "loss": 3.4554, "step": 5062 }, { "epoch": 6.47488, "grad_norm": 0.4985215365886688, "learning_rate": 0.00011137434378785839, "loss": 3.4759, "step": 5063 }, { "epoch": 6.47616, "grad_norm": 0.5182859897613525, "learning_rate": 0.00011133396150222101, "loss": 3.4293, "step": 5064 }, { "epoch": 6.47744, "grad_norm": 0.5109648108482361, "learning_rate": 0.00011129357921658365, "loss": 3.449, "step": 5065 }, { "epoch": 6.47872, "grad_norm": 0.4976121485233307, "learning_rate": 0.00011125319693094628, "loss": 3.4391, "step": 5066 }, { "epoch": 6.48, "grad_norm": 0.501159131526947, "learning_rate": 0.00011121281464530893, "loss": 3.4683, "step": 5067 }, { "epoch": 6.48128, "grad_norm": 0.5295117497444153, "learning_rate": 0.00011117243235967154, "loss": 3.4945, "step": 5068 }, { "epoch": 6.48256, "grad_norm": 0.5260968804359436, "learning_rate": 0.00011113205007403417, "loss": 3.4764, "step": 5069 }, { "epoch": 6.48384, "grad_norm": 0.5069438219070435, "learning_rate": 0.00011109166778839682, "loss": 3.4699, "step": 5070 }, { "epoch": 6.48512, "grad_norm": 0.5271986126899719, "learning_rate": 0.00011105128550275945, "loss": 3.5268, "step": 5071 }, { "epoch": 6.4864, "grad_norm": 0.5156261324882507, "learning_rate": 0.00011101090321712208, "loss": 3.4877, "step": 5072 }, { "epoch": 6.48768, "grad_norm": 0.5246565937995911, "learning_rate": 0.0001109705209314847, "loss": 3.4697, "step": 5073 }, { "epoch": 6.48896, "grad_norm": 0.5298268795013428, "learning_rate": 0.00011093013864584735, "loss": 3.4842, "step": 5074 }, { "epoch": 6.49024, "grad_norm": 0.5147030353546143, "learning_rate": 0.00011088975636020998, "loss": 3.5063, "step": 5075 }, { "epoch": 6.49152, "grad_norm": 0.5172019004821777, "learning_rate": 0.00011084937407457262, "loss": 3.4344, "step": 5076 }, { "epoch": 6.4928, "grad_norm": 0.5194494724273682, "learning_rate": 0.00011080899178893524, "loss": 3.4808, "step": 5077 }, { "epoch": 6.49408, "grad_norm": 0.5049477815628052, "learning_rate": 0.00011076860950329788, "loss": 3.4354, "step": 5078 }, { "epoch": 6.49536, "grad_norm": 0.5107006430625916, "learning_rate": 0.00011072822721766051, "loss": 3.4447, "step": 5079 }, { "epoch": 6.49664, "grad_norm": 0.5118290185928345, "learning_rate": 0.00011068784493202316, "loss": 3.4469, "step": 5080 }, { "epoch": 6.49792, "grad_norm": 0.5051876306533813, "learning_rate": 0.00011064746264638577, "loss": 3.4521, "step": 5081 }, { "epoch": 6.4992, "grad_norm": 0.4978385865688324, "learning_rate": 0.0001106070803607484, "loss": 3.3929, "step": 5082 }, { "epoch": 6.50048, "grad_norm": 0.5098656415939331, "learning_rate": 0.00011056669807511105, "loss": 3.4236, "step": 5083 }, { "epoch": 6.50176, "grad_norm": 0.5078462362289429, "learning_rate": 0.00011052631578947366, "loss": 3.4781, "step": 5084 }, { "epoch": 6.50304, "grad_norm": 0.5070273876190186, "learning_rate": 0.00011048593350383631, "loss": 3.4513, "step": 5085 }, { "epoch": 6.50432, "grad_norm": 0.4959174394607544, "learning_rate": 0.00011044555121819894, "loss": 3.4036, "step": 5086 }, { "epoch": 6.5056, "grad_norm": 0.5386160016059875, "learning_rate": 0.00011040516893256158, "loss": 3.5321, "step": 5087 }, { "epoch": 6.50688, "grad_norm": 0.518519937992096, "learning_rate": 0.0001103647866469242, "loss": 3.4958, "step": 5088 }, { "epoch": 6.50816, "grad_norm": 0.5082813501358032, "learning_rate": 0.00011032440436128684, "loss": 3.4061, "step": 5089 }, { "epoch": 6.50944, "grad_norm": 0.5264861583709717, "learning_rate": 0.00011028402207564947, "loss": 3.4816, "step": 5090 }, { "epoch": 6.51072, "grad_norm": 0.5055463910102844, "learning_rate": 0.00011024363979001212, "loss": 3.5055, "step": 5091 }, { "epoch": 6.5120000000000005, "grad_norm": 0.5075529217720032, "learning_rate": 0.00011020325750437473, "loss": 3.4639, "step": 5092 }, { "epoch": 6.51328, "grad_norm": 0.5140883326530457, "learning_rate": 0.00011016287521873738, "loss": 3.4615, "step": 5093 }, { "epoch": 6.51456, "grad_norm": 0.5120643377304077, "learning_rate": 0.0001101224929331, "loss": 3.4331, "step": 5094 }, { "epoch": 6.51584, "grad_norm": 0.501859724521637, "learning_rate": 0.00011008211064746264, "loss": 3.4614, "step": 5095 }, { "epoch": 6.51712, "grad_norm": 0.514984667301178, "learning_rate": 0.00011004172836182527, "loss": 3.4491, "step": 5096 }, { "epoch": 6.5184, "grad_norm": 0.5052075386047363, "learning_rate": 0.0001100013460761879, "loss": 3.5753, "step": 5097 }, { "epoch": 6.51968, "grad_norm": 0.5124235153198242, "learning_rate": 0.00010996096379055054, "loss": 3.4338, "step": 5098 }, { "epoch": 6.52096, "grad_norm": 0.48975175619125366, "learning_rate": 0.00010992058150491317, "loss": 3.4508, "step": 5099 }, { "epoch": 6.52224, "grad_norm": 0.49977898597717285, "learning_rate": 0.0001098801992192758, "loss": 3.3994, "step": 5100 }, { "epoch": 6.5235199999999995, "grad_norm": 0.4916556775569916, "learning_rate": 0.00010983981693363843, "loss": 3.4289, "step": 5101 }, { "epoch": 6.5248, "grad_norm": 0.5177762508392334, "learning_rate": 0.00010979943464800107, "loss": 3.5018, "step": 5102 }, { "epoch": 6.52608, "grad_norm": 0.5140178799629211, "learning_rate": 0.0001097590523623637, "loss": 3.4727, "step": 5103 }, { "epoch": 6.52736, "grad_norm": 0.4911567270755768, "learning_rate": 0.00010971867007672633, "loss": 3.5407, "step": 5104 }, { "epoch": 6.52864, "grad_norm": 0.505172610282898, "learning_rate": 0.00010967828779108896, "loss": 3.3872, "step": 5105 }, { "epoch": 6.52992, "grad_norm": 0.5191845297813416, "learning_rate": 0.00010963790550545161, "loss": 3.482, "step": 5106 }, { "epoch": 6.5312, "grad_norm": 0.5074360370635986, "learning_rate": 0.00010959752321981424, "loss": 3.4597, "step": 5107 }, { "epoch": 6.53248, "grad_norm": 0.5102279782295227, "learning_rate": 0.00010955714093417685, "loss": 3.3886, "step": 5108 }, { "epoch": 6.53376, "grad_norm": 0.5071060061454773, "learning_rate": 0.0001095167586485395, "loss": 3.4508, "step": 5109 }, { "epoch": 6.53504, "grad_norm": 0.5164380669593811, "learning_rate": 0.00010947637636290213, "loss": 3.446, "step": 5110 }, { "epoch": 6.53632, "grad_norm": 0.5028985142707825, "learning_rate": 0.00010943599407726477, "loss": 3.5139, "step": 5111 }, { "epoch": 6.5376, "grad_norm": 0.49001002311706543, "learning_rate": 0.00010939561179162739, "loss": 3.3891, "step": 5112 }, { "epoch": 6.53888, "grad_norm": 0.5156526565551758, "learning_rate": 0.00010935522950599003, "loss": 3.4913, "step": 5113 }, { "epoch": 6.54016, "grad_norm": 0.4975212514400482, "learning_rate": 0.00010931484722035266, "loss": 3.4462, "step": 5114 }, { "epoch": 6.54144, "grad_norm": 0.5028743147850037, "learning_rate": 0.0001092744649347153, "loss": 3.396, "step": 5115 }, { "epoch": 6.54272, "grad_norm": 0.5105941295623779, "learning_rate": 0.00010923408264907792, "loss": 3.5302, "step": 5116 }, { "epoch": 6.5440000000000005, "grad_norm": 0.5078137516975403, "learning_rate": 0.00010919370036344056, "loss": 3.4376, "step": 5117 }, { "epoch": 6.54528, "grad_norm": 0.4959731101989746, "learning_rate": 0.0001091533180778032, "loss": 3.4079, "step": 5118 }, { "epoch": 6.54656, "grad_norm": 0.5086926817893982, "learning_rate": 0.00010911293579216584, "loss": 3.4873, "step": 5119 }, { "epoch": 6.54784, "grad_norm": 0.5077411532402039, "learning_rate": 0.00010907255350652845, "loss": 3.4758, "step": 5120 }, { "epoch": 6.54912, "grad_norm": 0.5054405331611633, "learning_rate": 0.00010903217122089109, "loss": 3.4363, "step": 5121 }, { "epoch": 6.5504, "grad_norm": 0.5057424902915955, "learning_rate": 0.00010899178893525373, "loss": 3.5458, "step": 5122 }, { "epoch": 6.55168, "grad_norm": 0.5203006267547607, "learning_rate": 0.00010895140664961635, "loss": 3.5374, "step": 5123 }, { "epoch": 6.55296, "grad_norm": 0.5345029830932617, "learning_rate": 0.00010891102436397899, "loss": 3.4367, "step": 5124 }, { "epoch": 6.55424, "grad_norm": 0.5119213461875916, "learning_rate": 0.00010887064207834162, "loss": 3.4897, "step": 5125 }, { "epoch": 6.55552, "grad_norm": 0.5316451787948608, "learning_rate": 0.00010883025979270426, "loss": 3.4209, "step": 5126 }, { "epoch": 6.5568, "grad_norm": 0.5099210739135742, "learning_rate": 0.00010878987750706689, "loss": 3.418, "step": 5127 }, { "epoch": 6.55808, "grad_norm": 0.5128821730613708, "learning_rate": 0.00010874949522142952, "loss": 3.4507, "step": 5128 }, { "epoch": 6.55936, "grad_norm": 0.5047348141670227, "learning_rate": 0.00010870911293579215, "loss": 3.4232, "step": 5129 }, { "epoch": 6.56064, "grad_norm": 0.514415979385376, "learning_rate": 0.0001086687306501548, "loss": 3.5885, "step": 5130 }, { "epoch": 6.56192, "grad_norm": 0.5269995927810669, "learning_rate": 0.00010862834836451743, "loss": 3.5069, "step": 5131 }, { "epoch": 6.5632, "grad_norm": 0.5068910717964172, "learning_rate": 0.00010858796607888006, "loss": 3.427, "step": 5132 }, { "epoch": 6.56448, "grad_norm": 0.514909565448761, "learning_rate": 0.00010854758379324269, "loss": 3.5029, "step": 5133 }, { "epoch": 6.56576, "grad_norm": 0.5140718817710876, "learning_rate": 0.00010850720150760533, "loss": 3.4053, "step": 5134 }, { "epoch": 6.56704, "grad_norm": 0.524402379989624, "learning_rate": 0.00010846681922196796, "loss": 3.523, "step": 5135 }, { "epoch": 6.56832, "grad_norm": 0.519688069820404, "learning_rate": 0.00010842643693633058, "loss": 3.4752, "step": 5136 }, { "epoch": 6.5696, "grad_norm": 0.5132143497467041, "learning_rate": 0.00010838605465069322, "loss": 3.4902, "step": 5137 }, { "epoch": 6.57088, "grad_norm": 0.5264431834220886, "learning_rate": 0.00010834567236505585, "loss": 3.5565, "step": 5138 }, { "epoch": 6.57216, "grad_norm": 0.5338211059570312, "learning_rate": 0.0001083052900794185, "loss": 3.4898, "step": 5139 }, { "epoch": 6.57344, "grad_norm": 0.5249161720275879, "learning_rate": 0.00010826490779378111, "loss": 3.5718, "step": 5140 }, { "epoch": 6.57472, "grad_norm": 0.5375153422355652, "learning_rate": 0.00010822452550814375, "loss": 3.3902, "step": 5141 }, { "epoch": 6.576, "grad_norm": 0.5202714204788208, "learning_rate": 0.00010818414322250638, "loss": 3.4019, "step": 5142 }, { "epoch": 6.57728, "grad_norm": 0.5213868618011475, "learning_rate": 0.00010814376093686903, "loss": 3.5119, "step": 5143 }, { "epoch": 6.5785599999999995, "grad_norm": 0.5079566240310669, "learning_rate": 0.00010810337865123164, "loss": 3.4668, "step": 5144 }, { "epoch": 6.57984, "grad_norm": 0.5234741568565369, "learning_rate": 0.00010806299636559429, "loss": 3.4789, "step": 5145 }, { "epoch": 6.58112, "grad_norm": 0.5120872259140015, "learning_rate": 0.00010802261407995692, "loss": 3.4737, "step": 5146 }, { "epoch": 6.5824, "grad_norm": 0.5126521587371826, "learning_rate": 0.00010798223179431956, "loss": 3.4625, "step": 5147 }, { "epoch": 6.58368, "grad_norm": 0.5083807110786438, "learning_rate": 0.00010794184950868218, "loss": 3.5056, "step": 5148 }, { "epoch": 6.58496, "grad_norm": 0.5182278752326965, "learning_rate": 0.00010790146722304481, "loss": 3.4856, "step": 5149 }, { "epoch": 6.58624, "grad_norm": 0.5055496096611023, "learning_rate": 0.00010786108493740745, "loss": 3.4205, "step": 5150 }, { "epoch": 6.58752, "grad_norm": 0.5119015574455261, "learning_rate": 0.00010782070265177007, "loss": 3.4647, "step": 5151 }, { "epoch": 6.5888, "grad_norm": 0.5228475332260132, "learning_rate": 0.00010778032036613271, "loss": 3.4117, "step": 5152 }, { "epoch": 6.59008, "grad_norm": 0.5190859436988831, "learning_rate": 0.00010773993808049534, "loss": 3.4451, "step": 5153 }, { "epoch": 6.59136, "grad_norm": 0.509190022945404, "learning_rate": 0.00010769955579485799, "loss": 3.4443, "step": 5154 }, { "epoch": 6.59264, "grad_norm": 0.4972391724586487, "learning_rate": 0.0001076591735092206, "loss": 3.4357, "step": 5155 }, { "epoch": 6.59392, "grad_norm": 0.5034371614456177, "learning_rate": 0.00010761879122358325, "loss": 3.4543, "step": 5156 }, { "epoch": 6.5952, "grad_norm": 0.5267638564109802, "learning_rate": 0.00010757840893794588, "loss": 3.4192, "step": 5157 }, { "epoch": 6.59648, "grad_norm": 0.4983810484409332, "learning_rate": 0.00010753802665230852, "loss": 3.3902, "step": 5158 }, { "epoch": 6.59776, "grad_norm": 0.49326276779174805, "learning_rate": 0.00010749764436667114, "loss": 3.4052, "step": 5159 }, { "epoch": 6.5990400000000005, "grad_norm": 0.526161789894104, "learning_rate": 0.00010745726208103378, "loss": 3.4829, "step": 5160 }, { "epoch": 6.60032, "grad_norm": 0.49957576394081116, "learning_rate": 0.00010741687979539641, "loss": 3.3863, "step": 5161 }, { "epoch": 6.6016, "grad_norm": 0.5143072009086609, "learning_rate": 0.00010737649750975904, "loss": 3.502, "step": 5162 }, { "epoch": 6.60288, "grad_norm": 0.5053294897079468, "learning_rate": 0.00010733611522412168, "loss": 3.4453, "step": 5163 }, { "epoch": 6.60416, "grad_norm": 0.49979978799819946, "learning_rate": 0.0001072957329384843, "loss": 3.4394, "step": 5164 }, { "epoch": 6.60544, "grad_norm": 0.5157962441444397, "learning_rate": 0.00010725535065284694, "loss": 3.4921, "step": 5165 }, { "epoch": 6.60672, "grad_norm": 0.5077204704284668, "learning_rate": 0.00010721496836720957, "loss": 3.4838, "step": 5166 }, { "epoch": 6.608, "grad_norm": 0.5278066992759705, "learning_rate": 0.00010717458608157222, "loss": 3.4713, "step": 5167 }, { "epoch": 6.60928, "grad_norm": 0.506314218044281, "learning_rate": 0.00010713420379593483, "loss": 3.5477, "step": 5168 }, { "epoch": 6.6105599999999995, "grad_norm": 0.5169037580490112, "learning_rate": 0.00010709382151029748, "loss": 3.5052, "step": 5169 }, { "epoch": 6.61184, "grad_norm": 0.4995163679122925, "learning_rate": 0.00010705343922466011, "loss": 3.4526, "step": 5170 }, { "epoch": 6.61312, "grad_norm": 0.516046404838562, "learning_rate": 0.00010701305693902275, "loss": 3.4841, "step": 5171 }, { "epoch": 6.6144, "grad_norm": 0.5097321271896362, "learning_rate": 0.00010697267465338537, "loss": 3.502, "step": 5172 }, { "epoch": 6.61568, "grad_norm": 0.5002099871635437, "learning_rate": 0.00010693229236774801, "loss": 3.4349, "step": 5173 }, { "epoch": 6.61696, "grad_norm": 0.5027880668640137, "learning_rate": 0.00010689191008211064, "loss": 3.4844, "step": 5174 }, { "epoch": 6.61824, "grad_norm": 0.5286634564399719, "learning_rate": 0.00010685152779647326, "loss": 3.4879, "step": 5175 }, { "epoch": 6.61952, "grad_norm": 0.5041953325271606, "learning_rate": 0.0001068111455108359, "loss": 3.4475, "step": 5176 }, { "epoch": 6.6208, "grad_norm": 0.5107352137565613, "learning_rate": 0.00010677076322519853, "loss": 3.3829, "step": 5177 }, { "epoch": 6.62208, "grad_norm": 0.52018803358078, "learning_rate": 0.00010673038093956118, "loss": 3.4578, "step": 5178 }, { "epoch": 6.62336, "grad_norm": 0.5006090998649597, "learning_rate": 0.00010668999865392379, "loss": 3.4867, "step": 5179 }, { "epoch": 6.62464, "grad_norm": 0.5116918683052063, "learning_rate": 0.00010664961636828644, "loss": 3.3972, "step": 5180 }, { "epoch": 6.62592, "grad_norm": 0.5370351076126099, "learning_rate": 0.00010660923408264907, "loss": 3.4639, "step": 5181 }, { "epoch": 6.6272, "grad_norm": 0.5006325244903564, "learning_rate": 0.00010656885179701171, "loss": 3.5232, "step": 5182 }, { "epoch": 6.62848, "grad_norm": 0.5160982608795166, "learning_rate": 0.00010652846951137433, "loss": 3.4108, "step": 5183 }, { "epoch": 6.62976, "grad_norm": 0.5148961544036865, "learning_rate": 0.00010648808722573697, "loss": 3.5224, "step": 5184 }, { "epoch": 6.6310400000000005, "grad_norm": 0.4979293644428253, "learning_rate": 0.0001064477049400996, "loss": 3.4324, "step": 5185 }, { "epoch": 6.63232, "grad_norm": 0.5204696655273438, "learning_rate": 0.00010640732265446224, "loss": 3.4534, "step": 5186 }, { "epoch": 6.6336, "grad_norm": 0.5214214324951172, "learning_rate": 0.00010636694036882486, "loss": 3.5314, "step": 5187 }, { "epoch": 6.63488, "grad_norm": 0.5053163766860962, "learning_rate": 0.00010632655808318749, "loss": 3.3898, "step": 5188 }, { "epoch": 6.63616, "grad_norm": 0.5069863200187683, "learning_rate": 0.00010628617579755013, "loss": 3.4414, "step": 5189 }, { "epoch": 6.63744, "grad_norm": 0.5157361030578613, "learning_rate": 0.00010624579351191276, "loss": 3.5128, "step": 5190 }, { "epoch": 6.63872, "grad_norm": 0.500819206237793, "learning_rate": 0.0001062054112262754, "loss": 3.4923, "step": 5191 }, { "epoch": 6.64, "grad_norm": 0.5084678530693054, "learning_rate": 0.00010616502894063802, "loss": 3.4921, "step": 5192 }, { "epoch": 6.64128, "grad_norm": 0.520411491394043, "learning_rate": 0.00010612464665500067, "loss": 3.4166, "step": 5193 }, { "epoch": 6.64256, "grad_norm": 0.5096445083618164, "learning_rate": 0.0001060842643693633, "loss": 3.4323, "step": 5194 }, { "epoch": 6.64384, "grad_norm": 0.503450870513916, "learning_rate": 0.00010604388208372593, "loss": 3.3905, "step": 5195 }, { "epoch": 6.64512, "grad_norm": 0.5244500041007996, "learning_rate": 0.00010600349979808856, "loss": 3.5189, "step": 5196 }, { "epoch": 6.6464, "grad_norm": 0.5261669158935547, "learning_rate": 0.0001059631175124512, "loss": 3.4409, "step": 5197 }, { "epoch": 6.64768, "grad_norm": 0.5130569338798523, "learning_rate": 0.00010592273522681383, "loss": 3.4309, "step": 5198 }, { "epoch": 6.64896, "grad_norm": 0.5225698947906494, "learning_rate": 0.00010588235294117647, "loss": 3.4626, "step": 5199 }, { "epoch": 6.65024, "grad_norm": 0.5169934034347534, "learning_rate": 0.00010584197065553909, "loss": 3.465, "step": 5200 }, { "epoch": 6.65152, "grad_norm": 0.49897530674934387, "learning_rate": 0.00010580158836990174, "loss": 3.4524, "step": 5201 }, { "epoch": 6.6528, "grad_norm": 0.5103302001953125, "learning_rate": 0.00010576120608426437, "loss": 3.4629, "step": 5202 }, { "epoch": 6.65408, "grad_norm": 0.5289749503135681, "learning_rate": 0.00010572082379862698, "loss": 3.4733, "step": 5203 }, { "epoch": 6.65536, "grad_norm": 0.5066429376602173, "learning_rate": 0.00010568044151298963, "loss": 3.4266, "step": 5204 }, { "epoch": 6.65664, "grad_norm": 0.5125609636306763, "learning_rate": 0.00010564005922735226, "loss": 3.403, "step": 5205 }, { "epoch": 6.65792, "grad_norm": 0.5179828405380249, "learning_rate": 0.0001055996769417149, "loss": 3.4441, "step": 5206 }, { "epoch": 6.6592, "grad_norm": 0.5193979740142822, "learning_rate": 0.00010555929465607752, "loss": 3.5631, "step": 5207 }, { "epoch": 6.66048, "grad_norm": 0.5336554050445557, "learning_rate": 0.00010551891237044016, "loss": 3.5912, "step": 5208 }, { "epoch": 6.66176, "grad_norm": 0.5226610898971558, "learning_rate": 0.00010547853008480279, "loss": 3.461, "step": 5209 }, { "epoch": 6.66304, "grad_norm": 0.523395836353302, "learning_rate": 0.00010543814779916543, "loss": 3.4231, "step": 5210 }, { "epoch": 6.66432, "grad_norm": 0.5272761583328247, "learning_rate": 0.00010539776551352805, "loss": 3.4297, "step": 5211 }, { "epoch": 6.6655999999999995, "grad_norm": 0.5125918984413147, "learning_rate": 0.00010535738322789069, "loss": 3.4043, "step": 5212 }, { "epoch": 6.66688, "grad_norm": 0.4985739290714264, "learning_rate": 0.00010531700094225332, "loss": 3.5057, "step": 5213 }, { "epoch": 6.66816, "grad_norm": 0.5222551226615906, "learning_rate": 0.00010527661865661597, "loss": 3.4881, "step": 5214 }, { "epoch": 6.66944, "grad_norm": 0.5044218301773071, "learning_rate": 0.00010523623637097858, "loss": 3.4112, "step": 5215 }, { "epoch": 6.67072, "grad_norm": 0.5089839696884155, "learning_rate": 0.00010519585408534121, "loss": 3.4329, "step": 5216 }, { "epoch": 6.672, "grad_norm": 0.5202680826187134, "learning_rate": 0.00010515547179970386, "loss": 3.5092, "step": 5217 }, { "epoch": 6.67328, "grad_norm": 0.5112069845199585, "learning_rate": 0.00010511508951406649, "loss": 3.4684, "step": 5218 }, { "epoch": 6.67456, "grad_norm": 0.5102248787879944, "learning_rate": 0.00010507470722842912, "loss": 3.5571, "step": 5219 }, { "epoch": 6.67584, "grad_norm": 0.5237480998039246, "learning_rate": 0.00010503432494279175, "loss": 3.5169, "step": 5220 }, { "epoch": 6.67712, "grad_norm": 0.5136720538139343, "learning_rate": 0.00010499394265715439, "loss": 3.4614, "step": 5221 }, { "epoch": 6.6784, "grad_norm": 0.5107023119926453, "learning_rate": 0.00010495356037151702, "loss": 3.4888, "step": 5222 }, { "epoch": 6.67968, "grad_norm": 0.5016779899597168, "learning_rate": 0.00010491317808587965, "loss": 3.4266, "step": 5223 }, { "epoch": 6.68096, "grad_norm": 0.5184606909751892, "learning_rate": 0.00010487279580024228, "loss": 3.4299, "step": 5224 }, { "epoch": 6.68224, "grad_norm": 0.5030345320701599, "learning_rate": 0.00010483241351460492, "loss": 3.4783, "step": 5225 }, { "epoch": 6.68352, "grad_norm": 0.5136353373527527, "learning_rate": 0.00010479203122896755, "loss": 3.5147, "step": 5226 }, { "epoch": 6.6848, "grad_norm": 0.5140992999076843, "learning_rate": 0.00010475164894333018, "loss": 3.4818, "step": 5227 }, { "epoch": 6.6860800000000005, "grad_norm": 0.5170648694038391, "learning_rate": 0.00010471126665769281, "loss": 3.4642, "step": 5228 }, { "epoch": 6.68736, "grad_norm": 0.5128257274627686, "learning_rate": 0.00010467088437205544, "loss": 3.5038, "step": 5229 }, { "epoch": 6.68864, "grad_norm": 0.5143338441848755, "learning_rate": 0.00010463050208641809, "loss": 3.435, "step": 5230 }, { "epoch": 6.68992, "grad_norm": 0.5160678029060364, "learning_rate": 0.0001045901198007807, "loss": 3.4402, "step": 5231 }, { "epoch": 6.6912, "grad_norm": 0.4955798387527466, "learning_rate": 0.00010454973751514335, "loss": 3.4803, "step": 5232 }, { "epoch": 6.69248, "grad_norm": 0.5133627653121948, "learning_rate": 0.00010450935522950598, "loss": 3.5029, "step": 5233 }, { "epoch": 6.69376, "grad_norm": 0.5118188261985779, "learning_rate": 0.00010446897294386862, "loss": 3.4282, "step": 5234 }, { "epoch": 6.69504, "grad_norm": 0.5236147046089172, "learning_rate": 0.00010442859065823124, "loss": 3.5077, "step": 5235 }, { "epoch": 6.69632, "grad_norm": 0.510515034198761, "learning_rate": 0.00010438820837259388, "loss": 3.6193, "step": 5236 }, { "epoch": 6.6975999999999996, "grad_norm": 0.51131272315979, "learning_rate": 0.00010434782608695651, "loss": 3.5257, "step": 5237 }, { "epoch": 6.69888, "grad_norm": 0.5147497653961182, "learning_rate": 0.00010430744380131916, "loss": 3.4566, "step": 5238 }, { "epoch": 6.70016, "grad_norm": 0.4983549416065216, "learning_rate": 0.00010426706151568177, "loss": 3.5215, "step": 5239 }, { "epoch": 6.70144, "grad_norm": 0.49420973658561707, "learning_rate": 0.00010422667923004442, "loss": 3.4522, "step": 5240 }, { "epoch": 6.70272, "grad_norm": 0.5193718075752258, "learning_rate": 0.00010418629694440705, "loss": 3.5033, "step": 5241 }, { "epoch": 6.704, "grad_norm": 0.5157377123832703, "learning_rate": 0.00010414591465876966, "loss": 3.4045, "step": 5242 }, { "epoch": 6.70528, "grad_norm": 0.4960803985595703, "learning_rate": 0.0001041055323731323, "loss": 3.4672, "step": 5243 }, { "epoch": 6.70656, "grad_norm": 0.5239757299423218, "learning_rate": 0.00010406515008749494, "loss": 3.57, "step": 5244 }, { "epoch": 6.70784, "grad_norm": 0.5048173069953918, "learning_rate": 0.00010402476780185758, "loss": 3.4302, "step": 5245 }, { "epoch": 6.70912, "grad_norm": 0.5351990461349487, "learning_rate": 0.0001039843855162202, "loss": 3.5531, "step": 5246 }, { "epoch": 6.7104, "grad_norm": 0.49860915541648865, "learning_rate": 0.00010394400323058284, "loss": 3.3879, "step": 5247 }, { "epoch": 6.71168, "grad_norm": 0.5203588604927063, "learning_rate": 0.00010390362094494547, "loss": 3.4665, "step": 5248 }, { "epoch": 6.71296, "grad_norm": 0.5277810096740723, "learning_rate": 0.00010386323865930811, "loss": 3.4588, "step": 5249 }, { "epoch": 6.71424, "grad_norm": 0.5041863918304443, "learning_rate": 0.00010382285637367074, "loss": 3.4085, "step": 5250 }, { "epoch": 6.71552, "grad_norm": 0.5177097916603088, "learning_rate": 0.00010378247408803337, "loss": 3.4501, "step": 5251 }, { "epoch": 6.7168, "grad_norm": 0.5193495750427246, "learning_rate": 0.000103742091802396, "loss": 3.4086, "step": 5252 }, { "epoch": 6.7180800000000005, "grad_norm": 0.5192511081695557, "learning_rate": 0.00010370170951675865, "loss": 3.4711, "step": 5253 }, { "epoch": 6.71936, "grad_norm": 0.5242570638656616, "learning_rate": 0.00010366132723112128, "loss": 3.4035, "step": 5254 }, { "epoch": 6.7206399999999995, "grad_norm": 0.5168814063072205, "learning_rate": 0.00010362094494548391, "loss": 3.4503, "step": 5255 }, { "epoch": 6.72192, "grad_norm": 0.506004810333252, "learning_rate": 0.00010358056265984654, "loss": 3.456, "step": 5256 }, { "epoch": 6.7232, "grad_norm": 0.5204073786735535, "learning_rate": 0.00010354018037420917, "loss": 3.4142, "step": 5257 }, { "epoch": 6.72448, "grad_norm": 0.4996756315231323, "learning_rate": 0.00010349979808857181, "loss": 3.4942, "step": 5258 }, { "epoch": 6.72576, "grad_norm": 0.5085786581039429, "learning_rate": 0.00010345941580293443, "loss": 3.4686, "step": 5259 }, { "epoch": 6.72704, "grad_norm": 0.5232467651367188, "learning_rate": 0.00010341903351729707, "loss": 3.5521, "step": 5260 }, { "epoch": 6.72832, "grad_norm": 0.49586203694343567, "learning_rate": 0.0001033786512316597, "loss": 3.4765, "step": 5261 }, { "epoch": 6.7296, "grad_norm": 0.5204891562461853, "learning_rate": 0.00010333826894602235, "loss": 3.4617, "step": 5262 }, { "epoch": 6.73088, "grad_norm": 0.5140677094459534, "learning_rate": 0.00010329788666038496, "loss": 3.4447, "step": 5263 }, { "epoch": 6.73216, "grad_norm": 0.501307487487793, "learning_rate": 0.0001032575043747476, "loss": 3.4758, "step": 5264 }, { "epoch": 6.73344, "grad_norm": 0.5081751346588135, "learning_rate": 0.00010321712208911024, "loss": 3.4248, "step": 5265 }, { "epoch": 6.73472, "grad_norm": 0.5316794514656067, "learning_rate": 0.00010317673980347288, "loss": 3.4928, "step": 5266 }, { "epoch": 6.736, "grad_norm": 0.5068027377128601, "learning_rate": 0.0001031363575178355, "loss": 3.4352, "step": 5267 }, { "epoch": 6.73728, "grad_norm": 0.5198256373405457, "learning_rate": 0.00010309597523219814, "loss": 3.5322, "step": 5268 }, { "epoch": 6.73856, "grad_norm": 0.5113339424133301, "learning_rate": 0.00010305559294656077, "loss": 3.4244, "step": 5269 }, { "epoch": 6.73984, "grad_norm": 0.5019405484199524, "learning_rate": 0.00010301521066092339, "loss": 3.5323, "step": 5270 }, { "epoch": 6.7411200000000004, "grad_norm": 0.5108396410942078, "learning_rate": 0.00010297482837528603, "loss": 3.5005, "step": 5271 }, { "epoch": 6.7424, "grad_norm": 0.5199779868125916, "learning_rate": 0.00010293444608964866, "loss": 3.519, "step": 5272 }, { "epoch": 6.74368, "grad_norm": 0.5146390795707703, "learning_rate": 0.0001028940638040113, "loss": 3.4188, "step": 5273 }, { "epoch": 6.74496, "grad_norm": 0.5107973217964172, "learning_rate": 0.00010285368151837392, "loss": 3.4814, "step": 5274 }, { "epoch": 6.74624, "grad_norm": 0.5244498252868652, "learning_rate": 0.00010281329923273656, "loss": 3.4791, "step": 5275 }, { "epoch": 6.74752, "grad_norm": 0.5274310111999512, "learning_rate": 0.0001027729169470992, "loss": 3.5072, "step": 5276 }, { "epoch": 6.7488, "grad_norm": 0.528631329536438, "learning_rate": 0.00010273253466146184, "loss": 3.5248, "step": 5277 }, { "epoch": 6.75008, "grad_norm": 0.5297229886054993, "learning_rate": 0.00010269215237582445, "loss": 3.4491, "step": 5278 }, { "epoch": 6.75136, "grad_norm": 0.507632315158844, "learning_rate": 0.0001026517700901871, "loss": 3.4365, "step": 5279 }, { "epoch": 6.7526399999999995, "grad_norm": 0.5038201808929443, "learning_rate": 0.00010261138780454973, "loss": 3.5131, "step": 5280 }, { "epoch": 6.75392, "grad_norm": 0.5153136253356934, "learning_rate": 0.00010257100551891237, "loss": 3.4924, "step": 5281 }, { "epoch": 6.7552, "grad_norm": 0.4991972744464874, "learning_rate": 0.00010253062323327499, "loss": 3.4242, "step": 5282 }, { "epoch": 6.75648, "grad_norm": 0.4989522099494934, "learning_rate": 0.00010249024094763762, "loss": 3.5012, "step": 5283 }, { "epoch": 6.75776, "grad_norm": 0.49682870507240295, "learning_rate": 0.00010244985866200026, "loss": 3.4454, "step": 5284 }, { "epoch": 6.75904, "grad_norm": 0.49864479899406433, "learning_rate": 0.00010240947637636289, "loss": 3.443, "step": 5285 }, { "epoch": 6.76032, "grad_norm": 0.5117205381393433, "learning_rate": 0.00010236909409072552, "loss": 3.4396, "step": 5286 }, { "epoch": 6.7616, "grad_norm": 0.5127941966056824, "learning_rate": 0.00010232871180508815, "loss": 3.496, "step": 5287 }, { "epoch": 6.76288, "grad_norm": 0.5159007906913757, "learning_rate": 0.0001022883295194508, "loss": 3.4693, "step": 5288 }, { "epoch": 6.76416, "grad_norm": 0.5161288976669312, "learning_rate": 0.00010224794723381343, "loss": 3.4029, "step": 5289 }, { "epoch": 6.76544, "grad_norm": 0.4983842670917511, "learning_rate": 0.00010220756494817607, "loss": 3.4671, "step": 5290 }, { "epoch": 6.76672, "grad_norm": 0.5110688209533691, "learning_rate": 0.00010216718266253869, "loss": 3.435, "step": 5291 }, { "epoch": 6.768, "grad_norm": 0.524663507938385, "learning_rate": 0.00010212680037690133, "loss": 3.5548, "step": 5292 }, { "epoch": 6.76928, "grad_norm": 0.49911680817604065, "learning_rate": 0.00010208641809126396, "loss": 3.4678, "step": 5293 }, { "epoch": 6.77056, "grad_norm": 0.5108251571655273, "learning_rate": 0.0001020460358056266, "loss": 3.4816, "step": 5294 }, { "epoch": 6.77184, "grad_norm": 0.5047867298126221, "learning_rate": 0.00010200565351998922, "loss": 3.4348, "step": 5295 }, { "epoch": 6.7731200000000005, "grad_norm": 0.5126866698265076, "learning_rate": 0.00010196527123435185, "loss": 3.4728, "step": 5296 }, { "epoch": 6.7744, "grad_norm": 0.5074558258056641, "learning_rate": 0.0001019248889487145, "loss": 3.4696, "step": 5297 }, { "epoch": 6.77568, "grad_norm": 0.5098884105682373, "learning_rate": 0.00010188450666307711, "loss": 3.4211, "step": 5298 }, { "epoch": 6.77696, "grad_norm": 0.5483927726745605, "learning_rate": 0.00010184412437743975, "loss": 3.427, "step": 5299 }, { "epoch": 6.77824, "grad_norm": 0.49813124537467957, "learning_rate": 0.00010180374209180238, "loss": 3.4238, "step": 5300 }, { "epoch": 6.77952, "grad_norm": 0.5243797898292542, "learning_rate": 0.00010176335980616503, "loss": 3.5178, "step": 5301 }, { "epoch": 6.7808, "grad_norm": 0.5047301054000854, "learning_rate": 0.00010172297752052764, "loss": 3.3934, "step": 5302 }, { "epoch": 6.78208, "grad_norm": 0.4998399019241333, "learning_rate": 0.00010168259523489029, "loss": 3.4589, "step": 5303 }, { "epoch": 6.78336, "grad_norm": 0.5172979831695557, "learning_rate": 0.00010164221294925292, "loss": 3.4315, "step": 5304 }, { "epoch": 6.78464, "grad_norm": 0.5025821328163147, "learning_rate": 0.00010160183066361556, "loss": 3.4013, "step": 5305 }, { "epoch": 6.78592, "grad_norm": 0.49646785855293274, "learning_rate": 0.00010156144837797818, "loss": 3.4618, "step": 5306 }, { "epoch": 6.7872, "grad_norm": 0.49145373702049255, "learning_rate": 0.00010152106609234082, "loss": 3.4746, "step": 5307 }, { "epoch": 6.78848, "grad_norm": 0.5159743428230286, "learning_rate": 0.00010148068380670345, "loss": 3.4769, "step": 5308 }, { "epoch": 6.78976, "grad_norm": 0.4938885271549225, "learning_rate": 0.00010144030152106608, "loss": 3.3877, "step": 5309 }, { "epoch": 6.79104, "grad_norm": 0.5240892767906189, "learning_rate": 0.00010139991923542871, "loss": 3.5444, "step": 5310 }, { "epoch": 6.79232, "grad_norm": 0.5187338590621948, "learning_rate": 0.00010135953694979134, "loss": 3.5475, "step": 5311 }, { "epoch": 6.7936, "grad_norm": 0.5051224231719971, "learning_rate": 0.00010131915466415399, "loss": 3.4844, "step": 5312 }, { "epoch": 6.79488, "grad_norm": 0.5185756683349609, "learning_rate": 0.00010127877237851662, "loss": 3.5207, "step": 5313 }, { "epoch": 6.79616, "grad_norm": 0.5253816843032837, "learning_rate": 0.00010123839009287925, "loss": 3.4069, "step": 5314 }, { "epoch": 6.79744, "grad_norm": 0.511680543422699, "learning_rate": 0.00010119800780724188, "loss": 3.4456, "step": 5315 }, { "epoch": 6.79872, "grad_norm": 0.5037193298339844, "learning_rate": 0.00010115762552160452, "loss": 3.5139, "step": 5316 }, { "epoch": 6.8, "grad_norm": 0.5148517489433289, "learning_rate": 0.00010111724323596715, "loss": 3.4717, "step": 5317 }, { "epoch": 6.80128, "grad_norm": 0.4965799152851105, "learning_rate": 0.00010107686095032978, "loss": 3.4902, "step": 5318 }, { "epoch": 6.80256, "grad_norm": 0.5093517899513245, "learning_rate": 0.00010103647866469241, "loss": 3.5074, "step": 5319 }, { "epoch": 6.80384, "grad_norm": 0.5288740396499634, "learning_rate": 0.00010099609637905505, "loss": 3.4673, "step": 5320 }, { "epoch": 6.80512, "grad_norm": 0.5054797530174255, "learning_rate": 0.00010095571409341768, "loss": 3.5055, "step": 5321 }, { "epoch": 6.8064, "grad_norm": 0.5090057253837585, "learning_rate": 0.00010091533180778031, "loss": 3.5057, "step": 5322 }, { "epoch": 6.8076799999999995, "grad_norm": 0.5045148730278015, "learning_rate": 0.00010087494952214294, "loss": 3.4295, "step": 5323 }, { "epoch": 6.80896, "grad_norm": 0.5039887428283691, "learning_rate": 0.00010083456723650557, "loss": 3.4735, "step": 5324 }, { "epoch": 6.81024, "grad_norm": 0.4997764229774475, "learning_rate": 0.00010079418495086822, "loss": 3.4807, "step": 5325 }, { "epoch": 6.81152, "grad_norm": 0.5063294768333435, "learning_rate": 0.00010075380266523083, "loss": 3.446, "step": 5326 }, { "epoch": 6.8128, "grad_norm": 0.5159018039703369, "learning_rate": 0.00010071342037959348, "loss": 3.4766, "step": 5327 }, { "epoch": 6.81408, "grad_norm": 0.5194793343544006, "learning_rate": 0.00010067303809395611, "loss": 3.5094, "step": 5328 }, { "epoch": 6.81536, "grad_norm": 0.5233056545257568, "learning_rate": 0.00010063265580831875, "loss": 3.5093, "step": 5329 }, { "epoch": 6.81664, "grad_norm": 0.5220425128936768, "learning_rate": 0.00010059227352268137, "loss": 3.5481, "step": 5330 }, { "epoch": 6.81792, "grad_norm": 0.4991253912448883, "learning_rate": 0.00010055189123704401, "loss": 3.4397, "step": 5331 }, { "epoch": 6.8192, "grad_norm": 0.5196431279182434, "learning_rate": 0.00010051150895140664, "loss": 3.429, "step": 5332 }, { "epoch": 6.82048, "grad_norm": 0.510365903377533, "learning_rate": 0.00010047112666576928, "loss": 3.4965, "step": 5333 }, { "epoch": 6.82176, "grad_norm": 0.5005152821540833, "learning_rate": 0.0001004307443801319, "loss": 3.4985, "step": 5334 }, { "epoch": 6.82304, "grad_norm": 0.508727490901947, "learning_rate": 0.00010039036209449454, "loss": 3.3829, "step": 5335 }, { "epoch": 6.82432, "grad_norm": 0.5061600208282471, "learning_rate": 0.00010034997980885717, "loss": 3.4547, "step": 5336 }, { "epoch": 6.8256, "grad_norm": 0.5172441601753235, "learning_rate": 0.00010030959752321979, "loss": 3.3778, "step": 5337 }, { "epoch": 6.82688, "grad_norm": 0.4944312572479248, "learning_rate": 0.00010026921523758243, "loss": 3.4106, "step": 5338 }, { "epoch": 6.8281600000000005, "grad_norm": 0.5171020030975342, "learning_rate": 0.00010022883295194506, "loss": 3.4994, "step": 5339 }, { "epoch": 6.82944, "grad_norm": 0.5048860907554626, "learning_rate": 0.00010018845066630771, "loss": 3.4575, "step": 5340 }, { "epoch": 6.83072, "grad_norm": 0.5153966546058655, "learning_rate": 0.00010014806838067034, "loss": 3.4745, "step": 5341 }, { "epoch": 6.832, "grad_norm": 0.5013549327850342, "learning_rate": 0.00010010768609503297, "loss": 3.4541, "step": 5342 }, { "epoch": 6.83328, "grad_norm": 0.520453929901123, "learning_rate": 0.0001000673038093956, "loss": 3.4551, "step": 5343 }, { "epoch": 6.83456, "grad_norm": 0.5147715210914612, "learning_rate": 0.00010002692152375824, "loss": 3.4985, "step": 5344 }, { "epoch": 6.83584, "grad_norm": 0.5121247172355652, "learning_rate": 9.998653923812087e-05, "loss": 3.4887, "step": 5345 }, { "epoch": 6.83712, "grad_norm": 0.5188484787940979, "learning_rate": 9.99461569524835e-05, "loss": 3.4733, "step": 5346 }, { "epoch": 6.8384, "grad_norm": 0.5076323747634888, "learning_rate": 9.990577466684613e-05, "loss": 3.4318, "step": 5347 }, { "epoch": 6.8396799999999995, "grad_norm": 0.5205701589584351, "learning_rate": 9.986539238120878e-05, "loss": 3.4467, "step": 5348 }, { "epoch": 6.84096, "grad_norm": 0.5038503408432007, "learning_rate": 9.98250100955714e-05, "loss": 3.3956, "step": 5349 }, { "epoch": 6.84224, "grad_norm": 0.4975230395793915, "learning_rate": 9.978462780993402e-05, "loss": 3.4585, "step": 5350 }, { "epoch": 6.84352, "grad_norm": 0.5168045163154602, "learning_rate": 9.974424552429667e-05, "loss": 3.4813, "step": 5351 }, { "epoch": 6.8448, "grad_norm": 0.5278074145317078, "learning_rate": 9.97038632386593e-05, "loss": 3.5233, "step": 5352 }, { "epoch": 6.84608, "grad_norm": 0.5026320219039917, "learning_rate": 9.966348095302194e-05, "loss": 3.3954, "step": 5353 }, { "epoch": 6.84736, "grad_norm": 0.5222524404525757, "learning_rate": 9.962309866738456e-05, "loss": 3.5426, "step": 5354 }, { "epoch": 6.84864, "grad_norm": 0.49351999163627625, "learning_rate": 9.95827163817472e-05, "loss": 3.5092, "step": 5355 }, { "epoch": 6.84992, "grad_norm": 0.5170454978942871, "learning_rate": 9.954233409610983e-05, "loss": 3.5309, "step": 5356 }, { "epoch": 6.8512, "grad_norm": 0.49120914936065674, "learning_rate": 9.950195181047247e-05, "loss": 3.4582, "step": 5357 }, { "epoch": 6.85248, "grad_norm": 0.5041413903236389, "learning_rate": 9.946156952483509e-05, "loss": 3.5076, "step": 5358 }, { "epoch": 6.85376, "grad_norm": 0.5105189085006714, "learning_rate": 9.942118723919773e-05, "loss": 3.5075, "step": 5359 }, { "epoch": 6.85504, "grad_norm": 0.4975397288799286, "learning_rate": 9.938080495356036e-05, "loss": 3.4318, "step": 5360 }, { "epoch": 6.85632, "grad_norm": 0.49615657329559326, "learning_rate": 9.934042266792301e-05, "loss": 3.4473, "step": 5361 }, { "epoch": 6.8576, "grad_norm": 0.4980040490627289, "learning_rate": 9.930004038228562e-05, "loss": 3.4313, "step": 5362 }, { "epoch": 6.85888, "grad_norm": 0.4903441071510315, "learning_rate": 9.925965809664825e-05, "loss": 3.4639, "step": 5363 }, { "epoch": 6.8601600000000005, "grad_norm": 0.5194531679153442, "learning_rate": 9.92192758110109e-05, "loss": 3.5111, "step": 5364 }, { "epoch": 6.86144, "grad_norm": 0.5149429440498352, "learning_rate": 9.917889352537351e-05, "loss": 3.4815, "step": 5365 }, { "epoch": 6.86272, "grad_norm": 0.49559643864631653, "learning_rate": 9.913851123973616e-05, "loss": 3.4756, "step": 5366 }, { "epoch": 6.864, "grad_norm": 0.5016883015632629, "learning_rate": 9.909812895409879e-05, "loss": 3.4241, "step": 5367 }, { "epoch": 6.86528, "grad_norm": 0.5163260102272034, "learning_rate": 9.905774666846143e-05, "loss": 3.4777, "step": 5368 }, { "epoch": 6.86656, "grad_norm": 0.5174842476844788, "learning_rate": 9.901736438282405e-05, "loss": 3.534, "step": 5369 }, { "epoch": 6.86784, "grad_norm": 0.49278444051742554, "learning_rate": 9.897698209718669e-05, "loss": 3.453, "step": 5370 }, { "epoch": 6.86912, "grad_norm": 0.5156992673873901, "learning_rate": 9.893659981154932e-05, "loss": 3.4667, "step": 5371 }, { "epoch": 6.8704, "grad_norm": 0.5196961164474487, "learning_rate": 9.889621752591197e-05, "loss": 3.5111, "step": 5372 }, { "epoch": 6.87168, "grad_norm": 0.5395616888999939, "learning_rate": 9.885583524027458e-05, "loss": 3.4736, "step": 5373 }, { "epoch": 6.87296, "grad_norm": 0.49754607677459717, "learning_rate": 9.881545295463723e-05, "loss": 3.4057, "step": 5374 }, { "epoch": 6.87424, "grad_norm": 0.5215137004852295, "learning_rate": 9.877507066899986e-05, "loss": 3.4588, "step": 5375 }, { "epoch": 6.87552, "grad_norm": 0.5358836650848389, "learning_rate": 9.87346883833625e-05, "loss": 3.542, "step": 5376 }, { "epoch": 6.8768, "grad_norm": 0.5070124268531799, "learning_rate": 9.869430609772513e-05, "loss": 3.4484, "step": 5377 }, { "epoch": 6.87808, "grad_norm": 0.5137901902198792, "learning_rate": 9.865392381208775e-05, "loss": 3.447, "step": 5378 }, { "epoch": 6.87936, "grad_norm": 0.5140761137008667, "learning_rate": 9.861354152645039e-05, "loss": 3.4393, "step": 5379 }, { "epoch": 6.88064, "grad_norm": 0.5206302404403687, "learning_rate": 9.857315924081302e-05, "loss": 3.4926, "step": 5380 }, { "epoch": 6.88192, "grad_norm": 0.5339552760124207, "learning_rate": 9.853277695517566e-05, "loss": 3.5058, "step": 5381 }, { "epoch": 6.8832, "grad_norm": 0.5175068974494934, "learning_rate": 9.849239466953828e-05, "loss": 3.4939, "step": 5382 }, { "epoch": 6.88448, "grad_norm": 0.4954850673675537, "learning_rate": 9.845201238390092e-05, "loss": 3.4226, "step": 5383 }, { "epoch": 6.88576, "grad_norm": 0.5178061127662659, "learning_rate": 9.841163009826355e-05, "loss": 3.4637, "step": 5384 }, { "epoch": 6.88704, "grad_norm": 0.5334754586219788, "learning_rate": 9.83712478126262e-05, "loss": 3.4926, "step": 5385 }, { "epoch": 6.88832, "grad_norm": 0.5084394216537476, "learning_rate": 9.833086552698881e-05, "loss": 3.413, "step": 5386 }, { "epoch": 6.8896, "grad_norm": 0.5201271772384644, "learning_rate": 9.829048324135146e-05, "loss": 3.4399, "step": 5387 }, { "epoch": 6.89088, "grad_norm": 0.5018450021743774, "learning_rate": 9.825010095571409e-05, "loss": 3.4989, "step": 5388 }, { "epoch": 6.89216, "grad_norm": 0.524716854095459, "learning_rate": 9.820971867007673e-05, "loss": 3.4713, "step": 5389 }, { "epoch": 6.89344, "grad_norm": 0.5103277564048767, "learning_rate": 9.816933638443935e-05, "loss": 3.4293, "step": 5390 }, { "epoch": 6.8947199999999995, "grad_norm": 0.5264323353767395, "learning_rate": 9.812895409880198e-05, "loss": 3.5141, "step": 5391 }, { "epoch": 6.896, "grad_norm": 0.5120908617973328, "learning_rate": 9.808857181316462e-05, "loss": 3.5326, "step": 5392 }, { "epoch": 6.89728, "grad_norm": 0.510351300239563, "learning_rate": 9.804818952752724e-05, "loss": 3.3871, "step": 5393 }, { "epoch": 6.89856, "grad_norm": 0.5060296058654785, "learning_rate": 9.800780724188988e-05, "loss": 3.4455, "step": 5394 }, { "epoch": 6.89984, "grad_norm": 0.5129922032356262, "learning_rate": 9.796742495625251e-05, "loss": 3.5359, "step": 5395 }, { "epoch": 6.90112, "grad_norm": 0.5016512274742126, "learning_rate": 9.792704267061516e-05, "loss": 3.457, "step": 5396 }, { "epoch": 6.9024, "grad_norm": 0.50436931848526, "learning_rate": 9.788666038497777e-05, "loss": 3.5133, "step": 5397 }, { "epoch": 6.90368, "grad_norm": 0.5089812874794006, "learning_rate": 9.784627809934042e-05, "loss": 3.4324, "step": 5398 }, { "epoch": 6.90496, "grad_norm": 0.503563642501831, "learning_rate": 9.780589581370305e-05, "loss": 3.4798, "step": 5399 }, { "epoch": 6.90624, "grad_norm": 0.49585676193237305, "learning_rate": 9.776551352806569e-05, "loss": 3.4961, "step": 5400 }, { "epoch": 6.90752, "grad_norm": 0.5292691588401794, "learning_rate": 9.77251312424283e-05, "loss": 3.4779, "step": 5401 }, { "epoch": 6.9088, "grad_norm": 0.4982292056083679, "learning_rate": 9.768474895679095e-05, "loss": 3.5183, "step": 5402 }, { "epoch": 6.91008, "grad_norm": 0.5063697099685669, "learning_rate": 9.764436667115358e-05, "loss": 3.5731, "step": 5403 }, { "epoch": 6.91136, "grad_norm": 0.5067952871322632, "learning_rate": 9.760398438551621e-05, "loss": 3.4067, "step": 5404 }, { "epoch": 6.91264, "grad_norm": 0.5001175403594971, "learning_rate": 9.756360209987884e-05, "loss": 3.4509, "step": 5405 }, { "epoch": 6.91392, "grad_norm": 0.5022578239440918, "learning_rate": 9.752321981424147e-05, "loss": 3.5178, "step": 5406 }, { "epoch": 6.9152000000000005, "grad_norm": 0.49592211842536926, "learning_rate": 9.748283752860411e-05, "loss": 3.4671, "step": 5407 }, { "epoch": 6.91648, "grad_norm": 0.5008174777030945, "learning_rate": 9.744245524296674e-05, "loss": 3.463, "step": 5408 }, { "epoch": 6.91776, "grad_norm": 0.49405258893966675, "learning_rate": 9.740207295732937e-05, "loss": 3.4687, "step": 5409 }, { "epoch": 6.91904, "grad_norm": 0.5040400624275208, "learning_rate": 9.7361690671692e-05, "loss": 3.4612, "step": 5410 }, { "epoch": 6.92032, "grad_norm": 0.5239134430885315, "learning_rate": 9.732130838605465e-05, "loss": 3.4698, "step": 5411 }, { "epoch": 6.9216, "grad_norm": 0.5057811737060547, "learning_rate": 9.728092610041728e-05, "loss": 3.4569, "step": 5412 }, { "epoch": 6.92288, "grad_norm": 0.49874454736709595, "learning_rate": 9.724054381477992e-05, "loss": 3.4118, "step": 5413 }, { "epoch": 6.92416, "grad_norm": 0.5236087441444397, "learning_rate": 9.720016152914254e-05, "loss": 3.5346, "step": 5414 }, { "epoch": 6.92544, "grad_norm": 0.5117159485816956, "learning_rate": 9.715977924350518e-05, "loss": 3.5194, "step": 5415 }, { "epoch": 6.9267199999999995, "grad_norm": 0.5199319124221802, "learning_rate": 9.711939695786781e-05, "loss": 3.4622, "step": 5416 }, { "epoch": 6.928, "grad_norm": 0.5007228255271912, "learning_rate": 9.707901467223043e-05, "loss": 3.4657, "step": 5417 }, { "epoch": 6.92928, "grad_norm": 0.4935159683227539, "learning_rate": 9.703863238659307e-05, "loss": 3.473, "step": 5418 }, { "epoch": 6.93056, "grad_norm": 0.5131574273109436, "learning_rate": 9.69982501009557e-05, "loss": 3.4111, "step": 5419 }, { "epoch": 6.93184, "grad_norm": 0.4788220226764679, "learning_rate": 9.695786781531834e-05, "loss": 3.4845, "step": 5420 }, { "epoch": 6.93312, "grad_norm": 0.5057500600814819, "learning_rate": 9.691748552968096e-05, "loss": 3.505, "step": 5421 }, { "epoch": 6.9344, "grad_norm": 0.5060935020446777, "learning_rate": 9.68771032440436e-05, "loss": 3.4094, "step": 5422 }, { "epoch": 6.93568, "grad_norm": 0.4928586184978485, "learning_rate": 9.683672095840624e-05, "loss": 3.4754, "step": 5423 }, { "epoch": 6.93696, "grad_norm": 0.5091453790664673, "learning_rate": 9.679633867276888e-05, "loss": 3.4907, "step": 5424 }, { "epoch": 6.93824, "grad_norm": 0.5088662505149841, "learning_rate": 9.67559563871315e-05, "loss": 3.4644, "step": 5425 }, { "epoch": 6.93952, "grad_norm": 0.50926274061203, "learning_rate": 9.671557410149414e-05, "loss": 3.4988, "step": 5426 }, { "epoch": 6.9408, "grad_norm": 0.5116270780563354, "learning_rate": 9.667519181585677e-05, "loss": 3.4536, "step": 5427 }, { "epoch": 6.94208, "grad_norm": 0.4971657395362854, "learning_rate": 9.663480953021941e-05, "loss": 3.472, "step": 5428 }, { "epoch": 6.94336, "grad_norm": 0.5055294632911682, "learning_rate": 9.659442724458203e-05, "loss": 3.4944, "step": 5429 }, { "epoch": 6.94464, "grad_norm": 0.5095595121383667, "learning_rate": 9.655404495894466e-05, "loss": 3.4677, "step": 5430 }, { "epoch": 6.94592, "grad_norm": 0.4956967830657959, "learning_rate": 9.65136626733073e-05, "loss": 3.4282, "step": 5431 }, { "epoch": 6.9472000000000005, "grad_norm": 0.5110241174697876, "learning_rate": 9.647328038766993e-05, "loss": 3.4498, "step": 5432 }, { "epoch": 6.94848, "grad_norm": 0.4950363039970398, "learning_rate": 9.643289810203256e-05, "loss": 3.4326, "step": 5433 }, { "epoch": 6.94976, "grad_norm": 0.5211616158485413, "learning_rate": 9.639251581639519e-05, "loss": 3.512, "step": 5434 }, { "epoch": 6.95104, "grad_norm": 0.5052586793899536, "learning_rate": 9.635213353075784e-05, "loss": 3.4677, "step": 5435 }, { "epoch": 6.95232, "grad_norm": 0.509345293045044, "learning_rate": 9.631175124512047e-05, "loss": 3.4526, "step": 5436 }, { "epoch": 6.9536, "grad_norm": 0.5046288967132568, "learning_rate": 9.62713689594831e-05, "loss": 3.4675, "step": 5437 }, { "epoch": 6.95488, "grad_norm": 0.511081337928772, "learning_rate": 9.623098667384573e-05, "loss": 3.4853, "step": 5438 }, { "epoch": 6.95616, "grad_norm": 0.512008011341095, "learning_rate": 9.619060438820837e-05, "loss": 3.4673, "step": 5439 }, { "epoch": 6.95744, "grad_norm": 0.5058484077453613, "learning_rate": 9.6150222102571e-05, "loss": 3.4265, "step": 5440 }, { "epoch": 6.95872, "grad_norm": 0.5225197076797485, "learning_rate": 9.610983981693363e-05, "loss": 3.495, "step": 5441 }, { "epoch": 6.96, "grad_norm": 0.5050714015960693, "learning_rate": 9.606945753129626e-05, "loss": 3.4189, "step": 5442 }, { "epoch": 6.96128, "grad_norm": 0.5137665271759033, "learning_rate": 9.60290752456589e-05, "loss": 3.4752, "step": 5443 }, { "epoch": 6.96256, "grad_norm": 0.5180103778839111, "learning_rate": 9.598869296002153e-05, "loss": 3.4813, "step": 5444 }, { "epoch": 6.96384, "grad_norm": 0.5160430073738098, "learning_rate": 9.594831067438415e-05, "loss": 3.4638, "step": 5445 }, { "epoch": 6.96512, "grad_norm": 0.511764407157898, "learning_rate": 9.59079283887468e-05, "loss": 3.4263, "step": 5446 }, { "epoch": 6.9664, "grad_norm": 0.5118876099586487, "learning_rate": 9.586754610310942e-05, "loss": 3.4935, "step": 5447 }, { "epoch": 6.96768, "grad_norm": 0.5115414261817932, "learning_rate": 9.582716381747207e-05, "loss": 3.4721, "step": 5448 }, { "epoch": 6.96896, "grad_norm": 0.5058209896087646, "learning_rate": 9.578678153183468e-05, "loss": 3.464, "step": 5449 }, { "epoch": 6.97024, "grad_norm": 0.4981795847415924, "learning_rate": 9.574639924619733e-05, "loss": 3.473, "step": 5450 }, { "epoch": 6.97152, "grad_norm": 0.5040668249130249, "learning_rate": 9.570601696055996e-05, "loss": 3.3973, "step": 5451 }, { "epoch": 6.9728, "grad_norm": 0.49496889114379883, "learning_rate": 9.56656346749226e-05, "loss": 3.372, "step": 5452 }, { "epoch": 6.97408, "grad_norm": 0.5042381882667542, "learning_rate": 9.562525238928522e-05, "loss": 3.4545, "step": 5453 }, { "epoch": 6.97536, "grad_norm": 0.5015059113502502, "learning_rate": 9.558487010364786e-05, "loss": 3.4237, "step": 5454 }, { "epoch": 6.97664, "grad_norm": 0.5201092958450317, "learning_rate": 9.554448781801049e-05, "loss": 3.4955, "step": 5455 }, { "epoch": 6.97792, "grad_norm": 0.5009555220603943, "learning_rate": 9.550410553237314e-05, "loss": 3.4304, "step": 5456 }, { "epoch": 6.9792, "grad_norm": 0.49957409501075745, "learning_rate": 9.546372324673575e-05, "loss": 3.435, "step": 5457 }, { "epoch": 6.98048, "grad_norm": 0.512995719909668, "learning_rate": 9.542334096109838e-05, "loss": 3.5062, "step": 5458 }, { "epoch": 6.9817599999999995, "grad_norm": 0.5111671090126038, "learning_rate": 9.538295867546103e-05, "loss": 3.4766, "step": 5459 }, { "epoch": 6.98304, "grad_norm": 0.5001948475837708, "learning_rate": 9.534257638982364e-05, "loss": 3.4496, "step": 5460 }, { "epoch": 6.98432, "grad_norm": 0.5309041738510132, "learning_rate": 9.530219410418629e-05, "loss": 3.5057, "step": 5461 }, { "epoch": 6.9856, "grad_norm": 0.5163221955299377, "learning_rate": 9.526181181854892e-05, "loss": 3.4416, "step": 5462 }, { "epoch": 6.98688, "grad_norm": 0.4984453022480011, "learning_rate": 9.522142953291156e-05, "loss": 3.46, "step": 5463 }, { "epoch": 6.98816, "grad_norm": 0.5184572339057922, "learning_rate": 9.518104724727419e-05, "loss": 3.4491, "step": 5464 }, { "epoch": 6.98944, "grad_norm": 0.5225724577903748, "learning_rate": 9.514066496163682e-05, "loss": 3.4354, "step": 5465 }, { "epoch": 6.99072, "grad_norm": 0.507361114025116, "learning_rate": 9.510028267599945e-05, "loss": 3.4749, "step": 5466 }, { "epoch": 6.992, "grad_norm": 0.5018404126167297, "learning_rate": 9.50599003903621e-05, "loss": 3.4538, "step": 5467 }, { "epoch": 6.99328, "grad_norm": 0.5017131567001343, "learning_rate": 9.501951810472472e-05, "loss": 3.4954, "step": 5468 }, { "epoch": 6.99456, "grad_norm": 0.5111985802650452, "learning_rate": 9.497913581908735e-05, "loss": 3.4424, "step": 5469 }, { "epoch": 6.99584, "grad_norm": 0.5290077328681946, "learning_rate": 9.493875353344998e-05, "loss": 3.4288, "step": 5470 }, { "epoch": 6.99712, "grad_norm": 0.5016758441925049, "learning_rate": 9.489837124781261e-05, "loss": 3.4364, "step": 5471 }, { "epoch": 6.9984, "grad_norm": 0.491237074136734, "learning_rate": 9.485798896217526e-05, "loss": 3.4449, "step": 5472 }, { "epoch": 6.99968, "grad_norm": 0.5003752112388611, "learning_rate": 9.481760667653787e-05, "loss": 3.4199, "step": 5473 }, { "epoch": 7.0, "grad_norm": 0.9872975945472717, "learning_rate": 9.477722439090052e-05, "loss": 3.5161, "step": 5474 }, { "epoch": 7.00128, "grad_norm": 0.5078200101852417, "learning_rate": 9.473684210526315e-05, "loss": 3.33, "step": 5475 }, { "epoch": 7.00256, "grad_norm": 0.5141358971595764, "learning_rate": 9.469645981962579e-05, "loss": 3.333, "step": 5476 }, { "epoch": 7.00384, "grad_norm": 0.5374777913093567, "learning_rate": 9.465607753398841e-05, "loss": 3.3227, "step": 5477 }, { "epoch": 7.00512, "grad_norm": 0.5191708207130432, "learning_rate": 9.461569524835105e-05, "loss": 3.4003, "step": 5478 }, { "epoch": 7.0064, "grad_norm": 0.5081069469451904, "learning_rate": 9.457531296271368e-05, "loss": 3.3985, "step": 5479 }, { "epoch": 7.00768, "grad_norm": 0.5270673036575317, "learning_rate": 9.453493067707633e-05, "loss": 3.3482, "step": 5480 }, { "epoch": 7.00896, "grad_norm": 0.5204823017120361, "learning_rate": 9.449454839143894e-05, "loss": 3.3075, "step": 5481 }, { "epoch": 7.01024, "grad_norm": 0.5076407194137573, "learning_rate": 9.445416610580159e-05, "loss": 3.2981, "step": 5482 }, { "epoch": 7.01152, "grad_norm": 0.5210766196250916, "learning_rate": 9.441378382016422e-05, "loss": 3.4319, "step": 5483 }, { "epoch": 7.0128, "grad_norm": 0.5091572403907776, "learning_rate": 9.437340153452683e-05, "loss": 3.3706, "step": 5484 }, { "epoch": 7.01408, "grad_norm": 0.5025621056556702, "learning_rate": 9.433301924888948e-05, "loss": 3.274, "step": 5485 }, { "epoch": 7.01536, "grad_norm": 0.5144043564796448, "learning_rate": 9.42926369632521e-05, "loss": 3.2991, "step": 5486 }, { "epoch": 7.01664, "grad_norm": 0.5310031175613403, "learning_rate": 9.425225467761475e-05, "loss": 3.3532, "step": 5487 }, { "epoch": 7.01792, "grad_norm": 0.506899893283844, "learning_rate": 9.421187239197737e-05, "loss": 3.3682, "step": 5488 }, { "epoch": 7.0192, "grad_norm": 0.5226014256477356, "learning_rate": 9.417149010634001e-05, "loss": 3.3653, "step": 5489 }, { "epoch": 7.02048, "grad_norm": 0.5002744197845459, "learning_rate": 9.413110782070264e-05, "loss": 3.3786, "step": 5490 }, { "epoch": 7.0217600000000004, "grad_norm": 0.5125488638877869, "learning_rate": 9.409072553506528e-05, "loss": 3.3929, "step": 5491 }, { "epoch": 7.02304, "grad_norm": 0.5321469902992249, "learning_rate": 9.40503432494279e-05, "loss": 3.3667, "step": 5492 }, { "epoch": 7.02432, "grad_norm": 0.5275914072990417, "learning_rate": 9.400996096379054e-05, "loss": 3.3928, "step": 5493 }, { "epoch": 7.0256, "grad_norm": 0.5226854681968689, "learning_rate": 9.396957867815317e-05, "loss": 3.3962, "step": 5494 }, { "epoch": 7.02688, "grad_norm": 0.5123041272163391, "learning_rate": 9.392919639251582e-05, "loss": 3.2311, "step": 5495 }, { "epoch": 7.02816, "grad_norm": 0.49747586250305176, "learning_rate": 9.388881410687843e-05, "loss": 3.3694, "step": 5496 }, { "epoch": 7.02944, "grad_norm": 0.5055866241455078, "learning_rate": 9.384843182124106e-05, "loss": 3.3472, "step": 5497 }, { "epoch": 7.03072, "grad_norm": 0.5089617371559143, "learning_rate": 9.380804953560371e-05, "loss": 3.3635, "step": 5498 }, { "epoch": 7.032, "grad_norm": 0.5178612470626831, "learning_rate": 9.376766724996634e-05, "loss": 3.3485, "step": 5499 }, { "epoch": 7.03328, "grad_norm": 0.5137771964073181, "learning_rate": 9.372728496432898e-05, "loss": 3.3414, "step": 5500 }, { "epoch": 7.03456, "grad_norm": 0.5222017765045166, "learning_rate": 9.36869026786916e-05, "loss": 3.3504, "step": 5501 }, { "epoch": 7.03584, "grad_norm": 0.506715714931488, "learning_rate": 9.364652039305424e-05, "loss": 3.342, "step": 5502 }, { "epoch": 7.03712, "grad_norm": 0.5213739275932312, "learning_rate": 9.360613810741687e-05, "loss": 3.3492, "step": 5503 }, { "epoch": 7.0384, "grad_norm": 0.5213996767997742, "learning_rate": 9.356575582177952e-05, "loss": 3.3364, "step": 5504 }, { "epoch": 7.03968, "grad_norm": 0.5308609008789062, "learning_rate": 9.352537353614213e-05, "loss": 3.2428, "step": 5505 }, { "epoch": 7.04096, "grad_norm": 0.520922064781189, "learning_rate": 9.348499125050478e-05, "loss": 3.3741, "step": 5506 }, { "epoch": 7.04224, "grad_norm": 0.49800974130630493, "learning_rate": 9.34446089648674e-05, "loss": 3.35, "step": 5507 }, { "epoch": 7.04352, "grad_norm": 0.53228360414505, "learning_rate": 9.340422667923005e-05, "loss": 3.3215, "step": 5508 }, { "epoch": 7.0448, "grad_norm": 0.5500988960266113, "learning_rate": 9.336384439359267e-05, "loss": 3.4593, "step": 5509 }, { "epoch": 7.04608, "grad_norm": 0.5122029185295105, "learning_rate": 9.332346210795531e-05, "loss": 3.3557, "step": 5510 }, { "epoch": 7.04736, "grad_norm": 0.4989195764064789, "learning_rate": 9.328307982231794e-05, "loss": 3.3231, "step": 5511 }, { "epoch": 7.04864, "grad_norm": 0.521932065486908, "learning_rate": 9.324269753668056e-05, "loss": 3.3911, "step": 5512 }, { "epoch": 7.04992, "grad_norm": 0.5335235595703125, "learning_rate": 9.32023152510432e-05, "loss": 3.3212, "step": 5513 }, { "epoch": 7.0512, "grad_norm": 0.5094990730285645, "learning_rate": 9.316193296540583e-05, "loss": 3.2888, "step": 5514 }, { "epoch": 7.05248, "grad_norm": 0.5203416347503662, "learning_rate": 9.312155067976847e-05, "loss": 3.4157, "step": 5515 }, { "epoch": 7.05376, "grad_norm": 0.5284749865531921, "learning_rate": 9.308116839413109e-05, "loss": 3.3612, "step": 5516 }, { "epoch": 7.05504, "grad_norm": 0.5061756372451782, "learning_rate": 9.304078610849373e-05, "loss": 3.4084, "step": 5517 }, { "epoch": 7.05632, "grad_norm": 0.5212084054946899, "learning_rate": 9.300040382285636e-05, "loss": 3.2901, "step": 5518 }, { "epoch": 7.0576, "grad_norm": 0.528948187828064, "learning_rate": 9.296002153721901e-05, "loss": 3.3865, "step": 5519 }, { "epoch": 7.05888, "grad_norm": 0.507111132144928, "learning_rate": 9.291963925158162e-05, "loss": 3.4119, "step": 5520 }, { "epoch": 7.06016, "grad_norm": 0.5240997672080994, "learning_rate": 9.287925696594427e-05, "loss": 3.4243, "step": 5521 }, { "epoch": 7.06144, "grad_norm": 0.5230147242546082, "learning_rate": 9.28388746803069e-05, "loss": 3.376, "step": 5522 }, { "epoch": 7.06272, "grad_norm": 0.526611864566803, "learning_rate": 9.279849239466954e-05, "loss": 3.3711, "step": 5523 }, { "epoch": 7.064, "grad_norm": 0.5132043957710266, "learning_rate": 9.275811010903216e-05, "loss": 3.2992, "step": 5524 }, { "epoch": 7.06528, "grad_norm": 0.4980516731739044, "learning_rate": 9.271772782339479e-05, "loss": 3.2729, "step": 5525 }, { "epoch": 7.06656, "grad_norm": 0.5084584355354309, "learning_rate": 9.267734553775743e-05, "loss": 3.2491, "step": 5526 }, { "epoch": 7.06784, "grad_norm": 0.515275776386261, "learning_rate": 9.263696325212006e-05, "loss": 3.3712, "step": 5527 }, { "epoch": 7.06912, "grad_norm": 0.5161235928535461, "learning_rate": 9.259658096648269e-05, "loss": 3.4019, "step": 5528 }, { "epoch": 7.0704, "grad_norm": 0.5367205142974854, "learning_rate": 9.255619868084532e-05, "loss": 3.4287, "step": 5529 }, { "epoch": 7.07168, "grad_norm": 0.5217522978782654, "learning_rate": 9.251581639520796e-05, "loss": 3.4843, "step": 5530 }, { "epoch": 7.07296, "grad_norm": 0.5207579135894775, "learning_rate": 9.24754341095706e-05, "loss": 3.3275, "step": 5531 }, { "epoch": 7.07424, "grad_norm": 0.5166409611701965, "learning_rate": 9.243505182393323e-05, "loss": 3.293, "step": 5532 }, { "epoch": 7.07552, "grad_norm": 0.5201752185821533, "learning_rate": 9.239466953829586e-05, "loss": 3.3349, "step": 5533 }, { "epoch": 7.0768, "grad_norm": 0.5157431364059448, "learning_rate": 9.23542872526585e-05, "loss": 3.3274, "step": 5534 }, { "epoch": 7.07808, "grad_norm": 0.5315886735916138, "learning_rate": 9.231390496702113e-05, "loss": 3.3802, "step": 5535 }, { "epoch": 7.07936, "grad_norm": 0.5255934000015259, "learning_rate": 9.227352268138377e-05, "loss": 3.3859, "step": 5536 }, { "epoch": 7.08064, "grad_norm": 0.5185539126396179, "learning_rate": 9.223314039574639e-05, "loss": 3.3037, "step": 5537 }, { "epoch": 7.08192, "grad_norm": 0.5157800912857056, "learning_rate": 9.219275811010902e-05, "loss": 3.4169, "step": 5538 }, { "epoch": 7.0832, "grad_norm": 0.5315351486206055, "learning_rate": 9.215237582447166e-05, "loss": 3.3251, "step": 5539 }, { "epoch": 7.08448, "grad_norm": 0.5046893358230591, "learning_rate": 9.211199353883428e-05, "loss": 3.3554, "step": 5540 }, { "epoch": 7.08576, "grad_norm": 0.5140607357025146, "learning_rate": 9.207161125319692e-05, "loss": 3.3262, "step": 5541 }, { "epoch": 7.08704, "grad_norm": 0.508526086807251, "learning_rate": 9.203122896755955e-05, "loss": 3.2896, "step": 5542 }, { "epoch": 7.08832, "grad_norm": 0.5273757576942444, "learning_rate": 9.19908466819222e-05, "loss": 3.4774, "step": 5543 }, { "epoch": 7.0896, "grad_norm": 0.5205854773521423, "learning_rate": 9.195046439628481e-05, "loss": 3.3347, "step": 5544 }, { "epoch": 7.09088, "grad_norm": 0.5332198739051819, "learning_rate": 9.191008211064746e-05, "loss": 3.3342, "step": 5545 }, { "epoch": 7.09216, "grad_norm": 0.5330145955085754, "learning_rate": 9.186969982501009e-05, "loss": 3.3769, "step": 5546 }, { "epoch": 7.09344, "grad_norm": 0.5021401047706604, "learning_rate": 9.182931753937273e-05, "loss": 3.3476, "step": 5547 }, { "epoch": 7.09472, "grad_norm": 0.5273832678794861, "learning_rate": 9.178893525373535e-05, "loss": 3.333, "step": 5548 }, { "epoch": 7.096, "grad_norm": 0.5313226580619812, "learning_rate": 9.174855296809799e-05, "loss": 3.4208, "step": 5549 }, { "epoch": 7.09728, "grad_norm": 0.5218507051467896, "learning_rate": 9.170817068246062e-05, "loss": 3.3408, "step": 5550 }, { "epoch": 7.09856, "grad_norm": 0.5151420831680298, "learning_rate": 9.166778839682325e-05, "loss": 3.3944, "step": 5551 }, { "epoch": 7.09984, "grad_norm": 0.5464621186256409, "learning_rate": 9.162740611118588e-05, "loss": 3.3923, "step": 5552 }, { "epoch": 7.10112, "grad_norm": 0.5159791707992554, "learning_rate": 9.158702382554851e-05, "loss": 3.379, "step": 5553 }, { "epoch": 7.1024, "grad_norm": 0.526742160320282, "learning_rate": 9.154664153991115e-05, "loss": 3.3621, "step": 5554 }, { "epoch": 7.10368, "grad_norm": 0.5101780891418457, "learning_rate": 9.150625925427378e-05, "loss": 3.3674, "step": 5555 }, { "epoch": 7.10496, "grad_norm": 0.5260143876075745, "learning_rate": 9.146587696863641e-05, "loss": 3.3164, "step": 5556 }, { "epoch": 7.10624, "grad_norm": 0.5367382168769836, "learning_rate": 9.142549468299904e-05, "loss": 3.3863, "step": 5557 }, { "epoch": 7.10752, "grad_norm": 0.5072177648544312, "learning_rate": 9.138511239736169e-05, "loss": 3.335, "step": 5558 }, { "epoch": 7.1088, "grad_norm": 0.5360726118087769, "learning_rate": 9.134473011172432e-05, "loss": 3.3996, "step": 5559 }, { "epoch": 7.11008, "grad_norm": 0.5281416177749634, "learning_rate": 9.130434782608695e-05, "loss": 3.3302, "step": 5560 }, { "epoch": 7.11136, "grad_norm": 0.526565432548523, "learning_rate": 9.126396554044958e-05, "loss": 3.3779, "step": 5561 }, { "epoch": 7.11264, "grad_norm": 0.5221660733222961, "learning_rate": 9.122358325481222e-05, "loss": 3.3879, "step": 5562 }, { "epoch": 7.11392, "grad_norm": 0.5545408129692078, "learning_rate": 9.118320096917485e-05, "loss": 3.3599, "step": 5563 }, { "epoch": 7.1152, "grad_norm": 0.5166964530944824, "learning_rate": 9.114281868353748e-05, "loss": 3.3786, "step": 5564 }, { "epoch": 7.11648, "grad_norm": 0.5115810632705688, "learning_rate": 9.110243639790011e-05, "loss": 3.3983, "step": 5565 }, { "epoch": 7.11776, "grad_norm": 0.527052640914917, "learning_rate": 9.106205411226274e-05, "loss": 3.3886, "step": 5566 }, { "epoch": 7.11904, "grad_norm": 0.5167108774185181, "learning_rate": 9.102167182662539e-05, "loss": 3.3837, "step": 5567 }, { "epoch": 7.12032, "grad_norm": 0.5196815729141235, "learning_rate": 9.0981289540988e-05, "loss": 3.4007, "step": 5568 }, { "epoch": 7.1216, "grad_norm": 0.5209911465644836, "learning_rate": 9.094090725535065e-05, "loss": 3.3465, "step": 5569 }, { "epoch": 7.12288, "grad_norm": 0.5412890315055847, "learning_rate": 9.090052496971328e-05, "loss": 3.3554, "step": 5570 }, { "epoch": 7.12416, "grad_norm": 0.5221083164215088, "learning_rate": 9.086014268407592e-05, "loss": 3.3713, "step": 5571 }, { "epoch": 7.12544, "grad_norm": 0.5288161039352417, "learning_rate": 9.081976039843854e-05, "loss": 3.3445, "step": 5572 }, { "epoch": 7.12672, "grad_norm": 0.5313680171966553, "learning_rate": 9.077937811280118e-05, "loss": 3.3708, "step": 5573 }, { "epoch": 7.128, "grad_norm": 0.5292986035346985, "learning_rate": 9.073899582716381e-05, "loss": 3.3137, "step": 5574 }, { "epoch": 7.12928, "grad_norm": 0.5138509273529053, "learning_rate": 9.069861354152645e-05, "loss": 3.3887, "step": 5575 }, { "epoch": 7.13056, "grad_norm": 0.540255606174469, "learning_rate": 9.065823125588907e-05, "loss": 3.3745, "step": 5576 }, { "epoch": 7.13184, "grad_norm": 0.5347320437431335, "learning_rate": 9.061784897025171e-05, "loss": 3.3579, "step": 5577 }, { "epoch": 7.13312, "grad_norm": 0.49910813570022583, "learning_rate": 9.057746668461434e-05, "loss": 3.3225, "step": 5578 }, { "epoch": 7.1344, "grad_norm": 0.5152397751808167, "learning_rate": 9.053708439897696e-05, "loss": 3.3496, "step": 5579 }, { "epoch": 7.13568, "grad_norm": 0.5248477458953857, "learning_rate": 9.04967021133396e-05, "loss": 3.3574, "step": 5580 }, { "epoch": 7.13696, "grad_norm": 0.5127560496330261, "learning_rate": 9.045631982770223e-05, "loss": 3.3692, "step": 5581 }, { "epoch": 7.13824, "grad_norm": 0.5316688418388367, "learning_rate": 9.041593754206488e-05, "loss": 3.3383, "step": 5582 }, { "epoch": 7.13952, "grad_norm": 0.5397014021873474, "learning_rate": 9.03755552564275e-05, "loss": 3.3939, "step": 5583 }, { "epoch": 7.1408, "grad_norm": 0.5297314524650574, "learning_rate": 9.033517297079014e-05, "loss": 3.3205, "step": 5584 }, { "epoch": 7.14208, "grad_norm": 0.5164687633514404, "learning_rate": 9.029479068515277e-05, "loss": 3.311, "step": 5585 }, { "epoch": 7.14336, "grad_norm": 0.5183284878730774, "learning_rate": 9.025440839951541e-05, "loss": 3.3063, "step": 5586 }, { "epoch": 7.14464, "grad_norm": 0.508436918258667, "learning_rate": 9.021402611387804e-05, "loss": 3.3496, "step": 5587 }, { "epoch": 7.14592, "grad_norm": 0.5277997851371765, "learning_rate": 9.017364382824067e-05, "loss": 3.3782, "step": 5588 }, { "epoch": 7.1472, "grad_norm": 0.5223129987716675, "learning_rate": 9.01332615426033e-05, "loss": 3.3175, "step": 5589 }, { "epoch": 7.14848, "grad_norm": 0.4987069070339203, "learning_rate": 9.009287925696595e-05, "loss": 3.3346, "step": 5590 }, { "epoch": 7.14976, "grad_norm": 0.5348743796348572, "learning_rate": 9.005249697132858e-05, "loss": 3.3543, "step": 5591 }, { "epoch": 7.15104, "grad_norm": 0.5277005434036255, "learning_rate": 9.001211468569119e-05, "loss": 3.4288, "step": 5592 }, { "epoch": 7.15232, "grad_norm": 0.5248242020606995, "learning_rate": 8.997173240005384e-05, "loss": 3.3001, "step": 5593 }, { "epoch": 7.1536, "grad_norm": 0.5100671052932739, "learning_rate": 8.993135011441647e-05, "loss": 3.3571, "step": 5594 }, { "epoch": 7.15488, "grad_norm": 0.5024705529212952, "learning_rate": 8.989096782877911e-05, "loss": 3.35, "step": 5595 }, { "epoch": 7.15616, "grad_norm": 0.5296070575714111, "learning_rate": 8.985058554314173e-05, "loss": 3.3975, "step": 5596 }, { "epoch": 7.15744, "grad_norm": 0.535521388053894, "learning_rate": 8.981020325750437e-05, "loss": 3.4045, "step": 5597 }, { "epoch": 7.15872, "grad_norm": 0.5225574374198914, "learning_rate": 8.9769820971867e-05, "loss": 3.3518, "step": 5598 }, { "epoch": 7.16, "grad_norm": 0.509151816368103, "learning_rate": 8.972943868622964e-05, "loss": 3.438, "step": 5599 }, { "epoch": 7.16128, "grad_norm": 0.5296933054924011, "learning_rate": 8.968905640059226e-05, "loss": 3.4388, "step": 5600 }, { "epoch": 7.16256, "grad_norm": 0.527597963809967, "learning_rate": 8.96486741149549e-05, "loss": 3.36, "step": 5601 }, { "epoch": 7.16384, "grad_norm": 0.5163085460662842, "learning_rate": 8.960829182931753e-05, "loss": 3.3173, "step": 5602 }, { "epoch": 7.16512, "grad_norm": 0.5081968307495117, "learning_rate": 8.956790954368018e-05, "loss": 3.3379, "step": 5603 }, { "epoch": 7.1664, "grad_norm": 0.5179334878921509, "learning_rate": 8.95275272580428e-05, "loss": 3.3674, "step": 5604 }, { "epoch": 7.16768, "grad_norm": 0.5077155828475952, "learning_rate": 8.948714497240542e-05, "loss": 3.3791, "step": 5605 }, { "epoch": 7.16896, "grad_norm": 0.5360821485519409, "learning_rate": 8.944676268676807e-05, "loss": 3.3437, "step": 5606 }, { "epoch": 7.17024, "grad_norm": 0.5339843034744263, "learning_rate": 8.940638040113068e-05, "loss": 3.4416, "step": 5607 }, { "epoch": 7.17152, "grad_norm": 0.5199359059333801, "learning_rate": 8.936599811549333e-05, "loss": 3.4527, "step": 5608 }, { "epoch": 7.1728, "grad_norm": 0.5207464098930359, "learning_rate": 8.932561582985596e-05, "loss": 3.3807, "step": 5609 }, { "epoch": 7.17408, "grad_norm": 0.5150240659713745, "learning_rate": 8.92852335442186e-05, "loss": 3.3255, "step": 5610 }, { "epoch": 7.17536, "grad_norm": 0.5204200744628906, "learning_rate": 8.924485125858122e-05, "loss": 3.3301, "step": 5611 }, { "epoch": 7.17664, "grad_norm": 0.5184539556503296, "learning_rate": 8.920446897294386e-05, "loss": 3.3256, "step": 5612 }, { "epoch": 7.17792, "grad_norm": 0.5265978574752808, "learning_rate": 8.916408668730649e-05, "loss": 3.3752, "step": 5613 }, { "epoch": 7.1792, "grad_norm": 0.5245677828788757, "learning_rate": 8.912370440166914e-05, "loss": 3.3356, "step": 5614 }, { "epoch": 7.18048, "grad_norm": 0.5343059301376343, "learning_rate": 8.908332211603175e-05, "loss": 3.3374, "step": 5615 }, { "epoch": 7.18176, "grad_norm": 0.5347888469696045, "learning_rate": 8.90429398303944e-05, "loss": 3.3665, "step": 5616 }, { "epoch": 7.18304, "grad_norm": 0.5191536545753479, "learning_rate": 8.900255754475703e-05, "loss": 3.3387, "step": 5617 }, { "epoch": 7.18432, "grad_norm": 0.5158880949020386, "learning_rate": 8.896217525911966e-05, "loss": 3.312, "step": 5618 }, { "epoch": 7.1856, "grad_norm": 0.5173276662826538, "learning_rate": 8.892179297348229e-05, "loss": 3.352, "step": 5619 }, { "epoch": 7.18688, "grad_norm": 0.515876829624176, "learning_rate": 8.888141068784492e-05, "loss": 3.3369, "step": 5620 }, { "epoch": 7.18816, "grad_norm": 0.5200967788696289, "learning_rate": 8.884102840220756e-05, "loss": 3.3315, "step": 5621 }, { "epoch": 7.18944, "grad_norm": 0.5234977602958679, "learning_rate": 8.880064611657019e-05, "loss": 3.3648, "step": 5622 }, { "epoch": 7.19072, "grad_norm": 0.5436128377914429, "learning_rate": 8.876026383093282e-05, "loss": 3.3157, "step": 5623 }, { "epoch": 7.192, "grad_norm": 0.5315131545066833, "learning_rate": 8.871988154529545e-05, "loss": 3.3696, "step": 5624 }, { "epoch": 7.19328, "grad_norm": 0.5198819637298584, "learning_rate": 8.867949925965809e-05, "loss": 3.3841, "step": 5625 }, { "epoch": 7.19456, "grad_norm": 0.5186936259269714, "learning_rate": 8.863911697402072e-05, "loss": 3.3401, "step": 5626 }, { "epoch": 7.19584, "grad_norm": 0.5158634781837463, "learning_rate": 8.859873468838337e-05, "loss": 3.3416, "step": 5627 }, { "epoch": 7.19712, "grad_norm": 0.5173478722572327, "learning_rate": 8.855835240274598e-05, "loss": 3.3616, "step": 5628 }, { "epoch": 7.1984, "grad_norm": 0.5127228498458862, "learning_rate": 8.851797011710863e-05, "loss": 3.4288, "step": 5629 }, { "epoch": 7.19968, "grad_norm": 0.5157126188278198, "learning_rate": 8.847758783147126e-05, "loss": 3.3122, "step": 5630 }, { "epoch": 7.20096, "grad_norm": 0.5168270468711853, "learning_rate": 8.84372055458339e-05, "loss": 3.3206, "step": 5631 }, { "epoch": 7.20224, "grad_norm": 0.5436537861824036, "learning_rate": 8.839682326019652e-05, "loss": 3.415, "step": 5632 }, { "epoch": 7.20352, "grad_norm": 0.5260339975357056, "learning_rate": 8.835644097455915e-05, "loss": 3.3191, "step": 5633 }, { "epoch": 7.2048, "grad_norm": 0.5202707648277283, "learning_rate": 8.831605868892179e-05, "loss": 3.3391, "step": 5634 }, { "epoch": 7.20608, "grad_norm": 0.5285276174545288, "learning_rate": 8.827567640328441e-05, "loss": 3.3521, "step": 5635 }, { "epoch": 7.2073599999999995, "grad_norm": 0.5395596027374268, "learning_rate": 8.823529411764705e-05, "loss": 3.3539, "step": 5636 }, { "epoch": 7.20864, "grad_norm": 0.543609619140625, "learning_rate": 8.819491183200968e-05, "loss": 3.3652, "step": 5637 }, { "epoch": 7.20992, "grad_norm": 0.5348373651504517, "learning_rate": 8.815452954637232e-05, "loss": 3.3671, "step": 5638 }, { "epoch": 7.2112, "grad_norm": 0.5275009274482727, "learning_rate": 8.811414726073494e-05, "loss": 3.3737, "step": 5639 }, { "epoch": 7.21248, "grad_norm": 0.5249333381652832, "learning_rate": 8.807376497509758e-05, "loss": 3.4308, "step": 5640 }, { "epoch": 7.21376, "grad_norm": 0.5271218419075012, "learning_rate": 8.803338268946022e-05, "loss": 3.3363, "step": 5641 }, { "epoch": 7.21504, "grad_norm": 0.5315967798233032, "learning_rate": 8.799300040382286e-05, "loss": 3.4132, "step": 5642 }, { "epoch": 7.21632, "grad_norm": 0.5222828984260559, "learning_rate": 8.795261811818548e-05, "loss": 3.3832, "step": 5643 }, { "epoch": 7.2176, "grad_norm": 0.5247751474380493, "learning_rate": 8.791223583254812e-05, "loss": 3.255, "step": 5644 }, { "epoch": 7.21888, "grad_norm": 0.5310077667236328, "learning_rate": 8.787185354691075e-05, "loss": 3.3478, "step": 5645 }, { "epoch": 7.22016, "grad_norm": 0.5162127614021301, "learning_rate": 8.783147126127338e-05, "loss": 3.3882, "step": 5646 }, { "epoch": 7.22144, "grad_norm": 0.5231219530105591, "learning_rate": 8.779108897563601e-05, "loss": 3.3114, "step": 5647 }, { "epoch": 7.22272, "grad_norm": 0.5083349943161011, "learning_rate": 8.775070668999864e-05, "loss": 3.2972, "step": 5648 }, { "epoch": 7.224, "grad_norm": 0.5252796411514282, "learning_rate": 8.771032440436128e-05, "loss": 3.2982, "step": 5649 }, { "epoch": 7.22528, "grad_norm": 0.5133306980133057, "learning_rate": 8.766994211872391e-05, "loss": 3.3952, "step": 5650 }, { "epoch": 7.22656, "grad_norm": 0.5313632488250732, "learning_rate": 8.762955983308654e-05, "loss": 3.4222, "step": 5651 }, { "epoch": 7.22784, "grad_norm": 0.5258992910385132, "learning_rate": 8.758917754744917e-05, "loss": 3.3971, "step": 5652 }, { "epoch": 7.22912, "grad_norm": 0.5275927186012268, "learning_rate": 8.754879526181182e-05, "loss": 3.3187, "step": 5653 }, { "epoch": 7.2304, "grad_norm": 0.5042412877082825, "learning_rate": 8.750841297617445e-05, "loss": 3.3668, "step": 5654 }, { "epoch": 7.23168, "grad_norm": 0.528590202331543, "learning_rate": 8.746803069053708e-05, "loss": 3.4068, "step": 5655 }, { "epoch": 7.23296, "grad_norm": 0.5280982255935669, "learning_rate": 8.742764840489971e-05, "loss": 3.4039, "step": 5656 }, { "epoch": 7.23424, "grad_norm": 0.5211199522018433, "learning_rate": 8.738726611926235e-05, "loss": 3.3706, "step": 5657 }, { "epoch": 7.23552, "grad_norm": 0.5214046239852905, "learning_rate": 8.734688383362498e-05, "loss": 3.3703, "step": 5658 }, { "epoch": 7.2368, "grad_norm": 0.5262265205383301, "learning_rate": 8.73065015479876e-05, "loss": 3.2918, "step": 5659 }, { "epoch": 7.23808, "grad_norm": 0.534403920173645, "learning_rate": 8.726611926235024e-05, "loss": 3.3363, "step": 5660 }, { "epoch": 7.23936, "grad_norm": 0.5397913455963135, "learning_rate": 8.722573697671287e-05, "loss": 3.3867, "step": 5661 }, { "epoch": 7.24064, "grad_norm": 0.5387073755264282, "learning_rate": 8.718535469107551e-05, "loss": 3.4315, "step": 5662 }, { "epoch": 7.24192, "grad_norm": 0.5351658463478088, "learning_rate": 8.714497240543813e-05, "loss": 3.4319, "step": 5663 }, { "epoch": 7.2432, "grad_norm": 0.5469005107879639, "learning_rate": 8.710459011980077e-05, "loss": 3.4048, "step": 5664 }, { "epoch": 7.24448, "grad_norm": 0.539505660533905, "learning_rate": 8.70642078341634e-05, "loss": 3.4154, "step": 5665 }, { "epoch": 7.24576, "grad_norm": 0.543161928653717, "learning_rate": 8.702382554852605e-05, "loss": 3.4247, "step": 5666 }, { "epoch": 7.24704, "grad_norm": 0.5213647484779358, "learning_rate": 8.698344326288866e-05, "loss": 3.3403, "step": 5667 }, { "epoch": 7.24832, "grad_norm": 0.5247822403907776, "learning_rate": 8.694306097725131e-05, "loss": 3.3623, "step": 5668 }, { "epoch": 7.2496, "grad_norm": 0.5359296202659607, "learning_rate": 8.690267869161394e-05, "loss": 3.3437, "step": 5669 }, { "epoch": 7.25088, "grad_norm": 0.5125774145126343, "learning_rate": 8.686229640597658e-05, "loss": 3.4178, "step": 5670 }, { "epoch": 7.25216, "grad_norm": 0.5092083215713501, "learning_rate": 8.68219141203392e-05, "loss": 3.3376, "step": 5671 }, { "epoch": 7.25344, "grad_norm": 0.5361836552619934, "learning_rate": 8.678153183470183e-05, "loss": 3.2985, "step": 5672 }, { "epoch": 7.25472, "grad_norm": 0.525852382183075, "learning_rate": 8.674114954906447e-05, "loss": 3.3765, "step": 5673 }, { "epoch": 7.256, "grad_norm": 0.5261217355728149, "learning_rate": 8.670076726342709e-05, "loss": 3.3436, "step": 5674 }, { "epoch": 7.25728, "grad_norm": 0.5285188555717468, "learning_rate": 8.666038497778973e-05, "loss": 3.4639, "step": 5675 }, { "epoch": 7.25856, "grad_norm": 0.5249500274658203, "learning_rate": 8.662000269215236e-05, "loss": 3.3356, "step": 5676 }, { "epoch": 7.25984, "grad_norm": 0.5341816544532776, "learning_rate": 8.6579620406515e-05, "loss": 3.3849, "step": 5677 }, { "epoch": 7.26112, "grad_norm": 0.538826048374176, "learning_rate": 8.653923812087764e-05, "loss": 3.4035, "step": 5678 }, { "epoch": 7.2624, "grad_norm": 0.5222164392471313, "learning_rate": 8.649885583524027e-05, "loss": 3.3679, "step": 5679 }, { "epoch": 7.26368, "grad_norm": 0.5226588249206543, "learning_rate": 8.64584735496029e-05, "loss": 3.4012, "step": 5680 }, { "epoch": 7.26496, "grad_norm": 0.5181993246078491, "learning_rate": 8.641809126396554e-05, "loss": 3.3866, "step": 5681 }, { "epoch": 7.26624, "grad_norm": 0.5294241905212402, "learning_rate": 8.637770897832817e-05, "loss": 3.4343, "step": 5682 }, { "epoch": 7.26752, "grad_norm": 0.5253875851631165, "learning_rate": 8.63373266926908e-05, "loss": 3.3867, "step": 5683 }, { "epoch": 7.2688, "grad_norm": 0.5317622423171997, "learning_rate": 8.629694440705343e-05, "loss": 3.3478, "step": 5684 }, { "epoch": 7.27008, "grad_norm": 0.5254741907119751, "learning_rate": 8.625656212141606e-05, "loss": 3.4261, "step": 5685 }, { "epoch": 7.27136, "grad_norm": 0.5245876312255859, "learning_rate": 8.62161798357787e-05, "loss": 3.3631, "step": 5686 }, { "epoch": 7.27264, "grad_norm": 0.5217685699462891, "learning_rate": 8.617579755014132e-05, "loss": 3.3236, "step": 5687 }, { "epoch": 7.27392, "grad_norm": 0.5240602493286133, "learning_rate": 8.613541526450396e-05, "loss": 3.3932, "step": 5688 }, { "epoch": 7.2752, "grad_norm": 0.5256776809692383, "learning_rate": 8.60950329788666e-05, "loss": 3.4072, "step": 5689 }, { "epoch": 7.27648, "grad_norm": 0.5401302576065063, "learning_rate": 8.605465069322924e-05, "loss": 3.3593, "step": 5690 }, { "epoch": 7.27776, "grad_norm": 0.5390993356704712, "learning_rate": 8.601426840759185e-05, "loss": 3.3646, "step": 5691 }, { "epoch": 7.27904, "grad_norm": 0.5312697887420654, "learning_rate": 8.59738861219545e-05, "loss": 3.4073, "step": 5692 }, { "epoch": 7.28032, "grad_norm": 0.5275899171829224, "learning_rate": 8.593350383631713e-05, "loss": 3.3039, "step": 5693 }, { "epoch": 7.2816, "grad_norm": 0.5374751687049866, "learning_rate": 8.589312155067977e-05, "loss": 3.3517, "step": 5694 }, { "epoch": 7.2828800000000005, "grad_norm": 0.5126158595085144, "learning_rate": 8.585273926504239e-05, "loss": 3.3677, "step": 5695 }, { "epoch": 7.28416, "grad_norm": 0.5134846568107605, "learning_rate": 8.581235697940503e-05, "loss": 3.2869, "step": 5696 }, { "epoch": 7.28544, "grad_norm": 0.5251624584197998, "learning_rate": 8.577197469376766e-05, "loss": 3.3785, "step": 5697 }, { "epoch": 7.28672, "grad_norm": 0.5131708979606628, "learning_rate": 8.57315924081303e-05, "loss": 3.3422, "step": 5698 }, { "epoch": 7.288, "grad_norm": 0.5143713355064392, "learning_rate": 8.569121012249292e-05, "loss": 3.3171, "step": 5699 }, { "epoch": 7.28928, "grad_norm": 0.5281633734703064, "learning_rate": 8.565082783685555e-05, "loss": 3.324, "step": 5700 }, { "epoch": 7.29056, "grad_norm": 0.5322510600090027, "learning_rate": 8.56104455512182e-05, "loss": 3.3639, "step": 5701 }, { "epoch": 7.29184, "grad_norm": 0.5279068946838379, "learning_rate": 8.557006326558081e-05, "loss": 3.3392, "step": 5702 }, { "epoch": 7.29312, "grad_norm": 0.5267346501350403, "learning_rate": 8.552968097994346e-05, "loss": 3.4094, "step": 5703 }, { "epoch": 7.2943999999999996, "grad_norm": 0.512809693813324, "learning_rate": 8.548929869430609e-05, "loss": 3.3388, "step": 5704 }, { "epoch": 7.29568, "grad_norm": 0.5133838057518005, "learning_rate": 8.544891640866873e-05, "loss": 3.3506, "step": 5705 }, { "epoch": 7.29696, "grad_norm": 0.5310163497924805, "learning_rate": 8.540853412303135e-05, "loss": 3.3633, "step": 5706 }, { "epoch": 7.29824, "grad_norm": 0.5210456848144531, "learning_rate": 8.536815183739399e-05, "loss": 3.3905, "step": 5707 }, { "epoch": 7.29952, "grad_norm": 0.5178525447845459, "learning_rate": 8.532776955175662e-05, "loss": 3.3811, "step": 5708 }, { "epoch": 7.3008, "grad_norm": 0.5075005292892456, "learning_rate": 8.528738726611926e-05, "loss": 3.4205, "step": 5709 }, { "epoch": 7.30208, "grad_norm": 0.5155579447746277, "learning_rate": 8.524700498048188e-05, "loss": 3.4302, "step": 5710 }, { "epoch": 7.30336, "grad_norm": 0.5333580374717712, "learning_rate": 8.520662269484452e-05, "loss": 3.4141, "step": 5711 }, { "epoch": 7.30464, "grad_norm": 0.520826518535614, "learning_rate": 8.516624040920715e-05, "loss": 3.3472, "step": 5712 }, { "epoch": 7.30592, "grad_norm": 0.5248406529426575, "learning_rate": 8.512585812356978e-05, "loss": 3.3902, "step": 5713 }, { "epoch": 7.3072, "grad_norm": 0.5231212973594666, "learning_rate": 8.508547583793243e-05, "loss": 3.376, "step": 5714 }, { "epoch": 7.30848, "grad_norm": 0.5204564929008484, "learning_rate": 8.504509355229504e-05, "loss": 3.3655, "step": 5715 }, { "epoch": 7.30976, "grad_norm": 0.5213709473609924, "learning_rate": 8.500471126665769e-05, "loss": 3.3372, "step": 5716 }, { "epoch": 7.31104, "grad_norm": 0.5297428369522095, "learning_rate": 8.496432898102032e-05, "loss": 3.3941, "step": 5717 }, { "epoch": 7.31232, "grad_norm": 0.5178290605545044, "learning_rate": 8.492394669538296e-05, "loss": 3.3335, "step": 5718 }, { "epoch": 7.3136, "grad_norm": 0.537998616695404, "learning_rate": 8.488356440974558e-05, "loss": 3.4031, "step": 5719 }, { "epoch": 7.31488, "grad_norm": 0.5190197825431824, "learning_rate": 8.484318212410822e-05, "loss": 3.435, "step": 5720 }, { "epoch": 7.31616, "grad_norm": 0.5143560767173767, "learning_rate": 8.480279983847085e-05, "loss": 3.3524, "step": 5721 }, { "epoch": 7.31744, "grad_norm": 0.5168914198875427, "learning_rate": 8.47624175528335e-05, "loss": 3.365, "step": 5722 }, { "epoch": 7.31872, "grad_norm": 0.5233278870582581, "learning_rate": 8.472203526719611e-05, "loss": 3.3914, "step": 5723 }, { "epoch": 7.32, "grad_norm": 0.5222857594490051, "learning_rate": 8.468165298155876e-05, "loss": 3.445, "step": 5724 }, { "epoch": 7.32128, "grad_norm": 0.5074923634529114, "learning_rate": 8.464127069592139e-05, "loss": 3.3527, "step": 5725 }, { "epoch": 7.32256, "grad_norm": 0.5240098834037781, "learning_rate": 8.4600888410284e-05, "loss": 3.33, "step": 5726 }, { "epoch": 7.32384, "grad_norm": 0.513421893119812, "learning_rate": 8.456050612464665e-05, "loss": 3.3835, "step": 5727 }, { "epoch": 7.32512, "grad_norm": 0.524813175201416, "learning_rate": 8.452012383900928e-05, "loss": 3.3284, "step": 5728 }, { "epoch": 7.3264, "grad_norm": 0.5159763693809509, "learning_rate": 8.447974155337192e-05, "loss": 3.3274, "step": 5729 }, { "epoch": 7.32768, "grad_norm": 0.5181670784950256, "learning_rate": 8.443935926773454e-05, "loss": 3.4098, "step": 5730 }, { "epoch": 7.32896, "grad_norm": 0.5126925706863403, "learning_rate": 8.439897698209718e-05, "loss": 3.2956, "step": 5731 }, { "epoch": 7.33024, "grad_norm": 0.531437873840332, "learning_rate": 8.435859469645981e-05, "loss": 3.395, "step": 5732 }, { "epoch": 7.33152, "grad_norm": 0.5225799083709717, "learning_rate": 8.431821241082245e-05, "loss": 3.3421, "step": 5733 }, { "epoch": 7.3328, "grad_norm": 0.5173592567443848, "learning_rate": 8.427783012518507e-05, "loss": 3.3345, "step": 5734 }, { "epoch": 7.33408, "grad_norm": 0.525583028793335, "learning_rate": 8.423744783954771e-05, "loss": 3.4155, "step": 5735 }, { "epoch": 7.33536, "grad_norm": 0.5271779894828796, "learning_rate": 8.419706555391034e-05, "loss": 3.4121, "step": 5736 }, { "epoch": 7.33664, "grad_norm": 0.5142373442649841, "learning_rate": 8.415668326827299e-05, "loss": 3.38, "step": 5737 }, { "epoch": 7.33792, "grad_norm": 0.5095165967941284, "learning_rate": 8.41163009826356e-05, "loss": 3.3846, "step": 5738 }, { "epoch": 7.3392, "grad_norm": 0.5181048512458801, "learning_rate": 8.407591869699823e-05, "loss": 3.4532, "step": 5739 }, { "epoch": 7.34048, "grad_norm": 0.5084232091903687, "learning_rate": 8.403553641136088e-05, "loss": 3.3129, "step": 5740 }, { "epoch": 7.34176, "grad_norm": 0.5222226977348328, "learning_rate": 8.399515412572351e-05, "loss": 3.3317, "step": 5741 }, { "epoch": 7.34304, "grad_norm": 0.5268555283546448, "learning_rate": 8.395477184008614e-05, "loss": 3.3111, "step": 5742 }, { "epoch": 7.34432, "grad_norm": 0.5401803851127625, "learning_rate": 8.391438955444877e-05, "loss": 3.3389, "step": 5743 }, { "epoch": 7.3456, "grad_norm": 0.49834394454956055, "learning_rate": 8.387400726881141e-05, "loss": 3.3859, "step": 5744 }, { "epoch": 7.34688, "grad_norm": 0.5210273265838623, "learning_rate": 8.383362498317404e-05, "loss": 3.371, "step": 5745 }, { "epoch": 7.34816, "grad_norm": 0.5339854955673218, "learning_rate": 8.379324269753667e-05, "loss": 3.3539, "step": 5746 }, { "epoch": 7.3494399999999995, "grad_norm": 0.5159910917282104, "learning_rate": 8.37528604118993e-05, "loss": 3.3968, "step": 5747 }, { "epoch": 7.35072, "grad_norm": 0.514874279499054, "learning_rate": 8.371247812626194e-05, "loss": 3.4268, "step": 5748 }, { "epoch": 7.352, "grad_norm": 0.5367988348007202, "learning_rate": 8.367209584062457e-05, "loss": 3.4254, "step": 5749 }, { "epoch": 7.35328, "grad_norm": 0.5135504603385925, "learning_rate": 8.363171355498722e-05, "loss": 3.4124, "step": 5750 }, { "epoch": 7.35456, "grad_norm": 0.5267705321311951, "learning_rate": 8.359133126934984e-05, "loss": 3.4418, "step": 5751 }, { "epoch": 7.35584, "grad_norm": 0.5178395509719849, "learning_rate": 8.355094898371248e-05, "loss": 3.3394, "step": 5752 }, { "epoch": 7.35712, "grad_norm": 0.5211403369903564, "learning_rate": 8.351056669807511e-05, "loss": 3.3646, "step": 5753 }, { "epoch": 7.3584, "grad_norm": 0.5357264876365662, "learning_rate": 8.347018441243773e-05, "loss": 3.4218, "step": 5754 }, { "epoch": 7.35968, "grad_norm": 0.5282320976257324, "learning_rate": 8.342980212680037e-05, "loss": 3.3026, "step": 5755 }, { "epoch": 7.36096, "grad_norm": 0.5219427347183228, "learning_rate": 8.3389419841163e-05, "loss": 3.3327, "step": 5756 }, { "epoch": 7.36224, "grad_norm": 0.5319931507110596, "learning_rate": 8.334903755552564e-05, "loss": 3.4432, "step": 5757 }, { "epoch": 7.36352, "grad_norm": 0.5045892000198364, "learning_rate": 8.330865526988826e-05, "loss": 3.2485, "step": 5758 }, { "epoch": 7.3648, "grad_norm": 0.5378910303115845, "learning_rate": 8.32682729842509e-05, "loss": 3.3374, "step": 5759 }, { "epoch": 7.36608, "grad_norm": 0.5356464982032776, "learning_rate": 8.322789069861353e-05, "loss": 3.3517, "step": 5760 }, { "epoch": 7.36736, "grad_norm": 0.5344239473342896, "learning_rate": 8.318750841297618e-05, "loss": 3.4034, "step": 5761 }, { "epoch": 7.36864, "grad_norm": 0.530697226524353, "learning_rate": 8.314712612733879e-05, "loss": 3.3704, "step": 5762 }, { "epoch": 7.3699200000000005, "grad_norm": 0.5439151525497437, "learning_rate": 8.310674384170144e-05, "loss": 3.3844, "step": 5763 }, { "epoch": 7.3712, "grad_norm": 0.5485351085662842, "learning_rate": 8.306636155606407e-05, "loss": 3.3354, "step": 5764 }, { "epoch": 7.37248, "grad_norm": 0.5290150046348572, "learning_rate": 8.302597927042671e-05, "loss": 3.3475, "step": 5765 }, { "epoch": 7.37376, "grad_norm": 0.5363977551460266, "learning_rate": 8.298559698478933e-05, "loss": 3.3585, "step": 5766 }, { "epoch": 7.37504, "grad_norm": 0.5304407477378845, "learning_rate": 8.294521469915196e-05, "loss": 3.3573, "step": 5767 }, { "epoch": 7.37632, "grad_norm": 0.5171595215797424, "learning_rate": 8.29048324135146e-05, "loss": 3.3317, "step": 5768 }, { "epoch": 7.3776, "grad_norm": 0.5493373870849609, "learning_rate": 8.286445012787723e-05, "loss": 3.4143, "step": 5769 }, { "epoch": 7.37888, "grad_norm": 0.5111392736434937, "learning_rate": 8.282406784223986e-05, "loss": 3.3583, "step": 5770 }, { "epoch": 7.38016, "grad_norm": 0.5278791189193726, "learning_rate": 8.278368555660249e-05, "loss": 3.3444, "step": 5771 }, { "epoch": 7.38144, "grad_norm": 0.5462014079093933, "learning_rate": 8.274330327096513e-05, "loss": 3.3613, "step": 5772 }, { "epoch": 7.38272, "grad_norm": 0.5223116874694824, "learning_rate": 8.270292098532776e-05, "loss": 3.3495, "step": 5773 }, { "epoch": 7.384, "grad_norm": 0.5300905108451843, "learning_rate": 8.26625386996904e-05, "loss": 3.351, "step": 5774 }, { "epoch": 7.38528, "grad_norm": 0.5270613431930542, "learning_rate": 8.262215641405302e-05, "loss": 3.3506, "step": 5775 }, { "epoch": 7.38656, "grad_norm": 0.5194157958030701, "learning_rate": 8.258177412841567e-05, "loss": 3.3194, "step": 5776 }, { "epoch": 7.38784, "grad_norm": 0.5169289708137512, "learning_rate": 8.25413918427783e-05, "loss": 3.3789, "step": 5777 }, { "epoch": 7.38912, "grad_norm": 0.5390181541442871, "learning_rate": 8.250100955714093e-05, "loss": 3.3735, "step": 5778 }, { "epoch": 7.3904, "grad_norm": 0.5157946348190308, "learning_rate": 8.246062727150356e-05, "loss": 3.3887, "step": 5779 }, { "epoch": 7.39168, "grad_norm": 0.5124242901802063, "learning_rate": 8.242024498586619e-05, "loss": 3.3066, "step": 5780 }, { "epoch": 7.39296, "grad_norm": 0.523602306842804, "learning_rate": 8.237986270022883e-05, "loss": 3.4837, "step": 5781 }, { "epoch": 7.39424, "grad_norm": 0.525345504283905, "learning_rate": 8.233948041459145e-05, "loss": 3.343, "step": 5782 }, { "epoch": 7.39552, "grad_norm": 0.5302904844284058, "learning_rate": 8.229909812895409e-05, "loss": 3.358, "step": 5783 }, { "epoch": 7.3968, "grad_norm": 0.5098133087158203, "learning_rate": 8.225871584331672e-05, "loss": 3.3791, "step": 5784 }, { "epoch": 7.39808, "grad_norm": 0.5192782282829285, "learning_rate": 8.221833355767937e-05, "loss": 3.4295, "step": 5785 }, { "epoch": 7.39936, "grad_norm": 0.5070282816886902, "learning_rate": 8.217795127204198e-05, "loss": 3.3926, "step": 5786 }, { "epoch": 7.40064, "grad_norm": 0.5000091791152954, "learning_rate": 8.213756898640463e-05, "loss": 3.3154, "step": 5787 }, { "epoch": 7.40192, "grad_norm": 0.5293865203857422, "learning_rate": 8.209718670076726e-05, "loss": 3.3813, "step": 5788 }, { "epoch": 7.4032, "grad_norm": 0.5276476144790649, "learning_rate": 8.20568044151299e-05, "loss": 3.3801, "step": 5789 }, { "epoch": 7.40448, "grad_norm": 0.5471776127815247, "learning_rate": 8.201642212949252e-05, "loss": 3.4044, "step": 5790 }, { "epoch": 7.40576, "grad_norm": 0.5328643918037415, "learning_rate": 8.197603984385516e-05, "loss": 3.3514, "step": 5791 }, { "epoch": 7.40704, "grad_norm": 0.5258887410163879, "learning_rate": 8.193565755821779e-05, "loss": 3.3467, "step": 5792 }, { "epoch": 7.40832, "grad_norm": 0.5234854221343994, "learning_rate": 8.18952752725804e-05, "loss": 3.4287, "step": 5793 }, { "epoch": 7.4096, "grad_norm": 0.516001284122467, "learning_rate": 8.185489298694305e-05, "loss": 3.3589, "step": 5794 }, { "epoch": 7.41088, "grad_norm": 0.5244291424751282, "learning_rate": 8.181451070130568e-05, "loss": 3.4387, "step": 5795 }, { "epoch": 7.41216, "grad_norm": 0.5443776249885559, "learning_rate": 8.177412841566832e-05, "loss": 3.3916, "step": 5796 }, { "epoch": 7.41344, "grad_norm": 0.5309216380119324, "learning_rate": 8.173374613003094e-05, "loss": 3.3954, "step": 5797 }, { "epoch": 7.41472, "grad_norm": 0.5038948059082031, "learning_rate": 8.169336384439358e-05, "loss": 3.343, "step": 5798 }, { "epoch": 7.416, "grad_norm": 0.52728271484375, "learning_rate": 8.165298155875621e-05, "loss": 3.3784, "step": 5799 }, { "epoch": 7.41728, "grad_norm": 0.5411390066146851, "learning_rate": 8.161259927311886e-05, "loss": 3.4319, "step": 5800 }, { "epoch": 7.41856, "grad_norm": 0.5185636878013611, "learning_rate": 8.157221698748149e-05, "loss": 3.4297, "step": 5801 }, { "epoch": 7.41984, "grad_norm": 0.5342210531234741, "learning_rate": 8.153183470184412e-05, "loss": 3.3683, "step": 5802 }, { "epoch": 7.42112, "grad_norm": 0.5324079394340515, "learning_rate": 8.149145241620675e-05, "loss": 3.3805, "step": 5803 }, { "epoch": 7.4224, "grad_norm": 0.520026445388794, "learning_rate": 8.145107013056939e-05, "loss": 3.3222, "step": 5804 }, { "epoch": 7.42368, "grad_norm": 0.5147387981414795, "learning_rate": 8.141068784493202e-05, "loss": 3.3722, "step": 5805 }, { "epoch": 7.4249600000000004, "grad_norm": 0.5206732749938965, "learning_rate": 8.137030555929464e-05, "loss": 3.4011, "step": 5806 }, { "epoch": 7.42624, "grad_norm": 0.5213762521743774, "learning_rate": 8.132992327365728e-05, "loss": 3.3676, "step": 5807 }, { "epoch": 7.42752, "grad_norm": 0.5201925039291382, "learning_rate": 8.128954098801991e-05, "loss": 3.3553, "step": 5808 }, { "epoch": 7.4288, "grad_norm": 0.5245599150657654, "learning_rate": 8.124915870238256e-05, "loss": 3.4287, "step": 5809 }, { "epoch": 7.43008, "grad_norm": 0.5151320695877075, "learning_rate": 8.120877641674517e-05, "loss": 3.3672, "step": 5810 }, { "epoch": 7.43136, "grad_norm": 0.5449185967445374, "learning_rate": 8.116839413110782e-05, "loss": 3.4269, "step": 5811 }, { "epoch": 7.43264, "grad_norm": 0.5150814056396484, "learning_rate": 8.112801184547045e-05, "loss": 3.4574, "step": 5812 }, { "epoch": 7.43392, "grad_norm": 0.5201151371002197, "learning_rate": 8.108762955983309e-05, "loss": 3.3769, "step": 5813 }, { "epoch": 7.4352, "grad_norm": 0.5087258219718933, "learning_rate": 8.10472472741957e-05, "loss": 3.4079, "step": 5814 }, { "epoch": 7.4364799999999995, "grad_norm": 0.5286344289779663, "learning_rate": 8.100686498855835e-05, "loss": 3.4387, "step": 5815 }, { "epoch": 7.43776, "grad_norm": 0.5217651724815369, "learning_rate": 8.096648270292098e-05, "loss": 3.3826, "step": 5816 }, { "epoch": 7.43904, "grad_norm": 0.5195396542549133, "learning_rate": 8.092610041728362e-05, "loss": 3.3419, "step": 5817 }, { "epoch": 7.44032, "grad_norm": 0.5105847716331482, "learning_rate": 8.088571813164624e-05, "loss": 3.3915, "step": 5818 }, { "epoch": 7.4416, "grad_norm": 0.5170384645462036, "learning_rate": 8.084533584600888e-05, "loss": 3.3614, "step": 5819 }, { "epoch": 7.44288, "grad_norm": 0.5139781832695007, "learning_rate": 8.080495356037151e-05, "loss": 3.427, "step": 5820 }, { "epoch": 7.44416, "grad_norm": 0.5179570317268372, "learning_rate": 8.076457127473413e-05, "loss": 3.4167, "step": 5821 }, { "epoch": 7.44544, "grad_norm": 0.5289480090141296, "learning_rate": 8.072418898909677e-05, "loss": 3.3394, "step": 5822 }, { "epoch": 7.44672, "grad_norm": 0.5202321410179138, "learning_rate": 8.06838067034594e-05, "loss": 3.3681, "step": 5823 }, { "epoch": 7.448, "grad_norm": 0.5124974846839905, "learning_rate": 8.064342441782205e-05, "loss": 3.3676, "step": 5824 }, { "epoch": 7.44928, "grad_norm": 0.5194815993309021, "learning_rate": 8.060304213218466e-05, "loss": 3.3824, "step": 5825 }, { "epoch": 7.45056, "grad_norm": 0.5214881896972656, "learning_rate": 8.056265984654731e-05, "loss": 3.3963, "step": 5826 }, { "epoch": 7.45184, "grad_norm": 0.5211467742919922, "learning_rate": 8.052227756090994e-05, "loss": 3.3754, "step": 5827 }, { "epoch": 7.45312, "grad_norm": 0.5255600214004517, "learning_rate": 8.048189527527258e-05, "loss": 3.4049, "step": 5828 }, { "epoch": 7.4544, "grad_norm": 0.5339398980140686, "learning_rate": 8.04415129896352e-05, "loss": 3.3593, "step": 5829 }, { "epoch": 7.45568, "grad_norm": 0.5176902413368225, "learning_rate": 8.040113070399784e-05, "loss": 3.3843, "step": 5830 }, { "epoch": 7.45696, "grad_norm": 0.5297396779060364, "learning_rate": 8.036074841836047e-05, "loss": 3.3186, "step": 5831 }, { "epoch": 7.45824, "grad_norm": 0.531353235244751, "learning_rate": 8.032036613272312e-05, "loss": 3.3528, "step": 5832 }, { "epoch": 7.45952, "grad_norm": 0.5194066166877747, "learning_rate": 8.027998384708573e-05, "loss": 3.4387, "step": 5833 }, { "epoch": 7.4608, "grad_norm": 0.5318957567214966, "learning_rate": 8.023960156144836e-05, "loss": 3.3612, "step": 5834 }, { "epoch": 7.46208, "grad_norm": 0.5308012366294861, "learning_rate": 8.0199219275811e-05, "loss": 3.4369, "step": 5835 }, { "epoch": 7.46336, "grad_norm": 0.5104942321777344, "learning_rate": 8.015883699017364e-05, "loss": 3.4146, "step": 5836 }, { "epoch": 7.46464, "grad_norm": 0.5228284597396851, "learning_rate": 8.011845470453628e-05, "loss": 3.3655, "step": 5837 }, { "epoch": 7.46592, "grad_norm": 0.5318959951400757, "learning_rate": 8.00780724188989e-05, "loss": 3.3857, "step": 5838 }, { "epoch": 7.4672, "grad_norm": 0.5241490006446838, "learning_rate": 8.003769013326154e-05, "loss": 3.4119, "step": 5839 }, { "epoch": 7.46848, "grad_norm": 0.5517506003379822, "learning_rate": 7.999730784762417e-05, "loss": 3.395, "step": 5840 }, { "epoch": 7.46976, "grad_norm": 0.529964804649353, "learning_rate": 7.995692556198681e-05, "loss": 3.3613, "step": 5841 }, { "epoch": 7.47104, "grad_norm": 0.5344815850257874, "learning_rate": 7.991654327634943e-05, "loss": 3.3307, "step": 5842 }, { "epoch": 7.47232, "grad_norm": 0.5124824643135071, "learning_rate": 7.987616099071207e-05, "loss": 3.3268, "step": 5843 }, { "epoch": 7.4736, "grad_norm": 0.5115407705307007, "learning_rate": 7.98357787050747e-05, "loss": 3.3561, "step": 5844 }, { "epoch": 7.47488, "grad_norm": 0.5252261757850647, "learning_rate": 7.979539641943735e-05, "loss": 3.3923, "step": 5845 }, { "epoch": 7.47616, "grad_norm": 0.521026611328125, "learning_rate": 7.975501413379996e-05, "loss": 3.3531, "step": 5846 }, { "epoch": 7.47744, "grad_norm": 0.5293687582015991, "learning_rate": 7.97146318481626e-05, "loss": 3.4022, "step": 5847 }, { "epoch": 7.47872, "grad_norm": 0.5290361642837524, "learning_rate": 7.967424956252524e-05, "loss": 3.3675, "step": 5848 }, { "epoch": 7.48, "grad_norm": 0.5217385292053223, "learning_rate": 7.963386727688785e-05, "loss": 3.3527, "step": 5849 }, { "epoch": 7.48128, "grad_norm": 0.5071624517440796, "learning_rate": 7.95934849912505e-05, "loss": 3.3443, "step": 5850 }, { "epoch": 7.48256, "grad_norm": 0.5297966599464417, "learning_rate": 7.955310270561313e-05, "loss": 3.3398, "step": 5851 }, { "epoch": 7.48384, "grad_norm": 0.5170907378196716, "learning_rate": 7.951272041997577e-05, "loss": 3.3573, "step": 5852 }, { "epoch": 7.48512, "grad_norm": 0.5123015642166138, "learning_rate": 7.947233813433839e-05, "loss": 3.3852, "step": 5853 }, { "epoch": 7.4864, "grad_norm": 0.5429021120071411, "learning_rate": 7.943195584870103e-05, "loss": 3.435, "step": 5854 }, { "epoch": 7.48768, "grad_norm": 0.5190651416778564, "learning_rate": 7.939157356306366e-05, "loss": 3.4174, "step": 5855 }, { "epoch": 7.48896, "grad_norm": 0.5276857614517212, "learning_rate": 7.93511912774263e-05, "loss": 3.368, "step": 5856 }, { "epoch": 7.49024, "grad_norm": 0.5189977288246155, "learning_rate": 7.931080899178892e-05, "loss": 3.3205, "step": 5857 }, { "epoch": 7.49152, "grad_norm": 0.5297253131866455, "learning_rate": 7.927042670615156e-05, "loss": 3.3809, "step": 5858 }, { "epoch": 7.4928, "grad_norm": 0.513972818851471, "learning_rate": 7.92300444205142e-05, "loss": 3.3473, "step": 5859 }, { "epoch": 7.49408, "grad_norm": 0.539738118648529, "learning_rate": 7.918966213487682e-05, "loss": 3.3157, "step": 5860 }, { "epoch": 7.49536, "grad_norm": 0.5355290174484253, "learning_rate": 7.914927984923946e-05, "loss": 3.4172, "step": 5861 }, { "epoch": 7.49664, "grad_norm": 0.5001344084739685, "learning_rate": 7.910889756360209e-05, "loss": 3.2782, "step": 5862 }, { "epoch": 7.49792, "grad_norm": 0.5256953835487366, "learning_rate": 7.906851527796473e-05, "loss": 3.3973, "step": 5863 }, { "epoch": 7.4992, "grad_norm": 0.526117205619812, "learning_rate": 7.902813299232736e-05, "loss": 3.3484, "step": 5864 }, { "epoch": 7.50048, "grad_norm": 0.5159139037132263, "learning_rate": 7.898775070668999e-05, "loss": 3.2791, "step": 5865 }, { "epoch": 7.50176, "grad_norm": 0.5434691309928894, "learning_rate": 7.894736842105262e-05, "loss": 3.3503, "step": 5866 }, { "epoch": 7.50304, "grad_norm": 0.5286847949028015, "learning_rate": 7.890698613541526e-05, "loss": 3.3695, "step": 5867 }, { "epoch": 7.50432, "grad_norm": 0.5285123586654663, "learning_rate": 7.886660384977789e-05, "loss": 3.4208, "step": 5868 }, { "epoch": 7.5056, "grad_norm": 0.5370778441429138, "learning_rate": 7.882622156414052e-05, "loss": 3.4278, "step": 5869 }, { "epoch": 7.50688, "grad_norm": 0.5134227275848389, "learning_rate": 7.878583927850315e-05, "loss": 3.3727, "step": 5870 }, { "epoch": 7.50816, "grad_norm": 0.5174378156661987, "learning_rate": 7.87454569928658e-05, "loss": 3.3265, "step": 5871 }, { "epoch": 7.50944, "grad_norm": 0.5034676194190979, "learning_rate": 7.870507470722843e-05, "loss": 3.3688, "step": 5872 }, { "epoch": 7.51072, "grad_norm": 0.5375760197639465, "learning_rate": 7.866469242159107e-05, "loss": 3.3616, "step": 5873 }, { "epoch": 7.5120000000000005, "grad_norm": 0.5142561197280884, "learning_rate": 7.862431013595369e-05, "loss": 3.3702, "step": 5874 }, { "epoch": 7.51328, "grad_norm": 0.5261555314064026, "learning_rate": 7.858392785031632e-05, "loss": 3.372, "step": 5875 }, { "epoch": 7.51456, "grad_norm": 0.5355465412139893, "learning_rate": 7.854354556467896e-05, "loss": 3.3866, "step": 5876 }, { "epoch": 7.51584, "grad_norm": 0.5262174010276794, "learning_rate": 7.850316327904158e-05, "loss": 3.4209, "step": 5877 }, { "epoch": 7.51712, "grad_norm": 0.5196813344955444, "learning_rate": 7.846278099340422e-05, "loss": 3.4005, "step": 5878 }, { "epoch": 7.5184, "grad_norm": 0.5177279114723206, "learning_rate": 7.842239870776685e-05, "loss": 3.364, "step": 5879 }, { "epoch": 7.51968, "grad_norm": 0.5248557329177856, "learning_rate": 7.83820164221295e-05, "loss": 3.3458, "step": 5880 }, { "epoch": 7.52096, "grad_norm": 0.5306951999664307, "learning_rate": 7.834163413649211e-05, "loss": 3.3503, "step": 5881 }, { "epoch": 7.52224, "grad_norm": 0.5404332876205444, "learning_rate": 7.830125185085475e-05, "loss": 3.2522, "step": 5882 }, { "epoch": 7.5235199999999995, "grad_norm": 0.5501363277435303, "learning_rate": 7.826086956521738e-05, "loss": 3.3744, "step": 5883 }, { "epoch": 7.5248, "grad_norm": 0.5222617387771606, "learning_rate": 7.822048727958003e-05, "loss": 3.3439, "step": 5884 }, { "epoch": 7.52608, "grad_norm": 0.5240084528923035, "learning_rate": 7.818010499394264e-05, "loss": 3.3858, "step": 5885 }, { "epoch": 7.52736, "grad_norm": 0.5329712629318237, "learning_rate": 7.813972270830529e-05, "loss": 3.3657, "step": 5886 }, { "epoch": 7.52864, "grad_norm": 0.5383173823356628, "learning_rate": 7.809934042266792e-05, "loss": 3.4141, "step": 5887 }, { "epoch": 7.52992, "grad_norm": 0.5195038318634033, "learning_rate": 7.805895813703055e-05, "loss": 3.43, "step": 5888 }, { "epoch": 7.5312, "grad_norm": 0.5309056043624878, "learning_rate": 7.801857585139318e-05, "loss": 3.3459, "step": 5889 }, { "epoch": 7.53248, "grad_norm": 0.5201624631881714, "learning_rate": 7.797819356575581e-05, "loss": 3.3979, "step": 5890 }, { "epoch": 7.53376, "grad_norm": 0.520959198474884, "learning_rate": 7.793781128011845e-05, "loss": 3.3557, "step": 5891 }, { "epoch": 7.53504, "grad_norm": 0.5142325758934021, "learning_rate": 7.789742899448108e-05, "loss": 3.4258, "step": 5892 }, { "epoch": 7.53632, "grad_norm": 0.5339785814285278, "learning_rate": 7.785704670884371e-05, "loss": 3.3424, "step": 5893 }, { "epoch": 7.5376, "grad_norm": 0.5324097275733948, "learning_rate": 7.781666442320634e-05, "loss": 3.3253, "step": 5894 }, { "epoch": 7.53888, "grad_norm": 0.5203735828399658, "learning_rate": 7.777628213756899e-05, "loss": 3.422, "step": 5895 }, { "epoch": 7.54016, "grad_norm": 0.5267855525016785, "learning_rate": 7.773589985193162e-05, "loss": 3.3743, "step": 5896 }, { "epoch": 7.54144, "grad_norm": 0.51844722032547, "learning_rate": 7.769551756629425e-05, "loss": 3.2686, "step": 5897 }, { "epoch": 7.54272, "grad_norm": 0.5209645628929138, "learning_rate": 7.765513528065688e-05, "loss": 3.3467, "step": 5898 }, { "epoch": 7.5440000000000005, "grad_norm": 0.530583918094635, "learning_rate": 7.761475299501952e-05, "loss": 3.437, "step": 5899 }, { "epoch": 7.54528, "grad_norm": 0.5081465840339661, "learning_rate": 7.757437070938215e-05, "loss": 3.3694, "step": 5900 }, { "epoch": 7.54656, "grad_norm": 0.5179983973503113, "learning_rate": 7.753398842374477e-05, "loss": 3.3602, "step": 5901 }, { "epoch": 7.54784, "grad_norm": 0.5412976145744324, "learning_rate": 7.749360613810741e-05, "loss": 3.3402, "step": 5902 }, { "epoch": 7.54912, "grad_norm": 0.5343058705329895, "learning_rate": 7.745322385247004e-05, "loss": 3.4374, "step": 5903 }, { "epoch": 7.5504, "grad_norm": 0.5409019589424133, "learning_rate": 7.741284156683268e-05, "loss": 3.323, "step": 5904 }, { "epoch": 7.55168, "grad_norm": 0.5322821736335754, "learning_rate": 7.73724592811953e-05, "loss": 3.4257, "step": 5905 }, { "epoch": 7.55296, "grad_norm": 0.5157960653305054, "learning_rate": 7.733207699555794e-05, "loss": 3.3649, "step": 5906 }, { "epoch": 7.55424, "grad_norm": 0.5299696922302246, "learning_rate": 7.729169470992057e-05, "loss": 3.3594, "step": 5907 }, { "epoch": 7.55552, "grad_norm": 0.5254634618759155, "learning_rate": 7.725131242428322e-05, "loss": 3.4228, "step": 5908 }, { "epoch": 7.5568, "grad_norm": 0.5235505104064941, "learning_rate": 7.721093013864583e-05, "loss": 3.357, "step": 5909 }, { "epoch": 7.55808, "grad_norm": 0.5225314497947693, "learning_rate": 7.717054785300848e-05, "loss": 3.3528, "step": 5910 }, { "epoch": 7.55936, "grad_norm": 0.51315838098526, "learning_rate": 7.713016556737111e-05, "loss": 3.4245, "step": 5911 }, { "epoch": 7.56064, "grad_norm": 0.536595344543457, "learning_rate": 7.708978328173375e-05, "loss": 3.3816, "step": 5912 }, { "epoch": 7.56192, "grad_norm": 0.5261996984481812, "learning_rate": 7.704940099609637e-05, "loss": 3.3648, "step": 5913 }, { "epoch": 7.5632, "grad_norm": 0.5061861276626587, "learning_rate": 7.7009018710459e-05, "loss": 3.4095, "step": 5914 }, { "epoch": 7.56448, "grad_norm": 0.52558434009552, "learning_rate": 7.696863642482164e-05, "loss": 3.4055, "step": 5915 }, { "epoch": 7.56576, "grad_norm": 0.5246904492378235, "learning_rate": 7.692825413918426e-05, "loss": 3.341, "step": 5916 }, { "epoch": 7.56704, "grad_norm": 0.5209987759590149, "learning_rate": 7.68878718535469e-05, "loss": 3.3632, "step": 5917 }, { "epoch": 7.56832, "grad_norm": 0.5232482552528381, "learning_rate": 7.684748956790953e-05, "loss": 3.3235, "step": 5918 }, { "epoch": 7.5696, "grad_norm": 0.5276979207992554, "learning_rate": 7.680710728227218e-05, "loss": 3.4077, "step": 5919 }, { "epoch": 7.57088, "grad_norm": 0.5282018184661865, "learning_rate": 7.676672499663479e-05, "loss": 3.2798, "step": 5920 }, { "epoch": 7.57216, "grad_norm": 0.5135851502418518, "learning_rate": 7.672634271099744e-05, "loss": 3.4252, "step": 5921 }, { "epoch": 7.57344, "grad_norm": 0.5193025469779968, "learning_rate": 7.668596042536007e-05, "loss": 3.2833, "step": 5922 }, { "epoch": 7.57472, "grad_norm": 0.5133464336395264, "learning_rate": 7.664557813972271e-05, "loss": 3.3827, "step": 5923 }, { "epoch": 7.576, "grad_norm": 0.5185580253601074, "learning_rate": 7.660519585408534e-05, "loss": 3.3816, "step": 5924 }, { "epoch": 7.57728, "grad_norm": 0.5164629220962524, "learning_rate": 7.656481356844797e-05, "loss": 3.4205, "step": 5925 }, { "epoch": 7.5785599999999995, "grad_norm": 0.5288913249969482, "learning_rate": 7.65244312828106e-05, "loss": 3.3824, "step": 5926 }, { "epoch": 7.57984, "grad_norm": 0.518733024597168, "learning_rate": 7.648404899717323e-05, "loss": 3.3578, "step": 5927 }, { "epoch": 7.58112, "grad_norm": 0.5246930718421936, "learning_rate": 7.644366671153587e-05, "loss": 3.3632, "step": 5928 }, { "epoch": 7.5824, "grad_norm": 0.5315341949462891, "learning_rate": 7.640328442589849e-05, "loss": 3.3564, "step": 5929 }, { "epoch": 7.58368, "grad_norm": 0.5214976668357849, "learning_rate": 7.636290214026113e-05, "loss": 3.3396, "step": 5930 }, { "epoch": 7.58496, "grad_norm": 0.5225616097450256, "learning_rate": 7.632251985462376e-05, "loss": 3.4089, "step": 5931 }, { "epoch": 7.58624, "grad_norm": 0.5472208857536316, "learning_rate": 7.628213756898641e-05, "loss": 3.397, "step": 5932 }, { "epoch": 7.58752, "grad_norm": 0.5345306396484375, "learning_rate": 7.624175528334902e-05, "loss": 3.395, "step": 5933 }, { "epoch": 7.5888, "grad_norm": 0.5214905142784119, "learning_rate": 7.620137299771167e-05, "loss": 3.3558, "step": 5934 }, { "epoch": 7.59008, "grad_norm": 0.53025221824646, "learning_rate": 7.61609907120743e-05, "loss": 3.3575, "step": 5935 }, { "epoch": 7.59136, "grad_norm": 0.5355333685874939, "learning_rate": 7.612060842643694e-05, "loss": 3.3473, "step": 5936 }, { "epoch": 7.59264, "grad_norm": 0.5342869758605957, "learning_rate": 7.608022614079956e-05, "loss": 3.3687, "step": 5937 }, { "epoch": 7.59392, "grad_norm": 0.5135994553565979, "learning_rate": 7.60398438551622e-05, "loss": 3.3285, "step": 5938 }, { "epoch": 7.5952, "grad_norm": 0.523440957069397, "learning_rate": 7.599946156952483e-05, "loss": 3.3989, "step": 5939 }, { "epoch": 7.59648, "grad_norm": 0.5180022716522217, "learning_rate": 7.595907928388747e-05, "loss": 3.3353, "step": 5940 }, { "epoch": 7.59776, "grad_norm": 0.5203397870063782, "learning_rate": 7.591869699825009e-05, "loss": 3.3903, "step": 5941 }, { "epoch": 7.5990400000000005, "grad_norm": 0.5081101059913635, "learning_rate": 7.587831471261272e-05, "loss": 3.3339, "step": 5942 }, { "epoch": 7.60032, "grad_norm": 0.5131347179412842, "learning_rate": 7.583793242697537e-05, "loss": 3.4133, "step": 5943 }, { "epoch": 7.6016, "grad_norm": 0.5185050964355469, "learning_rate": 7.579755014133798e-05, "loss": 3.2982, "step": 5944 }, { "epoch": 7.60288, "grad_norm": 0.5262473821640015, "learning_rate": 7.575716785570063e-05, "loss": 3.4203, "step": 5945 }, { "epoch": 7.60416, "grad_norm": 0.5225714445114136, "learning_rate": 7.571678557006326e-05, "loss": 3.4279, "step": 5946 }, { "epoch": 7.60544, "grad_norm": 0.5234572887420654, "learning_rate": 7.56764032844259e-05, "loss": 3.4388, "step": 5947 }, { "epoch": 7.60672, "grad_norm": 0.51933753490448, "learning_rate": 7.563602099878852e-05, "loss": 3.4044, "step": 5948 }, { "epoch": 7.608, "grad_norm": 0.5347757339477539, "learning_rate": 7.559563871315116e-05, "loss": 3.4157, "step": 5949 }, { "epoch": 7.60928, "grad_norm": 0.5258873105049133, "learning_rate": 7.555525642751379e-05, "loss": 3.4091, "step": 5950 }, { "epoch": 7.6105599999999995, "grad_norm": 0.5191522836685181, "learning_rate": 7.551487414187643e-05, "loss": 3.2995, "step": 5951 }, { "epoch": 7.61184, "grad_norm": 0.5296809673309326, "learning_rate": 7.547449185623905e-05, "loss": 3.4695, "step": 5952 }, { "epoch": 7.61312, "grad_norm": 0.5205982327461243, "learning_rate": 7.543410957060169e-05, "loss": 3.3205, "step": 5953 }, { "epoch": 7.6144, "grad_norm": 0.5297776460647583, "learning_rate": 7.539372728496432e-05, "loss": 3.4047, "step": 5954 }, { "epoch": 7.61568, "grad_norm": 0.5211127996444702, "learning_rate": 7.535334499932695e-05, "loss": 3.2961, "step": 5955 }, { "epoch": 7.61696, "grad_norm": 0.5078245401382446, "learning_rate": 7.531296271368958e-05, "loss": 3.3458, "step": 5956 }, { "epoch": 7.61824, "grad_norm": 0.5190492272377014, "learning_rate": 7.527258042805221e-05, "loss": 3.3822, "step": 5957 }, { "epoch": 7.61952, "grad_norm": 0.506162703037262, "learning_rate": 7.523219814241486e-05, "loss": 3.3511, "step": 5958 }, { "epoch": 7.6208, "grad_norm": 0.5184814929962158, "learning_rate": 7.519181585677749e-05, "loss": 3.3438, "step": 5959 }, { "epoch": 7.62208, "grad_norm": 0.516089916229248, "learning_rate": 7.515143357114012e-05, "loss": 3.4503, "step": 5960 }, { "epoch": 7.62336, "grad_norm": 0.5319845676422119, "learning_rate": 7.511105128550275e-05, "loss": 3.3838, "step": 5961 }, { "epoch": 7.62464, "grad_norm": 0.5250706076622009, "learning_rate": 7.507066899986539e-05, "loss": 3.4235, "step": 5962 }, { "epoch": 7.62592, "grad_norm": 0.512394368648529, "learning_rate": 7.503028671422802e-05, "loss": 3.3163, "step": 5963 }, { "epoch": 7.6272, "grad_norm": 0.5351422429084778, "learning_rate": 7.498990442859065e-05, "loss": 3.4371, "step": 5964 }, { "epoch": 7.62848, "grad_norm": 0.5451759696006775, "learning_rate": 7.494952214295328e-05, "loss": 3.3743, "step": 5965 }, { "epoch": 7.62976, "grad_norm": 0.5345867872238159, "learning_rate": 7.490913985731591e-05, "loss": 3.3628, "step": 5966 }, { "epoch": 7.6310400000000005, "grad_norm": 0.5288296341896057, "learning_rate": 7.486875757167855e-05, "loss": 3.3488, "step": 5967 }, { "epoch": 7.63232, "grad_norm": 0.5270797610282898, "learning_rate": 7.482837528604118e-05, "loss": 3.3402, "step": 5968 }, { "epoch": 7.6336, "grad_norm": 0.5452633500099182, "learning_rate": 7.478799300040381e-05, "loss": 3.3927, "step": 5969 }, { "epoch": 7.63488, "grad_norm": 0.5138344764709473, "learning_rate": 7.474761071476644e-05, "loss": 3.3503, "step": 5970 }, { "epoch": 7.63616, "grad_norm": 0.5245453715324402, "learning_rate": 7.470722842912909e-05, "loss": 3.3953, "step": 5971 }, { "epoch": 7.63744, "grad_norm": 0.5355793237686157, "learning_rate": 7.466684614349172e-05, "loss": 3.3593, "step": 5972 }, { "epoch": 7.63872, "grad_norm": 0.5104573369026184, "learning_rate": 7.462646385785435e-05, "loss": 3.347, "step": 5973 }, { "epoch": 7.64, "grad_norm": 0.5135825276374817, "learning_rate": 7.458608157221698e-05, "loss": 3.3998, "step": 5974 }, { "epoch": 7.64128, "grad_norm": 0.5211784243583679, "learning_rate": 7.454569928657961e-05, "loss": 3.3404, "step": 5975 }, { "epoch": 7.64256, "grad_norm": 0.5213080644607544, "learning_rate": 7.450531700094224e-05, "loss": 3.41, "step": 5976 }, { "epoch": 7.64384, "grad_norm": 0.518886923789978, "learning_rate": 7.446493471530488e-05, "loss": 3.41, "step": 5977 }, { "epoch": 7.64512, "grad_norm": 0.5232623815536499, "learning_rate": 7.442455242966751e-05, "loss": 3.324, "step": 5978 }, { "epoch": 7.6464, "grad_norm": 0.5307015776634216, "learning_rate": 7.438417014403014e-05, "loss": 3.3834, "step": 5979 }, { "epoch": 7.64768, "grad_norm": 0.5195972323417664, "learning_rate": 7.434378785839277e-05, "loss": 3.4509, "step": 5980 }, { "epoch": 7.64896, "grad_norm": 0.5420621037483215, "learning_rate": 7.430340557275542e-05, "loss": 3.3845, "step": 5981 }, { "epoch": 7.65024, "grad_norm": 0.5237659215927124, "learning_rate": 7.426302328711805e-05, "loss": 3.3393, "step": 5982 }, { "epoch": 7.65152, "grad_norm": 0.5215779542922974, "learning_rate": 7.422264100148068e-05, "loss": 3.3277, "step": 5983 }, { "epoch": 7.6528, "grad_norm": 0.5218585729598999, "learning_rate": 7.41822587158433e-05, "loss": 3.3724, "step": 5984 }, { "epoch": 7.65408, "grad_norm": 0.5390816926956177, "learning_rate": 7.414187643020595e-05, "loss": 3.3104, "step": 5985 }, { "epoch": 7.65536, "grad_norm": 0.5215725302696228, "learning_rate": 7.410149414456858e-05, "loss": 3.3578, "step": 5986 }, { "epoch": 7.65664, "grad_norm": 0.52614825963974, "learning_rate": 7.406111185893121e-05, "loss": 3.3742, "step": 5987 }, { "epoch": 7.65792, "grad_norm": 0.5331546664237976, "learning_rate": 7.402072957329384e-05, "loss": 3.4524, "step": 5988 }, { "epoch": 7.6592, "grad_norm": 0.5238944888114929, "learning_rate": 7.398034728765647e-05, "loss": 3.3707, "step": 5989 }, { "epoch": 7.66048, "grad_norm": 0.5260823965072632, "learning_rate": 7.39399650020191e-05, "loss": 3.4557, "step": 5990 }, { "epoch": 7.66176, "grad_norm": 0.5286345481872559, "learning_rate": 7.389958271638174e-05, "loss": 3.3191, "step": 5991 }, { "epoch": 7.66304, "grad_norm": 0.5183789134025574, "learning_rate": 7.385920043074437e-05, "loss": 3.3766, "step": 5992 }, { "epoch": 7.66432, "grad_norm": 0.5158443450927734, "learning_rate": 7.3818818145107e-05, "loss": 3.3405, "step": 5993 }, { "epoch": 7.6655999999999995, "grad_norm": 0.5058552026748657, "learning_rate": 7.377843585946963e-05, "loss": 3.3266, "step": 5994 }, { "epoch": 7.66688, "grad_norm": 0.5316519737243652, "learning_rate": 7.373805357383228e-05, "loss": 3.4087, "step": 5995 }, { "epoch": 7.66816, "grad_norm": 0.5409033298492432, "learning_rate": 7.369767128819491e-05, "loss": 3.3138, "step": 5996 }, { "epoch": 7.66944, "grad_norm": 0.5183295607566833, "learning_rate": 7.365728900255754e-05, "loss": 3.3506, "step": 5997 }, { "epoch": 7.67072, "grad_norm": 0.5203860998153687, "learning_rate": 7.361690671692017e-05, "loss": 3.3259, "step": 5998 }, { "epoch": 7.672, "grad_norm": 0.5194439888000488, "learning_rate": 7.357652443128281e-05, "loss": 3.3583, "step": 5999 }, { "epoch": 7.67328, "grad_norm": 0.5380210280418396, "learning_rate": 7.353614214564544e-05, "loss": 3.3976, "step": 6000 }, { "epoch": 7.67456, "grad_norm": 0.5133769512176514, "learning_rate": 7.349575986000807e-05, "loss": 3.3776, "step": 6001 }, { "epoch": 7.67584, "grad_norm": 0.5389159321784973, "learning_rate": 7.34553775743707e-05, "loss": 3.3787, "step": 6002 }, { "epoch": 7.67712, "grad_norm": 0.5545948147773743, "learning_rate": 7.341499528873333e-05, "loss": 3.3445, "step": 6003 }, { "epoch": 7.6784, "grad_norm": 0.5201895236968994, "learning_rate": 7.337461300309596e-05, "loss": 3.2763, "step": 6004 }, { "epoch": 7.67968, "grad_norm": 0.538422167301178, "learning_rate": 7.33342307174586e-05, "loss": 3.3572, "step": 6005 }, { "epoch": 7.68096, "grad_norm": 0.5424717664718628, "learning_rate": 7.329384843182124e-05, "loss": 3.3556, "step": 6006 }, { "epoch": 7.68224, "grad_norm": 0.5273032784461975, "learning_rate": 7.325346614618387e-05, "loss": 3.3478, "step": 6007 }, { "epoch": 7.68352, "grad_norm": 0.5107418298721313, "learning_rate": 7.32130838605465e-05, "loss": 3.4369, "step": 6008 }, { "epoch": 7.6848, "grad_norm": 0.5244372487068176, "learning_rate": 7.317270157490914e-05, "loss": 3.3214, "step": 6009 }, { "epoch": 7.6860800000000005, "grad_norm": 0.5354893207550049, "learning_rate": 7.313231928927177e-05, "loss": 3.3396, "step": 6010 }, { "epoch": 7.68736, "grad_norm": 0.5245274901390076, "learning_rate": 7.30919370036344e-05, "loss": 3.423, "step": 6011 }, { "epoch": 7.68864, "grad_norm": 0.528121829032898, "learning_rate": 7.305155471799703e-05, "loss": 3.4042, "step": 6012 }, { "epoch": 7.68992, "grad_norm": 0.5323206782341003, "learning_rate": 7.301117243235967e-05, "loss": 3.4324, "step": 6013 }, { "epoch": 7.6912, "grad_norm": 0.5250928401947021, "learning_rate": 7.29707901467223e-05, "loss": 3.3946, "step": 6014 }, { "epoch": 7.69248, "grad_norm": 0.5359845757484436, "learning_rate": 7.293040786108493e-05, "loss": 3.3705, "step": 6015 }, { "epoch": 7.69376, "grad_norm": 0.5194463729858398, "learning_rate": 7.289002557544756e-05, "loss": 3.3554, "step": 6016 }, { "epoch": 7.69504, "grad_norm": 0.5283463597297668, "learning_rate": 7.28496432898102e-05, "loss": 3.3502, "step": 6017 }, { "epoch": 7.69632, "grad_norm": 0.5234790444374084, "learning_rate": 7.280926100417282e-05, "loss": 3.3667, "step": 6018 }, { "epoch": 7.6975999999999996, "grad_norm": 0.5353789329528809, "learning_rate": 7.276887871853547e-05, "loss": 3.4113, "step": 6019 }, { "epoch": 7.69888, "grad_norm": 0.5060636401176453, "learning_rate": 7.27284964328981e-05, "loss": 3.4047, "step": 6020 }, { "epoch": 7.70016, "grad_norm": 0.5186939239501953, "learning_rate": 7.268811414726073e-05, "loss": 3.3966, "step": 6021 }, { "epoch": 7.70144, "grad_norm": 0.5250869989395142, "learning_rate": 7.264773186162336e-05, "loss": 3.4134, "step": 6022 }, { "epoch": 7.70272, "grad_norm": 0.5262473821640015, "learning_rate": 7.2607349575986e-05, "loss": 3.3685, "step": 6023 }, { "epoch": 7.704, "grad_norm": 0.5311341881752014, "learning_rate": 7.256696729034863e-05, "loss": 3.3809, "step": 6024 }, { "epoch": 7.70528, "grad_norm": 0.522276759147644, "learning_rate": 7.252658500471126e-05, "loss": 3.3186, "step": 6025 }, { "epoch": 7.70656, "grad_norm": 0.5295814871788025, "learning_rate": 7.248620271907389e-05, "loss": 3.3868, "step": 6026 }, { "epoch": 7.70784, "grad_norm": 0.5124343037605286, "learning_rate": 7.244582043343654e-05, "loss": 3.4317, "step": 6027 }, { "epoch": 7.70912, "grad_norm": 0.5283524990081787, "learning_rate": 7.240543814779915e-05, "loss": 3.3263, "step": 6028 }, { "epoch": 7.7104, "grad_norm": 0.5237511992454529, "learning_rate": 7.236505586216178e-05, "loss": 3.3889, "step": 6029 }, { "epoch": 7.71168, "grad_norm": 0.5321406126022339, "learning_rate": 7.232467357652443e-05, "loss": 3.3616, "step": 6030 }, { "epoch": 7.71296, "grad_norm": 0.5302574634552002, "learning_rate": 7.228429129088706e-05, "loss": 3.4753, "step": 6031 }, { "epoch": 7.71424, "grad_norm": 0.5124385356903076, "learning_rate": 7.224390900524969e-05, "loss": 3.3905, "step": 6032 }, { "epoch": 7.71552, "grad_norm": 0.5328637361526489, "learning_rate": 7.220352671961233e-05, "loss": 3.4005, "step": 6033 }, { "epoch": 7.7168, "grad_norm": 0.5470924377441406, "learning_rate": 7.216314443397496e-05, "loss": 3.3782, "step": 6034 }, { "epoch": 7.7180800000000005, "grad_norm": 0.553785502910614, "learning_rate": 7.212276214833759e-05, "loss": 3.3522, "step": 6035 }, { "epoch": 7.71936, "grad_norm": 0.533015251159668, "learning_rate": 7.208237986270022e-05, "loss": 3.3962, "step": 6036 }, { "epoch": 7.7206399999999995, "grad_norm": 0.5438454747200012, "learning_rate": 7.204199757706286e-05, "loss": 3.3756, "step": 6037 }, { "epoch": 7.72192, "grad_norm": 0.5408405661582947, "learning_rate": 7.20016152914255e-05, "loss": 3.3191, "step": 6038 }, { "epoch": 7.7232, "grad_norm": 0.543483316898346, "learning_rate": 7.196123300578812e-05, "loss": 3.4062, "step": 6039 }, { "epoch": 7.72448, "grad_norm": 0.49515143036842346, "learning_rate": 7.192085072015075e-05, "loss": 3.3466, "step": 6040 }, { "epoch": 7.72576, "grad_norm": 0.5400490164756775, "learning_rate": 7.188046843451338e-05, "loss": 3.3837, "step": 6041 }, { "epoch": 7.72704, "grad_norm": 0.5192726850509644, "learning_rate": 7.184008614887601e-05, "loss": 3.3819, "step": 6042 }, { "epoch": 7.72832, "grad_norm": 0.5206571221351624, "learning_rate": 7.179970386323864e-05, "loss": 3.4132, "step": 6043 }, { "epoch": 7.7296, "grad_norm": 0.5262637138366699, "learning_rate": 7.175932157760129e-05, "loss": 3.4121, "step": 6044 }, { "epoch": 7.73088, "grad_norm": 0.5163931846618652, "learning_rate": 7.171893929196392e-05, "loss": 3.3883, "step": 6045 }, { "epoch": 7.73216, "grad_norm": 0.5316890478134155, "learning_rate": 7.167855700632655e-05, "loss": 3.3518, "step": 6046 }, { "epoch": 7.73344, "grad_norm": 0.5298048853874207, "learning_rate": 7.163817472068918e-05, "loss": 3.4238, "step": 6047 }, { "epoch": 7.73472, "grad_norm": 0.5294202566146851, "learning_rate": 7.159779243505182e-05, "loss": 3.3889, "step": 6048 }, { "epoch": 7.736, "grad_norm": 0.5132120847702026, "learning_rate": 7.155741014941445e-05, "loss": 3.365, "step": 6049 }, { "epoch": 7.73728, "grad_norm": 0.5124995708465576, "learning_rate": 7.151702786377708e-05, "loss": 3.3348, "step": 6050 }, { "epoch": 7.73856, "grad_norm": 0.5307555794715881, "learning_rate": 7.147664557813973e-05, "loss": 3.3825, "step": 6051 }, { "epoch": 7.73984, "grad_norm": 0.5224402546882629, "learning_rate": 7.143626329250236e-05, "loss": 3.3489, "step": 6052 }, { "epoch": 7.7411200000000004, "grad_norm": 0.5100345015525818, "learning_rate": 7.139588100686499e-05, "loss": 3.4441, "step": 6053 }, { "epoch": 7.7424, "grad_norm": 0.5374388694763184, "learning_rate": 7.135549872122762e-05, "loss": 3.3539, "step": 6054 }, { "epoch": 7.74368, "grad_norm": 0.5344933867454529, "learning_rate": 7.131511643559025e-05, "loss": 3.3818, "step": 6055 }, { "epoch": 7.74496, "grad_norm": 0.5270034670829773, "learning_rate": 7.127473414995288e-05, "loss": 3.3496, "step": 6056 }, { "epoch": 7.74624, "grad_norm": 0.5249508619308472, "learning_rate": 7.12343518643155e-05, "loss": 3.4065, "step": 6057 }, { "epoch": 7.74752, "grad_norm": 0.5132743716239929, "learning_rate": 7.119396957867815e-05, "loss": 3.3701, "step": 6058 }, { "epoch": 7.7488, "grad_norm": 0.5167114734649658, "learning_rate": 7.115358729304078e-05, "loss": 3.385, "step": 6059 }, { "epoch": 7.75008, "grad_norm": 0.5301070809364319, "learning_rate": 7.111320500740341e-05, "loss": 3.3957, "step": 6060 }, { "epoch": 7.75136, "grad_norm": 0.5281143188476562, "learning_rate": 7.107282272176604e-05, "loss": 3.4052, "step": 6061 }, { "epoch": 7.7526399999999995, "grad_norm": 0.5261697173118591, "learning_rate": 7.103244043612868e-05, "loss": 3.4694, "step": 6062 }, { "epoch": 7.75392, "grad_norm": 0.5167369246482849, "learning_rate": 7.099205815049131e-05, "loss": 3.378, "step": 6063 }, { "epoch": 7.7552, "grad_norm": 0.5419825911521912, "learning_rate": 7.095167586485394e-05, "loss": 3.4047, "step": 6064 }, { "epoch": 7.75648, "grad_norm": 0.5268843173980713, "learning_rate": 7.091129357921657e-05, "loss": 3.3606, "step": 6065 }, { "epoch": 7.75776, "grad_norm": 0.5122654438018799, "learning_rate": 7.087091129357922e-05, "loss": 3.3676, "step": 6066 }, { "epoch": 7.75904, "grad_norm": 0.5293072462081909, "learning_rate": 7.083052900794185e-05, "loss": 3.3321, "step": 6067 }, { "epoch": 7.76032, "grad_norm": 0.5248163342475891, "learning_rate": 7.079014672230448e-05, "loss": 3.348, "step": 6068 }, { "epoch": 7.7616, "grad_norm": 0.5331951975822449, "learning_rate": 7.074976443666711e-05, "loss": 3.3557, "step": 6069 }, { "epoch": 7.76288, "grad_norm": 0.5159996151924133, "learning_rate": 7.070938215102974e-05, "loss": 3.3483, "step": 6070 }, { "epoch": 7.76416, "grad_norm": 0.519347071647644, "learning_rate": 7.066899986539237e-05, "loss": 3.3843, "step": 6071 }, { "epoch": 7.76544, "grad_norm": 0.5147475004196167, "learning_rate": 7.062861757975501e-05, "loss": 3.3476, "step": 6072 }, { "epoch": 7.76672, "grad_norm": 0.5350569486618042, "learning_rate": 7.058823529411764e-05, "loss": 3.4024, "step": 6073 }, { "epoch": 7.768, "grad_norm": 0.5119604468345642, "learning_rate": 7.054785300848027e-05, "loss": 3.3335, "step": 6074 }, { "epoch": 7.76928, "grad_norm": 0.5323851108551025, "learning_rate": 7.05074707228429e-05, "loss": 3.4564, "step": 6075 }, { "epoch": 7.77056, "grad_norm": 0.5254625678062439, "learning_rate": 7.046708843720554e-05, "loss": 3.4561, "step": 6076 }, { "epoch": 7.77184, "grad_norm": 0.5181812644004822, "learning_rate": 7.042670615156817e-05, "loss": 3.3622, "step": 6077 }, { "epoch": 7.7731200000000005, "grad_norm": 0.5225897431373596, "learning_rate": 7.03863238659308e-05, "loss": 3.4193, "step": 6078 }, { "epoch": 7.7744, "grad_norm": 0.5444203615188599, "learning_rate": 7.034594158029343e-05, "loss": 3.4026, "step": 6079 }, { "epoch": 7.77568, "grad_norm": 0.5264379978179932, "learning_rate": 7.030555929465608e-05, "loss": 3.448, "step": 6080 }, { "epoch": 7.77696, "grad_norm": 0.5221927762031555, "learning_rate": 7.026517700901871e-05, "loss": 3.3665, "step": 6081 }, { "epoch": 7.77824, "grad_norm": 0.5278477072715759, "learning_rate": 7.022479472338134e-05, "loss": 3.3952, "step": 6082 }, { "epoch": 7.77952, "grad_norm": 0.5419744849205017, "learning_rate": 7.018441243774397e-05, "loss": 3.4158, "step": 6083 }, { "epoch": 7.7808, "grad_norm": 0.5510439276695251, "learning_rate": 7.01440301521066e-05, "loss": 3.3966, "step": 6084 }, { "epoch": 7.78208, "grad_norm": 0.5095901489257812, "learning_rate": 7.010364786646923e-05, "loss": 3.3745, "step": 6085 }, { "epoch": 7.78336, "grad_norm": 0.5376297831535339, "learning_rate": 7.006326558083187e-05, "loss": 3.3438, "step": 6086 }, { "epoch": 7.78464, "grad_norm": 0.5214537382125854, "learning_rate": 7.00228832951945e-05, "loss": 3.4108, "step": 6087 }, { "epoch": 7.78592, "grad_norm": 0.5194911956787109, "learning_rate": 6.998250100955713e-05, "loss": 3.4137, "step": 6088 }, { "epoch": 7.7872, "grad_norm": 0.528439462184906, "learning_rate": 6.994211872391976e-05, "loss": 3.3615, "step": 6089 }, { "epoch": 7.78848, "grad_norm": 0.5447664856910706, "learning_rate": 6.99017364382824e-05, "loss": 3.3933, "step": 6090 }, { "epoch": 7.78976, "grad_norm": 0.5379071235656738, "learning_rate": 6.986135415264504e-05, "loss": 3.4011, "step": 6091 }, { "epoch": 7.79104, "grad_norm": 0.5375171303749084, "learning_rate": 6.982097186700767e-05, "loss": 3.4155, "step": 6092 }, { "epoch": 7.79232, "grad_norm": 0.5257745385169983, "learning_rate": 6.97805895813703e-05, "loss": 3.4416, "step": 6093 }, { "epoch": 7.7936, "grad_norm": 0.5261110663414001, "learning_rate": 6.974020729573294e-05, "loss": 3.4095, "step": 6094 }, { "epoch": 7.79488, "grad_norm": 0.5397837162017822, "learning_rate": 6.969982501009556e-05, "loss": 3.2971, "step": 6095 }, { "epoch": 7.79616, "grad_norm": 0.5269252061843872, "learning_rate": 6.96594427244582e-05, "loss": 3.3842, "step": 6096 }, { "epoch": 7.79744, "grad_norm": 0.514562726020813, "learning_rate": 6.961906043882083e-05, "loss": 3.4729, "step": 6097 }, { "epoch": 7.79872, "grad_norm": 0.5287720561027527, "learning_rate": 6.957867815318346e-05, "loss": 3.3533, "step": 6098 }, { "epoch": 7.8, "grad_norm": 0.5171098709106445, "learning_rate": 6.953829586754609e-05, "loss": 3.356, "step": 6099 }, { "epoch": 7.80128, "grad_norm": 0.5358878374099731, "learning_rate": 6.949791358190873e-05, "loss": 3.3566, "step": 6100 }, { "epoch": 7.80256, "grad_norm": 0.5136488080024719, "learning_rate": 6.945753129627136e-05, "loss": 3.4024, "step": 6101 }, { "epoch": 7.80384, "grad_norm": 0.5352822542190552, "learning_rate": 6.9417149010634e-05, "loss": 3.4127, "step": 6102 }, { "epoch": 7.80512, "grad_norm": 0.5107256174087524, "learning_rate": 6.937676672499662e-05, "loss": 3.3742, "step": 6103 }, { "epoch": 7.8064, "grad_norm": 0.527970552444458, "learning_rate": 6.933638443935927e-05, "loss": 3.4109, "step": 6104 }, { "epoch": 7.8076799999999995, "grad_norm": 0.5363277196884155, "learning_rate": 6.92960021537219e-05, "loss": 3.3898, "step": 6105 }, { "epoch": 7.80896, "grad_norm": 0.5282407999038696, "learning_rate": 6.925561986808453e-05, "loss": 3.3657, "step": 6106 }, { "epoch": 7.81024, "grad_norm": 0.5155186057090759, "learning_rate": 6.921523758244716e-05, "loss": 3.3866, "step": 6107 }, { "epoch": 7.81152, "grad_norm": 0.5109987258911133, "learning_rate": 6.91748552968098e-05, "loss": 3.3924, "step": 6108 }, { "epoch": 7.8128, "grad_norm": 0.5373751521110535, "learning_rate": 6.913447301117242e-05, "loss": 3.3803, "step": 6109 }, { "epoch": 7.81408, "grad_norm": 0.5129071474075317, "learning_rate": 6.909409072553506e-05, "loss": 3.3581, "step": 6110 }, { "epoch": 7.81536, "grad_norm": 0.5419071316719055, "learning_rate": 6.905370843989769e-05, "loss": 3.367, "step": 6111 }, { "epoch": 7.81664, "grad_norm": 0.5342729687690735, "learning_rate": 6.901332615426032e-05, "loss": 3.5005, "step": 6112 }, { "epoch": 7.81792, "grad_norm": 0.5251415371894836, "learning_rate": 6.897294386862295e-05, "loss": 3.3677, "step": 6113 }, { "epoch": 7.8192, "grad_norm": 0.5298883318901062, "learning_rate": 6.89325615829856e-05, "loss": 3.4143, "step": 6114 }, { "epoch": 7.82048, "grad_norm": 0.5260635018348694, "learning_rate": 6.889217929734823e-05, "loss": 3.4216, "step": 6115 }, { "epoch": 7.82176, "grad_norm": 0.5399680733680725, "learning_rate": 6.885179701171086e-05, "loss": 3.3927, "step": 6116 }, { "epoch": 7.82304, "grad_norm": 0.5211296081542969, "learning_rate": 6.881141472607349e-05, "loss": 3.4147, "step": 6117 }, { "epoch": 7.82432, "grad_norm": 0.5149162411689758, "learning_rate": 6.877103244043613e-05, "loss": 3.4004, "step": 6118 }, { "epoch": 7.8256, "grad_norm": 0.5208600759506226, "learning_rate": 6.873065015479876e-05, "loss": 3.305, "step": 6119 }, { "epoch": 7.82688, "grad_norm": 0.5258411765098572, "learning_rate": 6.869026786916139e-05, "loss": 3.3602, "step": 6120 }, { "epoch": 7.8281600000000005, "grad_norm": 0.5164110660552979, "learning_rate": 6.864988558352402e-05, "loss": 3.295, "step": 6121 }, { "epoch": 7.82944, "grad_norm": 0.539932131767273, "learning_rate": 6.860950329788665e-05, "loss": 3.3754, "step": 6122 }, { "epoch": 7.83072, "grad_norm": 0.5369081497192383, "learning_rate": 6.856912101224928e-05, "loss": 3.3797, "step": 6123 }, { "epoch": 7.832, "grad_norm": 0.5187609791755676, "learning_rate": 6.852873872661192e-05, "loss": 3.3654, "step": 6124 }, { "epoch": 7.83328, "grad_norm": 0.5491527915000916, "learning_rate": 6.848835644097455e-05, "loss": 3.4431, "step": 6125 }, { "epoch": 7.83456, "grad_norm": 0.5081817507743835, "learning_rate": 6.844797415533718e-05, "loss": 3.4171, "step": 6126 }, { "epoch": 7.83584, "grad_norm": 0.5206778645515442, "learning_rate": 6.840759186969981e-05, "loss": 3.38, "step": 6127 }, { "epoch": 7.83712, "grad_norm": 0.5148397088050842, "learning_rate": 6.836720958406246e-05, "loss": 3.3939, "step": 6128 }, { "epoch": 7.8384, "grad_norm": 0.5235511660575867, "learning_rate": 6.832682729842509e-05, "loss": 3.3396, "step": 6129 }, { "epoch": 7.8396799999999995, "grad_norm": 0.5200296640396118, "learning_rate": 6.828644501278772e-05, "loss": 3.3397, "step": 6130 }, { "epoch": 7.84096, "grad_norm": 0.5297372341156006, "learning_rate": 6.824606272715035e-05, "loss": 3.4623, "step": 6131 }, { "epoch": 7.84224, "grad_norm": 0.5361821055412292, "learning_rate": 6.820568044151299e-05, "loss": 3.381, "step": 6132 }, { "epoch": 7.84352, "grad_norm": 0.5185701847076416, "learning_rate": 6.816529815587562e-05, "loss": 3.3566, "step": 6133 }, { "epoch": 7.8448, "grad_norm": 0.5242226123809814, "learning_rate": 6.812491587023825e-05, "loss": 3.3903, "step": 6134 }, { "epoch": 7.84608, "grad_norm": 0.532424807548523, "learning_rate": 6.808453358460088e-05, "loss": 3.4556, "step": 6135 }, { "epoch": 7.84736, "grad_norm": 0.5150805115699768, "learning_rate": 6.804415129896351e-05, "loss": 3.3373, "step": 6136 }, { "epoch": 7.84864, "grad_norm": 0.5233043432235718, "learning_rate": 6.800376901332614e-05, "loss": 3.4448, "step": 6137 }, { "epoch": 7.84992, "grad_norm": 0.517623245716095, "learning_rate": 6.796338672768879e-05, "loss": 3.3695, "step": 6138 }, { "epoch": 7.8512, "grad_norm": 0.51863032579422, "learning_rate": 6.792300444205142e-05, "loss": 3.3963, "step": 6139 }, { "epoch": 7.85248, "grad_norm": 0.52508544921875, "learning_rate": 6.788262215641405e-05, "loss": 3.3959, "step": 6140 }, { "epoch": 7.85376, "grad_norm": 0.525643527507782, "learning_rate": 6.784223987077668e-05, "loss": 3.3873, "step": 6141 }, { "epoch": 7.85504, "grad_norm": 0.5046698451042175, "learning_rate": 6.780185758513932e-05, "loss": 3.4145, "step": 6142 }, { "epoch": 7.85632, "grad_norm": 0.5362650156021118, "learning_rate": 6.776147529950195e-05, "loss": 3.4785, "step": 6143 }, { "epoch": 7.8576, "grad_norm": 0.5106462240219116, "learning_rate": 6.772109301386458e-05, "loss": 3.4154, "step": 6144 }, { "epoch": 7.85888, "grad_norm": 0.5172496438026428, "learning_rate": 6.768071072822721e-05, "loss": 3.4247, "step": 6145 }, { "epoch": 7.8601600000000005, "grad_norm": 0.5204678773880005, "learning_rate": 6.764032844258985e-05, "loss": 3.4397, "step": 6146 }, { "epoch": 7.86144, "grad_norm": 0.5216001868247986, "learning_rate": 6.759994615695248e-05, "loss": 3.4111, "step": 6147 }, { "epoch": 7.86272, "grad_norm": 0.5186067819595337, "learning_rate": 6.755956387131511e-05, "loss": 3.3803, "step": 6148 }, { "epoch": 7.864, "grad_norm": 0.5313200950622559, "learning_rate": 6.751918158567774e-05, "loss": 3.3722, "step": 6149 }, { "epoch": 7.86528, "grad_norm": 0.530690610408783, "learning_rate": 6.747879930004037e-05, "loss": 3.3698, "step": 6150 }, { "epoch": 7.86656, "grad_norm": 0.5053249001502991, "learning_rate": 6.7438417014403e-05, "loss": 3.2738, "step": 6151 }, { "epoch": 7.86784, "grad_norm": 0.5100921988487244, "learning_rate": 6.739803472876563e-05, "loss": 3.3805, "step": 6152 }, { "epoch": 7.86912, "grad_norm": 0.5200990438461304, "learning_rate": 6.735765244312828e-05, "loss": 3.3805, "step": 6153 }, { "epoch": 7.8704, "grad_norm": 0.5229262113571167, "learning_rate": 6.731727015749091e-05, "loss": 3.3675, "step": 6154 }, { "epoch": 7.87168, "grad_norm": 0.536566436290741, "learning_rate": 6.727688787185354e-05, "loss": 3.4692, "step": 6155 }, { "epoch": 7.87296, "grad_norm": 0.5252888798713684, "learning_rate": 6.723650558621618e-05, "loss": 3.375, "step": 6156 }, { "epoch": 7.87424, "grad_norm": 0.5295806527137756, "learning_rate": 6.719612330057881e-05, "loss": 3.3758, "step": 6157 }, { "epoch": 7.87552, "grad_norm": 0.5360879302024841, "learning_rate": 6.715574101494144e-05, "loss": 3.394, "step": 6158 }, { "epoch": 7.8768, "grad_norm": 0.5231373310089111, "learning_rate": 6.711535872930407e-05, "loss": 3.3922, "step": 6159 }, { "epoch": 7.87808, "grad_norm": 0.5426596999168396, "learning_rate": 6.707497644366671e-05, "loss": 3.4305, "step": 6160 }, { "epoch": 7.87936, "grad_norm": 0.5190793871879578, "learning_rate": 6.703459415802935e-05, "loss": 3.3645, "step": 6161 }, { "epoch": 7.88064, "grad_norm": 0.5220685005187988, "learning_rate": 6.699421187239196e-05, "loss": 3.3472, "step": 6162 }, { "epoch": 7.88192, "grad_norm": 0.5117445588111877, "learning_rate": 6.69538295867546e-05, "loss": 3.3592, "step": 6163 }, { "epoch": 7.8832, "grad_norm": 0.520155668258667, "learning_rate": 6.691344730111724e-05, "loss": 3.4109, "step": 6164 }, { "epoch": 7.88448, "grad_norm": 0.5211771726608276, "learning_rate": 6.687306501547987e-05, "loss": 3.359, "step": 6165 }, { "epoch": 7.88576, "grad_norm": 0.5182546973228455, "learning_rate": 6.68326827298425e-05, "loss": 3.3939, "step": 6166 }, { "epoch": 7.88704, "grad_norm": 0.5254797339439392, "learning_rate": 6.679230044420514e-05, "loss": 3.359, "step": 6167 }, { "epoch": 7.88832, "grad_norm": 0.5227944254875183, "learning_rate": 6.675191815856777e-05, "loss": 3.4111, "step": 6168 }, { "epoch": 7.8896, "grad_norm": 0.5160208940505981, "learning_rate": 6.67115358729304e-05, "loss": 3.3495, "step": 6169 }, { "epoch": 7.89088, "grad_norm": 0.5098345279693604, "learning_rate": 6.667115358729303e-05, "loss": 3.329, "step": 6170 }, { "epoch": 7.89216, "grad_norm": 0.5212696194648743, "learning_rate": 6.663077130165567e-05, "loss": 3.4255, "step": 6171 }, { "epoch": 7.89344, "grad_norm": 0.5201159715652466, "learning_rate": 6.65903890160183e-05, "loss": 3.3864, "step": 6172 }, { "epoch": 7.8947199999999995, "grad_norm": 0.5263365507125854, "learning_rate": 6.655000673038093e-05, "loss": 3.3921, "step": 6173 }, { "epoch": 7.896, "grad_norm": 0.5230242013931274, "learning_rate": 6.650962444474358e-05, "loss": 3.3675, "step": 6174 }, { "epoch": 7.89728, "grad_norm": 0.5143063068389893, "learning_rate": 6.64692421591062e-05, "loss": 3.3708, "step": 6175 }, { "epoch": 7.89856, "grad_norm": 0.5212923288345337, "learning_rate": 6.642885987346882e-05, "loss": 3.4216, "step": 6176 }, { "epoch": 7.89984, "grad_norm": 0.5219258069992065, "learning_rate": 6.638847758783147e-05, "loss": 3.3633, "step": 6177 }, { "epoch": 7.90112, "grad_norm": 0.5209653377532959, "learning_rate": 6.63480953021941e-05, "loss": 3.3671, "step": 6178 }, { "epoch": 7.9024, "grad_norm": 0.5172528624534607, "learning_rate": 6.630771301655673e-05, "loss": 3.398, "step": 6179 }, { "epoch": 7.90368, "grad_norm": 0.5204746127128601, "learning_rate": 6.626733073091936e-05, "loss": 3.3294, "step": 6180 }, { "epoch": 7.90496, "grad_norm": 0.5181044936180115, "learning_rate": 6.6226948445282e-05, "loss": 3.2986, "step": 6181 }, { "epoch": 7.90624, "grad_norm": 0.503993034362793, "learning_rate": 6.618656615964463e-05, "loss": 3.2642, "step": 6182 }, { "epoch": 7.90752, "grad_norm": 0.5091139674186707, "learning_rate": 6.614618387400726e-05, "loss": 3.3122, "step": 6183 }, { "epoch": 7.9088, "grad_norm": 0.5170398950576782, "learning_rate": 6.610580158836989e-05, "loss": 3.3814, "step": 6184 }, { "epoch": 7.91008, "grad_norm": 0.5216673016548157, "learning_rate": 6.606541930273253e-05, "loss": 3.3738, "step": 6185 }, { "epoch": 7.91136, "grad_norm": 0.5129463076591492, "learning_rate": 6.602503701709516e-05, "loss": 3.285, "step": 6186 }, { "epoch": 7.91264, "grad_norm": 0.5073363184928894, "learning_rate": 6.59846547314578e-05, "loss": 3.3271, "step": 6187 }, { "epoch": 7.91392, "grad_norm": 0.5254284143447876, "learning_rate": 6.594427244582042e-05, "loss": 3.3854, "step": 6188 }, { "epoch": 7.9152000000000005, "grad_norm": 0.5197746157646179, "learning_rate": 6.590389016018305e-05, "loss": 3.4062, "step": 6189 }, { "epoch": 7.91648, "grad_norm": 0.5055711269378662, "learning_rate": 6.586350787454568e-05, "loss": 3.3037, "step": 6190 }, { "epoch": 7.91776, "grad_norm": 0.523854672908783, "learning_rate": 6.582312558890833e-05, "loss": 3.3311, "step": 6191 }, { "epoch": 7.91904, "grad_norm": 0.5296027064323425, "learning_rate": 6.578274330327096e-05, "loss": 3.4084, "step": 6192 }, { "epoch": 7.92032, "grad_norm": 0.5208480954170227, "learning_rate": 6.574236101763359e-05, "loss": 3.4572, "step": 6193 }, { "epoch": 7.9216, "grad_norm": 0.5234736800193787, "learning_rate": 6.570197873199622e-05, "loss": 3.4112, "step": 6194 }, { "epoch": 7.92288, "grad_norm": 0.5259544849395752, "learning_rate": 6.566159644635886e-05, "loss": 3.4369, "step": 6195 }, { "epoch": 7.92416, "grad_norm": 0.5302821397781372, "learning_rate": 6.562121416072149e-05, "loss": 3.3975, "step": 6196 }, { "epoch": 7.92544, "grad_norm": 0.5262648463249207, "learning_rate": 6.558083187508412e-05, "loss": 3.4066, "step": 6197 }, { "epoch": 7.9267199999999995, "grad_norm": 0.5254402756690979, "learning_rate": 6.554044958944675e-05, "loss": 3.3711, "step": 6198 }, { "epoch": 7.928, "grad_norm": 0.5265082120895386, "learning_rate": 6.55000673038094e-05, "loss": 3.367, "step": 6199 }, { "epoch": 7.92928, "grad_norm": 0.5290309190750122, "learning_rate": 6.545968501817203e-05, "loss": 3.3912, "step": 6200 }, { "epoch": 7.93056, "grad_norm": 0.5364521145820618, "learning_rate": 6.541930273253466e-05, "loss": 3.466, "step": 6201 }, { "epoch": 7.93184, "grad_norm": 0.5207248330116272, "learning_rate": 6.537892044689729e-05, "loss": 3.3728, "step": 6202 }, { "epoch": 7.93312, "grad_norm": 0.533179759979248, "learning_rate": 6.533853816125992e-05, "loss": 3.354, "step": 6203 }, { "epoch": 7.9344, "grad_norm": 0.5137979388237, "learning_rate": 6.529815587562255e-05, "loss": 3.3751, "step": 6204 }, { "epoch": 7.93568, "grad_norm": 0.5287781953811646, "learning_rate": 6.525777358998519e-05, "loss": 3.3851, "step": 6205 }, { "epoch": 7.93696, "grad_norm": 0.5349233746528625, "learning_rate": 6.521739130434782e-05, "loss": 3.4174, "step": 6206 }, { "epoch": 7.93824, "grad_norm": 0.5217055082321167, "learning_rate": 6.517700901871045e-05, "loss": 3.3981, "step": 6207 }, { "epoch": 7.93952, "grad_norm": 0.5294625759124756, "learning_rate": 6.513662673307308e-05, "loss": 3.4748, "step": 6208 }, { "epoch": 7.9408, "grad_norm": 0.5194503664970398, "learning_rate": 6.509624444743572e-05, "loss": 3.3751, "step": 6209 }, { "epoch": 7.94208, "grad_norm": 0.528069257736206, "learning_rate": 6.505586216179835e-05, "loss": 3.3722, "step": 6210 }, { "epoch": 7.94336, "grad_norm": 0.5228124856948853, "learning_rate": 6.501547987616098e-05, "loss": 3.3791, "step": 6211 }, { "epoch": 7.94464, "grad_norm": 0.5172755718231201, "learning_rate": 6.497509759052361e-05, "loss": 3.4098, "step": 6212 }, { "epoch": 7.94592, "grad_norm": 0.5152941346168518, "learning_rate": 6.493471530488626e-05, "loss": 3.4003, "step": 6213 }, { "epoch": 7.9472000000000005, "grad_norm": 0.53346186876297, "learning_rate": 6.489433301924889e-05, "loss": 3.3025, "step": 6214 }, { "epoch": 7.94848, "grad_norm": 0.5327422022819519, "learning_rate": 6.485395073361152e-05, "loss": 3.422, "step": 6215 }, { "epoch": 7.94976, "grad_norm": 0.5168991088867188, "learning_rate": 6.481356844797415e-05, "loss": 3.3234, "step": 6216 }, { "epoch": 7.95104, "grad_norm": 0.5401486754417419, "learning_rate": 6.477318616233678e-05, "loss": 3.4159, "step": 6217 }, { "epoch": 7.95232, "grad_norm": 0.5209543704986572, "learning_rate": 6.473280387669941e-05, "loss": 3.3988, "step": 6218 }, { "epoch": 7.9536, "grad_norm": 0.5165715217590332, "learning_rate": 6.469242159106205e-05, "loss": 3.3658, "step": 6219 }, { "epoch": 7.95488, "grad_norm": 0.5266816020011902, "learning_rate": 6.465203930542468e-05, "loss": 3.4037, "step": 6220 }, { "epoch": 7.95616, "grad_norm": 0.5167005658149719, "learning_rate": 6.461165701978731e-05, "loss": 3.343, "step": 6221 }, { "epoch": 7.95744, "grad_norm": 0.5324759483337402, "learning_rate": 6.457127473414994e-05, "loss": 3.3873, "step": 6222 }, { "epoch": 7.95872, "grad_norm": 0.5235298871994019, "learning_rate": 6.453089244851259e-05, "loss": 3.4334, "step": 6223 }, { "epoch": 7.96, "grad_norm": 0.5192801356315613, "learning_rate": 6.449051016287522e-05, "loss": 3.3825, "step": 6224 }, { "epoch": 7.96128, "grad_norm": 0.5173263549804688, "learning_rate": 6.445012787723785e-05, "loss": 3.4143, "step": 6225 }, { "epoch": 7.96256, "grad_norm": 0.5249988436698914, "learning_rate": 6.440974559160048e-05, "loss": 3.3573, "step": 6226 }, { "epoch": 7.96384, "grad_norm": 0.5259899497032166, "learning_rate": 6.436936330596312e-05, "loss": 3.3955, "step": 6227 }, { "epoch": 7.96512, "grad_norm": 0.5165438055992126, "learning_rate": 6.432898102032575e-05, "loss": 3.4257, "step": 6228 }, { "epoch": 7.9664, "grad_norm": 0.5043834447860718, "learning_rate": 6.428859873468838e-05, "loss": 3.3238, "step": 6229 }, { "epoch": 7.96768, "grad_norm": 0.5150227546691895, "learning_rate": 6.424821644905101e-05, "loss": 3.3857, "step": 6230 }, { "epoch": 7.96896, "grad_norm": 0.5162477493286133, "learning_rate": 6.420783416341364e-05, "loss": 3.351, "step": 6231 }, { "epoch": 7.97024, "grad_norm": 0.5149223804473877, "learning_rate": 6.416745187777627e-05, "loss": 3.3808, "step": 6232 }, { "epoch": 7.97152, "grad_norm": 0.5162880420684814, "learning_rate": 6.412706959213891e-05, "loss": 3.3048, "step": 6233 }, { "epoch": 7.9728, "grad_norm": 0.5108062624931335, "learning_rate": 6.408668730650154e-05, "loss": 3.442, "step": 6234 }, { "epoch": 7.97408, "grad_norm": 0.5145354270935059, "learning_rate": 6.404630502086417e-05, "loss": 3.3521, "step": 6235 }, { "epoch": 7.97536, "grad_norm": 0.5244855284690857, "learning_rate": 6.40059227352268e-05, "loss": 3.376, "step": 6236 }, { "epoch": 7.97664, "grad_norm": 0.5116865038871765, "learning_rate": 6.396554044958945e-05, "loss": 3.3633, "step": 6237 }, { "epoch": 7.97792, "grad_norm": 0.5205397009849548, "learning_rate": 6.392515816395208e-05, "loss": 3.4108, "step": 6238 }, { "epoch": 7.9792, "grad_norm": 0.5130834579467773, "learning_rate": 6.388477587831471e-05, "loss": 3.3463, "step": 6239 }, { "epoch": 7.98048, "grad_norm": 0.5261620283126831, "learning_rate": 6.384439359267734e-05, "loss": 3.4818, "step": 6240 }, { "epoch": 7.9817599999999995, "grad_norm": 0.5290659070014954, "learning_rate": 6.380401130703998e-05, "loss": 3.4639, "step": 6241 }, { "epoch": 7.98304, "grad_norm": 0.5278947353363037, "learning_rate": 6.376362902140261e-05, "loss": 3.4368, "step": 6242 }, { "epoch": 7.98432, "grad_norm": 0.5171101093292236, "learning_rate": 6.372324673576524e-05, "loss": 3.3449, "step": 6243 }, { "epoch": 7.9856, "grad_norm": 0.5271726250648499, "learning_rate": 6.368286445012787e-05, "loss": 3.3427, "step": 6244 }, { "epoch": 7.98688, "grad_norm": 0.5079402923583984, "learning_rate": 6.36424821644905e-05, "loss": 3.3444, "step": 6245 }, { "epoch": 7.98816, "grad_norm": 0.5159564018249512, "learning_rate": 6.360209987885313e-05, "loss": 3.3274, "step": 6246 }, { "epoch": 7.98944, "grad_norm": 0.5179126858711243, "learning_rate": 6.356171759321578e-05, "loss": 3.2998, "step": 6247 }, { "epoch": 7.99072, "grad_norm": 0.5274621248245239, "learning_rate": 6.35213353075784e-05, "loss": 3.2867, "step": 6248 }, { "epoch": 7.992, "grad_norm": 0.5217450857162476, "learning_rate": 6.348095302194104e-05, "loss": 3.4232, "step": 6249 }, { "epoch": 7.99328, "grad_norm": 0.5240947604179382, "learning_rate": 6.344057073630367e-05, "loss": 3.4344, "step": 6250 }, { "epoch": 7.99456, "grad_norm": 0.5134342312812805, "learning_rate": 6.340018845066631e-05, "loss": 3.365, "step": 6251 }, { "epoch": 7.99584, "grad_norm": 0.5243435502052307, "learning_rate": 6.335980616502894e-05, "loss": 3.3951, "step": 6252 }, { "epoch": 7.99712, "grad_norm": 0.5112805366516113, "learning_rate": 6.331942387939157e-05, "loss": 3.3713, "step": 6253 }, { "epoch": 7.9984, "grad_norm": 0.532479465007782, "learning_rate": 6.32790415937542e-05, "loss": 3.4131, "step": 6254 }, { "epoch": 7.99968, "grad_norm": 0.5240690112113953, "learning_rate": 6.323865930811684e-05, "loss": 3.3779, "step": 6255 }, { "epoch": 8.0, "grad_norm": 1.0870201587677002, "learning_rate": 6.319827702247946e-05, "loss": 3.4098, "step": 6256 }, { "epoch": 8.00128, "grad_norm": 0.5369760394096375, "learning_rate": 6.315789473684209e-05, "loss": 3.3303, "step": 6257 }, { "epoch": 8.00256, "grad_norm": 0.5155583620071411, "learning_rate": 6.311751245120473e-05, "loss": 3.3235, "step": 6258 }, { "epoch": 8.00384, "grad_norm": 0.50521320104599, "learning_rate": 6.307713016556736e-05, "loss": 3.3097, "step": 6259 }, { "epoch": 8.00512, "grad_norm": 0.5237175226211548, "learning_rate": 6.303674787993e-05, "loss": 3.3051, "step": 6260 }, { "epoch": 8.0064, "grad_norm": 0.521220862865448, "learning_rate": 6.299636559429264e-05, "loss": 3.2319, "step": 6261 }, { "epoch": 8.00768, "grad_norm": 0.5294133424758911, "learning_rate": 6.295598330865527e-05, "loss": 3.2934, "step": 6262 }, { "epoch": 8.00896, "grad_norm": 0.5125667452812195, "learning_rate": 6.29156010230179e-05, "loss": 3.1766, "step": 6263 }, { "epoch": 8.01024, "grad_norm": 0.5298846364021301, "learning_rate": 6.287521873738053e-05, "loss": 3.2827, "step": 6264 }, { "epoch": 8.01152, "grad_norm": 0.5239753127098083, "learning_rate": 6.283483645174317e-05, "loss": 3.256, "step": 6265 }, { "epoch": 8.0128, "grad_norm": 0.5199098587036133, "learning_rate": 6.27944541661058e-05, "loss": 3.2353, "step": 6266 }, { "epoch": 8.01408, "grad_norm": 0.5292956233024597, "learning_rate": 6.275407188046843e-05, "loss": 3.2311, "step": 6267 }, { "epoch": 8.01536, "grad_norm": 0.5216575860977173, "learning_rate": 6.271368959483106e-05, "loss": 3.3264, "step": 6268 }, { "epoch": 8.01664, "grad_norm": 0.514209508895874, "learning_rate": 6.26733073091937e-05, "loss": 3.2135, "step": 6269 }, { "epoch": 8.01792, "grad_norm": 0.5441368818283081, "learning_rate": 6.263292502355632e-05, "loss": 3.3422, "step": 6270 }, { "epoch": 8.0192, "grad_norm": 0.5475019216537476, "learning_rate": 6.259254273791895e-05, "loss": 3.2504, "step": 6271 }, { "epoch": 8.02048, "grad_norm": 0.5108786821365356, "learning_rate": 6.25521604522816e-05, "loss": 3.291, "step": 6272 }, { "epoch": 8.02176, "grad_norm": 0.532529890537262, "learning_rate": 6.251177816664423e-05, "loss": 3.3117, "step": 6273 }, { "epoch": 8.02304, "grad_norm": 0.5336194634437561, "learning_rate": 6.247139588100686e-05, "loss": 3.2797, "step": 6274 }, { "epoch": 8.02432, "grad_norm": 0.5180751085281372, "learning_rate": 6.243101359536949e-05, "loss": 3.2857, "step": 6275 }, { "epoch": 8.0256, "grad_norm": 0.5190191864967346, "learning_rate": 6.239063130973213e-05, "loss": 3.2449, "step": 6276 }, { "epoch": 8.02688, "grad_norm": 0.533854603767395, "learning_rate": 6.235024902409476e-05, "loss": 3.3588, "step": 6277 }, { "epoch": 8.02816, "grad_norm": 0.5298550724983215, "learning_rate": 6.230986673845739e-05, "loss": 3.2968, "step": 6278 }, { "epoch": 8.02944, "grad_norm": 0.5271071791648865, "learning_rate": 6.226948445282003e-05, "loss": 3.18, "step": 6279 }, { "epoch": 8.03072, "grad_norm": 0.5124052166938782, "learning_rate": 6.222910216718266e-05, "loss": 3.3069, "step": 6280 }, { "epoch": 8.032, "grad_norm": 0.5358180403709412, "learning_rate": 6.218871988154529e-05, "loss": 3.2968, "step": 6281 }, { "epoch": 8.03328, "grad_norm": 0.5310022234916687, "learning_rate": 6.214833759590792e-05, "loss": 3.3983, "step": 6282 }, { "epoch": 8.03456, "grad_norm": 0.5346135497093201, "learning_rate": 6.210795531027055e-05, "loss": 3.2622, "step": 6283 }, { "epoch": 8.03584, "grad_norm": 0.5122181177139282, "learning_rate": 6.206757302463318e-05, "loss": 3.3501, "step": 6284 }, { "epoch": 8.03712, "grad_norm": 0.5202990770339966, "learning_rate": 6.202719073899581e-05, "loss": 3.2425, "step": 6285 }, { "epoch": 8.0384, "grad_norm": 0.5240968465805054, "learning_rate": 6.198680845335846e-05, "loss": 3.2669, "step": 6286 }, { "epoch": 8.03968, "grad_norm": 0.5298383831977844, "learning_rate": 6.194642616772109e-05, "loss": 3.3057, "step": 6287 }, { "epoch": 8.04096, "grad_norm": 0.5407096743583679, "learning_rate": 6.190604388208372e-05, "loss": 3.2781, "step": 6288 }, { "epoch": 8.04224, "grad_norm": 0.5193719267845154, "learning_rate": 6.186566159644635e-05, "loss": 3.3438, "step": 6289 }, { "epoch": 8.043520000000001, "grad_norm": 0.5216055512428284, "learning_rate": 6.182527931080899e-05, "loss": 3.2728, "step": 6290 }, { "epoch": 8.0448, "grad_norm": 0.5255246758460999, "learning_rate": 6.178489702517162e-05, "loss": 3.2996, "step": 6291 }, { "epoch": 8.04608, "grad_norm": 0.5254226326942444, "learning_rate": 6.174451473953425e-05, "loss": 3.293, "step": 6292 }, { "epoch": 8.04736, "grad_norm": 0.5267927050590515, "learning_rate": 6.170413245389688e-05, "loss": 3.2104, "step": 6293 }, { "epoch": 8.04864, "grad_norm": 0.5204477906227112, "learning_rate": 6.166375016825952e-05, "loss": 3.297, "step": 6294 }, { "epoch": 8.04992, "grad_norm": 0.5418780446052551, "learning_rate": 6.162336788262215e-05, "loss": 3.3136, "step": 6295 }, { "epoch": 8.0512, "grad_norm": 0.5511976480484009, "learning_rate": 6.158298559698478e-05, "loss": 3.332, "step": 6296 }, { "epoch": 8.05248, "grad_norm": 0.5536786317825317, "learning_rate": 6.154260331134741e-05, "loss": 3.3544, "step": 6297 }, { "epoch": 8.05376, "grad_norm": 0.5378971099853516, "learning_rate": 6.150222102571004e-05, "loss": 3.2811, "step": 6298 }, { "epoch": 8.05504, "grad_norm": 0.5422542095184326, "learning_rate": 6.146183874007267e-05, "loss": 3.2559, "step": 6299 }, { "epoch": 8.05632, "grad_norm": 0.5333569049835205, "learning_rate": 6.142145645443532e-05, "loss": 3.3139, "step": 6300 }, { "epoch": 8.0576, "grad_norm": 0.5348880290985107, "learning_rate": 6.138107416879795e-05, "loss": 3.3467, "step": 6301 }, { "epoch": 8.05888, "grad_norm": 0.5167289972305298, "learning_rate": 6.134069188316058e-05, "loss": 3.2289, "step": 6302 }, { "epoch": 8.06016, "grad_norm": 0.5306516289710999, "learning_rate": 6.130030959752321e-05, "loss": 3.3242, "step": 6303 }, { "epoch": 8.06144, "grad_norm": 0.5348290801048279, "learning_rate": 6.125992731188585e-05, "loss": 3.36, "step": 6304 }, { "epoch": 8.06272, "grad_norm": 0.5243771076202393, "learning_rate": 6.121954502624848e-05, "loss": 3.2549, "step": 6305 }, { "epoch": 8.064, "grad_norm": 0.5279974341392517, "learning_rate": 6.117916274061111e-05, "loss": 3.3017, "step": 6306 }, { "epoch": 8.06528, "grad_norm": 0.539170503616333, "learning_rate": 6.113878045497374e-05, "loss": 3.2932, "step": 6307 }, { "epoch": 8.06656, "grad_norm": 0.5232791304588318, "learning_rate": 6.109839816933639e-05, "loss": 3.2744, "step": 6308 }, { "epoch": 8.06784, "grad_norm": 0.5283025503158569, "learning_rate": 6.105801588369902e-05, "loss": 3.3529, "step": 6309 }, { "epoch": 8.06912, "grad_norm": 0.5246297121047974, "learning_rate": 6.101763359806164e-05, "loss": 3.2392, "step": 6310 }, { "epoch": 8.0704, "grad_norm": 0.5458753108978271, "learning_rate": 6.0977251312424276e-05, "loss": 3.2406, "step": 6311 }, { "epoch": 8.07168, "grad_norm": 0.5204522609710693, "learning_rate": 6.0936869026786907e-05, "loss": 3.3418, "step": 6312 }, { "epoch": 8.07296, "grad_norm": 0.5313632488250732, "learning_rate": 6.089648674114954e-05, "loss": 3.3164, "step": 6313 }, { "epoch": 8.07424, "grad_norm": 0.5227934122085571, "learning_rate": 6.0856104455512173e-05, "loss": 3.2728, "step": 6314 }, { "epoch": 8.07552, "grad_norm": 0.5291623473167419, "learning_rate": 6.081572216987481e-05, "loss": 3.307, "step": 6315 }, { "epoch": 8.0768, "grad_norm": 0.5221740007400513, "learning_rate": 6.077533988423744e-05, "loss": 3.301, "step": 6316 }, { "epoch": 8.07808, "grad_norm": 0.528735339641571, "learning_rate": 6.073495759860008e-05, "loss": 3.2217, "step": 6317 }, { "epoch": 8.07936, "grad_norm": 0.5461089611053467, "learning_rate": 6.069457531296271e-05, "loss": 3.2215, "step": 6318 }, { "epoch": 8.08064, "grad_norm": 0.5363235473632812, "learning_rate": 6.0654193027325344e-05, "loss": 3.2331, "step": 6319 }, { "epoch": 8.08192, "grad_norm": 0.5322678089141846, "learning_rate": 6.0613810741687974e-05, "loss": 3.2927, "step": 6320 }, { "epoch": 8.0832, "grad_norm": 0.5260446071624756, "learning_rate": 6.057342845605061e-05, "loss": 3.2021, "step": 6321 }, { "epoch": 8.08448, "grad_norm": 0.5323824286460876, "learning_rate": 6.053304617041324e-05, "loss": 3.3024, "step": 6322 }, { "epoch": 8.08576, "grad_norm": 0.5511333346366882, "learning_rate": 6.049266388477588e-05, "loss": 3.2838, "step": 6323 }, { "epoch": 8.08704, "grad_norm": 0.5331417322158813, "learning_rate": 6.04522815991385e-05, "loss": 3.3127, "step": 6324 }, { "epoch": 8.08832, "grad_norm": 0.5371091961860657, "learning_rate": 6.041189931350114e-05, "loss": 3.3004, "step": 6325 }, { "epoch": 8.0896, "grad_norm": 0.5571906566619873, "learning_rate": 6.037151702786377e-05, "loss": 3.2509, "step": 6326 }, { "epoch": 8.09088, "grad_norm": 0.5286134481430054, "learning_rate": 6.0331134742226405e-05, "loss": 3.3488, "step": 6327 }, { "epoch": 8.09216, "grad_norm": 0.5509176254272461, "learning_rate": 6.0290752456589035e-05, "loss": 3.3017, "step": 6328 }, { "epoch": 8.09344, "grad_norm": 0.5480915307998657, "learning_rate": 6.025037017095167e-05, "loss": 3.2503, "step": 6329 }, { "epoch": 8.09472, "grad_norm": 0.5310816764831543, "learning_rate": 6.02099878853143e-05, "loss": 3.309, "step": 6330 }, { "epoch": 8.096, "grad_norm": 0.5437869429588318, "learning_rate": 6.016960559967694e-05, "loss": 3.2537, "step": 6331 }, { "epoch": 8.09728, "grad_norm": 0.5316812992095947, "learning_rate": 6.012922331403957e-05, "loss": 3.2957, "step": 6332 }, { "epoch": 8.09856, "grad_norm": 0.5401766896247864, "learning_rate": 6.0088841028402206e-05, "loss": 3.2455, "step": 6333 }, { "epoch": 8.09984, "grad_norm": 0.518835723400116, "learning_rate": 6.0048458742764836e-05, "loss": 3.298, "step": 6334 }, { "epoch": 8.10112, "grad_norm": 0.531012237071991, "learning_rate": 6.000807645712747e-05, "loss": 3.2884, "step": 6335 }, { "epoch": 8.1024, "grad_norm": 0.5368117094039917, "learning_rate": 5.99676941714901e-05, "loss": 3.2962, "step": 6336 }, { "epoch": 8.10368, "grad_norm": 0.5301125049591064, "learning_rate": 5.992731188585273e-05, "loss": 3.366, "step": 6337 }, { "epoch": 8.10496, "grad_norm": 0.5062065720558167, "learning_rate": 5.988692960021536e-05, "loss": 3.2497, "step": 6338 }, { "epoch": 8.10624, "grad_norm": 0.5318953990936279, "learning_rate": 5.9846547314578e-05, "loss": 3.3522, "step": 6339 }, { "epoch": 8.10752, "grad_norm": 0.5129379034042358, "learning_rate": 5.980616502894063e-05, "loss": 3.2826, "step": 6340 }, { "epoch": 8.1088, "grad_norm": 0.5241764187812805, "learning_rate": 5.976578274330327e-05, "loss": 3.295, "step": 6341 }, { "epoch": 8.11008, "grad_norm": 0.5228458046913147, "learning_rate": 5.97254004576659e-05, "loss": 3.2485, "step": 6342 }, { "epoch": 8.11136, "grad_norm": 0.523200273513794, "learning_rate": 5.9685018172028534e-05, "loss": 3.306, "step": 6343 }, { "epoch": 8.11264, "grad_norm": 0.5235135555267334, "learning_rate": 5.9644635886391164e-05, "loss": 3.2701, "step": 6344 }, { "epoch": 8.11392, "grad_norm": 0.5147677063941956, "learning_rate": 5.96042536007538e-05, "loss": 3.2849, "step": 6345 }, { "epoch": 8.1152, "grad_norm": 0.5381238460540771, "learning_rate": 5.956387131511643e-05, "loss": 3.3421, "step": 6346 }, { "epoch": 8.11648, "grad_norm": 0.5454177260398865, "learning_rate": 5.952348902947907e-05, "loss": 3.3036, "step": 6347 }, { "epoch": 8.11776, "grad_norm": 0.5195689797401428, "learning_rate": 5.94831067438417e-05, "loss": 3.2856, "step": 6348 }, { "epoch": 8.11904, "grad_norm": 0.5343729853630066, "learning_rate": 5.9442724458204335e-05, "loss": 3.2902, "step": 6349 }, { "epoch": 8.12032, "grad_norm": 0.5302938222885132, "learning_rate": 5.940234217256696e-05, "loss": 3.2755, "step": 6350 }, { "epoch": 8.1216, "grad_norm": 0.5336959958076477, "learning_rate": 5.9361959886929595e-05, "loss": 3.2973, "step": 6351 }, { "epoch": 8.12288, "grad_norm": 0.525851845741272, "learning_rate": 5.9321577601292225e-05, "loss": 3.2702, "step": 6352 }, { "epoch": 8.12416, "grad_norm": 0.5346911549568176, "learning_rate": 5.928119531565486e-05, "loss": 3.3683, "step": 6353 }, { "epoch": 8.12544, "grad_norm": 0.5326193571090698, "learning_rate": 5.924081303001749e-05, "loss": 3.3173, "step": 6354 }, { "epoch": 8.12672, "grad_norm": 0.5400025844573975, "learning_rate": 5.920043074438013e-05, "loss": 3.279, "step": 6355 }, { "epoch": 8.128, "grad_norm": 0.5323216915130615, "learning_rate": 5.916004845874276e-05, "loss": 3.2818, "step": 6356 }, { "epoch": 8.12928, "grad_norm": 0.5249941945075989, "learning_rate": 5.9119666173105395e-05, "loss": 3.2571, "step": 6357 }, { "epoch": 8.13056, "grad_norm": 0.5245072245597839, "learning_rate": 5.9079283887468026e-05, "loss": 3.2323, "step": 6358 }, { "epoch": 8.13184, "grad_norm": 0.5230678915977478, "learning_rate": 5.903890160183066e-05, "loss": 3.2932, "step": 6359 }, { "epoch": 8.13312, "grad_norm": 0.5385542511940002, "learning_rate": 5.899851931619329e-05, "loss": 3.3357, "step": 6360 }, { "epoch": 8.1344, "grad_norm": 0.5333831310272217, "learning_rate": 5.895813703055593e-05, "loss": 3.301, "step": 6361 }, { "epoch": 8.13568, "grad_norm": 0.5369766354560852, "learning_rate": 5.891775474491856e-05, "loss": 3.2642, "step": 6362 }, { "epoch": 8.13696, "grad_norm": 0.5322566032409668, "learning_rate": 5.8877372459281196e-05, "loss": 3.2542, "step": 6363 }, { "epoch": 8.13824, "grad_norm": 0.5302499532699585, "learning_rate": 5.883699017364382e-05, "loss": 3.2494, "step": 6364 }, { "epoch": 8.13952, "grad_norm": 0.5144988298416138, "learning_rate": 5.879660788800645e-05, "loss": 3.1926, "step": 6365 }, { "epoch": 8.1408, "grad_norm": 0.5225915908813477, "learning_rate": 5.8756225602369086e-05, "loss": 3.3514, "step": 6366 }, { "epoch": 8.14208, "grad_norm": 0.5414659976959229, "learning_rate": 5.871584331673172e-05, "loss": 3.3684, "step": 6367 }, { "epoch": 8.14336, "grad_norm": 0.5391842722892761, "learning_rate": 5.867546103109435e-05, "loss": 3.3141, "step": 6368 }, { "epoch": 8.14464, "grad_norm": 0.5198903679847717, "learning_rate": 5.863507874545699e-05, "loss": 3.284, "step": 6369 }, { "epoch": 8.14592, "grad_norm": 0.5301628112792969, "learning_rate": 5.859469645981962e-05, "loss": 3.2225, "step": 6370 }, { "epoch": 8.1472, "grad_norm": 0.5428996682167053, "learning_rate": 5.855431417418226e-05, "loss": 3.3104, "step": 6371 }, { "epoch": 8.14848, "grad_norm": 0.5339773297309875, "learning_rate": 5.851393188854489e-05, "loss": 3.3018, "step": 6372 }, { "epoch": 8.14976, "grad_norm": 0.52174973487854, "learning_rate": 5.8473549602907524e-05, "loss": 3.2703, "step": 6373 }, { "epoch": 8.15104, "grad_norm": 0.5308415293693542, "learning_rate": 5.8433167317270154e-05, "loss": 3.2691, "step": 6374 }, { "epoch": 8.15232, "grad_norm": 0.5573276877403259, "learning_rate": 5.839278503163279e-05, "loss": 3.34, "step": 6375 }, { "epoch": 8.1536, "grad_norm": 0.5412933230400085, "learning_rate": 5.835240274599542e-05, "loss": 3.286, "step": 6376 }, { "epoch": 8.15488, "grad_norm": 0.5277814269065857, "learning_rate": 5.8312020460358044e-05, "loss": 3.3651, "step": 6377 }, { "epoch": 8.15616, "grad_norm": 0.5270769596099854, "learning_rate": 5.827163817472068e-05, "loss": 3.3258, "step": 6378 }, { "epoch": 8.15744, "grad_norm": 0.5277435779571533, "learning_rate": 5.823125588908331e-05, "loss": 3.1801, "step": 6379 }, { "epoch": 8.15872, "grad_norm": 0.5209450125694275, "learning_rate": 5.819087360344595e-05, "loss": 3.2954, "step": 6380 }, { "epoch": 8.16, "grad_norm": 0.5338136553764343, "learning_rate": 5.815049131780858e-05, "loss": 3.2964, "step": 6381 }, { "epoch": 8.16128, "grad_norm": 0.547455370426178, "learning_rate": 5.8110109032171215e-05, "loss": 3.3137, "step": 6382 }, { "epoch": 8.16256, "grad_norm": 0.5422819256782532, "learning_rate": 5.8069726746533845e-05, "loss": 3.2683, "step": 6383 }, { "epoch": 8.16384, "grad_norm": 0.536233127117157, "learning_rate": 5.802934446089648e-05, "loss": 3.3036, "step": 6384 }, { "epoch": 8.16512, "grad_norm": 0.5339389443397522, "learning_rate": 5.798896217525911e-05, "loss": 3.3953, "step": 6385 }, { "epoch": 8.1664, "grad_norm": 0.5347949266433716, "learning_rate": 5.794857988962175e-05, "loss": 3.3728, "step": 6386 }, { "epoch": 8.16768, "grad_norm": 0.5407658219337463, "learning_rate": 5.7908197603984386e-05, "loss": 3.3447, "step": 6387 }, { "epoch": 8.16896, "grad_norm": 0.5457347631454468, "learning_rate": 5.7867815318347016e-05, "loss": 3.3002, "step": 6388 }, { "epoch": 8.17024, "grad_norm": 0.5405694842338562, "learning_rate": 5.782743303270965e-05, "loss": 3.2992, "step": 6389 }, { "epoch": 8.17152, "grad_norm": 0.5365909934043884, "learning_rate": 5.778705074707228e-05, "loss": 3.3, "step": 6390 }, { "epoch": 8.1728, "grad_norm": 0.5313602089881897, "learning_rate": 5.7746668461434906e-05, "loss": 3.2851, "step": 6391 }, { "epoch": 8.17408, "grad_norm": 0.5397671461105347, "learning_rate": 5.770628617579754e-05, "loss": 3.3395, "step": 6392 }, { "epoch": 8.17536, "grad_norm": 0.5422872304916382, "learning_rate": 5.766590389016017e-05, "loss": 3.2891, "step": 6393 }, { "epoch": 8.17664, "grad_norm": 0.5259747505187988, "learning_rate": 5.762552160452281e-05, "loss": 3.2912, "step": 6394 }, { "epoch": 8.17792, "grad_norm": 0.5186034440994263, "learning_rate": 5.758513931888544e-05, "loss": 3.2599, "step": 6395 }, { "epoch": 8.1792, "grad_norm": 0.5253992080688477, "learning_rate": 5.754475703324808e-05, "loss": 3.2672, "step": 6396 }, { "epoch": 8.18048, "grad_norm": 0.5303462743759155, "learning_rate": 5.750437474761071e-05, "loss": 3.3237, "step": 6397 }, { "epoch": 8.18176, "grad_norm": 0.5276507139205933, "learning_rate": 5.7463992461973344e-05, "loss": 3.2787, "step": 6398 }, { "epoch": 8.18304, "grad_norm": 0.5117890238761902, "learning_rate": 5.7423610176335974e-05, "loss": 3.2535, "step": 6399 }, { "epoch": 8.18432, "grad_norm": 0.5390210151672363, "learning_rate": 5.738322789069861e-05, "loss": 3.2985, "step": 6400 }, { "epoch": 8.1856, "grad_norm": 0.5502768754959106, "learning_rate": 5.734284560506124e-05, "loss": 3.2703, "step": 6401 }, { "epoch": 8.18688, "grad_norm": 0.533101499080658, "learning_rate": 5.730246331942388e-05, "loss": 3.3369, "step": 6402 }, { "epoch": 8.18816, "grad_norm": 0.5256611108779907, "learning_rate": 5.726208103378651e-05, "loss": 3.286, "step": 6403 }, { "epoch": 8.18944, "grad_norm": 0.5344136357307434, "learning_rate": 5.722169874814914e-05, "loss": 3.2811, "step": 6404 }, { "epoch": 8.19072, "grad_norm": 0.53043532371521, "learning_rate": 5.718131646251177e-05, "loss": 3.1944, "step": 6405 }, { "epoch": 8.192, "grad_norm": 0.5280171036720276, "learning_rate": 5.7140934176874405e-05, "loss": 3.3055, "step": 6406 }, { "epoch": 8.19328, "grad_norm": 0.5500734448432922, "learning_rate": 5.7100551891237035e-05, "loss": 3.3447, "step": 6407 }, { "epoch": 8.19456, "grad_norm": 0.5249525308609009, "learning_rate": 5.706016960559967e-05, "loss": 3.2479, "step": 6408 }, { "epoch": 8.19584, "grad_norm": 0.5362176895141602, "learning_rate": 5.70197873199623e-05, "loss": 3.3179, "step": 6409 }, { "epoch": 8.19712, "grad_norm": 0.53925621509552, "learning_rate": 5.697940503432494e-05, "loss": 3.3497, "step": 6410 }, { "epoch": 8.1984, "grad_norm": 0.5505580306053162, "learning_rate": 5.693902274868757e-05, "loss": 3.3724, "step": 6411 }, { "epoch": 8.19968, "grad_norm": 0.5253210067749023, "learning_rate": 5.6898640463050205e-05, "loss": 3.2298, "step": 6412 }, { "epoch": 8.20096, "grad_norm": 0.5256701707839966, "learning_rate": 5.6858258177412836e-05, "loss": 3.3066, "step": 6413 }, { "epoch": 8.20224, "grad_norm": 0.5349451899528503, "learning_rate": 5.681787589177547e-05, "loss": 3.3191, "step": 6414 }, { "epoch": 8.20352, "grad_norm": 0.5368797779083252, "learning_rate": 5.67774936061381e-05, "loss": 3.2995, "step": 6415 }, { "epoch": 8.2048, "grad_norm": 0.5200009942054749, "learning_rate": 5.673711132050074e-05, "loss": 3.3066, "step": 6416 }, { "epoch": 8.20608, "grad_norm": 0.5384882092475891, "learning_rate": 5.669672903486337e-05, "loss": 3.2356, "step": 6417 }, { "epoch": 8.20736, "grad_norm": 0.5600325465202332, "learning_rate": 5.6656346749226e-05, "loss": 3.341, "step": 6418 }, { "epoch": 8.20864, "grad_norm": 0.5298986434936523, "learning_rate": 5.661596446358863e-05, "loss": 3.278, "step": 6419 }, { "epoch": 8.20992, "grad_norm": 0.5479633808135986, "learning_rate": 5.6575582177951266e-05, "loss": 3.313, "step": 6420 }, { "epoch": 8.2112, "grad_norm": 0.5331804156303406, "learning_rate": 5.6535199892313896e-05, "loss": 3.2668, "step": 6421 }, { "epoch": 8.21248, "grad_norm": 0.5298319458961487, "learning_rate": 5.649481760667653e-05, "loss": 3.2764, "step": 6422 }, { "epoch": 8.21376, "grad_norm": 0.5214959383010864, "learning_rate": 5.645443532103916e-05, "loss": 3.2612, "step": 6423 }, { "epoch": 8.21504, "grad_norm": 0.5243988633155823, "learning_rate": 5.64140530354018e-05, "loss": 3.2165, "step": 6424 }, { "epoch": 8.21632, "grad_norm": 0.5409027934074402, "learning_rate": 5.637367074976443e-05, "loss": 3.3805, "step": 6425 }, { "epoch": 8.2176, "grad_norm": 0.5333059430122375, "learning_rate": 5.633328846412707e-05, "loss": 3.3277, "step": 6426 }, { "epoch": 8.21888, "grad_norm": 0.5418739914894104, "learning_rate": 5.62929061784897e-05, "loss": 3.3094, "step": 6427 }, { "epoch": 8.22016, "grad_norm": 0.544068455696106, "learning_rate": 5.6252523892852334e-05, "loss": 3.3722, "step": 6428 }, { "epoch": 8.22144, "grad_norm": 0.5321391820907593, "learning_rate": 5.6212141607214964e-05, "loss": 3.2523, "step": 6429 }, { "epoch": 8.22272, "grad_norm": 0.538314163684845, "learning_rate": 5.61717593215776e-05, "loss": 3.3437, "step": 6430 }, { "epoch": 8.224, "grad_norm": 0.519304096698761, "learning_rate": 5.6131377035940224e-05, "loss": 3.3433, "step": 6431 }, { "epoch": 8.22528, "grad_norm": 0.5347556471824646, "learning_rate": 5.609099475030286e-05, "loss": 3.2909, "step": 6432 }, { "epoch": 8.22656, "grad_norm": 0.5274844169616699, "learning_rate": 5.605061246466549e-05, "loss": 3.31, "step": 6433 }, { "epoch": 8.22784, "grad_norm": 0.516024112701416, "learning_rate": 5.601023017902813e-05, "loss": 3.2947, "step": 6434 }, { "epoch": 8.22912, "grad_norm": 0.5266866087913513, "learning_rate": 5.596984789339076e-05, "loss": 3.2854, "step": 6435 }, { "epoch": 8.2304, "grad_norm": 0.537972629070282, "learning_rate": 5.5929465607753395e-05, "loss": 3.3274, "step": 6436 }, { "epoch": 8.23168, "grad_norm": 0.5341406464576721, "learning_rate": 5.5889083322116025e-05, "loss": 3.2714, "step": 6437 }, { "epoch": 8.23296, "grad_norm": 0.5305014252662659, "learning_rate": 5.584870103647866e-05, "loss": 3.2615, "step": 6438 }, { "epoch": 8.23424, "grad_norm": 0.5244127511978149, "learning_rate": 5.580831875084129e-05, "loss": 3.328, "step": 6439 }, { "epoch": 8.23552, "grad_norm": 0.54160475730896, "learning_rate": 5.576793646520393e-05, "loss": 3.258, "step": 6440 }, { "epoch": 8.2368, "grad_norm": 0.5337183475494385, "learning_rate": 5.572755417956656e-05, "loss": 3.279, "step": 6441 }, { "epoch": 8.23808, "grad_norm": 0.5248258709907532, "learning_rate": 5.5687171893929196e-05, "loss": 3.2938, "step": 6442 }, { "epoch": 8.23936, "grad_norm": 0.5294501185417175, "learning_rate": 5.5646789608291826e-05, "loss": 3.2288, "step": 6443 }, { "epoch": 8.24064, "grad_norm": 0.5414791703224182, "learning_rate": 5.560640732265446e-05, "loss": 3.2559, "step": 6444 }, { "epoch": 8.24192, "grad_norm": 0.5217268466949463, "learning_rate": 5.5566025037017086e-05, "loss": 3.341, "step": 6445 }, { "epoch": 8.2432, "grad_norm": 0.5463381409645081, "learning_rate": 5.552564275137972e-05, "loss": 3.3459, "step": 6446 }, { "epoch": 8.24448, "grad_norm": 0.5450925827026367, "learning_rate": 5.548526046574235e-05, "loss": 3.287, "step": 6447 }, { "epoch": 8.24576, "grad_norm": 0.5325472950935364, "learning_rate": 5.544487818010499e-05, "loss": 3.285, "step": 6448 }, { "epoch": 8.24704, "grad_norm": 0.5504860877990723, "learning_rate": 5.540449589446762e-05, "loss": 3.3032, "step": 6449 }, { "epoch": 8.24832, "grad_norm": 0.5341525077819824, "learning_rate": 5.536411360883026e-05, "loss": 3.2986, "step": 6450 }, { "epoch": 8.2496, "grad_norm": 0.5327278971672058, "learning_rate": 5.532373132319289e-05, "loss": 3.2495, "step": 6451 }, { "epoch": 8.25088, "grad_norm": 0.5406553745269775, "learning_rate": 5.5283349037555524e-05, "loss": 3.3444, "step": 6452 }, { "epoch": 8.25216, "grad_norm": 0.5242081880569458, "learning_rate": 5.5242966751918154e-05, "loss": 3.264, "step": 6453 }, { "epoch": 8.25344, "grad_norm": 0.534663200378418, "learning_rate": 5.520258446628079e-05, "loss": 3.3117, "step": 6454 }, { "epoch": 8.25472, "grad_norm": 0.5145909190177917, "learning_rate": 5.516220218064342e-05, "loss": 3.255, "step": 6455 }, { "epoch": 8.256, "grad_norm": 0.5177584886550903, "learning_rate": 5.512181989500606e-05, "loss": 3.2581, "step": 6456 }, { "epoch": 8.25728, "grad_norm": 0.5454705357551575, "learning_rate": 5.508143760936869e-05, "loss": 3.2777, "step": 6457 }, { "epoch": 8.25856, "grad_norm": 0.5337566137313843, "learning_rate": 5.504105532373132e-05, "loss": 3.2425, "step": 6458 }, { "epoch": 8.25984, "grad_norm": 0.5299128293991089, "learning_rate": 5.500067303809395e-05, "loss": 3.3318, "step": 6459 }, { "epoch": 8.26112, "grad_norm": 0.5477068424224854, "learning_rate": 5.4960290752456585e-05, "loss": 3.3498, "step": 6460 }, { "epoch": 8.2624, "grad_norm": 0.5416424870491028, "learning_rate": 5.4919908466819215e-05, "loss": 3.3292, "step": 6461 }, { "epoch": 8.26368, "grad_norm": 0.5232448577880859, "learning_rate": 5.487952618118185e-05, "loss": 3.282, "step": 6462 }, { "epoch": 8.26496, "grad_norm": 0.5199217796325684, "learning_rate": 5.483914389554448e-05, "loss": 3.3211, "step": 6463 }, { "epoch": 8.26624, "grad_norm": 0.5238966941833496, "learning_rate": 5.479876160990712e-05, "loss": 3.2954, "step": 6464 }, { "epoch": 8.26752, "grad_norm": 0.5222386121749878, "learning_rate": 5.475837932426975e-05, "loss": 3.3141, "step": 6465 }, { "epoch": 8.2688, "grad_norm": 0.5436253547668457, "learning_rate": 5.4717997038632385e-05, "loss": 3.3027, "step": 6466 }, { "epoch": 8.27008, "grad_norm": 0.5344957709312439, "learning_rate": 5.4677614752995015e-05, "loss": 3.3603, "step": 6467 }, { "epoch": 8.27136, "grad_norm": 0.5461180806159973, "learning_rate": 5.463723246735765e-05, "loss": 3.3247, "step": 6468 }, { "epoch": 8.272639999999999, "grad_norm": 0.5331898927688599, "learning_rate": 5.459685018172028e-05, "loss": 3.2901, "step": 6469 }, { "epoch": 8.27392, "grad_norm": 0.5247555375099182, "learning_rate": 5.455646789608292e-05, "loss": 3.2163, "step": 6470 }, { "epoch": 8.2752, "grad_norm": 0.5188766121864319, "learning_rate": 5.451608561044554e-05, "loss": 3.2779, "step": 6471 }, { "epoch": 8.27648, "grad_norm": 0.5405585169792175, "learning_rate": 5.447570332480817e-05, "loss": 3.2365, "step": 6472 }, { "epoch": 8.27776, "grad_norm": 0.529931366443634, "learning_rate": 5.443532103917081e-05, "loss": 3.3514, "step": 6473 }, { "epoch": 8.27904, "grad_norm": 0.5279687643051147, "learning_rate": 5.4394938753533446e-05, "loss": 3.2361, "step": 6474 }, { "epoch": 8.28032, "grad_norm": 0.537344753742218, "learning_rate": 5.4354556467896076e-05, "loss": 3.2811, "step": 6475 }, { "epoch": 8.2816, "grad_norm": 0.5340175628662109, "learning_rate": 5.431417418225871e-05, "loss": 3.2844, "step": 6476 }, { "epoch": 8.28288, "grad_norm": 0.5298900604248047, "learning_rate": 5.427379189662134e-05, "loss": 3.3131, "step": 6477 }, { "epoch": 8.28416, "grad_norm": 0.5220656394958496, "learning_rate": 5.423340961098398e-05, "loss": 3.2696, "step": 6478 }, { "epoch": 8.28544, "grad_norm": 0.5230215191841125, "learning_rate": 5.419302732534661e-05, "loss": 3.2052, "step": 6479 }, { "epoch": 8.28672, "grad_norm": 0.5259697437286377, "learning_rate": 5.415264503970925e-05, "loss": 3.2847, "step": 6480 }, { "epoch": 8.288, "grad_norm": 0.5399904251098633, "learning_rate": 5.411226275407188e-05, "loss": 3.3553, "step": 6481 }, { "epoch": 8.28928, "grad_norm": 0.5356093645095825, "learning_rate": 5.4071880468434514e-05, "loss": 3.2998, "step": 6482 }, { "epoch": 8.29056, "grad_norm": 0.5491902232170105, "learning_rate": 5.4031498182797144e-05, "loss": 3.3246, "step": 6483 }, { "epoch": 8.29184, "grad_norm": 0.5284157991409302, "learning_rate": 5.399111589715978e-05, "loss": 3.3414, "step": 6484 }, { "epoch": 8.29312, "grad_norm": 0.5413424372673035, "learning_rate": 5.3950733611522404e-05, "loss": 3.3315, "step": 6485 }, { "epoch": 8.2944, "grad_norm": 0.5393568873405457, "learning_rate": 5.3910351325885034e-05, "loss": 3.3165, "step": 6486 }, { "epoch": 8.29568, "grad_norm": 0.5338280200958252, "learning_rate": 5.386996904024767e-05, "loss": 3.2115, "step": 6487 }, { "epoch": 8.29696, "grad_norm": 0.5379251837730408, "learning_rate": 5.38295867546103e-05, "loss": 3.2415, "step": 6488 }, { "epoch": 8.29824, "grad_norm": 0.5346460938453674, "learning_rate": 5.378920446897294e-05, "loss": 3.2432, "step": 6489 }, { "epoch": 8.29952, "grad_norm": 0.5359485149383545, "learning_rate": 5.374882218333557e-05, "loss": 3.285, "step": 6490 }, { "epoch": 8.3008, "grad_norm": 0.5321866869926453, "learning_rate": 5.3708439897698205e-05, "loss": 3.2818, "step": 6491 }, { "epoch": 8.30208, "grad_norm": 0.5524401068687439, "learning_rate": 5.366805761206084e-05, "loss": 3.317, "step": 6492 }, { "epoch": 8.30336, "grad_norm": 0.5479095578193665, "learning_rate": 5.362767532642347e-05, "loss": 3.3955, "step": 6493 }, { "epoch": 8.30464, "grad_norm": 0.5439134836196899, "learning_rate": 5.358729304078611e-05, "loss": 3.3354, "step": 6494 }, { "epoch": 8.30592, "grad_norm": 0.5380696058273315, "learning_rate": 5.354691075514874e-05, "loss": 3.2792, "step": 6495 }, { "epoch": 8.3072, "grad_norm": 0.537322998046875, "learning_rate": 5.3506528469511376e-05, "loss": 3.3121, "step": 6496 }, { "epoch": 8.30848, "grad_norm": 0.5562627911567688, "learning_rate": 5.3466146183874006e-05, "loss": 3.3261, "step": 6497 }, { "epoch": 8.30976, "grad_norm": 0.5425490140914917, "learning_rate": 5.342576389823663e-05, "loss": 3.3206, "step": 6498 }, { "epoch": 8.31104, "grad_norm": 0.5292763113975525, "learning_rate": 5.3385381612599266e-05, "loss": 3.283, "step": 6499 }, { "epoch": 8.31232, "grad_norm": 0.5407996773719788, "learning_rate": 5.3344999326961896e-05, "loss": 3.2909, "step": 6500 }, { "epoch": 8.3136, "grad_norm": 0.547214925289154, "learning_rate": 5.330461704132453e-05, "loss": 3.3512, "step": 6501 }, { "epoch": 8.31488, "grad_norm": 0.5352652668952942, "learning_rate": 5.326423475568716e-05, "loss": 3.337, "step": 6502 }, { "epoch": 8.31616, "grad_norm": 0.5354547500610352, "learning_rate": 5.32238524700498e-05, "loss": 3.2523, "step": 6503 }, { "epoch": 8.31744, "grad_norm": 0.5254250168800354, "learning_rate": 5.318347018441243e-05, "loss": 3.3103, "step": 6504 }, { "epoch": 8.31872, "grad_norm": 0.5346370935440063, "learning_rate": 5.314308789877507e-05, "loss": 3.2874, "step": 6505 }, { "epoch": 8.32, "grad_norm": 0.5400400161743164, "learning_rate": 5.31027056131377e-05, "loss": 3.3643, "step": 6506 }, { "epoch": 8.32128, "grad_norm": 0.5396955609321594, "learning_rate": 5.3062323327500334e-05, "loss": 3.2664, "step": 6507 }, { "epoch": 8.32256, "grad_norm": 0.5290980339050293, "learning_rate": 5.3021941041862964e-05, "loss": 3.2937, "step": 6508 }, { "epoch": 8.32384, "grad_norm": 0.5446150898933411, "learning_rate": 5.29815587562256e-05, "loss": 3.3385, "step": 6509 }, { "epoch": 8.32512, "grad_norm": 0.5492910146713257, "learning_rate": 5.294117647058824e-05, "loss": 3.2376, "step": 6510 }, { "epoch": 8.3264, "grad_norm": 0.518017053604126, "learning_rate": 5.290079418495087e-05, "loss": 3.2401, "step": 6511 }, { "epoch": 8.32768, "grad_norm": 0.527169406414032, "learning_rate": 5.286041189931349e-05, "loss": 3.2828, "step": 6512 }, { "epoch": 8.32896, "grad_norm": 0.5413787364959717, "learning_rate": 5.282002961367613e-05, "loss": 3.338, "step": 6513 }, { "epoch": 8.33024, "grad_norm": 0.5491598844528198, "learning_rate": 5.277964732803876e-05, "loss": 3.3348, "step": 6514 }, { "epoch": 8.33152, "grad_norm": 0.529373049736023, "learning_rate": 5.2739265042401395e-05, "loss": 3.3055, "step": 6515 }, { "epoch": 8.3328, "grad_norm": 0.5360579490661621, "learning_rate": 5.2698882756764025e-05, "loss": 3.2354, "step": 6516 }, { "epoch": 8.33408, "grad_norm": 0.5181642770767212, "learning_rate": 5.265850047112666e-05, "loss": 3.2713, "step": 6517 }, { "epoch": 8.33536, "grad_norm": 0.5401854515075684, "learning_rate": 5.261811818548929e-05, "loss": 3.314, "step": 6518 }, { "epoch": 8.33664, "grad_norm": 0.5461652874946594, "learning_rate": 5.257773589985193e-05, "loss": 3.2553, "step": 6519 }, { "epoch": 8.33792, "grad_norm": 0.5387153625488281, "learning_rate": 5.253735361421456e-05, "loss": 3.3327, "step": 6520 }, { "epoch": 8.3392, "grad_norm": 0.5197851657867432, "learning_rate": 5.2496971328577195e-05, "loss": 3.3562, "step": 6521 }, { "epoch": 8.34048, "grad_norm": 0.5335418581962585, "learning_rate": 5.2456589042939825e-05, "loss": 3.3083, "step": 6522 }, { "epoch": 8.34176, "grad_norm": 0.5208085775375366, "learning_rate": 5.241620675730246e-05, "loss": 3.2347, "step": 6523 }, { "epoch": 8.34304, "grad_norm": 0.5384852290153503, "learning_rate": 5.237582447166509e-05, "loss": 3.285, "step": 6524 }, { "epoch": 8.34432, "grad_norm": 0.5363668203353882, "learning_rate": 5.233544218602772e-05, "loss": 3.3247, "step": 6525 }, { "epoch": 8.3456, "grad_norm": 0.5379889011383057, "learning_rate": 5.229505990039035e-05, "loss": 3.3292, "step": 6526 }, { "epoch": 8.34688, "grad_norm": 0.5510496497154236, "learning_rate": 5.225467761475299e-05, "loss": 3.3561, "step": 6527 }, { "epoch": 8.34816, "grad_norm": 0.5368717908859253, "learning_rate": 5.221429532911562e-05, "loss": 3.2615, "step": 6528 }, { "epoch": 8.34944, "grad_norm": 0.5422154068946838, "learning_rate": 5.2173913043478256e-05, "loss": 3.2918, "step": 6529 }, { "epoch": 8.35072, "grad_norm": 0.5283968448638916, "learning_rate": 5.2133530757840886e-05, "loss": 3.2817, "step": 6530 }, { "epoch": 8.352, "grad_norm": 0.5295470952987671, "learning_rate": 5.209314847220352e-05, "loss": 3.2432, "step": 6531 }, { "epoch": 8.35328, "grad_norm": 0.5349634885787964, "learning_rate": 5.205276618656615e-05, "loss": 3.2913, "step": 6532 }, { "epoch": 8.35456, "grad_norm": 0.5370112657546997, "learning_rate": 5.201238390092879e-05, "loss": 3.2895, "step": 6533 }, { "epoch": 8.35584, "grad_norm": 0.5439594984054565, "learning_rate": 5.197200161529142e-05, "loss": 3.3035, "step": 6534 }, { "epoch": 8.35712, "grad_norm": 0.5383702516555786, "learning_rate": 5.193161932965406e-05, "loss": 3.3121, "step": 6535 }, { "epoch": 8.3584, "grad_norm": 0.5429950952529907, "learning_rate": 5.189123704401669e-05, "loss": 3.3598, "step": 6536 }, { "epoch": 8.35968, "grad_norm": 0.5377165675163269, "learning_rate": 5.1850854758379324e-05, "loss": 3.2763, "step": 6537 }, { "epoch": 8.36096, "grad_norm": 0.5333889126777649, "learning_rate": 5.1810472472741954e-05, "loss": 3.2872, "step": 6538 }, { "epoch": 8.36224, "grad_norm": 0.5367612242698669, "learning_rate": 5.1770090187104584e-05, "loss": 3.2694, "step": 6539 }, { "epoch": 8.36352, "grad_norm": 0.5359439253807068, "learning_rate": 5.1729707901467214e-05, "loss": 3.2822, "step": 6540 }, { "epoch": 8.3648, "grad_norm": 0.531587541103363, "learning_rate": 5.168932561582985e-05, "loss": 3.3343, "step": 6541 }, { "epoch": 8.36608, "grad_norm": 0.5479581952095032, "learning_rate": 5.164894333019248e-05, "loss": 3.2928, "step": 6542 }, { "epoch": 8.36736, "grad_norm": 0.5197001099586487, "learning_rate": 5.160856104455512e-05, "loss": 3.3011, "step": 6543 }, { "epoch": 8.36864, "grad_norm": 0.5361149311065674, "learning_rate": 5.156817875891775e-05, "loss": 3.2673, "step": 6544 }, { "epoch": 8.36992, "grad_norm": 0.5395957827568054, "learning_rate": 5.1527796473280385e-05, "loss": 3.3285, "step": 6545 }, { "epoch": 8.3712, "grad_norm": 0.538285493850708, "learning_rate": 5.1487414187643015e-05, "loss": 3.3397, "step": 6546 }, { "epoch": 8.37248, "grad_norm": 0.5274959206581116, "learning_rate": 5.144703190200565e-05, "loss": 3.2938, "step": 6547 }, { "epoch": 8.37376, "grad_norm": 0.5292668342590332, "learning_rate": 5.140664961636828e-05, "loss": 3.2236, "step": 6548 }, { "epoch": 8.37504, "grad_norm": 0.5355281233787537, "learning_rate": 5.136626733073092e-05, "loss": 3.3204, "step": 6549 }, { "epoch": 8.37632, "grad_norm": 0.5475792288780212, "learning_rate": 5.132588504509355e-05, "loss": 3.2975, "step": 6550 }, { "epoch": 8.3776, "grad_norm": 0.5429788827896118, "learning_rate": 5.1285502759456186e-05, "loss": 3.2851, "step": 6551 }, { "epoch": 8.37888, "grad_norm": 0.5370329022407532, "learning_rate": 5.124512047381881e-05, "loss": 3.32, "step": 6552 }, { "epoch": 8.38016, "grad_norm": 0.5378831028938293, "learning_rate": 5.1204738188181446e-05, "loss": 3.2567, "step": 6553 }, { "epoch": 8.38144, "grad_norm": 0.5425311326980591, "learning_rate": 5.1164355902544076e-05, "loss": 3.2856, "step": 6554 }, { "epoch": 8.38272, "grad_norm": 0.5385202765464783, "learning_rate": 5.112397361690671e-05, "loss": 3.3203, "step": 6555 }, { "epoch": 8.384, "grad_norm": 0.5249854922294617, "learning_rate": 5.108359133126934e-05, "loss": 3.2775, "step": 6556 }, { "epoch": 8.38528, "grad_norm": 0.5497921705245972, "learning_rate": 5.104320904563198e-05, "loss": 3.375, "step": 6557 }, { "epoch": 8.38656, "grad_norm": 0.5290823578834534, "learning_rate": 5.100282675999461e-05, "loss": 3.4186, "step": 6558 }, { "epoch": 8.38784, "grad_norm": 0.5338512659072876, "learning_rate": 5.096244447435725e-05, "loss": 3.3006, "step": 6559 }, { "epoch": 8.38912, "grad_norm": 0.5281364321708679, "learning_rate": 5.092206218871988e-05, "loss": 3.3595, "step": 6560 }, { "epoch": 8.3904, "grad_norm": 0.5322358012199402, "learning_rate": 5.0881679903082514e-05, "loss": 3.2731, "step": 6561 }, { "epoch": 8.39168, "grad_norm": 0.5335200428962708, "learning_rate": 5.0841297617445144e-05, "loss": 3.2838, "step": 6562 }, { "epoch": 8.39296, "grad_norm": 0.5297821760177612, "learning_rate": 5.080091533180778e-05, "loss": 3.3354, "step": 6563 }, { "epoch": 8.39424, "grad_norm": 0.5368463397026062, "learning_rate": 5.076053304617041e-05, "loss": 3.3192, "step": 6564 }, { "epoch": 8.39552, "grad_norm": 0.5247287154197693, "learning_rate": 5.072015076053304e-05, "loss": 3.3062, "step": 6565 }, { "epoch": 8.3968, "grad_norm": 0.540341317653656, "learning_rate": 5.067976847489567e-05, "loss": 3.3337, "step": 6566 }, { "epoch": 8.39808, "grad_norm": 0.5336794853210449, "learning_rate": 5.063938618925831e-05, "loss": 3.2744, "step": 6567 }, { "epoch": 8.39936, "grad_norm": 0.5233598351478577, "learning_rate": 5.059900390362094e-05, "loss": 3.2749, "step": 6568 }, { "epoch": 8.40064, "grad_norm": 0.5248898267745972, "learning_rate": 5.0558621617983575e-05, "loss": 3.2279, "step": 6569 }, { "epoch": 8.40192, "grad_norm": 0.5336484313011169, "learning_rate": 5.0518239332346205e-05, "loss": 3.2535, "step": 6570 }, { "epoch": 8.4032, "grad_norm": 0.550398051738739, "learning_rate": 5.047785704670884e-05, "loss": 3.3059, "step": 6571 }, { "epoch": 8.40448, "grad_norm": 0.5399869084358215, "learning_rate": 5.043747476107147e-05, "loss": 3.3524, "step": 6572 }, { "epoch": 8.40576, "grad_norm": 0.5220796465873718, "learning_rate": 5.039709247543411e-05, "loss": 3.2817, "step": 6573 }, { "epoch": 8.40704, "grad_norm": 0.5332768559455872, "learning_rate": 5.035671018979674e-05, "loss": 3.2817, "step": 6574 }, { "epoch": 8.40832, "grad_norm": 0.5253673195838928, "learning_rate": 5.0316327904159375e-05, "loss": 3.2993, "step": 6575 }, { "epoch": 8.4096, "grad_norm": 0.5454734563827515, "learning_rate": 5.0275945618522005e-05, "loss": 3.2808, "step": 6576 }, { "epoch": 8.41088, "grad_norm": 0.5498039126396179, "learning_rate": 5.023556333288464e-05, "loss": 3.3277, "step": 6577 }, { "epoch": 8.41216, "grad_norm": 0.52730792760849, "learning_rate": 5.019518104724727e-05, "loss": 3.2521, "step": 6578 }, { "epoch": 8.41344, "grad_norm": 0.5146779417991638, "learning_rate": 5.0154798761609896e-05, "loss": 3.3197, "step": 6579 }, { "epoch": 8.414719999999999, "grad_norm": 0.5284165740013123, "learning_rate": 5.011441647597253e-05, "loss": 3.3388, "step": 6580 }, { "epoch": 8.416, "grad_norm": 0.5253511667251587, "learning_rate": 5.007403419033517e-05, "loss": 3.324, "step": 6581 }, { "epoch": 8.41728, "grad_norm": 0.53730309009552, "learning_rate": 5.00336519046978e-05, "loss": 3.3254, "step": 6582 }, { "epoch": 8.41856, "grad_norm": 0.5318365693092346, "learning_rate": 4.9993269619060436e-05, "loss": 3.3386, "step": 6583 }, { "epoch": 8.41984, "grad_norm": 0.522120475769043, "learning_rate": 4.9952887333423066e-05, "loss": 3.2756, "step": 6584 }, { "epoch": 8.42112, "grad_norm": 0.5341548919677734, "learning_rate": 4.99125050477857e-05, "loss": 3.2815, "step": 6585 }, { "epoch": 8.4224, "grad_norm": 0.5254530310630798, "learning_rate": 4.987212276214833e-05, "loss": 3.3148, "step": 6586 }, { "epoch": 8.42368, "grad_norm": 0.516562283039093, "learning_rate": 4.983174047651097e-05, "loss": 3.3143, "step": 6587 }, { "epoch": 8.42496, "grad_norm": 0.5405133366584778, "learning_rate": 4.97913581908736e-05, "loss": 3.2954, "step": 6588 }, { "epoch": 8.42624, "grad_norm": 0.5392136573791504, "learning_rate": 4.975097590523624e-05, "loss": 3.2129, "step": 6589 }, { "epoch": 8.42752, "grad_norm": 0.5335177779197693, "learning_rate": 4.971059361959887e-05, "loss": 3.2511, "step": 6590 }, { "epoch": 8.4288, "grad_norm": 0.5130221843719482, "learning_rate": 4.9670211333961504e-05, "loss": 3.2803, "step": 6591 }, { "epoch": 8.43008, "grad_norm": 0.5377788543701172, "learning_rate": 4.962982904832413e-05, "loss": 3.3044, "step": 6592 }, { "epoch": 8.43136, "grad_norm": 0.535243034362793, "learning_rate": 4.958944676268676e-05, "loss": 3.3071, "step": 6593 }, { "epoch": 8.43264, "grad_norm": 0.5345697999000549, "learning_rate": 4.9549064477049394e-05, "loss": 3.3025, "step": 6594 }, { "epoch": 8.43392, "grad_norm": 0.5292974710464478, "learning_rate": 4.9508682191412024e-05, "loss": 3.3006, "step": 6595 }, { "epoch": 8.4352, "grad_norm": 0.5351077318191528, "learning_rate": 4.946829990577466e-05, "loss": 3.3633, "step": 6596 }, { "epoch": 8.43648, "grad_norm": 0.546981155872345, "learning_rate": 4.942791762013729e-05, "loss": 3.3898, "step": 6597 }, { "epoch": 8.43776, "grad_norm": 0.5353581309318542, "learning_rate": 4.938753533449993e-05, "loss": 3.2787, "step": 6598 }, { "epoch": 8.43904, "grad_norm": 0.5259009003639221, "learning_rate": 4.9347153048862565e-05, "loss": 3.204, "step": 6599 }, { "epoch": 8.44032, "grad_norm": 0.5294379591941833, "learning_rate": 4.9306770763225195e-05, "loss": 3.3074, "step": 6600 }, { "epoch": 8.4416, "grad_norm": 0.5439087748527527, "learning_rate": 4.926638847758783e-05, "loss": 3.2858, "step": 6601 }, { "epoch": 8.44288, "grad_norm": 0.5535233020782471, "learning_rate": 4.922600619195046e-05, "loss": 3.2854, "step": 6602 }, { "epoch": 8.44416, "grad_norm": 0.5227919220924377, "learning_rate": 4.91856239063131e-05, "loss": 3.2956, "step": 6603 }, { "epoch": 8.44544, "grad_norm": 0.525883674621582, "learning_rate": 4.914524162067573e-05, "loss": 3.3417, "step": 6604 }, { "epoch": 8.44672, "grad_norm": 0.5234944224357605, "learning_rate": 4.9104859335038366e-05, "loss": 3.2156, "step": 6605 }, { "epoch": 8.448, "grad_norm": 0.5246995091438293, "learning_rate": 4.906447704940099e-05, "loss": 3.2844, "step": 6606 }, { "epoch": 8.44928, "grad_norm": 0.5292709469795227, "learning_rate": 4.902409476376362e-05, "loss": 3.3131, "step": 6607 }, { "epoch": 8.45056, "grad_norm": 0.5411702394485474, "learning_rate": 4.8983712478126256e-05, "loss": 3.2209, "step": 6608 }, { "epoch": 8.45184, "grad_norm": 0.5438616871833801, "learning_rate": 4.8943330192488886e-05, "loss": 3.3588, "step": 6609 }, { "epoch": 8.45312, "grad_norm": 0.5124935507774353, "learning_rate": 4.890294790685152e-05, "loss": 3.2722, "step": 6610 }, { "epoch": 8.4544, "grad_norm": 0.5373281240463257, "learning_rate": 4.886256562121415e-05, "loss": 3.2317, "step": 6611 }, { "epoch": 8.45568, "grad_norm": 0.5373220443725586, "learning_rate": 4.882218333557679e-05, "loss": 3.2904, "step": 6612 }, { "epoch": 8.45696, "grad_norm": 0.526901125907898, "learning_rate": 4.878180104993942e-05, "loss": 3.3668, "step": 6613 }, { "epoch": 8.45824, "grad_norm": 0.5479453802108765, "learning_rate": 4.874141876430206e-05, "loss": 3.3688, "step": 6614 }, { "epoch": 8.45952, "grad_norm": 0.5343961715698242, "learning_rate": 4.870103647866469e-05, "loss": 3.3287, "step": 6615 }, { "epoch": 8.4608, "grad_norm": 0.5295287370681763, "learning_rate": 4.8660654193027324e-05, "loss": 3.3605, "step": 6616 }, { "epoch": 8.46208, "grad_norm": 0.5407337546348572, "learning_rate": 4.862027190738996e-05, "loss": 3.2565, "step": 6617 }, { "epoch": 8.46336, "grad_norm": 0.5319713950157166, "learning_rate": 4.857988962175259e-05, "loss": 3.2649, "step": 6618 }, { "epoch": 8.46464, "grad_norm": 0.5339657664299011, "learning_rate": 4.8539507336115214e-05, "loss": 3.2722, "step": 6619 }, { "epoch": 8.46592, "grad_norm": 0.5170288681983948, "learning_rate": 4.849912505047785e-05, "loss": 3.2732, "step": 6620 }, { "epoch": 8.4672, "grad_norm": 0.5304208397865295, "learning_rate": 4.845874276484048e-05, "loss": 3.2746, "step": 6621 }, { "epoch": 8.46848, "grad_norm": 0.5211367011070251, "learning_rate": 4.841836047920312e-05, "loss": 3.2367, "step": 6622 }, { "epoch": 8.46976, "grad_norm": 0.5377680659294128, "learning_rate": 4.837797819356575e-05, "loss": 3.3002, "step": 6623 }, { "epoch": 8.47104, "grad_norm": 0.532772421836853, "learning_rate": 4.8337595907928385e-05, "loss": 3.3249, "step": 6624 }, { "epoch": 8.47232, "grad_norm": 0.5326812863349915, "learning_rate": 4.8297213622291015e-05, "loss": 3.277, "step": 6625 }, { "epoch": 8.4736, "grad_norm": 0.532048761844635, "learning_rate": 4.825683133665365e-05, "loss": 3.3765, "step": 6626 }, { "epoch": 8.47488, "grad_norm": 0.5287336707115173, "learning_rate": 4.821644905101628e-05, "loss": 3.3689, "step": 6627 }, { "epoch": 8.47616, "grad_norm": 0.5333081483840942, "learning_rate": 4.817606676537892e-05, "loss": 3.3058, "step": 6628 }, { "epoch": 8.47744, "grad_norm": 0.5384047031402588, "learning_rate": 4.813568447974155e-05, "loss": 3.2469, "step": 6629 }, { "epoch": 8.47872, "grad_norm": 0.5469292998313904, "learning_rate": 4.8095302194104185e-05, "loss": 3.2969, "step": 6630 }, { "epoch": 8.48, "grad_norm": 0.5489159226417542, "learning_rate": 4.8054919908466815e-05, "loss": 3.3041, "step": 6631 }, { "epoch": 8.48128, "grad_norm": 0.5341904759407043, "learning_rate": 4.801453762282945e-05, "loss": 3.2527, "step": 6632 }, { "epoch": 8.48256, "grad_norm": 0.5368700623512268, "learning_rate": 4.7974155337192076e-05, "loss": 3.2925, "step": 6633 }, { "epoch": 8.48384, "grad_norm": 0.5526073575019836, "learning_rate": 4.793377305155471e-05, "loss": 3.2979, "step": 6634 }, { "epoch": 8.48512, "grad_norm": 0.5394701361656189, "learning_rate": 4.789339076591734e-05, "loss": 3.3142, "step": 6635 }, { "epoch": 8.4864, "grad_norm": 0.5406348705291748, "learning_rate": 4.785300848027998e-05, "loss": 3.277, "step": 6636 }, { "epoch": 8.48768, "grad_norm": 0.5281654000282288, "learning_rate": 4.781262619464261e-05, "loss": 3.2689, "step": 6637 }, { "epoch": 8.48896, "grad_norm": 0.519383430480957, "learning_rate": 4.7772243909005246e-05, "loss": 3.2823, "step": 6638 }, { "epoch": 8.49024, "grad_norm": 0.5329931974411011, "learning_rate": 4.7731861623367876e-05, "loss": 3.3037, "step": 6639 }, { "epoch": 8.49152, "grad_norm": 0.5432103872299194, "learning_rate": 4.769147933773051e-05, "loss": 3.3509, "step": 6640 }, { "epoch": 8.4928, "grad_norm": 0.523330807685852, "learning_rate": 4.765109705209314e-05, "loss": 3.3401, "step": 6641 }, { "epoch": 8.49408, "grad_norm": 0.5295400619506836, "learning_rate": 4.761071476645578e-05, "loss": 3.2433, "step": 6642 }, { "epoch": 8.49536, "grad_norm": 0.5371250510215759, "learning_rate": 4.757033248081841e-05, "loss": 3.2771, "step": 6643 }, { "epoch": 8.49664, "grad_norm": 0.5461635589599609, "learning_rate": 4.752995019518105e-05, "loss": 3.3759, "step": 6644 }, { "epoch": 8.49792, "grad_norm": 0.5435198545455933, "learning_rate": 4.748956790954368e-05, "loss": 3.343, "step": 6645 }, { "epoch": 8.4992, "grad_norm": 0.5389981269836426, "learning_rate": 4.744918562390631e-05, "loss": 3.2387, "step": 6646 }, { "epoch": 8.50048, "grad_norm": 0.5361154675483704, "learning_rate": 4.740880333826894e-05, "loss": 3.312, "step": 6647 }, { "epoch": 8.50176, "grad_norm": 0.5247445702552795, "learning_rate": 4.7368421052631574e-05, "loss": 3.2876, "step": 6648 }, { "epoch": 8.50304, "grad_norm": 0.5444439053535461, "learning_rate": 4.7328038766994204e-05, "loss": 3.2557, "step": 6649 }, { "epoch": 8.50432, "grad_norm": 0.549291729927063, "learning_rate": 4.728765648135684e-05, "loss": 3.291, "step": 6650 }, { "epoch": 8.5056, "grad_norm": 0.5521546602249146, "learning_rate": 4.724727419571947e-05, "loss": 3.3253, "step": 6651 }, { "epoch": 8.50688, "grad_norm": 0.5204415917396545, "learning_rate": 4.720689191008211e-05, "loss": 3.2887, "step": 6652 }, { "epoch": 8.50816, "grad_norm": 0.5449312329292297, "learning_rate": 4.716650962444474e-05, "loss": 3.2632, "step": 6653 }, { "epoch": 8.50944, "grad_norm": 0.558323323726654, "learning_rate": 4.7126127338807375e-05, "loss": 3.356, "step": 6654 }, { "epoch": 8.51072, "grad_norm": 0.5528385639190674, "learning_rate": 4.7085745053170005e-05, "loss": 3.3238, "step": 6655 }, { "epoch": 8.512, "grad_norm": 0.5442092418670654, "learning_rate": 4.704536276753264e-05, "loss": 3.1951, "step": 6656 }, { "epoch": 8.51328, "grad_norm": 0.5273060202598572, "learning_rate": 4.700498048189527e-05, "loss": 3.2714, "step": 6657 }, { "epoch": 8.51456, "grad_norm": 0.5554311871528625, "learning_rate": 4.696459819625791e-05, "loss": 3.3199, "step": 6658 }, { "epoch": 8.51584, "grad_norm": 0.540524423122406, "learning_rate": 4.692421591062053e-05, "loss": 3.3171, "step": 6659 }, { "epoch": 8.51712, "grad_norm": 0.543006420135498, "learning_rate": 4.688383362498317e-05, "loss": 3.3168, "step": 6660 }, { "epoch": 8.5184, "grad_norm": 0.552527666091919, "learning_rate": 4.68434513393458e-05, "loss": 3.3671, "step": 6661 }, { "epoch": 8.51968, "grad_norm": 0.5287879109382629, "learning_rate": 4.6803069053708436e-05, "loss": 3.2603, "step": 6662 }, { "epoch": 8.52096, "grad_norm": 0.5465594530105591, "learning_rate": 4.6762686768071066e-05, "loss": 3.287, "step": 6663 }, { "epoch": 8.52224, "grad_norm": 0.5492598414421082, "learning_rate": 4.67223044824337e-05, "loss": 3.299, "step": 6664 }, { "epoch": 8.52352, "grad_norm": 0.534814178943634, "learning_rate": 4.668192219679633e-05, "loss": 3.2982, "step": 6665 }, { "epoch": 8.5248, "grad_norm": 0.5302172303199768, "learning_rate": 4.664153991115897e-05, "loss": 3.2304, "step": 6666 }, { "epoch": 8.52608, "grad_norm": 0.5542375445365906, "learning_rate": 4.66011576255216e-05, "loss": 3.3307, "step": 6667 }, { "epoch": 8.52736, "grad_norm": 0.5232375860214233, "learning_rate": 4.6560775339884237e-05, "loss": 3.2925, "step": 6668 }, { "epoch": 8.52864, "grad_norm": 0.5362772941589355, "learning_rate": 4.652039305424687e-05, "loss": 3.3613, "step": 6669 }, { "epoch": 8.52992, "grad_norm": 0.5412434935569763, "learning_rate": 4.6480010768609504e-05, "loss": 3.3056, "step": 6670 }, { "epoch": 8.5312, "grad_norm": 0.5264190435409546, "learning_rate": 4.6439628482972134e-05, "loss": 3.2676, "step": 6671 }, { "epoch": 8.53248, "grad_norm": 0.5494565367698669, "learning_rate": 4.639924619733477e-05, "loss": 3.3138, "step": 6672 }, { "epoch": 8.533760000000001, "grad_norm": 0.5212265253067017, "learning_rate": 4.6358863911697394e-05, "loss": 3.3393, "step": 6673 }, { "epoch": 8.53504, "grad_norm": 0.5425857901573181, "learning_rate": 4.631848162606003e-05, "loss": 3.3338, "step": 6674 }, { "epoch": 8.53632, "grad_norm": 0.5471029877662659, "learning_rate": 4.627809934042266e-05, "loss": 3.2664, "step": 6675 }, { "epoch": 8.5376, "grad_norm": 0.5544648170471191, "learning_rate": 4.62377170547853e-05, "loss": 3.2952, "step": 6676 }, { "epoch": 8.53888, "grad_norm": 0.5421332716941833, "learning_rate": 4.619733476914793e-05, "loss": 3.3276, "step": 6677 }, { "epoch": 8.54016, "grad_norm": 0.5286380052566528, "learning_rate": 4.6156952483510564e-05, "loss": 3.2644, "step": 6678 }, { "epoch": 8.54144, "grad_norm": 0.5330486297607422, "learning_rate": 4.6116570197873195e-05, "loss": 3.3331, "step": 6679 }, { "epoch": 8.54272, "grad_norm": 0.5347878932952881, "learning_rate": 4.607618791223583e-05, "loss": 3.2441, "step": 6680 }, { "epoch": 8.544, "grad_norm": 0.5383884906768799, "learning_rate": 4.603580562659846e-05, "loss": 3.3088, "step": 6681 }, { "epoch": 8.54528, "grad_norm": 0.5470399856567383, "learning_rate": 4.59954233409611e-05, "loss": 3.2866, "step": 6682 }, { "epoch": 8.54656, "grad_norm": 0.5385159254074097, "learning_rate": 4.595504105532373e-05, "loss": 3.248, "step": 6683 }, { "epoch": 8.54784, "grad_norm": 0.5333342552185059, "learning_rate": 4.5914658769686365e-05, "loss": 3.3123, "step": 6684 }, { "epoch": 8.54912, "grad_norm": 0.5450199842453003, "learning_rate": 4.5874276484048995e-05, "loss": 3.2946, "step": 6685 }, { "epoch": 8.5504, "grad_norm": 0.552905797958374, "learning_rate": 4.5833894198411625e-05, "loss": 3.3255, "step": 6686 }, { "epoch": 8.55168, "grad_norm": 0.5411385297775269, "learning_rate": 4.5793511912774255e-05, "loss": 3.3116, "step": 6687 }, { "epoch": 8.55296, "grad_norm": 0.5458307266235352, "learning_rate": 4.575312962713689e-05, "loss": 3.3065, "step": 6688 }, { "epoch": 8.55424, "grad_norm": 0.526465892791748, "learning_rate": 4.571274734149952e-05, "loss": 3.3649, "step": 6689 }, { "epoch": 8.55552, "grad_norm": 0.5384424924850464, "learning_rate": 4.567236505586216e-05, "loss": 3.3591, "step": 6690 }, { "epoch": 8.556799999999999, "grad_norm": 0.5338189601898193, "learning_rate": 4.563198277022479e-05, "loss": 3.2857, "step": 6691 }, { "epoch": 8.55808, "grad_norm": 0.5296444296836853, "learning_rate": 4.5591600484587426e-05, "loss": 3.2774, "step": 6692 }, { "epoch": 8.55936, "grad_norm": 0.5314897894859314, "learning_rate": 4.5551218198950056e-05, "loss": 3.3251, "step": 6693 }, { "epoch": 8.56064, "grad_norm": 0.5258769989013672, "learning_rate": 4.551083591331269e-05, "loss": 3.285, "step": 6694 }, { "epoch": 8.56192, "grad_norm": 0.5228273868560791, "learning_rate": 4.547045362767532e-05, "loss": 3.327, "step": 6695 }, { "epoch": 8.5632, "grad_norm": 0.5333476662635803, "learning_rate": 4.543007134203796e-05, "loss": 3.2938, "step": 6696 }, { "epoch": 8.56448, "grad_norm": 0.5421110987663269, "learning_rate": 4.538968905640059e-05, "loss": 3.3763, "step": 6697 }, { "epoch": 8.565760000000001, "grad_norm": 0.526771605014801, "learning_rate": 4.534930677076323e-05, "loss": 3.2587, "step": 6698 }, { "epoch": 8.56704, "grad_norm": 0.5357040762901306, "learning_rate": 4.530892448512586e-05, "loss": 3.3598, "step": 6699 }, { "epoch": 8.56832, "grad_norm": 0.5350444316864014, "learning_rate": 4.526854219948848e-05, "loss": 3.2902, "step": 6700 }, { "epoch": 8.5696, "grad_norm": 0.524205207824707, "learning_rate": 4.522815991385112e-05, "loss": 3.3145, "step": 6701 }, { "epoch": 8.57088, "grad_norm": 0.5205404758453369, "learning_rate": 4.518777762821375e-05, "loss": 3.2893, "step": 6702 }, { "epoch": 8.57216, "grad_norm": 0.5430803894996643, "learning_rate": 4.5147395342576384e-05, "loss": 3.3511, "step": 6703 }, { "epoch": 8.57344, "grad_norm": 0.5332580804824829, "learning_rate": 4.510701305693902e-05, "loss": 3.3326, "step": 6704 }, { "epoch": 8.57472, "grad_norm": 0.5290676951408386, "learning_rate": 4.506663077130165e-05, "loss": 3.2984, "step": 6705 }, { "epoch": 8.576, "grad_norm": 0.5440507531166077, "learning_rate": 4.502624848566429e-05, "loss": 3.3492, "step": 6706 }, { "epoch": 8.57728, "grad_norm": 0.5364778637886047, "learning_rate": 4.498586620002692e-05, "loss": 3.2265, "step": 6707 }, { "epoch": 8.57856, "grad_norm": 0.5364824533462524, "learning_rate": 4.4945483914389555e-05, "loss": 3.291, "step": 6708 }, { "epoch": 8.57984, "grad_norm": 0.5298408269882202, "learning_rate": 4.4905101628752185e-05, "loss": 3.3376, "step": 6709 }, { "epoch": 8.58112, "grad_norm": 0.5235030651092529, "learning_rate": 4.486471934311482e-05, "loss": 3.3014, "step": 6710 }, { "epoch": 8.5824, "grad_norm": 0.5457448959350586, "learning_rate": 4.482433705747745e-05, "loss": 3.3132, "step": 6711 }, { "epoch": 8.58368, "grad_norm": 0.5244759321212769, "learning_rate": 4.478395477184009e-05, "loss": 3.3322, "step": 6712 }, { "epoch": 8.58496, "grad_norm": 0.5247620344161987, "learning_rate": 4.474357248620271e-05, "loss": 3.3435, "step": 6713 }, { "epoch": 8.58624, "grad_norm": 0.5359330177307129, "learning_rate": 4.470319020056534e-05, "loss": 3.2807, "step": 6714 }, { "epoch": 8.58752, "grad_norm": 0.5311152338981628, "learning_rate": 4.466280791492798e-05, "loss": 3.2728, "step": 6715 }, { "epoch": 8.588799999999999, "grad_norm": 0.5377166867256165, "learning_rate": 4.462242562929061e-05, "loss": 3.2665, "step": 6716 }, { "epoch": 8.59008, "grad_norm": 0.5189564824104309, "learning_rate": 4.4582043343653246e-05, "loss": 3.2582, "step": 6717 }, { "epoch": 8.59136, "grad_norm": 0.5309081673622131, "learning_rate": 4.4541661058015876e-05, "loss": 3.3129, "step": 6718 }, { "epoch": 8.59264, "grad_norm": 0.536917507648468, "learning_rate": 4.450127877237851e-05, "loss": 3.3365, "step": 6719 }, { "epoch": 8.59392, "grad_norm": 0.5495189428329468, "learning_rate": 4.446089648674114e-05, "loss": 3.2733, "step": 6720 }, { "epoch": 8.5952, "grad_norm": 0.5371974110603333, "learning_rate": 4.442051420110378e-05, "loss": 3.2837, "step": 6721 }, { "epoch": 8.59648, "grad_norm": 0.5370466113090515, "learning_rate": 4.438013191546641e-05, "loss": 3.3062, "step": 6722 }, { "epoch": 8.59776, "grad_norm": 0.5342411398887634, "learning_rate": 4.4339749629829047e-05, "loss": 3.3036, "step": 6723 }, { "epoch": 8.59904, "grad_norm": 0.5245928764343262, "learning_rate": 4.4299367344191683e-05, "loss": 3.2969, "step": 6724 }, { "epoch": 8.60032, "grad_norm": 0.5341728329658508, "learning_rate": 4.4258985058554314e-05, "loss": 3.3438, "step": 6725 }, { "epoch": 8.6016, "grad_norm": 0.5377516150474548, "learning_rate": 4.421860277291695e-05, "loss": 3.2733, "step": 6726 }, { "epoch": 8.60288, "grad_norm": 0.5328512191772461, "learning_rate": 4.4178220487279574e-05, "loss": 3.3902, "step": 6727 }, { "epoch": 8.60416, "grad_norm": 0.5404960513114929, "learning_rate": 4.4137838201642204e-05, "loss": 3.3356, "step": 6728 }, { "epoch": 8.60544, "grad_norm": 0.5246291160583496, "learning_rate": 4.409745591600484e-05, "loss": 3.2186, "step": 6729 }, { "epoch": 8.60672, "grad_norm": 0.5350708961486816, "learning_rate": 4.405707363036747e-05, "loss": 3.3441, "step": 6730 }, { "epoch": 8.608, "grad_norm": 0.525844395160675, "learning_rate": 4.401669134473011e-05, "loss": 3.3156, "step": 6731 }, { "epoch": 8.60928, "grad_norm": 0.5407573580741882, "learning_rate": 4.397630905909274e-05, "loss": 3.2902, "step": 6732 }, { "epoch": 8.61056, "grad_norm": 0.5487748384475708, "learning_rate": 4.3935926773455374e-05, "loss": 3.2765, "step": 6733 }, { "epoch": 8.61184, "grad_norm": 0.56423020362854, "learning_rate": 4.3895544487818005e-05, "loss": 3.2918, "step": 6734 }, { "epoch": 8.61312, "grad_norm": 0.5294039249420166, "learning_rate": 4.385516220218064e-05, "loss": 3.3598, "step": 6735 }, { "epoch": 8.6144, "grad_norm": 0.5347636342048645, "learning_rate": 4.381477991654327e-05, "loss": 3.2215, "step": 6736 }, { "epoch": 8.61568, "grad_norm": 0.5291707515716553, "learning_rate": 4.377439763090591e-05, "loss": 3.3315, "step": 6737 }, { "epoch": 8.61696, "grad_norm": 0.5520625710487366, "learning_rate": 4.373401534526854e-05, "loss": 3.3297, "step": 6738 }, { "epoch": 8.61824, "grad_norm": 0.5564367771148682, "learning_rate": 4.3693633059631175e-05, "loss": 3.3697, "step": 6739 }, { "epoch": 8.61952, "grad_norm": 0.5352851748466492, "learning_rate": 4.36532507739938e-05, "loss": 3.322, "step": 6740 }, { "epoch": 8.6208, "grad_norm": 0.5492033362388611, "learning_rate": 4.3612868488356435e-05, "loss": 3.3269, "step": 6741 }, { "epoch": 8.62208, "grad_norm": 0.5320399403572083, "learning_rate": 4.3572486202719065e-05, "loss": 3.3004, "step": 6742 }, { "epoch": 8.62336, "grad_norm": 0.5452033877372742, "learning_rate": 4.35321039170817e-05, "loss": 3.3551, "step": 6743 }, { "epoch": 8.62464, "grad_norm": 0.5418961644172668, "learning_rate": 4.349172163144433e-05, "loss": 3.3384, "step": 6744 }, { "epoch": 8.62592, "grad_norm": 0.5382152199745178, "learning_rate": 4.345133934580697e-05, "loss": 3.3212, "step": 6745 }, { "epoch": 8.6272, "grad_norm": 0.5336072444915771, "learning_rate": 4.34109570601696e-05, "loss": 3.2777, "step": 6746 }, { "epoch": 8.62848, "grad_norm": 0.5362292528152466, "learning_rate": 4.3370574774532236e-05, "loss": 3.3193, "step": 6747 }, { "epoch": 8.62976, "grad_norm": 0.5334807634353638, "learning_rate": 4.3330192488894866e-05, "loss": 3.2778, "step": 6748 }, { "epoch": 8.63104, "grad_norm": 0.5414783954620361, "learning_rate": 4.32898102032575e-05, "loss": 3.3341, "step": 6749 }, { "epoch": 8.63232, "grad_norm": 0.5455512404441833, "learning_rate": 4.324942791762013e-05, "loss": 3.2904, "step": 6750 }, { "epoch": 8.6336, "grad_norm": 0.5280025601387024, "learning_rate": 4.320904563198277e-05, "loss": 3.2243, "step": 6751 }, { "epoch": 8.63488, "grad_norm": 0.5390767455101013, "learning_rate": 4.31686633463454e-05, "loss": 3.3436, "step": 6752 }, { "epoch": 8.63616, "grad_norm": 0.5390846133232117, "learning_rate": 4.312828106070803e-05, "loss": 3.261, "step": 6753 }, { "epoch": 8.63744, "grad_norm": 0.5264686942100525, "learning_rate": 4.308789877507066e-05, "loss": 3.3319, "step": 6754 }, { "epoch": 8.63872, "grad_norm": 0.5262269377708435, "learning_rate": 4.30475164894333e-05, "loss": 3.2829, "step": 6755 }, { "epoch": 8.64, "grad_norm": 0.5335661768913269, "learning_rate": 4.300713420379593e-05, "loss": 3.2751, "step": 6756 }, { "epoch": 8.64128, "grad_norm": 0.5311907529830933, "learning_rate": 4.2966751918158564e-05, "loss": 3.3261, "step": 6757 }, { "epoch": 8.64256, "grad_norm": 0.5352111458778381, "learning_rate": 4.2926369632521194e-05, "loss": 3.2754, "step": 6758 }, { "epoch": 8.64384, "grad_norm": 0.5307966470718384, "learning_rate": 4.288598734688383e-05, "loss": 3.3116, "step": 6759 }, { "epoch": 8.64512, "grad_norm": 0.536453127861023, "learning_rate": 4.284560506124646e-05, "loss": 3.3126, "step": 6760 }, { "epoch": 8.6464, "grad_norm": 0.5201568007469177, "learning_rate": 4.28052227756091e-05, "loss": 3.2846, "step": 6761 }, { "epoch": 8.64768, "grad_norm": 0.5415171980857849, "learning_rate": 4.276484048997173e-05, "loss": 3.3514, "step": 6762 }, { "epoch": 8.64896, "grad_norm": 0.5305605530738831, "learning_rate": 4.2724458204334365e-05, "loss": 3.2887, "step": 6763 }, { "epoch": 8.65024, "grad_norm": 0.5250943303108215, "learning_rate": 4.2684075918696995e-05, "loss": 3.3334, "step": 6764 }, { "epoch": 8.65152, "grad_norm": 0.5324791073799133, "learning_rate": 4.264369363305963e-05, "loss": 3.246, "step": 6765 }, { "epoch": 8.6528, "grad_norm": 0.5431379675865173, "learning_rate": 4.260331134742226e-05, "loss": 3.2801, "step": 6766 }, { "epoch": 8.65408, "grad_norm": 0.5285992622375488, "learning_rate": 4.256292906178489e-05, "loss": 3.2725, "step": 6767 }, { "epoch": 8.65536, "grad_norm": 0.5373889803886414, "learning_rate": 4.252254677614752e-05, "loss": 3.3643, "step": 6768 }, { "epoch": 8.65664, "grad_norm": 0.5431947708129883, "learning_rate": 4.248216449051016e-05, "loss": 3.3618, "step": 6769 }, { "epoch": 8.65792, "grad_norm": 0.5260449647903442, "learning_rate": 4.244178220487279e-05, "loss": 3.265, "step": 6770 }, { "epoch": 8.6592, "grad_norm": 0.5484868288040161, "learning_rate": 4.2401399919235426e-05, "loss": 3.3274, "step": 6771 }, { "epoch": 8.66048, "grad_norm": 0.5380119681358337, "learning_rate": 4.2361017633598056e-05, "loss": 3.2953, "step": 6772 }, { "epoch": 8.66176, "grad_norm": 0.539348840713501, "learning_rate": 4.232063534796069e-05, "loss": 3.2966, "step": 6773 }, { "epoch": 8.66304, "grad_norm": 0.528852641582489, "learning_rate": 4.228025306232332e-05, "loss": 3.3094, "step": 6774 }, { "epoch": 8.66432, "grad_norm": 0.5262014269828796, "learning_rate": 4.223987077668596e-05, "loss": 3.237, "step": 6775 }, { "epoch": 8.6656, "grad_norm": 0.5200825333595276, "learning_rate": 4.219948849104859e-05, "loss": 3.2816, "step": 6776 }, { "epoch": 8.66688, "grad_norm": 0.5474440455436707, "learning_rate": 4.2159106205411227e-05, "loss": 3.3406, "step": 6777 }, { "epoch": 8.66816, "grad_norm": 0.5352291464805603, "learning_rate": 4.2118723919773857e-05, "loss": 3.279, "step": 6778 }, { "epoch": 8.66944, "grad_norm": 0.5264368653297424, "learning_rate": 4.2078341634136493e-05, "loss": 3.2559, "step": 6779 }, { "epoch": 8.67072, "grad_norm": 0.5307362675666809, "learning_rate": 4.203795934849912e-05, "loss": 3.2877, "step": 6780 }, { "epoch": 8.672, "grad_norm": 0.5339285135269165, "learning_rate": 4.1997577062861754e-05, "loss": 3.3039, "step": 6781 }, { "epoch": 8.67328, "grad_norm": 0.5298980474472046, "learning_rate": 4.1957194777224384e-05, "loss": 3.2824, "step": 6782 }, { "epoch": 8.67456, "grad_norm": 0.5294161438941956, "learning_rate": 4.191681249158702e-05, "loss": 3.3326, "step": 6783 }, { "epoch": 8.67584, "grad_norm": 0.5197745561599731, "learning_rate": 4.187643020594965e-05, "loss": 3.2917, "step": 6784 }, { "epoch": 8.67712, "grad_norm": 0.5530399680137634, "learning_rate": 4.183604792031229e-05, "loss": 3.2526, "step": 6785 }, { "epoch": 8.6784, "grad_norm": 0.534653902053833, "learning_rate": 4.179566563467492e-05, "loss": 3.3122, "step": 6786 }, { "epoch": 8.67968, "grad_norm": 0.5447145700454712, "learning_rate": 4.1755283349037554e-05, "loss": 3.3016, "step": 6787 }, { "epoch": 8.68096, "grad_norm": 0.5226910710334778, "learning_rate": 4.1714901063400184e-05, "loss": 3.2551, "step": 6788 }, { "epoch": 8.68224, "grad_norm": 0.5207378268241882, "learning_rate": 4.167451877776282e-05, "loss": 3.247, "step": 6789 }, { "epoch": 8.68352, "grad_norm": 0.5464200973510742, "learning_rate": 4.163413649212545e-05, "loss": 3.2968, "step": 6790 }, { "epoch": 8.6848, "grad_norm": 0.5588473081588745, "learning_rate": 4.159375420648809e-05, "loss": 3.301, "step": 6791 }, { "epoch": 8.68608, "grad_norm": 0.5364888906478882, "learning_rate": 4.155337192085072e-05, "loss": 3.2832, "step": 6792 }, { "epoch": 8.68736, "grad_norm": 0.5507677793502808, "learning_rate": 4.1512989635213355e-05, "loss": 3.3312, "step": 6793 }, { "epoch": 8.68864, "grad_norm": 0.5295298099517822, "learning_rate": 4.147260734957598e-05, "loss": 3.3066, "step": 6794 }, { "epoch": 8.68992, "grad_norm": 0.5314168334007263, "learning_rate": 4.1432225063938615e-05, "loss": 3.2516, "step": 6795 }, { "epoch": 8.6912, "grad_norm": 0.5450182557106018, "learning_rate": 4.1391842778301245e-05, "loss": 3.309, "step": 6796 }, { "epoch": 8.69248, "grad_norm": 0.5385010242462158, "learning_rate": 4.135146049266388e-05, "loss": 3.1977, "step": 6797 }, { "epoch": 8.69376, "grad_norm": 0.5317445397377014, "learning_rate": 4.131107820702651e-05, "loss": 3.2691, "step": 6798 }, { "epoch": 8.69504, "grad_norm": 0.536864697933197, "learning_rate": 4.127069592138915e-05, "loss": 3.2888, "step": 6799 }, { "epoch": 8.69632, "grad_norm": 0.5189259648323059, "learning_rate": 4.123031363575178e-05, "loss": 3.2824, "step": 6800 }, { "epoch": 8.6976, "grad_norm": 0.5167062282562256, "learning_rate": 4.1189931350114416e-05, "loss": 3.2385, "step": 6801 }, { "epoch": 8.698879999999999, "grad_norm": 0.5420131683349609, "learning_rate": 4.1149549064477046e-05, "loss": 3.4164, "step": 6802 }, { "epoch": 8.70016, "grad_norm": 0.533211886882782, "learning_rate": 4.110916677883968e-05, "loss": 3.2774, "step": 6803 }, { "epoch": 8.70144, "grad_norm": 0.5351250767707825, "learning_rate": 4.106878449320231e-05, "loss": 3.3219, "step": 6804 }, { "epoch": 8.70272, "grad_norm": 0.5335774421691895, "learning_rate": 4.102840220756495e-05, "loss": 3.272, "step": 6805 }, { "epoch": 8.704, "grad_norm": 0.5470417737960815, "learning_rate": 4.098801992192758e-05, "loss": 3.265, "step": 6806 }, { "epoch": 8.70528, "grad_norm": 0.5419219732284546, "learning_rate": 4.09476376362902e-05, "loss": 3.3382, "step": 6807 }, { "epoch": 8.70656, "grad_norm": 0.5278633832931519, "learning_rate": 4.090725535065284e-05, "loss": 3.341, "step": 6808 }, { "epoch": 8.707840000000001, "grad_norm": 0.5203990936279297, "learning_rate": 4.086687306501547e-05, "loss": 3.3018, "step": 6809 }, { "epoch": 8.70912, "grad_norm": 0.530105710029602, "learning_rate": 4.082649077937811e-05, "loss": 3.3272, "step": 6810 }, { "epoch": 8.7104, "grad_norm": 0.5211985111236572, "learning_rate": 4.0786108493740744e-05, "loss": 3.2695, "step": 6811 }, { "epoch": 8.71168, "grad_norm": 0.5373910665512085, "learning_rate": 4.0745726208103374e-05, "loss": 3.3031, "step": 6812 }, { "epoch": 8.71296, "grad_norm": 0.5470989942550659, "learning_rate": 4.070534392246601e-05, "loss": 3.3386, "step": 6813 }, { "epoch": 8.71424, "grad_norm": 0.5321961045265198, "learning_rate": 4.066496163682864e-05, "loss": 3.2882, "step": 6814 }, { "epoch": 8.71552, "grad_norm": 0.5434589385986328, "learning_rate": 4.062457935119128e-05, "loss": 3.3023, "step": 6815 }, { "epoch": 8.7168, "grad_norm": 0.5319420099258423, "learning_rate": 4.058419706555391e-05, "loss": 3.3207, "step": 6816 }, { "epoch": 8.71808, "grad_norm": 0.5350143313407898, "learning_rate": 4.0543814779916545e-05, "loss": 3.3401, "step": 6817 }, { "epoch": 8.71936, "grad_norm": 0.5368631482124329, "learning_rate": 4.0503432494279175e-05, "loss": 3.2471, "step": 6818 }, { "epoch": 8.72064, "grad_norm": 0.541717529296875, "learning_rate": 4.046305020864181e-05, "loss": 3.3548, "step": 6819 }, { "epoch": 8.72192, "grad_norm": 0.5320764183998108, "learning_rate": 4.042266792300444e-05, "loss": 3.2743, "step": 6820 }, { "epoch": 8.7232, "grad_norm": 0.536125659942627, "learning_rate": 4.0382285637367065e-05, "loss": 3.3071, "step": 6821 }, { "epoch": 8.72448, "grad_norm": 0.5400855541229248, "learning_rate": 4.03419033517297e-05, "loss": 3.3063, "step": 6822 }, { "epoch": 8.72576, "grad_norm": 0.5443190336227417, "learning_rate": 4.030152106609233e-05, "loss": 3.3044, "step": 6823 }, { "epoch": 8.72704, "grad_norm": 0.5323216319084167, "learning_rate": 4.026113878045497e-05, "loss": 3.2328, "step": 6824 }, { "epoch": 8.72832, "grad_norm": 0.5174821615219116, "learning_rate": 4.02207564948176e-05, "loss": 3.3146, "step": 6825 }, { "epoch": 8.7296, "grad_norm": 0.5285962224006653, "learning_rate": 4.0180374209180236e-05, "loss": 3.3673, "step": 6826 }, { "epoch": 8.730879999999999, "grad_norm": 0.525948703289032, "learning_rate": 4.0139991923542866e-05, "loss": 3.3524, "step": 6827 }, { "epoch": 8.73216, "grad_norm": 0.526118814945221, "learning_rate": 4.00996096379055e-05, "loss": 3.3361, "step": 6828 }, { "epoch": 8.73344, "grad_norm": 0.5256155729293823, "learning_rate": 4.005922735226814e-05, "loss": 3.35, "step": 6829 }, { "epoch": 8.73472, "grad_norm": 0.5380839705467224, "learning_rate": 4.001884506663077e-05, "loss": 3.3115, "step": 6830 }, { "epoch": 8.736, "grad_norm": 0.5296849012374878, "learning_rate": 3.9978462780993406e-05, "loss": 3.2831, "step": 6831 }, { "epoch": 8.73728, "grad_norm": 0.5297383069992065, "learning_rate": 3.9938080495356037e-05, "loss": 3.2457, "step": 6832 }, { "epoch": 8.73856, "grad_norm": 0.528474748134613, "learning_rate": 3.989769820971867e-05, "loss": 3.2796, "step": 6833 }, { "epoch": 8.739840000000001, "grad_norm": 0.5146127939224243, "learning_rate": 3.98573159240813e-05, "loss": 3.2998, "step": 6834 }, { "epoch": 8.74112, "grad_norm": 0.5278341174125671, "learning_rate": 3.981693363844393e-05, "loss": 3.2445, "step": 6835 }, { "epoch": 8.7424, "grad_norm": 0.5160564184188843, "learning_rate": 3.9776551352806564e-05, "loss": 3.307, "step": 6836 }, { "epoch": 8.74368, "grad_norm": 0.5266233682632446, "learning_rate": 3.9736169067169194e-05, "loss": 3.2835, "step": 6837 }, { "epoch": 8.74496, "grad_norm": 0.5305745601654053, "learning_rate": 3.969578678153183e-05, "loss": 3.3538, "step": 6838 }, { "epoch": 8.74624, "grad_norm": 0.5437188744544983, "learning_rate": 3.965540449589446e-05, "loss": 3.3189, "step": 6839 }, { "epoch": 8.74752, "grad_norm": 0.5442191958427429, "learning_rate": 3.96150222102571e-05, "loss": 3.3207, "step": 6840 }, { "epoch": 8.7488, "grad_norm": 0.5521953701972961, "learning_rate": 3.957463992461973e-05, "loss": 3.3707, "step": 6841 }, { "epoch": 8.75008, "grad_norm": 0.5367425084114075, "learning_rate": 3.9534257638982364e-05, "loss": 3.3494, "step": 6842 }, { "epoch": 8.75136, "grad_norm": 0.5323737263679504, "learning_rate": 3.9493875353344994e-05, "loss": 3.3304, "step": 6843 }, { "epoch": 8.75264, "grad_norm": 0.5559576749801636, "learning_rate": 3.945349306770763e-05, "loss": 3.2795, "step": 6844 }, { "epoch": 8.75392, "grad_norm": 0.5402018427848816, "learning_rate": 3.941311078207026e-05, "loss": 3.2885, "step": 6845 }, { "epoch": 8.7552, "grad_norm": 0.5311586856842041, "learning_rate": 3.93727284964329e-05, "loss": 3.2718, "step": 6846 }, { "epoch": 8.75648, "grad_norm": 0.5346354246139526, "learning_rate": 3.9332346210795535e-05, "loss": 3.2709, "step": 6847 }, { "epoch": 8.75776, "grad_norm": 0.5428527593612671, "learning_rate": 3.929196392515816e-05, "loss": 3.296, "step": 6848 }, { "epoch": 8.75904, "grad_norm": 0.5390904545783997, "learning_rate": 3.925158163952079e-05, "loss": 3.2799, "step": 6849 }, { "epoch": 8.76032, "grad_norm": 0.5223475694656372, "learning_rate": 3.9211199353883425e-05, "loss": 3.3201, "step": 6850 }, { "epoch": 8.7616, "grad_norm": 0.5381412506103516, "learning_rate": 3.9170817068246055e-05, "loss": 3.3655, "step": 6851 }, { "epoch": 8.76288, "grad_norm": 0.5343440175056458, "learning_rate": 3.913043478260869e-05, "loss": 3.264, "step": 6852 }, { "epoch": 8.76416, "grad_norm": 0.5274061560630798, "learning_rate": 3.909005249697132e-05, "loss": 3.35, "step": 6853 }, { "epoch": 8.76544, "grad_norm": 0.5369771718978882, "learning_rate": 3.904967021133396e-05, "loss": 3.3448, "step": 6854 }, { "epoch": 8.76672, "grad_norm": 0.5344805717468262, "learning_rate": 3.900928792569659e-05, "loss": 3.3441, "step": 6855 }, { "epoch": 8.768, "grad_norm": 0.5286930203437805, "learning_rate": 3.8968905640059226e-05, "loss": 3.3262, "step": 6856 }, { "epoch": 8.76928, "grad_norm": 0.5351747870445251, "learning_rate": 3.8928523354421856e-05, "loss": 3.3277, "step": 6857 }, { "epoch": 8.77056, "grad_norm": 0.5334113836288452, "learning_rate": 3.888814106878449e-05, "loss": 3.3462, "step": 6858 }, { "epoch": 8.77184, "grad_norm": 0.5328750014305115, "learning_rate": 3.884775878314712e-05, "loss": 3.257, "step": 6859 }, { "epoch": 8.77312, "grad_norm": 0.5237380266189575, "learning_rate": 3.880737649750976e-05, "loss": 3.2508, "step": 6860 }, { "epoch": 8.7744, "grad_norm": 0.542219877243042, "learning_rate": 3.876699421187238e-05, "loss": 3.3518, "step": 6861 }, { "epoch": 8.77568, "grad_norm": 0.5408708453178406, "learning_rate": 3.872661192623502e-05, "loss": 3.4064, "step": 6862 }, { "epoch": 8.77696, "grad_norm": 0.543810248374939, "learning_rate": 3.868622964059765e-05, "loss": 3.3481, "step": 6863 }, { "epoch": 8.77824, "grad_norm": 0.5442402958869934, "learning_rate": 3.864584735496029e-05, "loss": 3.3938, "step": 6864 }, { "epoch": 8.77952, "grad_norm": 0.5541428923606873, "learning_rate": 3.860546506932292e-05, "loss": 3.3481, "step": 6865 }, { "epoch": 8.7808, "grad_norm": 0.5401941537857056, "learning_rate": 3.8565082783685554e-05, "loss": 3.3197, "step": 6866 }, { "epoch": 8.78208, "grad_norm": 0.5313037633895874, "learning_rate": 3.8524700498048184e-05, "loss": 3.2591, "step": 6867 }, { "epoch": 8.78336, "grad_norm": 0.5377905368804932, "learning_rate": 3.848431821241082e-05, "loss": 3.3435, "step": 6868 }, { "epoch": 8.78464, "grad_norm": 0.5325095057487488, "learning_rate": 3.844393592677345e-05, "loss": 3.3114, "step": 6869 }, { "epoch": 8.78592, "grad_norm": 0.540044367313385, "learning_rate": 3.840355364113609e-05, "loss": 3.3397, "step": 6870 }, { "epoch": 8.7872, "grad_norm": 0.531450092792511, "learning_rate": 3.836317135549872e-05, "loss": 3.2683, "step": 6871 }, { "epoch": 8.78848, "grad_norm": 0.5396316647529602, "learning_rate": 3.8322789069861355e-05, "loss": 3.2794, "step": 6872 }, { "epoch": 8.78976, "grad_norm": 0.5314230918884277, "learning_rate": 3.8282406784223985e-05, "loss": 3.305, "step": 6873 }, { "epoch": 8.79104, "grad_norm": 0.5359961986541748, "learning_rate": 3.8242024498586615e-05, "loss": 3.3323, "step": 6874 }, { "epoch": 8.79232, "grad_norm": 0.5214905738830566, "learning_rate": 3.8201642212949245e-05, "loss": 3.2747, "step": 6875 }, { "epoch": 8.7936, "grad_norm": 0.5209900140762329, "learning_rate": 3.816125992731188e-05, "loss": 3.2886, "step": 6876 }, { "epoch": 8.79488, "grad_norm": 0.5249080061912537, "learning_rate": 3.812087764167451e-05, "loss": 3.3115, "step": 6877 }, { "epoch": 8.79616, "grad_norm": 0.561204195022583, "learning_rate": 3.808049535603715e-05, "loss": 3.3694, "step": 6878 }, { "epoch": 8.79744, "grad_norm": 0.5340297222137451, "learning_rate": 3.804011307039978e-05, "loss": 3.2916, "step": 6879 }, { "epoch": 8.79872, "grad_norm": 0.5248726010322571, "learning_rate": 3.7999730784762416e-05, "loss": 3.3112, "step": 6880 }, { "epoch": 8.8, "grad_norm": 0.5359779596328735, "learning_rate": 3.7959348499125046e-05, "loss": 3.3402, "step": 6881 }, { "epoch": 8.80128, "grad_norm": 0.5329644083976746, "learning_rate": 3.791896621348768e-05, "loss": 3.3107, "step": 6882 }, { "epoch": 8.80256, "grad_norm": 0.5371171236038208, "learning_rate": 3.787858392785031e-05, "loss": 3.3409, "step": 6883 }, { "epoch": 8.80384, "grad_norm": 0.5442470908164978, "learning_rate": 3.783820164221295e-05, "loss": 3.3277, "step": 6884 }, { "epoch": 8.80512, "grad_norm": 0.5545779466629028, "learning_rate": 3.779781935657558e-05, "loss": 3.3126, "step": 6885 }, { "epoch": 8.8064, "grad_norm": 0.5447943210601807, "learning_rate": 3.7757437070938216e-05, "loss": 3.2873, "step": 6886 }, { "epoch": 8.80768, "grad_norm": 0.5211206674575806, "learning_rate": 3.7717054785300847e-05, "loss": 3.3024, "step": 6887 }, { "epoch": 8.80896, "grad_norm": 0.5305944681167603, "learning_rate": 3.7676672499663477e-05, "loss": 3.3325, "step": 6888 }, { "epoch": 8.81024, "grad_norm": 0.5540784001350403, "learning_rate": 3.763629021402611e-05, "loss": 3.3269, "step": 6889 }, { "epoch": 8.81152, "grad_norm": 0.5730512738227844, "learning_rate": 3.7595907928388744e-05, "loss": 3.3965, "step": 6890 }, { "epoch": 8.8128, "grad_norm": 0.5502961874008179, "learning_rate": 3.7555525642751374e-05, "loss": 3.2786, "step": 6891 }, { "epoch": 8.81408, "grad_norm": 0.5448852777481079, "learning_rate": 3.751514335711401e-05, "loss": 3.3031, "step": 6892 }, { "epoch": 8.81536, "grad_norm": 0.5359817743301392, "learning_rate": 3.747476107147664e-05, "loss": 3.318, "step": 6893 }, { "epoch": 8.81664, "grad_norm": 0.5279159545898438, "learning_rate": 3.743437878583928e-05, "loss": 3.3592, "step": 6894 }, { "epoch": 8.81792, "grad_norm": 0.5456517338752747, "learning_rate": 3.739399650020191e-05, "loss": 3.3106, "step": 6895 }, { "epoch": 8.8192, "grad_norm": 0.5353114604949951, "learning_rate": 3.7353614214564544e-05, "loss": 3.3135, "step": 6896 }, { "epoch": 8.82048, "grad_norm": 0.5308430194854736, "learning_rate": 3.7313231928927174e-05, "loss": 3.2659, "step": 6897 }, { "epoch": 8.82176, "grad_norm": 0.5371167659759521, "learning_rate": 3.7272849643289804e-05, "loss": 3.244, "step": 6898 }, { "epoch": 8.82304, "grad_norm": 0.5514832735061646, "learning_rate": 3.723246735765244e-05, "loss": 3.3401, "step": 6899 }, { "epoch": 8.82432, "grad_norm": 0.536336362361908, "learning_rate": 3.719208507201507e-05, "loss": 3.3118, "step": 6900 }, { "epoch": 8.8256, "grad_norm": 0.5184128284454346, "learning_rate": 3.715170278637771e-05, "loss": 3.1998, "step": 6901 }, { "epoch": 8.82688, "grad_norm": 0.5253881812095642, "learning_rate": 3.711132050074034e-05, "loss": 3.1437, "step": 6902 }, { "epoch": 8.82816, "grad_norm": 0.5361356735229492, "learning_rate": 3.7070938215102975e-05, "loss": 3.1978, "step": 6903 }, { "epoch": 8.82944, "grad_norm": 0.533018946647644, "learning_rate": 3.7030555929465605e-05, "loss": 3.2561, "step": 6904 }, { "epoch": 8.83072, "grad_norm": 0.5180771350860596, "learning_rate": 3.6990173643828235e-05, "loss": 3.2524, "step": 6905 }, { "epoch": 8.832, "grad_norm": 0.5097092986106873, "learning_rate": 3.694979135819087e-05, "loss": 3.1094, "step": 6906 }, { "epoch": 8.83328, "grad_norm": 0.5372468829154968, "learning_rate": 3.69094090725535e-05, "loss": 3.1413, "step": 6907 }, { "epoch": 8.83456, "grad_norm": 0.5473780632019043, "learning_rate": 3.686902678691614e-05, "loss": 3.2555, "step": 6908 }, { "epoch": 8.83584, "grad_norm": 0.537969172000885, "learning_rate": 3.682864450127877e-05, "loss": 3.0862, "step": 6909 }, { "epoch": 8.83712, "grad_norm": 0.5439502596855164, "learning_rate": 3.6788262215641406e-05, "loss": 3.1283, "step": 6910 }, { "epoch": 8.8384, "grad_norm": 0.5461182594299316, "learning_rate": 3.6747879930004036e-05, "loss": 3.2333, "step": 6911 }, { "epoch": 8.83968, "grad_norm": 0.5343428254127502, "learning_rate": 3.6707497644366666e-05, "loss": 3.1861, "step": 6912 }, { "epoch": 8.84096, "grad_norm": 0.5404158234596252, "learning_rate": 3.66671153587293e-05, "loss": 3.2274, "step": 6913 }, { "epoch": 8.84224, "grad_norm": 0.5451922416687012, "learning_rate": 3.662673307309193e-05, "loss": 3.1915, "step": 6914 }, { "epoch": 8.84352, "grad_norm": 0.5269055366516113, "learning_rate": 3.658635078745457e-05, "loss": 3.2036, "step": 6915 }, { "epoch": 8.8448, "grad_norm": 0.5376463532447815, "learning_rate": 3.65459685018172e-05, "loss": 3.0966, "step": 6916 }, { "epoch": 8.84608, "grad_norm": 0.5396373271942139, "learning_rate": 3.650558621617984e-05, "loss": 3.2148, "step": 6917 }, { "epoch": 8.84736, "grad_norm": 0.5474383234977722, "learning_rate": 3.646520393054247e-05, "loss": 3.0508, "step": 6918 }, { "epoch": 8.84864, "grad_norm": 0.5413127541542053, "learning_rate": 3.64248216449051e-05, "loss": 3.2371, "step": 6919 }, { "epoch": 8.849920000000001, "grad_norm": 0.5485610365867615, "learning_rate": 3.6384439359267734e-05, "loss": 3.183, "step": 6920 }, { "epoch": 8.8512, "grad_norm": 0.551889955997467, "learning_rate": 3.6344057073630364e-05, "loss": 3.2028, "step": 6921 }, { "epoch": 8.85248, "grad_norm": 0.5297598242759705, "learning_rate": 3.6303674787993e-05, "loss": 3.1678, "step": 6922 }, { "epoch": 8.85376, "grad_norm": 0.5409109592437744, "learning_rate": 3.626329250235563e-05, "loss": 3.1554, "step": 6923 }, { "epoch": 8.85504, "grad_norm": 0.5404353737831116, "learning_rate": 3.622291021671827e-05, "loss": 3.2286, "step": 6924 }, { "epoch": 8.85632, "grad_norm": 0.5331123471260071, "learning_rate": 3.618252793108089e-05, "loss": 3.1951, "step": 6925 }, { "epoch": 8.8576, "grad_norm": 0.53252112865448, "learning_rate": 3.614214564544353e-05, "loss": 3.1818, "step": 6926 }, { "epoch": 8.85888, "grad_norm": 0.5230981707572937, "learning_rate": 3.6101763359806165e-05, "loss": 3.1821, "step": 6927 }, { "epoch": 8.86016, "grad_norm": 0.5399007797241211, "learning_rate": 3.6061381074168795e-05, "loss": 3.1559, "step": 6928 }, { "epoch": 8.86144, "grad_norm": 0.5414989590644836, "learning_rate": 3.602099878853143e-05, "loss": 3.1675, "step": 6929 }, { "epoch": 8.86272, "grad_norm": 0.5386769771575928, "learning_rate": 3.598061650289406e-05, "loss": 3.1707, "step": 6930 }, { "epoch": 8.864, "grad_norm": 0.5298625230789185, "learning_rate": 3.594023421725669e-05, "loss": 3.2093, "step": 6931 }, { "epoch": 8.86528, "grad_norm": 0.5370173454284668, "learning_rate": 3.589985193161932e-05, "loss": 3.1237, "step": 6932 }, { "epoch": 8.86656, "grad_norm": 0.5402517914772034, "learning_rate": 3.585946964598196e-05, "loss": 3.2736, "step": 6933 }, { "epoch": 8.86784, "grad_norm": 0.5565125346183777, "learning_rate": 3.581908736034459e-05, "loss": 3.2115, "step": 6934 }, { "epoch": 8.86912, "grad_norm": 0.5417731404304504, "learning_rate": 3.5778705074707226e-05, "loss": 3.2296, "step": 6935 }, { "epoch": 8.8704, "grad_norm": 0.5352853536605835, "learning_rate": 3.573832278906986e-05, "loss": 3.1484, "step": 6936 }, { "epoch": 8.87168, "grad_norm": 0.5440704226493835, "learning_rate": 3.569794050343249e-05, "loss": 3.1939, "step": 6937 }, { "epoch": 8.872959999999999, "grad_norm": 0.553403913974762, "learning_rate": 3.565755821779512e-05, "loss": 3.1381, "step": 6938 }, { "epoch": 8.87424, "grad_norm": 0.5392211079597473, "learning_rate": 3.561717593215775e-05, "loss": 3.2004, "step": 6939 }, { "epoch": 8.87552, "grad_norm": 0.5346492528915405, "learning_rate": 3.557679364652039e-05, "loss": 3.2119, "step": 6940 }, { "epoch": 8.8768, "grad_norm": 0.5390061140060425, "learning_rate": 3.553641136088302e-05, "loss": 3.2046, "step": 6941 }, { "epoch": 8.87808, "grad_norm": 0.5476921796798706, "learning_rate": 3.5496029075245657e-05, "loss": 3.2266, "step": 6942 }, { "epoch": 8.87936, "grad_norm": 0.5377036333084106, "learning_rate": 3.5455646789608287e-05, "loss": 3.2189, "step": 6943 }, { "epoch": 8.88064, "grad_norm": 0.5338594317436218, "learning_rate": 3.5415264503970923e-05, "loss": 3.2326, "step": 6944 }, { "epoch": 8.881920000000001, "grad_norm": 0.5427373647689819, "learning_rate": 3.5374882218333554e-05, "loss": 3.2784, "step": 6945 }, { "epoch": 8.8832, "grad_norm": 0.5387793779373169, "learning_rate": 3.5334499932696184e-05, "loss": 3.1931, "step": 6946 }, { "epoch": 8.88448, "grad_norm": 0.5435247421264648, "learning_rate": 3.529411764705882e-05, "loss": 3.1342, "step": 6947 }, { "epoch": 8.88576, "grad_norm": 0.5299807786941528, "learning_rate": 3.525373536142145e-05, "loss": 3.1364, "step": 6948 }, { "epoch": 8.88704, "grad_norm": 0.5515544414520264, "learning_rate": 3.521335307578409e-05, "loss": 3.2109, "step": 6949 }, { "epoch": 8.88832, "grad_norm": 0.5315101146697998, "learning_rate": 3.517297079014672e-05, "loss": 3.0987, "step": 6950 }, { "epoch": 8.8896, "grad_norm": 0.5465390682220459, "learning_rate": 3.5132588504509354e-05, "loss": 3.1256, "step": 6951 }, { "epoch": 8.89088, "grad_norm": 0.5575345158576965, "learning_rate": 3.5092206218871984e-05, "loss": 3.194, "step": 6952 }, { "epoch": 8.89216, "grad_norm": 0.5439948439598083, "learning_rate": 3.5051823933234614e-05, "loss": 3.2022, "step": 6953 }, { "epoch": 8.89344, "grad_norm": 0.5493448972702026, "learning_rate": 3.501144164759725e-05, "loss": 3.2155, "step": 6954 }, { "epoch": 8.89472, "grad_norm": 0.5295661091804504, "learning_rate": 3.497105936195988e-05, "loss": 3.1321, "step": 6955 }, { "epoch": 8.896, "grad_norm": 0.5430306792259216, "learning_rate": 3.493067707632252e-05, "loss": 3.1363, "step": 6956 }, { "epoch": 8.89728, "grad_norm": 0.5532288551330566, "learning_rate": 3.489029479068515e-05, "loss": 3.2848, "step": 6957 }, { "epoch": 8.89856, "grad_norm": 0.5648680329322815, "learning_rate": 3.484991250504778e-05, "loss": 3.1427, "step": 6958 }, { "epoch": 8.89984, "grad_norm": 0.5307405591011047, "learning_rate": 3.4809530219410415e-05, "loss": 3.2077, "step": 6959 }, { "epoch": 8.90112, "grad_norm": 0.5643923282623291, "learning_rate": 3.4769147933773045e-05, "loss": 3.1602, "step": 6960 }, { "epoch": 8.9024, "grad_norm": 0.5337897539138794, "learning_rate": 3.472876564813568e-05, "loss": 3.186, "step": 6961 }, { "epoch": 8.90368, "grad_norm": 0.5449624061584473, "learning_rate": 3.468838336249831e-05, "loss": 3.223, "step": 6962 }, { "epoch": 8.904959999999999, "grad_norm": 0.5331872701644897, "learning_rate": 3.464800107686095e-05, "loss": 3.1097, "step": 6963 }, { "epoch": 8.90624, "grad_norm": 0.5646026134490967, "learning_rate": 3.460761879122358e-05, "loss": 3.203, "step": 6964 }, { "epoch": 8.90752, "grad_norm": 0.554621696472168, "learning_rate": 3.456723650558621e-05, "loss": 3.1597, "step": 6965 }, { "epoch": 8.9088, "grad_norm": 0.5355739593505859, "learning_rate": 3.4526854219948846e-05, "loss": 3.2187, "step": 6966 }, { "epoch": 8.91008, "grad_norm": 0.5411005020141602, "learning_rate": 3.4486471934311476e-05, "loss": 3.1622, "step": 6967 }, { "epoch": 8.91136, "grad_norm": 0.5598929524421692, "learning_rate": 3.444608964867411e-05, "loss": 3.1857, "step": 6968 }, { "epoch": 8.91264, "grad_norm": 0.5395918488502502, "learning_rate": 3.440570736303674e-05, "loss": 3.1413, "step": 6969 }, { "epoch": 8.91392, "grad_norm": 0.5290459394454956, "learning_rate": 3.436532507739938e-05, "loss": 3.1358, "step": 6970 }, { "epoch": 8.9152, "grad_norm": 0.5369711518287659, "learning_rate": 3.432494279176201e-05, "loss": 3.1303, "step": 6971 }, { "epoch": 8.91648, "grad_norm": 0.5442224740982056, "learning_rate": 3.428456050612464e-05, "loss": 3.1369, "step": 6972 }, { "epoch": 8.91776, "grad_norm": 0.5366105437278748, "learning_rate": 3.424417822048728e-05, "loss": 3.1323, "step": 6973 }, { "epoch": 8.91904, "grad_norm": 0.5568270087242126, "learning_rate": 3.420379593484991e-05, "loss": 3.1774, "step": 6974 }, { "epoch": 8.92032, "grad_norm": 0.5447999835014343, "learning_rate": 3.4163413649212544e-05, "loss": 3.1728, "step": 6975 }, { "epoch": 8.9216, "grad_norm": 0.5445603728294373, "learning_rate": 3.4123031363575174e-05, "loss": 3.1144, "step": 6976 }, { "epoch": 8.92288, "grad_norm": 0.5516496896743774, "learning_rate": 3.408264907793781e-05, "loss": 3.1628, "step": 6977 }, { "epoch": 8.92416, "grad_norm": 0.5592992901802063, "learning_rate": 3.404226679230044e-05, "loss": 3.1663, "step": 6978 }, { "epoch": 8.92544, "grad_norm": 0.5563246607780457, "learning_rate": 3.400188450666307e-05, "loss": 3.1543, "step": 6979 }, { "epoch": 8.92672, "grad_norm": 0.5511194467544556, "learning_rate": 3.396150222102571e-05, "loss": 3.2003, "step": 6980 }, { "epoch": 8.928, "grad_norm": 0.5466040372848511, "learning_rate": 3.392111993538834e-05, "loss": 3.1386, "step": 6981 }, { "epoch": 8.92928, "grad_norm": 0.5524888038635254, "learning_rate": 3.3880737649750975e-05, "loss": 3.0766, "step": 6982 }, { "epoch": 8.93056, "grad_norm": 0.5536906123161316, "learning_rate": 3.3840355364113605e-05, "loss": 3.1638, "step": 6983 }, { "epoch": 8.93184, "grad_norm": 0.5587788224220276, "learning_rate": 3.379997307847624e-05, "loss": 3.2024, "step": 6984 }, { "epoch": 8.93312, "grad_norm": 0.5503929257392883, "learning_rate": 3.375959079283887e-05, "loss": 3.2101, "step": 6985 }, { "epoch": 8.9344, "grad_norm": 0.5357226729393005, "learning_rate": 3.37192085072015e-05, "loss": 3.2078, "step": 6986 }, { "epoch": 8.93568, "grad_norm": 0.5584990382194519, "learning_rate": 3.367882622156414e-05, "loss": 3.186, "step": 6987 }, { "epoch": 8.93696, "grad_norm": 0.5508469939231873, "learning_rate": 3.363844393592677e-05, "loss": 3.1558, "step": 6988 }, { "epoch": 8.93824, "grad_norm": 0.5476589202880859, "learning_rate": 3.3598061650289406e-05, "loss": 3.1501, "step": 6989 }, { "epoch": 8.93952, "grad_norm": 0.5468451380729675, "learning_rate": 3.3557679364652036e-05, "loss": 3.211, "step": 6990 }, { "epoch": 8.9408, "grad_norm": 0.5536555647850037, "learning_rate": 3.351729707901467e-05, "loss": 3.1646, "step": 6991 }, { "epoch": 8.94208, "grad_norm": 0.5505756735801697, "learning_rate": 3.34769147933773e-05, "loss": 3.1244, "step": 6992 }, { "epoch": 8.94336, "grad_norm": 0.560417890548706, "learning_rate": 3.343653250773993e-05, "loss": 3.1672, "step": 6993 }, { "epoch": 8.94464, "grad_norm": 0.5642468929290771, "learning_rate": 3.339615022210257e-05, "loss": 3.1955, "step": 6994 }, { "epoch": 8.94592, "grad_norm": 0.5576351881027222, "learning_rate": 3.33557679364652e-05, "loss": 3.1591, "step": 6995 }, { "epoch": 8.9472, "grad_norm": 0.543550968170166, "learning_rate": 3.3315385650827836e-05, "loss": 3.1024, "step": 6996 }, { "epoch": 8.94848, "grad_norm": 0.5292938947677612, "learning_rate": 3.3275003365190467e-05, "loss": 3.1107, "step": 6997 }, { "epoch": 8.94976, "grad_norm": 0.5450161695480347, "learning_rate": 3.32346210795531e-05, "loss": 3.1862, "step": 6998 }, { "epoch": 8.95104, "grad_norm": 0.5542306303977966, "learning_rate": 3.3194238793915733e-05, "loss": 3.209, "step": 6999 }, { "epoch": 8.95232, "grad_norm": 0.552590548992157, "learning_rate": 3.3153856508278364e-05, "loss": 3.2279, "step": 7000 }, { "epoch": 8.9536, "grad_norm": 0.548201858997345, "learning_rate": 3.3113474222641e-05, "loss": 3.1338, "step": 7001 }, { "epoch": 8.95488, "grad_norm": 0.5407779216766357, "learning_rate": 3.307309193700363e-05, "loss": 3.1704, "step": 7002 }, { "epoch": 8.95616, "grad_norm": 0.5337063074111938, "learning_rate": 3.303270965136627e-05, "loss": 3.1294, "step": 7003 }, { "epoch": 8.95744, "grad_norm": 0.5435624718666077, "learning_rate": 3.29923273657289e-05, "loss": 3.2299, "step": 7004 }, { "epoch": 8.95872, "grad_norm": 0.5566171407699585, "learning_rate": 3.295194508009153e-05, "loss": 3.2128, "step": 7005 }, { "epoch": 8.96, "grad_norm": 0.5542343854904175, "learning_rate": 3.2911562794454164e-05, "loss": 3.1888, "step": 7006 }, { "epoch": 8.96128, "grad_norm": 0.5571079254150391, "learning_rate": 3.2871180508816794e-05, "loss": 3.1994, "step": 7007 }, { "epoch": 8.96256, "grad_norm": 0.5469478964805603, "learning_rate": 3.283079822317943e-05, "loss": 3.1812, "step": 7008 }, { "epoch": 8.96384, "grad_norm": 0.5388615727424622, "learning_rate": 3.279041593754206e-05, "loss": 3.1808, "step": 7009 }, { "epoch": 8.96512, "grad_norm": 0.5441920161247253, "learning_rate": 3.27500336519047e-05, "loss": 3.1783, "step": 7010 }, { "epoch": 8.9664, "grad_norm": 0.5513103604316711, "learning_rate": 3.270965136626733e-05, "loss": 3.1525, "step": 7011 }, { "epoch": 8.96768, "grad_norm": 0.5584927201271057, "learning_rate": 3.266926908062996e-05, "loss": 3.207, "step": 7012 }, { "epoch": 8.96896, "grad_norm": 0.5390376448631287, "learning_rate": 3.2628886794992595e-05, "loss": 3.1688, "step": 7013 }, { "epoch": 8.97024, "grad_norm": 0.5638677477836609, "learning_rate": 3.2588504509355225e-05, "loss": 3.2089, "step": 7014 }, { "epoch": 8.97152, "grad_norm": 0.5538638830184937, "learning_rate": 3.254812222371786e-05, "loss": 3.1597, "step": 7015 }, { "epoch": 8.9728, "grad_norm": 0.5503858923912048, "learning_rate": 3.250773993808049e-05, "loss": 3.1927, "step": 7016 }, { "epoch": 8.97408, "grad_norm": 0.5431429147720337, "learning_rate": 3.246735765244313e-05, "loss": 3.2362, "step": 7017 }, { "epoch": 8.97536, "grad_norm": 0.5614336133003235, "learning_rate": 3.242697536680576e-05, "loss": 3.1585, "step": 7018 }, { "epoch": 8.97664, "grad_norm": 0.540317714214325, "learning_rate": 3.238659308116839e-05, "loss": 3.2157, "step": 7019 }, { "epoch": 8.97792, "grad_norm": 0.5396054983139038, "learning_rate": 3.2346210795531026e-05, "loss": 3.1543, "step": 7020 }, { "epoch": 8.9792, "grad_norm": 0.541448712348938, "learning_rate": 3.2305828509893656e-05, "loss": 3.1709, "step": 7021 }, { "epoch": 8.98048, "grad_norm": 0.553905725479126, "learning_rate": 3.226544622425629e-05, "loss": 3.1969, "step": 7022 }, { "epoch": 8.98176, "grad_norm": 0.5426627397537231, "learning_rate": 3.222506393861892e-05, "loss": 3.2246, "step": 7023 }, { "epoch": 8.98304, "grad_norm": 0.5612876415252686, "learning_rate": 3.218468165298156e-05, "loss": 3.2059, "step": 7024 }, { "epoch": 8.98432, "grad_norm": 0.5525398850440979, "learning_rate": 3.214429936734419e-05, "loss": 3.1842, "step": 7025 }, { "epoch": 8.9856, "grad_norm": 0.5441053509712219, "learning_rate": 3.210391708170682e-05, "loss": 3.2081, "step": 7026 }, { "epoch": 8.98688, "grad_norm": 0.5565687417984009, "learning_rate": 3.206353479606946e-05, "loss": 3.2089, "step": 7027 }, { "epoch": 8.98816, "grad_norm": 0.553962230682373, "learning_rate": 3.202315251043209e-05, "loss": 3.2249, "step": 7028 }, { "epoch": 8.98944, "grad_norm": 0.5383572578430176, "learning_rate": 3.1982770224794724e-05, "loss": 3.1249, "step": 7029 }, { "epoch": 8.99072, "grad_norm": 0.5608500242233276, "learning_rate": 3.1942387939157354e-05, "loss": 3.2204, "step": 7030 }, { "epoch": 8.992, "grad_norm": 0.5564176440238953, "learning_rate": 3.190200565351999e-05, "loss": 3.222, "step": 7031 }, { "epoch": 8.99328, "grad_norm": 0.5383712649345398, "learning_rate": 3.186162336788262e-05, "loss": 3.1404, "step": 7032 }, { "epoch": 8.99456, "grad_norm": 0.5401707291603088, "learning_rate": 3.182124108224525e-05, "loss": 3.1736, "step": 7033 }, { "epoch": 8.99584, "grad_norm": 0.5486315488815308, "learning_rate": 3.178085879660789e-05, "loss": 3.1421, "step": 7034 }, { "epoch": 8.99712, "grad_norm": 0.5637720227241516, "learning_rate": 3.174047651097052e-05, "loss": 3.0799, "step": 7035 }, { "epoch": 8.9984, "grad_norm": 0.5368375182151794, "learning_rate": 3.1700094225333155e-05, "loss": 3.1562, "step": 7036 }, { "epoch": 8.99968, "grad_norm": 0.5359020233154297, "learning_rate": 3.1659711939695785e-05, "loss": 3.1867, "step": 7037 }, { "epoch": 9.0, "grad_norm": 0.9711993932723999, "learning_rate": 3.161932965405842e-05, "loss": 2.7479, "step": 7038 }, { "epoch": 9.00128, "grad_norm": 0.5583388209342957, "learning_rate": 3.1578947368421045e-05, "loss": 3.2842, "step": 7039 }, { "epoch": 9.00256, "grad_norm": 0.5551260113716125, "learning_rate": 3.153856508278368e-05, "loss": 3.2511, "step": 7040 }, { "epoch": 9.00384, "grad_norm": 0.5703940987586975, "learning_rate": 3.149818279714632e-05, "loss": 3.2821, "step": 7041 }, { "epoch": 9.00512, "grad_norm": 0.5474501848220825, "learning_rate": 3.145780051150895e-05, "loss": 3.2544, "step": 7042 }, { "epoch": 9.0064, "grad_norm": 0.5608713030815125, "learning_rate": 3.1417418225871586e-05, "loss": 3.3234, "step": 7043 }, { "epoch": 9.00768, "grad_norm": 0.5595000386238098, "learning_rate": 3.1377035940234216e-05, "loss": 3.2516, "step": 7044 }, { "epoch": 9.00896, "grad_norm": 0.5540786385536194, "learning_rate": 3.133665365459685e-05, "loss": 3.3641, "step": 7045 }, { "epoch": 9.01024, "grad_norm": 0.5481628775596619, "learning_rate": 3.1296271368959476e-05, "loss": 3.1462, "step": 7046 }, { "epoch": 9.01152, "grad_norm": 0.545194685459137, "learning_rate": 3.125588908332211e-05, "loss": 3.2292, "step": 7047 }, { "epoch": 9.0128, "grad_norm": 0.5309725403785706, "learning_rate": 3.121550679768474e-05, "loss": 3.244, "step": 7048 }, { "epoch": 9.01408, "grad_norm": 0.5396099090576172, "learning_rate": 3.117512451204738e-05, "loss": 3.3037, "step": 7049 }, { "epoch": 9.01536, "grad_norm": 0.5477572083473206, "learning_rate": 3.1134742226410016e-05, "loss": 3.1592, "step": 7050 }, { "epoch": 9.01664, "grad_norm": 0.5614184737205505, "learning_rate": 3.1094359940772646e-05, "loss": 3.1785, "step": 7051 }, { "epoch": 9.01792, "grad_norm": 0.5425687432289124, "learning_rate": 3.1053977655135277e-05, "loss": 3.2016, "step": 7052 }, { "epoch": 9.0192, "grad_norm": 0.5536109209060669, "learning_rate": 3.1013595369497907e-05, "loss": 3.247, "step": 7053 }, { "epoch": 9.02048, "grad_norm": 0.5437664985656738, "learning_rate": 3.0973213083860543e-05, "loss": 3.2428, "step": 7054 }, { "epoch": 9.02176, "grad_norm": 0.5411404371261597, "learning_rate": 3.0932830798223174e-05, "loss": 3.2555, "step": 7055 }, { "epoch": 9.02304, "grad_norm": 0.5475705862045288, "learning_rate": 3.089244851258581e-05, "loss": 3.2672, "step": 7056 }, { "epoch": 9.02432, "grad_norm": 0.5438557863235474, "learning_rate": 3.085206622694844e-05, "loss": 3.1994, "step": 7057 }, { "epoch": 9.0256, "grad_norm": 0.54364413022995, "learning_rate": 3.081168394131108e-05, "loss": 3.249, "step": 7058 }, { "epoch": 9.02688, "grad_norm": 0.5349510908126831, "learning_rate": 3.077130165567371e-05, "loss": 3.2396, "step": 7059 }, { "epoch": 9.02816, "grad_norm": 0.5500809550285339, "learning_rate": 3.073091937003634e-05, "loss": 3.2324, "step": 7060 }, { "epoch": 9.02944, "grad_norm": 0.5365533232688904, "learning_rate": 3.0690537084398974e-05, "loss": 3.1999, "step": 7061 }, { "epoch": 9.03072, "grad_norm": 0.5576481819152832, "learning_rate": 3.0650154798761604e-05, "loss": 3.3374, "step": 7062 }, { "epoch": 9.032, "grad_norm": 0.5433608293533325, "learning_rate": 3.060977251312424e-05, "loss": 3.2957, "step": 7063 }, { "epoch": 9.03328, "grad_norm": 0.5531395673751831, "learning_rate": 3.056939022748687e-05, "loss": 3.2484, "step": 7064 }, { "epoch": 9.03456, "grad_norm": 0.5414525866508484, "learning_rate": 3.052900794184951e-05, "loss": 3.218, "step": 7065 }, { "epoch": 9.03584, "grad_norm": 0.5335577726364136, "learning_rate": 3.0488625656212138e-05, "loss": 3.1524, "step": 7066 }, { "epoch": 9.03712, "grad_norm": 0.5346528887748718, "learning_rate": 3.044824337057477e-05, "loss": 3.172, "step": 7067 }, { "epoch": 9.0384, "grad_norm": 0.5395179390907288, "learning_rate": 3.0407861084937405e-05, "loss": 3.1621, "step": 7068 }, { "epoch": 9.03968, "grad_norm": 0.544449508190155, "learning_rate": 3.036747879930004e-05, "loss": 3.2452, "step": 7069 }, { "epoch": 9.04096, "grad_norm": 0.5622183680534363, "learning_rate": 3.0327096513662672e-05, "loss": 3.2826, "step": 7070 }, { "epoch": 9.04224, "grad_norm": 0.543907880783081, "learning_rate": 3.0286714228025306e-05, "loss": 3.2684, "step": 7071 }, { "epoch": 9.043520000000001, "grad_norm": 0.5473181009292603, "learning_rate": 3.024633194238794e-05, "loss": 3.2676, "step": 7072 }, { "epoch": 9.0448, "grad_norm": 0.5329403877258301, "learning_rate": 3.020594965675057e-05, "loss": 3.2042, "step": 7073 }, { "epoch": 9.04608, "grad_norm": 0.519966185092926, "learning_rate": 3.0165567371113203e-05, "loss": 3.1961, "step": 7074 }, { "epoch": 9.04736, "grad_norm": 0.5505262017250061, "learning_rate": 3.0125185085475836e-05, "loss": 3.2669, "step": 7075 }, { "epoch": 9.04864, "grad_norm": 0.5391356945037842, "learning_rate": 3.008480279983847e-05, "loss": 3.2735, "step": 7076 }, { "epoch": 9.04992, "grad_norm": 0.5384522676467896, "learning_rate": 3.0044420514201103e-05, "loss": 3.3018, "step": 7077 }, { "epoch": 9.0512, "grad_norm": 0.5468092560768127, "learning_rate": 3.0004038228563736e-05, "loss": 3.2427, "step": 7078 }, { "epoch": 9.05248, "grad_norm": 0.5462827682495117, "learning_rate": 2.9963655942926366e-05, "loss": 3.2236, "step": 7079 }, { "epoch": 9.05376, "grad_norm": 0.5457635521888733, "learning_rate": 2.9923273657289e-05, "loss": 3.1881, "step": 7080 }, { "epoch": 9.05504, "grad_norm": 0.5283154249191284, "learning_rate": 2.9882891371651633e-05, "loss": 3.2305, "step": 7081 }, { "epoch": 9.05632, "grad_norm": 0.533507227897644, "learning_rate": 2.9842509086014267e-05, "loss": 3.2302, "step": 7082 }, { "epoch": 9.0576, "grad_norm": 0.5426484942436218, "learning_rate": 2.98021268003769e-05, "loss": 3.2233, "step": 7083 }, { "epoch": 9.05888, "grad_norm": 0.5597418546676636, "learning_rate": 2.9761744514739534e-05, "loss": 3.2381, "step": 7084 }, { "epoch": 9.06016, "grad_norm": 0.5480257868766785, "learning_rate": 2.9721362229102167e-05, "loss": 3.2169, "step": 7085 }, { "epoch": 9.06144, "grad_norm": 0.5565598011016846, "learning_rate": 2.9680979943464797e-05, "loss": 3.2586, "step": 7086 }, { "epoch": 9.06272, "grad_norm": 0.5538652539253235, "learning_rate": 2.964059765782743e-05, "loss": 3.211, "step": 7087 }, { "epoch": 9.064, "grad_norm": 0.5413698554039001, "learning_rate": 2.9600215372190064e-05, "loss": 3.2349, "step": 7088 }, { "epoch": 9.06528, "grad_norm": 0.5424516797065735, "learning_rate": 2.9559833086552698e-05, "loss": 3.2359, "step": 7089 }, { "epoch": 9.06656, "grad_norm": 0.5624558925628662, "learning_rate": 2.951945080091533e-05, "loss": 3.2008, "step": 7090 }, { "epoch": 9.06784, "grad_norm": 0.5545729398727417, "learning_rate": 2.9479068515277965e-05, "loss": 3.2058, "step": 7091 }, { "epoch": 9.06912, "grad_norm": 0.5462035536766052, "learning_rate": 2.9438686229640598e-05, "loss": 3.2632, "step": 7092 }, { "epoch": 9.0704, "grad_norm": 0.5403680801391602, "learning_rate": 2.9398303944003225e-05, "loss": 3.2468, "step": 7093 }, { "epoch": 9.07168, "grad_norm": 0.5381686091423035, "learning_rate": 2.935792165836586e-05, "loss": 3.2539, "step": 7094 }, { "epoch": 9.07296, "grad_norm": 0.5552157759666443, "learning_rate": 2.9317539372728495e-05, "loss": 3.2792, "step": 7095 }, { "epoch": 9.07424, "grad_norm": 0.5554720759391785, "learning_rate": 2.927715708709113e-05, "loss": 3.1714, "step": 7096 }, { "epoch": 9.07552, "grad_norm": 0.5406906604766846, "learning_rate": 2.9236774801453762e-05, "loss": 3.1821, "step": 7097 }, { "epoch": 9.0768, "grad_norm": 0.5409752130508423, "learning_rate": 2.9196392515816396e-05, "loss": 3.2008, "step": 7098 }, { "epoch": 9.07808, "grad_norm": 0.5609580874443054, "learning_rate": 2.9156010230179022e-05, "loss": 3.2278, "step": 7099 }, { "epoch": 9.07936, "grad_norm": 0.5392387509346008, "learning_rate": 2.9115627944541656e-05, "loss": 3.2108, "step": 7100 }, { "epoch": 9.08064, "grad_norm": 0.5388525128364563, "learning_rate": 2.907524565890429e-05, "loss": 3.2953, "step": 7101 }, { "epoch": 9.08192, "grad_norm": 0.544260561466217, "learning_rate": 2.9034863373266923e-05, "loss": 3.2398, "step": 7102 }, { "epoch": 9.0832, "grad_norm": 0.5434393882751465, "learning_rate": 2.8994481087629556e-05, "loss": 3.2731, "step": 7103 }, { "epoch": 9.08448, "grad_norm": 0.5499718189239502, "learning_rate": 2.8954098801992193e-05, "loss": 3.1798, "step": 7104 }, { "epoch": 9.08576, "grad_norm": 0.53836590051651, "learning_rate": 2.8913716516354826e-05, "loss": 3.2279, "step": 7105 }, { "epoch": 9.08704, "grad_norm": 0.5385804772377014, "learning_rate": 2.8873334230717453e-05, "loss": 3.2543, "step": 7106 }, { "epoch": 9.08832, "grad_norm": 0.5505902767181396, "learning_rate": 2.8832951945080087e-05, "loss": 3.1762, "step": 7107 }, { "epoch": 9.0896, "grad_norm": 0.5531958937644958, "learning_rate": 2.879256965944272e-05, "loss": 3.197, "step": 7108 }, { "epoch": 9.09088, "grad_norm": 0.5575078725814819, "learning_rate": 2.8752187373805353e-05, "loss": 3.2469, "step": 7109 }, { "epoch": 9.09216, "grad_norm": 0.5386976003646851, "learning_rate": 2.8711805088167987e-05, "loss": 3.2092, "step": 7110 }, { "epoch": 9.09344, "grad_norm": 0.5384331345558167, "learning_rate": 2.867142280253062e-05, "loss": 3.2562, "step": 7111 }, { "epoch": 9.09472, "grad_norm": 0.549574077129364, "learning_rate": 2.8631040516893254e-05, "loss": 3.2311, "step": 7112 }, { "epoch": 9.096, "grad_norm": 0.5679108500480652, "learning_rate": 2.8590658231255884e-05, "loss": 3.3449, "step": 7113 }, { "epoch": 9.09728, "grad_norm": 0.5485793948173523, "learning_rate": 2.8550275945618517e-05, "loss": 3.2654, "step": 7114 }, { "epoch": 9.09856, "grad_norm": 0.5479905605316162, "learning_rate": 2.850989365998115e-05, "loss": 3.252, "step": 7115 }, { "epoch": 9.09984, "grad_norm": 0.5610731840133667, "learning_rate": 2.8469511374343784e-05, "loss": 3.2753, "step": 7116 }, { "epoch": 9.10112, "grad_norm": 0.5504294633865356, "learning_rate": 2.8429129088706418e-05, "loss": 3.2685, "step": 7117 }, { "epoch": 9.1024, "grad_norm": 0.5451478958129883, "learning_rate": 2.838874680306905e-05, "loss": 3.1972, "step": 7118 }, { "epoch": 9.10368, "grad_norm": 0.5582462549209595, "learning_rate": 2.8348364517431685e-05, "loss": 3.2463, "step": 7119 }, { "epoch": 9.10496, "grad_norm": 0.5593787431716919, "learning_rate": 2.8307982231794315e-05, "loss": 3.304, "step": 7120 }, { "epoch": 9.10624, "grad_norm": 0.5443241596221924, "learning_rate": 2.8267599946156948e-05, "loss": 3.1912, "step": 7121 }, { "epoch": 9.10752, "grad_norm": 0.52846759557724, "learning_rate": 2.822721766051958e-05, "loss": 3.1861, "step": 7122 }, { "epoch": 9.1088, "grad_norm": 0.5365118384361267, "learning_rate": 2.8186835374882215e-05, "loss": 3.227, "step": 7123 }, { "epoch": 9.11008, "grad_norm": 0.5490257740020752, "learning_rate": 2.814645308924485e-05, "loss": 3.2294, "step": 7124 }, { "epoch": 9.11136, "grad_norm": 0.5475121140480042, "learning_rate": 2.8106070803607482e-05, "loss": 3.2356, "step": 7125 }, { "epoch": 9.11264, "grad_norm": 0.5491126775741577, "learning_rate": 2.8065688517970112e-05, "loss": 3.2843, "step": 7126 }, { "epoch": 9.11392, "grad_norm": 0.5404393076896667, "learning_rate": 2.8025306232332746e-05, "loss": 3.2315, "step": 7127 }, { "epoch": 9.1152, "grad_norm": 0.5436875224113464, "learning_rate": 2.798492394669538e-05, "loss": 3.2436, "step": 7128 }, { "epoch": 9.11648, "grad_norm": 0.5332518219947815, "learning_rate": 2.7944541661058013e-05, "loss": 3.2059, "step": 7129 }, { "epoch": 9.11776, "grad_norm": 0.5293616056442261, "learning_rate": 2.7904159375420646e-05, "loss": 3.2773, "step": 7130 }, { "epoch": 9.11904, "grad_norm": 0.5454232692718506, "learning_rate": 2.786377708978328e-05, "loss": 3.2034, "step": 7131 }, { "epoch": 9.12032, "grad_norm": 0.5401408076286316, "learning_rate": 2.7823394804145913e-05, "loss": 3.1661, "step": 7132 }, { "epoch": 9.1216, "grad_norm": 0.5544297695159912, "learning_rate": 2.7783012518508543e-05, "loss": 3.2872, "step": 7133 }, { "epoch": 9.12288, "grad_norm": 0.5589653253555298, "learning_rate": 2.7742630232871176e-05, "loss": 3.3172, "step": 7134 }, { "epoch": 9.12416, "grad_norm": 0.5525814294815063, "learning_rate": 2.770224794723381e-05, "loss": 3.2713, "step": 7135 }, { "epoch": 9.12544, "grad_norm": 0.5349526405334473, "learning_rate": 2.7661865661596443e-05, "loss": 3.2127, "step": 7136 }, { "epoch": 9.12672, "grad_norm": 0.5305134654045105, "learning_rate": 2.7621483375959077e-05, "loss": 3.1684, "step": 7137 }, { "epoch": 9.128, "grad_norm": 0.5401234030723572, "learning_rate": 2.758110109032171e-05, "loss": 3.3105, "step": 7138 }, { "epoch": 9.12928, "grad_norm": 0.5535856485366821, "learning_rate": 2.7540718804684344e-05, "loss": 3.2225, "step": 7139 }, { "epoch": 9.13056, "grad_norm": 0.5476309061050415, "learning_rate": 2.7500336519046974e-05, "loss": 3.2879, "step": 7140 }, { "epoch": 9.13184, "grad_norm": 0.5413089394569397, "learning_rate": 2.7459954233409607e-05, "loss": 3.1868, "step": 7141 }, { "epoch": 9.13312, "grad_norm": 0.548028290271759, "learning_rate": 2.741957194777224e-05, "loss": 3.2499, "step": 7142 }, { "epoch": 9.1344, "grad_norm": 0.5540904402732849, "learning_rate": 2.7379189662134874e-05, "loss": 3.2888, "step": 7143 }, { "epoch": 9.13568, "grad_norm": 0.5426977872848511, "learning_rate": 2.7338807376497508e-05, "loss": 3.227, "step": 7144 }, { "epoch": 9.13696, "grad_norm": 0.5503297448158264, "learning_rate": 2.729842509086014e-05, "loss": 3.1835, "step": 7145 }, { "epoch": 9.13824, "grad_norm": 0.5338129997253418, "learning_rate": 2.725804280522277e-05, "loss": 3.257, "step": 7146 }, { "epoch": 9.13952, "grad_norm": 0.5652978420257568, "learning_rate": 2.7217660519585405e-05, "loss": 3.2995, "step": 7147 }, { "epoch": 9.1408, "grad_norm": 0.5556164979934692, "learning_rate": 2.7177278233948038e-05, "loss": 3.2905, "step": 7148 }, { "epoch": 9.14208, "grad_norm": 0.5410715341567993, "learning_rate": 2.713689594831067e-05, "loss": 3.3382, "step": 7149 }, { "epoch": 9.14336, "grad_norm": 0.546202540397644, "learning_rate": 2.7096513662673305e-05, "loss": 3.321, "step": 7150 }, { "epoch": 9.14464, "grad_norm": 0.5364200472831726, "learning_rate": 2.705613137703594e-05, "loss": 3.2471, "step": 7151 }, { "epoch": 9.14592, "grad_norm": 0.5380752086639404, "learning_rate": 2.7015749091398572e-05, "loss": 3.3008, "step": 7152 }, { "epoch": 9.1472, "grad_norm": 0.538922131061554, "learning_rate": 2.6975366805761202e-05, "loss": 3.2591, "step": 7153 }, { "epoch": 9.14848, "grad_norm": 0.5580816864967346, "learning_rate": 2.6934984520123836e-05, "loss": 3.2522, "step": 7154 }, { "epoch": 9.14976, "grad_norm": 0.5336586236953735, "learning_rate": 2.689460223448647e-05, "loss": 3.1542, "step": 7155 }, { "epoch": 9.15104, "grad_norm": 0.5401532649993896, "learning_rate": 2.6854219948849103e-05, "loss": 3.1945, "step": 7156 }, { "epoch": 9.15232, "grad_norm": 0.5493281483650208, "learning_rate": 2.6813837663211736e-05, "loss": 3.3135, "step": 7157 }, { "epoch": 9.1536, "grad_norm": 0.545318603515625, "learning_rate": 2.677345537757437e-05, "loss": 3.2359, "step": 7158 }, { "epoch": 9.15488, "grad_norm": 0.5534164309501648, "learning_rate": 2.6733073091937003e-05, "loss": 3.2863, "step": 7159 }, { "epoch": 9.15616, "grad_norm": 0.547370970249176, "learning_rate": 2.6692690806299633e-05, "loss": 3.2479, "step": 7160 }, { "epoch": 9.15744, "grad_norm": 0.5454353094100952, "learning_rate": 2.6652308520662266e-05, "loss": 3.2092, "step": 7161 }, { "epoch": 9.15872, "grad_norm": 0.5439633727073669, "learning_rate": 2.66119262350249e-05, "loss": 3.2338, "step": 7162 }, { "epoch": 9.16, "grad_norm": 0.5375586152076721, "learning_rate": 2.6571543949387533e-05, "loss": 3.2332, "step": 7163 }, { "epoch": 9.16128, "grad_norm": 0.5413592457771301, "learning_rate": 2.6531161663750167e-05, "loss": 3.1831, "step": 7164 }, { "epoch": 9.16256, "grad_norm": 0.5374849438667297, "learning_rate": 2.64907793781128e-05, "loss": 3.2544, "step": 7165 }, { "epoch": 9.16384, "grad_norm": 0.5411040782928467, "learning_rate": 2.6450397092475434e-05, "loss": 3.2554, "step": 7166 }, { "epoch": 9.16512, "grad_norm": 0.5422086119651794, "learning_rate": 2.6410014806838064e-05, "loss": 3.1815, "step": 7167 }, { "epoch": 9.1664, "grad_norm": 0.5530745983123779, "learning_rate": 2.6369632521200697e-05, "loss": 3.329, "step": 7168 }, { "epoch": 9.16768, "grad_norm": 0.535277247428894, "learning_rate": 2.632925023556333e-05, "loss": 3.1514, "step": 7169 }, { "epoch": 9.16896, "grad_norm": 0.5430591106414795, "learning_rate": 2.6288867949925964e-05, "loss": 3.2385, "step": 7170 }, { "epoch": 9.17024, "grad_norm": 0.5327318906784058, "learning_rate": 2.6248485664288598e-05, "loss": 3.2665, "step": 7171 }, { "epoch": 9.17152, "grad_norm": 0.5375852584838867, "learning_rate": 2.620810337865123e-05, "loss": 3.1724, "step": 7172 }, { "epoch": 9.1728, "grad_norm": 0.5394271612167358, "learning_rate": 2.616772109301386e-05, "loss": 3.1937, "step": 7173 }, { "epoch": 9.17408, "grad_norm": 0.5466704368591309, "learning_rate": 2.6127338807376495e-05, "loss": 3.1974, "step": 7174 }, { "epoch": 9.17536, "grad_norm": 0.5488426685333252, "learning_rate": 2.6086956521739128e-05, "loss": 3.2674, "step": 7175 }, { "epoch": 9.17664, "grad_norm": 0.5353536009788513, "learning_rate": 2.604657423610176e-05, "loss": 3.2576, "step": 7176 }, { "epoch": 9.17792, "grad_norm": 0.5377178192138672, "learning_rate": 2.6006191950464395e-05, "loss": 3.2777, "step": 7177 }, { "epoch": 9.1792, "grad_norm": 0.5338087677955627, "learning_rate": 2.596580966482703e-05, "loss": 3.2165, "step": 7178 }, { "epoch": 9.18048, "grad_norm": 0.5402014851570129, "learning_rate": 2.5925427379189662e-05, "loss": 3.2682, "step": 7179 }, { "epoch": 9.18176, "grad_norm": 0.5575425028800964, "learning_rate": 2.5885045093552292e-05, "loss": 3.2025, "step": 7180 }, { "epoch": 9.18304, "grad_norm": 0.5500099658966064, "learning_rate": 2.5844662807914926e-05, "loss": 3.2428, "step": 7181 }, { "epoch": 9.18432, "grad_norm": 0.5393756628036499, "learning_rate": 2.580428052227756e-05, "loss": 3.3224, "step": 7182 }, { "epoch": 9.1856, "grad_norm": 0.5401633381843567, "learning_rate": 2.5763898236640192e-05, "loss": 3.1761, "step": 7183 }, { "epoch": 9.18688, "grad_norm": 0.5423944592475891, "learning_rate": 2.5723515951002826e-05, "loss": 3.2521, "step": 7184 }, { "epoch": 9.18816, "grad_norm": 0.5446013808250427, "learning_rate": 2.568313366536546e-05, "loss": 3.2659, "step": 7185 }, { "epoch": 9.18944, "grad_norm": 0.5417162775993347, "learning_rate": 2.5642751379728093e-05, "loss": 3.2457, "step": 7186 }, { "epoch": 9.19072, "grad_norm": 0.5435442328453064, "learning_rate": 2.5602369094090723e-05, "loss": 3.2624, "step": 7187 }, { "epoch": 9.192, "grad_norm": 0.5303104519844055, "learning_rate": 2.5561986808453356e-05, "loss": 3.2494, "step": 7188 }, { "epoch": 9.19328, "grad_norm": 0.5464146137237549, "learning_rate": 2.552160452281599e-05, "loss": 3.2261, "step": 7189 }, { "epoch": 9.19456, "grad_norm": 0.5271885395050049, "learning_rate": 2.5481222237178623e-05, "loss": 3.2895, "step": 7190 }, { "epoch": 9.19584, "grad_norm": 0.5422387719154358, "learning_rate": 2.5440839951541257e-05, "loss": 3.2853, "step": 7191 }, { "epoch": 9.19712, "grad_norm": 0.533074676990509, "learning_rate": 2.540045766590389e-05, "loss": 3.2502, "step": 7192 }, { "epoch": 9.1984, "grad_norm": 0.5399423837661743, "learning_rate": 2.536007538026652e-05, "loss": 3.2562, "step": 7193 }, { "epoch": 9.19968, "grad_norm": 0.5299935936927795, "learning_rate": 2.5319693094629154e-05, "loss": 3.1815, "step": 7194 }, { "epoch": 9.20096, "grad_norm": 0.538884162902832, "learning_rate": 2.5279310808991787e-05, "loss": 3.1976, "step": 7195 }, { "epoch": 9.20224, "grad_norm": 0.551018238067627, "learning_rate": 2.523892852335442e-05, "loss": 3.2776, "step": 7196 }, { "epoch": 9.20352, "grad_norm": 0.5342896580696106, "learning_rate": 2.5198546237717054e-05, "loss": 3.1877, "step": 7197 }, { "epoch": 9.2048, "grad_norm": 0.5489623546600342, "learning_rate": 2.5158163952079688e-05, "loss": 3.2537, "step": 7198 }, { "epoch": 9.20608, "grad_norm": 0.5403774380683899, "learning_rate": 2.511778166644232e-05, "loss": 3.2378, "step": 7199 }, { "epoch": 9.20736, "grad_norm": 0.5302898287773132, "learning_rate": 2.5077399380804948e-05, "loss": 3.1942, "step": 7200 }, { "epoch": 9.20864, "grad_norm": 0.5362948179244995, "learning_rate": 2.5037017095167585e-05, "loss": 3.2845, "step": 7201 }, { "epoch": 9.20992, "grad_norm": 0.5381386280059814, "learning_rate": 2.4996634809530218e-05, "loss": 3.2257, "step": 7202 }, { "epoch": 9.2112, "grad_norm": 0.5591046214103699, "learning_rate": 2.495625252389285e-05, "loss": 3.2319, "step": 7203 }, { "epoch": 9.21248, "grad_norm": 0.5458407998085022, "learning_rate": 2.4915870238255485e-05, "loss": 3.24, "step": 7204 }, { "epoch": 9.21376, "grad_norm": 0.5337607860565186, "learning_rate": 2.487548795261812e-05, "loss": 3.3132, "step": 7205 }, { "epoch": 9.21504, "grad_norm": 0.5361645817756653, "learning_rate": 2.4835105666980752e-05, "loss": 3.2828, "step": 7206 }, { "epoch": 9.21632, "grad_norm": 0.5330992937088013, "learning_rate": 2.479472338134338e-05, "loss": 3.2427, "step": 7207 }, { "epoch": 9.2176, "grad_norm": 0.5395091772079468, "learning_rate": 2.4754341095706012e-05, "loss": 3.2042, "step": 7208 }, { "epoch": 9.21888, "grad_norm": 0.5380380749702454, "learning_rate": 2.4713958810068646e-05, "loss": 3.2095, "step": 7209 }, { "epoch": 9.22016, "grad_norm": 0.5278270244598389, "learning_rate": 2.4673576524431282e-05, "loss": 3.2077, "step": 7210 }, { "epoch": 9.22144, "grad_norm": 0.5470373034477234, "learning_rate": 2.4633194238793916e-05, "loss": 3.1724, "step": 7211 }, { "epoch": 9.22272, "grad_norm": 0.5408812165260315, "learning_rate": 2.459281195315655e-05, "loss": 3.3655, "step": 7212 }, { "epoch": 9.224, "grad_norm": 0.5187749266624451, "learning_rate": 2.4552429667519183e-05, "loss": 3.2324, "step": 7213 }, { "epoch": 9.22528, "grad_norm": 0.5471019148826599, "learning_rate": 2.451204738188181e-05, "loss": 3.227, "step": 7214 }, { "epoch": 9.22656, "grad_norm": 0.5407528281211853, "learning_rate": 2.4471665096244443e-05, "loss": 3.2282, "step": 7215 }, { "epoch": 9.22784, "grad_norm": 0.5427590608596802, "learning_rate": 2.4431282810607076e-05, "loss": 3.2541, "step": 7216 }, { "epoch": 9.22912, "grad_norm": 0.5527137517929077, "learning_rate": 2.439090052496971e-05, "loss": 3.3002, "step": 7217 }, { "epoch": 9.2304, "grad_norm": 0.5542083978652954, "learning_rate": 2.4350518239332343e-05, "loss": 3.2398, "step": 7218 }, { "epoch": 9.23168, "grad_norm": 0.5390406250953674, "learning_rate": 2.431013595369498e-05, "loss": 3.2449, "step": 7219 }, { "epoch": 9.23296, "grad_norm": 0.5343725681304932, "learning_rate": 2.4269753668057607e-05, "loss": 3.205, "step": 7220 }, { "epoch": 9.23424, "grad_norm": 0.53874671459198, "learning_rate": 2.422937138242024e-05, "loss": 3.2123, "step": 7221 }, { "epoch": 9.23552, "grad_norm": 0.5471300482749939, "learning_rate": 2.4188989096782874e-05, "loss": 3.2171, "step": 7222 }, { "epoch": 9.2368, "grad_norm": 0.5401138067245483, "learning_rate": 2.4148606811145507e-05, "loss": 3.2517, "step": 7223 }, { "epoch": 9.23808, "grad_norm": 0.5442971587181091, "learning_rate": 2.410822452550814e-05, "loss": 3.226, "step": 7224 }, { "epoch": 9.23936, "grad_norm": 0.5361701846122742, "learning_rate": 2.4067842239870774e-05, "loss": 3.2387, "step": 7225 }, { "epoch": 9.24064, "grad_norm": 0.5273582935333252, "learning_rate": 2.4027459954233408e-05, "loss": 3.2343, "step": 7226 }, { "epoch": 9.24192, "grad_norm": 0.5558703541755676, "learning_rate": 2.3987077668596038e-05, "loss": 3.291, "step": 7227 }, { "epoch": 9.2432, "grad_norm": 0.5563510656356812, "learning_rate": 2.394669538295867e-05, "loss": 3.2446, "step": 7228 }, { "epoch": 9.24448, "grad_norm": 0.5549482107162476, "learning_rate": 2.3906313097321305e-05, "loss": 3.1956, "step": 7229 }, { "epoch": 9.24576, "grad_norm": 0.5281810760498047, "learning_rate": 2.3865930811683938e-05, "loss": 3.3095, "step": 7230 }, { "epoch": 9.24704, "grad_norm": 0.5421765446662903, "learning_rate": 2.382554852604657e-05, "loss": 3.2243, "step": 7231 }, { "epoch": 9.24832, "grad_norm": 0.5279957056045532, "learning_rate": 2.3785166240409205e-05, "loss": 3.2774, "step": 7232 }, { "epoch": 9.2496, "grad_norm": 0.5412895679473877, "learning_rate": 2.374478395477184e-05, "loss": 3.195, "step": 7233 }, { "epoch": 9.25088, "grad_norm": 0.5304553508758545, "learning_rate": 2.370440166913447e-05, "loss": 3.2945, "step": 7234 }, { "epoch": 9.25216, "grad_norm": 0.536499559879303, "learning_rate": 2.3664019383497102e-05, "loss": 3.1877, "step": 7235 }, { "epoch": 9.25344, "grad_norm": 0.5327797532081604, "learning_rate": 2.3623637097859736e-05, "loss": 3.186, "step": 7236 }, { "epoch": 9.25472, "grad_norm": 0.5580500960350037, "learning_rate": 2.358325481222237e-05, "loss": 3.2472, "step": 7237 }, { "epoch": 9.256, "grad_norm": 0.5418148040771484, "learning_rate": 2.3542872526585002e-05, "loss": 3.293, "step": 7238 }, { "epoch": 9.25728, "grad_norm": 0.5535669922828674, "learning_rate": 2.3502490240947636e-05, "loss": 3.247, "step": 7239 }, { "epoch": 9.25856, "grad_norm": 0.5439664721488953, "learning_rate": 2.3462107955310266e-05, "loss": 3.2334, "step": 7240 }, { "epoch": 9.25984, "grad_norm": 0.5313609838485718, "learning_rate": 2.34217256696729e-05, "loss": 3.1706, "step": 7241 }, { "epoch": 9.26112, "grad_norm": 0.5202425122261047, "learning_rate": 2.3381343384035533e-05, "loss": 3.3, "step": 7242 }, { "epoch": 9.2624, "grad_norm": 0.5400013327598572, "learning_rate": 2.3340961098398166e-05, "loss": 3.2212, "step": 7243 }, { "epoch": 9.26368, "grad_norm": 0.5324732661247253, "learning_rate": 2.33005788127608e-05, "loss": 3.1796, "step": 7244 }, { "epoch": 9.26496, "grad_norm": 0.5566893815994263, "learning_rate": 2.3260196527123433e-05, "loss": 3.2211, "step": 7245 }, { "epoch": 9.26624, "grad_norm": 0.5362138152122498, "learning_rate": 2.3219814241486067e-05, "loss": 3.2751, "step": 7246 }, { "epoch": 9.26752, "grad_norm": 0.5280265808105469, "learning_rate": 2.3179431955848697e-05, "loss": 3.3033, "step": 7247 }, { "epoch": 9.2688, "grad_norm": 0.5292081236839294, "learning_rate": 2.313904967021133e-05, "loss": 3.2031, "step": 7248 }, { "epoch": 9.27008, "grad_norm": 0.529188334941864, "learning_rate": 2.3098667384573964e-05, "loss": 3.1846, "step": 7249 }, { "epoch": 9.27136, "grad_norm": 0.5365098118782043, "learning_rate": 2.3058285098936597e-05, "loss": 3.2667, "step": 7250 }, { "epoch": 9.272639999999999, "grad_norm": 0.5411892533302307, "learning_rate": 2.301790281329923e-05, "loss": 3.2621, "step": 7251 }, { "epoch": 9.27392, "grad_norm": 0.5581626296043396, "learning_rate": 2.2977520527661864e-05, "loss": 3.2695, "step": 7252 }, { "epoch": 9.2752, "grad_norm": 0.5382561087608337, "learning_rate": 2.2937138242024498e-05, "loss": 3.1889, "step": 7253 }, { "epoch": 9.27648, "grad_norm": 0.5616065859794617, "learning_rate": 2.2896755956387128e-05, "loss": 3.3357, "step": 7254 }, { "epoch": 9.27776, "grad_norm": 0.5404537320137024, "learning_rate": 2.285637367074976e-05, "loss": 3.2573, "step": 7255 }, { "epoch": 9.27904, "grad_norm": 0.5330450534820557, "learning_rate": 2.2815991385112395e-05, "loss": 3.1843, "step": 7256 }, { "epoch": 9.28032, "grad_norm": 0.5430814027786255, "learning_rate": 2.2775609099475028e-05, "loss": 3.1601, "step": 7257 }, { "epoch": 9.2816, "grad_norm": 0.5439402461051941, "learning_rate": 2.273522681383766e-05, "loss": 3.2283, "step": 7258 }, { "epoch": 9.28288, "grad_norm": 0.5354278087615967, "learning_rate": 2.2694844528200295e-05, "loss": 3.1896, "step": 7259 }, { "epoch": 9.28416, "grad_norm": 0.5329548120498657, "learning_rate": 2.265446224256293e-05, "loss": 3.278, "step": 7260 }, { "epoch": 9.28544, "grad_norm": 0.5459061861038208, "learning_rate": 2.261407995692556e-05, "loss": 3.2793, "step": 7261 }, { "epoch": 9.28672, "grad_norm": 0.5458545684814453, "learning_rate": 2.2573697671288192e-05, "loss": 3.2462, "step": 7262 }, { "epoch": 9.288, "grad_norm": 0.5448905229568481, "learning_rate": 2.2533315385650826e-05, "loss": 3.1968, "step": 7263 }, { "epoch": 9.28928, "grad_norm": 0.5559900403022766, "learning_rate": 2.249293310001346e-05, "loss": 3.21, "step": 7264 }, { "epoch": 9.29056, "grad_norm": 0.5367174744606018, "learning_rate": 2.2452550814376092e-05, "loss": 3.137, "step": 7265 }, { "epoch": 9.29184, "grad_norm": 0.5573728084564209, "learning_rate": 2.2412168528738726e-05, "loss": 3.2597, "step": 7266 }, { "epoch": 9.29312, "grad_norm": 0.5518680214881897, "learning_rate": 2.2371786243101356e-05, "loss": 3.3036, "step": 7267 }, { "epoch": 9.2944, "grad_norm": 0.5395819544792175, "learning_rate": 2.233140395746399e-05, "loss": 3.2503, "step": 7268 }, { "epoch": 9.29568, "grad_norm": 0.528838038444519, "learning_rate": 2.2291021671826623e-05, "loss": 3.2374, "step": 7269 }, { "epoch": 9.29696, "grad_norm": 0.5213932394981384, "learning_rate": 2.2250639386189256e-05, "loss": 3.2252, "step": 7270 }, { "epoch": 9.29824, "grad_norm": 0.5346904993057251, "learning_rate": 2.221025710055189e-05, "loss": 3.2775, "step": 7271 }, { "epoch": 9.29952, "grad_norm": 0.5518351197242737, "learning_rate": 2.2169874814914523e-05, "loss": 3.2446, "step": 7272 }, { "epoch": 9.3008, "grad_norm": 0.5426953434944153, "learning_rate": 2.2129492529277157e-05, "loss": 3.2718, "step": 7273 }, { "epoch": 9.30208, "grad_norm": 0.5330793261528015, "learning_rate": 2.2089110243639787e-05, "loss": 3.2078, "step": 7274 }, { "epoch": 9.30336, "grad_norm": 0.5355795621871948, "learning_rate": 2.204872795800242e-05, "loss": 3.2268, "step": 7275 }, { "epoch": 9.30464, "grad_norm": 0.5403215885162354, "learning_rate": 2.2008345672365054e-05, "loss": 3.2271, "step": 7276 }, { "epoch": 9.30592, "grad_norm": 0.5434122085571289, "learning_rate": 2.1967963386727687e-05, "loss": 3.2366, "step": 7277 }, { "epoch": 9.3072, "grad_norm": 0.5365995168685913, "learning_rate": 2.192758110109032e-05, "loss": 3.2595, "step": 7278 }, { "epoch": 9.30848, "grad_norm": 0.537118136882782, "learning_rate": 2.1887198815452954e-05, "loss": 3.2239, "step": 7279 }, { "epoch": 9.30976, "grad_norm": 0.5411548614501953, "learning_rate": 2.1846816529815588e-05, "loss": 3.3102, "step": 7280 }, { "epoch": 9.31104, "grad_norm": 0.5356780290603638, "learning_rate": 2.1806434244178218e-05, "loss": 3.3315, "step": 7281 }, { "epoch": 9.31232, "grad_norm": 0.5491350889205933, "learning_rate": 2.176605195854085e-05, "loss": 3.3068, "step": 7282 }, { "epoch": 9.3136, "grad_norm": 0.5464149713516235, "learning_rate": 2.1725669672903485e-05, "loss": 3.2792, "step": 7283 }, { "epoch": 9.31488, "grad_norm": 0.5239377617835999, "learning_rate": 2.1685287387266118e-05, "loss": 3.1874, "step": 7284 }, { "epoch": 9.31616, "grad_norm": 0.5476245284080505, "learning_rate": 2.164490510162875e-05, "loss": 3.2341, "step": 7285 }, { "epoch": 9.31744, "grad_norm": 0.546082079410553, "learning_rate": 2.1604522815991385e-05, "loss": 3.237, "step": 7286 }, { "epoch": 9.31872, "grad_norm": 0.5425671339035034, "learning_rate": 2.1564140530354015e-05, "loss": 3.2401, "step": 7287 }, { "epoch": 9.32, "grad_norm": 0.5253366827964783, "learning_rate": 2.152375824471665e-05, "loss": 3.2288, "step": 7288 }, { "epoch": 9.32128, "grad_norm": 0.5394009351730347, "learning_rate": 2.1483375959079282e-05, "loss": 3.2209, "step": 7289 }, { "epoch": 9.32256, "grad_norm": 0.5472865104675293, "learning_rate": 2.1442993673441915e-05, "loss": 3.2802, "step": 7290 }, { "epoch": 9.32384, "grad_norm": 0.5333254933357239, "learning_rate": 2.140261138780455e-05, "loss": 3.2069, "step": 7291 }, { "epoch": 9.32512, "grad_norm": 0.5474021434783936, "learning_rate": 2.1362229102167182e-05, "loss": 3.2606, "step": 7292 }, { "epoch": 9.3264, "grad_norm": 0.5382992625236511, "learning_rate": 2.1321846816529816e-05, "loss": 3.2327, "step": 7293 }, { "epoch": 9.32768, "grad_norm": 0.533406674861908, "learning_rate": 2.1281464530892446e-05, "loss": 3.2464, "step": 7294 }, { "epoch": 9.32896, "grad_norm": 0.5368064641952515, "learning_rate": 2.124108224525508e-05, "loss": 3.2933, "step": 7295 }, { "epoch": 9.33024, "grad_norm": 0.5427320599555969, "learning_rate": 2.1200699959617713e-05, "loss": 3.2846, "step": 7296 }, { "epoch": 9.33152, "grad_norm": 0.5371754765510559, "learning_rate": 2.1160317673980346e-05, "loss": 3.3316, "step": 7297 }, { "epoch": 9.3328, "grad_norm": 0.5448208451271057, "learning_rate": 2.111993538834298e-05, "loss": 3.2638, "step": 7298 }, { "epoch": 9.33408, "grad_norm": 0.5369369983673096, "learning_rate": 2.1079553102705613e-05, "loss": 3.172, "step": 7299 }, { "epoch": 9.33536, "grad_norm": 0.5618038773536682, "learning_rate": 2.1039170817068247e-05, "loss": 3.2398, "step": 7300 }, { "epoch": 9.33664, "grad_norm": 0.5481551289558411, "learning_rate": 2.0998788531430877e-05, "loss": 3.25, "step": 7301 }, { "epoch": 9.33792, "grad_norm": 0.5384693741798401, "learning_rate": 2.095840624579351e-05, "loss": 3.2027, "step": 7302 }, { "epoch": 9.3392, "grad_norm": 0.5455001592636108, "learning_rate": 2.0918023960156144e-05, "loss": 3.2312, "step": 7303 }, { "epoch": 9.34048, "grad_norm": 0.5411433577537537, "learning_rate": 2.0877641674518777e-05, "loss": 3.2789, "step": 7304 }, { "epoch": 9.34176, "grad_norm": 0.5493059158325195, "learning_rate": 2.083725938888141e-05, "loss": 3.2225, "step": 7305 }, { "epoch": 9.34304, "grad_norm": 0.5578423738479614, "learning_rate": 2.0796877103244044e-05, "loss": 3.2215, "step": 7306 }, { "epoch": 9.34432, "grad_norm": 0.5362764596939087, "learning_rate": 2.0756494817606678e-05, "loss": 3.1921, "step": 7307 }, { "epoch": 9.3456, "grad_norm": 0.5485124588012695, "learning_rate": 2.0716112531969308e-05, "loss": 3.2242, "step": 7308 }, { "epoch": 9.34688, "grad_norm": 0.5391074419021606, "learning_rate": 2.067573024633194e-05, "loss": 3.2832, "step": 7309 }, { "epoch": 9.34816, "grad_norm": 0.5347970128059387, "learning_rate": 2.0635347960694575e-05, "loss": 3.2889, "step": 7310 }, { "epoch": 9.34944, "grad_norm": 0.5423051714897156, "learning_rate": 2.0594965675057208e-05, "loss": 3.2891, "step": 7311 }, { "epoch": 9.35072, "grad_norm": 0.5293038487434387, "learning_rate": 2.055458338941984e-05, "loss": 3.224, "step": 7312 }, { "epoch": 9.352, "grad_norm": 0.5548036098480225, "learning_rate": 2.0514201103782475e-05, "loss": 3.2699, "step": 7313 }, { "epoch": 9.35328, "grad_norm": 0.534817636013031, "learning_rate": 2.04738188181451e-05, "loss": 3.1907, "step": 7314 }, { "epoch": 9.35456, "grad_norm": 0.5452345013618469, "learning_rate": 2.0433436532507735e-05, "loss": 3.2226, "step": 7315 }, { "epoch": 9.35584, "grad_norm": 0.5356956124305725, "learning_rate": 2.0393054246870372e-05, "loss": 3.2319, "step": 7316 }, { "epoch": 9.35712, "grad_norm": 0.5347426533699036, "learning_rate": 2.0352671961233005e-05, "loss": 3.2123, "step": 7317 }, { "epoch": 9.3584, "grad_norm": 0.533959686756134, "learning_rate": 2.031228967559564e-05, "loss": 3.213, "step": 7318 }, { "epoch": 9.35968, "grad_norm": 0.549071729183197, "learning_rate": 2.0271907389958272e-05, "loss": 3.3334, "step": 7319 }, { "epoch": 9.36096, "grad_norm": 0.5512160658836365, "learning_rate": 2.0231525104320906e-05, "loss": 3.2198, "step": 7320 }, { "epoch": 9.36224, "grad_norm": 0.5341069102287292, "learning_rate": 2.0191142818683533e-05, "loss": 3.1991, "step": 7321 }, { "epoch": 9.36352, "grad_norm": 0.536259114742279, "learning_rate": 2.0150760533046166e-05, "loss": 3.2656, "step": 7322 }, { "epoch": 9.3648, "grad_norm": 0.5421985387802124, "learning_rate": 2.01103782474088e-05, "loss": 3.1539, "step": 7323 }, { "epoch": 9.36608, "grad_norm": 0.5535771250724792, "learning_rate": 2.0069995961771433e-05, "loss": 3.2589, "step": 7324 }, { "epoch": 9.36736, "grad_norm": 0.5412119626998901, "learning_rate": 2.002961367613407e-05, "loss": 3.3689, "step": 7325 }, { "epoch": 9.36864, "grad_norm": 0.5178043842315674, "learning_rate": 1.9989231390496703e-05, "loss": 3.177, "step": 7326 }, { "epoch": 9.36992, "grad_norm": 0.5386024117469788, "learning_rate": 1.9948849104859337e-05, "loss": 3.23, "step": 7327 }, { "epoch": 9.3712, "grad_norm": 0.5399037599563599, "learning_rate": 1.9908466819221963e-05, "loss": 3.274, "step": 7328 }, { "epoch": 9.37248, "grad_norm": 0.5423693060874939, "learning_rate": 1.9868084533584597e-05, "loss": 3.1935, "step": 7329 }, { "epoch": 9.37376, "grad_norm": 0.5459950566291809, "learning_rate": 1.982770224794723e-05, "loss": 3.1973, "step": 7330 }, { "epoch": 9.37504, "grad_norm": 0.5486062169075012, "learning_rate": 1.9787319962309864e-05, "loss": 3.1877, "step": 7331 }, { "epoch": 9.37632, "grad_norm": 0.552658200263977, "learning_rate": 1.9746937676672497e-05, "loss": 3.2007, "step": 7332 }, { "epoch": 9.3776, "grad_norm": 0.5424798130989075, "learning_rate": 1.970655539103513e-05, "loss": 3.2671, "step": 7333 }, { "epoch": 9.37888, "grad_norm": 0.5408488512039185, "learning_rate": 1.9666173105397768e-05, "loss": 3.222, "step": 7334 }, { "epoch": 9.38016, "grad_norm": 0.537208080291748, "learning_rate": 1.9625790819760394e-05, "loss": 3.3263, "step": 7335 }, { "epoch": 9.38144, "grad_norm": 0.5427209734916687, "learning_rate": 1.9585408534123028e-05, "loss": 3.2509, "step": 7336 }, { "epoch": 9.38272, "grad_norm": 0.5269715189933777, "learning_rate": 1.954502624848566e-05, "loss": 3.1778, "step": 7337 }, { "epoch": 9.384, "grad_norm": 0.5249371528625488, "learning_rate": 1.9504643962848295e-05, "loss": 3.1646, "step": 7338 }, { "epoch": 9.38528, "grad_norm": 0.5350185632705688, "learning_rate": 1.9464261677210928e-05, "loss": 3.1933, "step": 7339 }, { "epoch": 9.38656, "grad_norm": 0.5533794164657593, "learning_rate": 1.942387939157356e-05, "loss": 3.3036, "step": 7340 }, { "epoch": 9.38784, "grad_norm": 0.537314236164093, "learning_rate": 1.938349710593619e-05, "loss": 3.266, "step": 7341 }, { "epoch": 9.38912, "grad_norm": 0.5408215522766113, "learning_rate": 1.9343114820298825e-05, "loss": 3.2319, "step": 7342 }, { "epoch": 9.3904, "grad_norm": 0.5351347923278809, "learning_rate": 1.930273253466146e-05, "loss": 3.2069, "step": 7343 }, { "epoch": 9.39168, "grad_norm": 0.56521075963974, "learning_rate": 1.9262350249024092e-05, "loss": 3.2888, "step": 7344 }, { "epoch": 9.39296, "grad_norm": 0.5516117811203003, "learning_rate": 1.9221967963386725e-05, "loss": 3.2223, "step": 7345 }, { "epoch": 9.39424, "grad_norm": 0.5425495505332947, "learning_rate": 1.918158567774936e-05, "loss": 3.2373, "step": 7346 }, { "epoch": 9.39552, "grad_norm": 0.5260109901428223, "learning_rate": 1.9141203392111992e-05, "loss": 3.2283, "step": 7347 }, { "epoch": 9.3968, "grad_norm": 0.530563473701477, "learning_rate": 1.9100821106474622e-05, "loss": 3.2014, "step": 7348 }, { "epoch": 9.39808, "grad_norm": 0.5480346083641052, "learning_rate": 1.9060438820837256e-05, "loss": 3.3066, "step": 7349 }, { "epoch": 9.39936, "grad_norm": 0.5326750874519348, "learning_rate": 1.902005653519989e-05, "loss": 3.2109, "step": 7350 }, { "epoch": 9.40064, "grad_norm": 0.5301714539527893, "learning_rate": 1.8979674249562523e-05, "loss": 3.2464, "step": 7351 }, { "epoch": 9.40192, "grad_norm": 0.5453739166259766, "learning_rate": 1.8939291963925156e-05, "loss": 3.2734, "step": 7352 }, { "epoch": 9.4032, "grad_norm": 0.5450626611709595, "learning_rate": 1.889890967828779e-05, "loss": 3.2435, "step": 7353 }, { "epoch": 9.40448, "grad_norm": 0.5502852201461792, "learning_rate": 1.8858527392650423e-05, "loss": 3.3081, "step": 7354 }, { "epoch": 9.40576, "grad_norm": 0.5369656682014465, "learning_rate": 1.8818145107013053e-05, "loss": 3.2784, "step": 7355 }, { "epoch": 9.40704, "grad_norm": 0.5568560361862183, "learning_rate": 1.8777762821375687e-05, "loss": 3.3034, "step": 7356 }, { "epoch": 9.40832, "grad_norm": 0.541373074054718, "learning_rate": 1.873738053573832e-05, "loss": 3.2594, "step": 7357 }, { "epoch": 9.4096, "grad_norm": 0.5414525866508484, "learning_rate": 1.8696998250100954e-05, "loss": 3.2231, "step": 7358 }, { "epoch": 9.41088, "grad_norm": 0.535603940486908, "learning_rate": 1.8656615964463587e-05, "loss": 3.2663, "step": 7359 }, { "epoch": 9.41216, "grad_norm": 0.5469928979873657, "learning_rate": 1.861623367882622e-05, "loss": 3.2296, "step": 7360 }, { "epoch": 9.41344, "grad_norm": 0.5573539733886719, "learning_rate": 1.8575851393188854e-05, "loss": 3.2841, "step": 7361 }, { "epoch": 9.414719999999999, "grad_norm": 0.5386171340942383, "learning_rate": 1.8535469107551488e-05, "loss": 3.2654, "step": 7362 }, { "epoch": 9.416, "grad_norm": 0.5473566651344299, "learning_rate": 1.8495086821914118e-05, "loss": 3.206, "step": 7363 }, { "epoch": 9.41728, "grad_norm": 0.54440838098526, "learning_rate": 1.845470453627675e-05, "loss": 3.2718, "step": 7364 }, { "epoch": 9.41856, "grad_norm": 0.5463802814483643, "learning_rate": 1.8414322250639385e-05, "loss": 3.268, "step": 7365 }, { "epoch": 9.41984, "grad_norm": 0.5378354787826538, "learning_rate": 1.8373939965002018e-05, "loss": 3.2652, "step": 7366 }, { "epoch": 9.42112, "grad_norm": 0.5412339568138123, "learning_rate": 1.833355767936465e-05, "loss": 3.1854, "step": 7367 }, { "epoch": 9.4224, "grad_norm": 0.5401472449302673, "learning_rate": 1.8293175393727285e-05, "loss": 3.3367, "step": 7368 }, { "epoch": 9.42368, "grad_norm": 0.5603813529014587, "learning_rate": 1.825279310808992e-05, "loss": 3.2289, "step": 7369 }, { "epoch": 9.42496, "grad_norm": 0.5412847399711609, "learning_rate": 1.821241082245255e-05, "loss": 3.2606, "step": 7370 }, { "epoch": 9.42624, "grad_norm": 0.5373712778091431, "learning_rate": 1.8172028536815182e-05, "loss": 3.2097, "step": 7371 }, { "epoch": 9.42752, "grad_norm": 0.5573616623878479, "learning_rate": 1.8131646251177815e-05, "loss": 3.2397, "step": 7372 }, { "epoch": 9.4288, "grad_norm": 0.5405897498130798, "learning_rate": 1.8091263965540446e-05, "loss": 3.259, "step": 7373 }, { "epoch": 9.43008, "grad_norm": 0.5547781586647034, "learning_rate": 1.8050881679903082e-05, "loss": 3.247, "step": 7374 }, { "epoch": 9.43136, "grad_norm": 0.5367437601089478, "learning_rate": 1.8010499394265716e-05, "loss": 3.25, "step": 7375 }, { "epoch": 9.43264, "grad_norm": 0.5429502725601196, "learning_rate": 1.7970117108628346e-05, "loss": 3.2392, "step": 7376 }, { "epoch": 9.43392, "grad_norm": 0.5472865700721741, "learning_rate": 1.792973482299098e-05, "loss": 3.2329, "step": 7377 }, { "epoch": 9.4352, "grad_norm": 0.542381227016449, "learning_rate": 1.7889352537353613e-05, "loss": 3.2845, "step": 7378 }, { "epoch": 9.43648, "grad_norm": 0.5476444959640503, "learning_rate": 1.7848970251716246e-05, "loss": 3.2889, "step": 7379 }, { "epoch": 9.43776, "grad_norm": 0.5548306107521057, "learning_rate": 1.7808587966078876e-05, "loss": 3.2567, "step": 7380 }, { "epoch": 9.43904, "grad_norm": 0.5338513851165771, "learning_rate": 1.776820568044151e-05, "loss": 3.1739, "step": 7381 }, { "epoch": 9.44032, "grad_norm": 0.5605148673057556, "learning_rate": 1.7727823394804143e-05, "loss": 3.2783, "step": 7382 }, { "epoch": 9.4416, "grad_norm": 0.5437026619911194, "learning_rate": 1.7687441109166777e-05, "loss": 3.1916, "step": 7383 }, { "epoch": 9.44288, "grad_norm": 0.5323542952537537, "learning_rate": 1.764705882352941e-05, "loss": 3.2599, "step": 7384 }, { "epoch": 9.44416, "grad_norm": 0.5453789830207825, "learning_rate": 1.7606676537892044e-05, "loss": 3.2619, "step": 7385 }, { "epoch": 9.44544, "grad_norm": 0.5387235283851624, "learning_rate": 1.7566294252254677e-05, "loss": 3.2936, "step": 7386 }, { "epoch": 9.44672, "grad_norm": 0.5475628972053528, "learning_rate": 1.7525911966617307e-05, "loss": 3.244, "step": 7387 }, { "epoch": 9.448, "grad_norm": 0.5442694425582886, "learning_rate": 1.748552968097994e-05, "loss": 3.2876, "step": 7388 }, { "epoch": 9.44928, "grad_norm": 0.5453842282295227, "learning_rate": 1.7445147395342574e-05, "loss": 3.2533, "step": 7389 }, { "epoch": 9.45056, "grad_norm": 0.5407310128211975, "learning_rate": 1.7404765109705208e-05, "loss": 3.2382, "step": 7390 }, { "epoch": 9.45184, "grad_norm": 0.531166672706604, "learning_rate": 1.736438282406784e-05, "loss": 3.2376, "step": 7391 }, { "epoch": 9.45312, "grad_norm": 0.5313123464584351, "learning_rate": 1.7324000538430475e-05, "loss": 3.2362, "step": 7392 }, { "epoch": 9.4544, "grad_norm": 0.5363158583641052, "learning_rate": 1.7283618252793105e-05, "loss": 3.2088, "step": 7393 }, { "epoch": 9.45568, "grad_norm": 0.5450273752212524, "learning_rate": 1.7243235967155738e-05, "loss": 3.26, "step": 7394 }, { "epoch": 9.45696, "grad_norm": 0.57196044921875, "learning_rate": 1.720285368151837e-05, "loss": 3.2556, "step": 7395 }, { "epoch": 9.45824, "grad_norm": 0.5576905012130737, "learning_rate": 1.7162471395881005e-05, "loss": 3.2441, "step": 7396 }, { "epoch": 9.45952, "grad_norm": 0.5438629388809204, "learning_rate": 1.712208911024364e-05, "loss": 3.1963, "step": 7397 }, { "epoch": 9.4608, "grad_norm": 0.5366012454032898, "learning_rate": 1.7081706824606272e-05, "loss": 3.2966, "step": 7398 }, { "epoch": 9.46208, "grad_norm": 0.5356854796409607, "learning_rate": 1.7041324538968905e-05, "loss": 3.22, "step": 7399 }, { "epoch": 9.46336, "grad_norm": 0.5522456169128418, "learning_rate": 1.7000942253331535e-05, "loss": 3.2367, "step": 7400 }, { "epoch": 9.46464, "grad_norm": 0.5475519299507141, "learning_rate": 1.696055996769417e-05, "loss": 3.2766, "step": 7401 }, { "epoch": 9.46592, "grad_norm": 0.5508553385734558, "learning_rate": 1.6920177682056802e-05, "loss": 3.2885, "step": 7402 }, { "epoch": 9.4672, "grad_norm": 0.5404077172279358, "learning_rate": 1.6879795396419436e-05, "loss": 3.2627, "step": 7403 }, { "epoch": 9.46848, "grad_norm": 0.5281044840812683, "learning_rate": 1.683941311078207e-05, "loss": 3.2113, "step": 7404 }, { "epoch": 9.46976, "grad_norm": 0.5426114201545715, "learning_rate": 1.6799030825144703e-05, "loss": 3.2066, "step": 7405 }, { "epoch": 9.47104, "grad_norm": 0.5397725701332092, "learning_rate": 1.6758648539507336e-05, "loss": 3.2879, "step": 7406 }, { "epoch": 9.47232, "grad_norm": 0.5343126058578491, "learning_rate": 1.6718266253869966e-05, "loss": 3.21, "step": 7407 }, { "epoch": 9.4736, "grad_norm": 0.527040421962738, "learning_rate": 1.66778839682326e-05, "loss": 3.2397, "step": 7408 }, { "epoch": 9.47488, "grad_norm": 0.5362902879714966, "learning_rate": 1.6637501682595233e-05, "loss": 3.2548, "step": 7409 }, { "epoch": 9.47616, "grad_norm": 0.5329915285110474, "learning_rate": 1.6597119396957867e-05, "loss": 3.2235, "step": 7410 }, { "epoch": 9.47744, "grad_norm": 0.5368509292602539, "learning_rate": 1.65567371113205e-05, "loss": 3.1814, "step": 7411 }, { "epoch": 9.47872, "grad_norm": 0.5509939193725586, "learning_rate": 1.6516354825683134e-05, "loss": 3.2374, "step": 7412 }, { "epoch": 9.48, "grad_norm": 0.5431427359580994, "learning_rate": 1.6475972540045764e-05, "loss": 3.1822, "step": 7413 }, { "epoch": 9.48128, "grad_norm": 0.5423570275306702, "learning_rate": 1.6435590254408397e-05, "loss": 3.2381, "step": 7414 }, { "epoch": 9.48256, "grad_norm": 0.5203372240066528, "learning_rate": 1.639520796877103e-05, "loss": 3.238, "step": 7415 }, { "epoch": 9.48384, "grad_norm": 0.5459218621253967, "learning_rate": 1.6354825683133664e-05, "loss": 3.2343, "step": 7416 }, { "epoch": 9.48512, "grad_norm": 0.5473526120185852, "learning_rate": 1.6314443397496298e-05, "loss": 3.263, "step": 7417 }, { "epoch": 9.4864, "grad_norm": 0.5523941516876221, "learning_rate": 1.627406111185893e-05, "loss": 3.2685, "step": 7418 }, { "epoch": 9.48768, "grad_norm": 0.541957437992096, "learning_rate": 1.6233678826221565e-05, "loss": 3.3066, "step": 7419 }, { "epoch": 9.48896, "grad_norm": 0.5321203470230103, "learning_rate": 1.6193296540584195e-05, "loss": 3.2308, "step": 7420 }, { "epoch": 9.49024, "grad_norm": 0.5349709391593933, "learning_rate": 1.6152914254946828e-05, "loss": 3.2747, "step": 7421 }, { "epoch": 9.49152, "grad_norm": 0.5336790084838867, "learning_rate": 1.611253196930946e-05, "loss": 3.2135, "step": 7422 }, { "epoch": 9.4928, "grad_norm": 0.551705539226532, "learning_rate": 1.6072149683672095e-05, "loss": 3.2396, "step": 7423 }, { "epoch": 9.49408, "grad_norm": 0.547311007976532, "learning_rate": 1.603176739803473e-05, "loss": 3.3115, "step": 7424 }, { "epoch": 9.49536, "grad_norm": 0.558070182800293, "learning_rate": 1.5991385112397362e-05, "loss": 3.2538, "step": 7425 }, { "epoch": 9.49664, "grad_norm": 0.5429165363311768, "learning_rate": 1.5951002826759995e-05, "loss": 3.1978, "step": 7426 }, { "epoch": 9.49792, "grad_norm": 0.5304774045944214, "learning_rate": 1.5910620541122625e-05, "loss": 3.2507, "step": 7427 }, { "epoch": 9.4992, "grad_norm": 0.5391626358032227, "learning_rate": 1.587023825548526e-05, "loss": 3.2713, "step": 7428 }, { "epoch": 9.50048, "grad_norm": 0.5373072028160095, "learning_rate": 1.5829855969847892e-05, "loss": 3.2066, "step": 7429 }, { "epoch": 9.50176, "grad_norm": 0.52620929479599, "learning_rate": 1.5789473684210522e-05, "loss": 3.2278, "step": 7430 }, { "epoch": 9.50304, "grad_norm": 0.5313057899475098, "learning_rate": 1.574909139857316e-05, "loss": 3.2863, "step": 7431 }, { "epoch": 9.50432, "grad_norm": 0.555780291557312, "learning_rate": 1.5708709112935793e-05, "loss": 3.3081, "step": 7432 }, { "epoch": 9.5056, "grad_norm": 0.5542112588882446, "learning_rate": 1.5668326827298426e-05, "loss": 3.2342, "step": 7433 }, { "epoch": 9.50688, "grad_norm": 0.5538204312324524, "learning_rate": 1.5627944541661056e-05, "loss": 3.2699, "step": 7434 }, { "epoch": 9.50816, "grad_norm": 0.5440041422843933, "learning_rate": 1.558756225602369e-05, "loss": 3.195, "step": 7435 }, { "epoch": 9.50944, "grad_norm": 0.529220700263977, "learning_rate": 1.5547179970386323e-05, "loss": 3.2317, "step": 7436 }, { "epoch": 9.51072, "grad_norm": 0.5430290102958679, "learning_rate": 1.5506797684748953e-05, "loss": 3.2327, "step": 7437 }, { "epoch": 9.512, "grad_norm": 0.5630149245262146, "learning_rate": 1.5466415399111587e-05, "loss": 3.3014, "step": 7438 }, { "epoch": 9.51328, "grad_norm": 0.5409126877784729, "learning_rate": 1.542603311347422e-05, "loss": 3.2794, "step": 7439 }, { "epoch": 9.51456, "grad_norm": 0.5274693369865417, "learning_rate": 1.5385650827836854e-05, "loss": 3.2504, "step": 7440 }, { "epoch": 9.51584, "grad_norm": 0.5387745499610901, "learning_rate": 1.5345268542199487e-05, "loss": 3.1959, "step": 7441 }, { "epoch": 9.51712, "grad_norm": 0.5413409471511841, "learning_rate": 1.530488625656212e-05, "loss": 3.208, "step": 7442 }, { "epoch": 9.5184, "grad_norm": 0.5244672298431396, "learning_rate": 1.5264503970924754e-05, "loss": 3.1847, "step": 7443 }, { "epoch": 9.51968, "grad_norm": 0.5482396483421326, "learning_rate": 1.5224121685287386e-05, "loss": 3.2254, "step": 7444 }, { "epoch": 9.52096, "grad_norm": 0.5391595363616943, "learning_rate": 1.518373939965002e-05, "loss": 3.174, "step": 7445 }, { "epoch": 9.52224, "grad_norm": 0.5382583737373352, "learning_rate": 1.5143357114012653e-05, "loss": 3.2514, "step": 7446 }, { "epoch": 9.52352, "grad_norm": 0.5451508164405823, "learning_rate": 1.5102974828375285e-05, "loss": 3.3266, "step": 7447 }, { "epoch": 9.5248, "grad_norm": 0.5408745408058167, "learning_rate": 1.5062592542737918e-05, "loss": 3.2239, "step": 7448 }, { "epoch": 9.52608, "grad_norm": 0.5346300601959229, "learning_rate": 1.5022210257100551e-05, "loss": 3.1678, "step": 7449 }, { "epoch": 9.52736, "grad_norm": 0.54719078540802, "learning_rate": 1.4981827971463183e-05, "loss": 3.3131, "step": 7450 }, { "epoch": 9.52864, "grad_norm": 0.5220864415168762, "learning_rate": 1.4941445685825817e-05, "loss": 3.2922, "step": 7451 }, { "epoch": 9.52992, "grad_norm": 0.538643479347229, "learning_rate": 1.490106340018845e-05, "loss": 3.2359, "step": 7452 }, { "epoch": 9.5312, "grad_norm": 0.5405166745185852, "learning_rate": 1.4860681114551084e-05, "loss": 3.1694, "step": 7453 }, { "epoch": 9.53248, "grad_norm": 0.5491073131561279, "learning_rate": 1.4820298828913715e-05, "loss": 3.2585, "step": 7454 }, { "epoch": 9.533760000000001, "grad_norm": 0.5413920879364014, "learning_rate": 1.4779916543276349e-05, "loss": 3.1619, "step": 7455 }, { "epoch": 9.53504, "grad_norm": 0.5279734134674072, "learning_rate": 1.4739534257638982e-05, "loss": 3.1632, "step": 7456 }, { "epoch": 9.53632, "grad_norm": 0.5433555245399475, "learning_rate": 1.4699151972001612e-05, "loss": 3.2692, "step": 7457 }, { "epoch": 9.5376, "grad_norm": 0.5342807173728943, "learning_rate": 1.4658769686364248e-05, "loss": 3.2832, "step": 7458 }, { "epoch": 9.53888, "grad_norm": 0.5404999256134033, "learning_rate": 1.4618387400726881e-05, "loss": 3.2091, "step": 7459 }, { "epoch": 9.54016, "grad_norm": 0.5424739718437195, "learning_rate": 1.4578005115089511e-05, "loss": 3.192, "step": 7460 }, { "epoch": 9.54144, "grad_norm": 0.5365021228790283, "learning_rate": 1.4537622829452145e-05, "loss": 3.2372, "step": 7461 }, { "epoch": 9.54272, "grad_norm": 0.5258615016937256, "learning_rate": 1.4497240543814778e-05, "loss": 3.2472, "step": 7462 }, { "epoch": 9.544, "grad_norm": 0.543051540851593, "learning_rate": 1.4456858258177413e-05, "loss": 3.2782, "step": 7463 }, { "epoch": 9.54528, "grad_norm": 0.5488424897193909, "learning_rate": 1.4416475972540043e-05, "loss": 3.2517, "step": 7464 }, { "epoch": 9.54656, "grad_norm": 0.5342796444892883, "learning_rate": 1.4376093686902677e-05, "loss": 3.2171, "step": 7465 }, { "epoch": 9.54784, "grad_norm": 0.5366568565368652, "learning_rate": 1.433571140126531e-05, "loss": 3.2127, "step": 7466 }, { "epoch": 9.54912, "grad_norm": 0.5328598022460938, "learning_rate": 1.4295329115627942e-05, "loss": 3.1996, "step": 7467 }, { "epoch": 9.5504, "grad_norm": 0.5342961549758911, "learning_rate": 1.4254946829990575e-05, "loss": 3.2439, "step": 7468 }, { "epoch": 9.55168, "grad_norm": 0.5400911569595337, "learning_rate": 1.4214564544353209e-05, "loss": 3.2113, "step": 7469 }, { "epoch": 9.55296, "grad_norm": 0.536578893661499, "learning_rate": 1.4174182258715842e-05, "loss": 3.2135, "step": 7470 }, { "epoch": 9.55424, "grad_norm": 0.5431190729141235, "learning_rate": 1.4133799973078474e-05, "loss": 3.2889, "step": 7471 }, { "epoch": 9.55552, "grad_norm": 0.554277241230011, "learning_rate": 1.4093417687441108e-05, "loss": 3.2935, "step": 7472 }, { "epoch": 9.556799999999999, "grad_norm": 0.5410692095756531, "learning_rate": 1.4053035401803741e-05, "loss": 3.2235, "step": 7473 }, { "epoch": 9.55808, "grad_norm": 0.5578665733337402, "learning_rate": 1.4012653116166373e-05, "loss": 3.246, "step": 7474 }, { "epoch": 9.55936, "grad_norm": 0.5440622568130493, "learning_rate": 1.3972270830529006e-05, "loss": 3.2371, "step": 7475 }, { "epoch": 9.56064, "grad_norm": 0.5429546236991882, "learning_rate": 1.393188854489164e-05, "loss": 3.1835, "step": 7476 }, { "epoch": 9.56192, "grad_norm": 0.5361295342445374, "learning_rate": 1.3891506259254272e-05, "loss": 3.1896, "step": 7477 }, { "epoch": 9.5632, "grad_norm": 0.5314315557479858, "learning_rate": 1.3851123973616905e-05, "loss": 3.279, "step": 7478 }, { "epoch": 9.56448, "grad_norm": 0.5387744903564453, "learning_rate": 1.3810741687979538e-05, "loss": 3.2884, "step": 7479 }, { "epoch": 9.565760000000001, "grad_norm": 0.5472334027290344, "learning_rate": 1.3770359402342172e-05, "loss": 3.2379, "step": 7480 }, { "epoch": 9.56704, "grad_norm": 0.5438302755355835, "learning_rate": 1.3729977116704804e-05, "loss": 3.2577, "step": 7481 }, { "epoch": 9.56832, "grad_norm": 0.5295417904853821, "learning_rate": 1.3689594831067437e-05, "loss": 3.3273, "step": 7482 }, { "epoch": 9.5696, "grad_norm": 0.5329756140708923, "learning_rate": 1.364921254543007e-05, "loss": 3.1549, "step": 7483 }, { "epoch": 9.57088, "grad_norm": 0.5324873328208923, "learning_rate": 1.3608830259792702e-05, "loss": 3.245, "step": 7484 }, { "epoch": 9.57216, "grad_norm": 0.5475909113883972, "learning_rate": 1.3568447974155336e-05, "loss": 3.3146, "step": 7485 }, { "epoch": 9.57344, "grad_norm": 0.552155077457428, "learning_rate": 1.352806568851797e-05, "loss": 3.22, "step": 7486 }, { "epoch": 9.57472, "grad_norm": 0.5365369915962219, "learning_rate": 1.3487683402880601e-05, "loss": 3.2338, "step": 7487 }, { "epoch": 9.576, "grad_norm": 0.5464675426483154, "learning_rate": 1.3447301117243235e-05, "loss": 3.3097, "step": 7488 }, { "epoch": 9.57728, "grad_norm": 0.5443135499954224, "learning_rate": 1.3406918831605868e-05, "loss": 3.2514, "step": 7489 }, { "epoch": 9.57856, "grad_norm": 0.5426671504974365, "learning_rate": 1.3366536545968501e-05, "loss": 3.2772, "step": 7490 }, { "epoch": 9.57984, "grad_norm": 0.5442038774490356, "learning_rate": 1.3326154260331133e-05, "loss": 3.2219, "step": 7491 }, { "epoch": 9.58112, "grad_norm": 0.5287876129150391, "learning_rate": 1.3285771974693767e-05, "loss": 3.246, "step": 7492 }, { "epoch": 9.5824, "grad_norm": 0.5401532649993896, "learning_rate": 1.32453896890564e-05, "loss": 3.2587, "step": 7493 }, { "epoch": 9.58368, "grad_norm": 0.538408637046814, "learning_rate": 1.3205007403419032e-05, "loss": 3.2394, "step": 7494 }, { "epoch": 9.58496, "grad_norm": 0.5716665387153625, "learning_rate": 1.3164625117781665e-05, "loss": 3.2705, "step": 7495 }, { "epoch": 9.58624, "grad_norm": 0.5246546864509583, "learning_rate": 1.3124242832144299e-05, "loss": 3.2281, "step": 7496 }, { "epoch": 9.58752, "grad_norm": 0.5313910245895386, "learning_rate": 1.308386054650693e-05, "loss": 3.2713, "step": 7497 }, { "epoch": 9.588799999999999, "grad_norm": 0.5274931788444519, "learning_rate": 1.3043478260869564e-05, "loss": 3.2255, "step": 7498 }, { "epoch": 9.59008, "grad_norm": 0.5328890085220337, "learning_rate": 1.3003095975232198e-05, "loss": 3.2064, "step": 7499 }, { "epoch": 9.59136, "grad_norm": 0.5413006544113159, "learning_rate": 1.2962713689594831e-05, "loss": 3.2733, "step": 7500 }, { "epoch": 9.59264, "grad_norm": 0.5474052429199219, "learning_rate": 1.2922331403957463e-05, "loss": 3.2836, "step": 7501 }, { "epoch": 9.59392, "grad_norm": 0.5338591933250427, "learning_rate": 1.2881949118320096e-05, "loss": 3.2265, "step": 7502 }, { "epoch": 9.5952, "grad_norm": 0.533933699131012, "learning_rate": 1.284156683268273e-05, "loss": 3.2688, "step": 7503 }, { "epoch": 9.59648, "grad_norm": 0.5405238270759583, "learning_rate": 1.2801184547045361e-05, "loss": 3.3086, "step": 7504 }, { "epoch": 9.59776, "grad_norm": 0.5274151563644409, "learning_rate": 1.2760802261407995e-05, "loss": 3.2516, "step": 7505 }, { "epoch": 9.59904, "grad_norm": 0.533254861831665, "learning_rate": 1.2720419975770628e-05, "loss": 3.2087, "step": 7506 }, { "epoch": 9.60032, "grad_norm": 0.5390244722366333, "learning_rate": 1.268003769013326e-05, "loss": 3.1609, "step": 7507 }, { "epoch": 9.6016, "grad_norm": 0.5491555333137512, "learning_rate": 1.2639655404495894e-05, "loss": 3.261, "step": 7508 }, { "epoch": 9.60288, "grad_norm": 0.5407535433769226, "learning_rate": 1.2599273118858527e-05, "loss": 3.2147, "step": 7509 }, { "epoch": 9.60416, "grad_norm": 0.549811065196991, "learning_rate": 1.255889083322116e-05, "loss": 3.2559, "step": 7510 }, { "epoch": 9.60544, "grad_norm": 0.5414533019065857, "learning_rate": 1.2518508547583792e-05, "loss": 3.2521, "step": 7511 }, { "epoch": 9.60672, "grad_norm": 0.5452784895896912, "learning_rate": 1.2478126261946426e-05, "loss": 3.2758, "step": 7512 }, { "epoch": 9.608, "grad_norm": 0.5487382411956787, "learning_rate": 1.243774397630906e-05, "loss": 3.2842, "step": 7513 }, { "epoch": 9.60928, "grad_norm": 0.5465408563613892, "learning_rate": 1.239736169067169e-05, "loss": 3.242, "step": 7514 }, { "epoch": 9.61056, "grad_norm": 0.5443041920661926, "learning_rate": 1.2356979405034323e-05, "loss": 3.2918, "step": 7515 }, { "epoch": 9.61184, "grad_norm": 0.5370469689369202, "learning_rate": 1.2316597119396958e-05, "loss": 3.2174, "step": 7516 }, { "epoch": 9.61312, "grad_norm": 0.5332541465759277, "learning_rate": 1.2276214833759591e-05, "loss": 3.2014, "step": 7517 }, { "epoch": 9.6144, "grad_norm": 0.5249244570732117, "learning_rate": 1.2235832548122221e-05, "loss": 3.2625, "step": 7518 }, { "epoch": 9.61568, "grad_norm": 0.5379579663276672, "learning_rate": 1.2195450262484855e-05, "loss": 3.2321, "step": 7519 }, { "epoch": 9.61696, "grad_norm": 0.5287061333656311, "learning_rate": 1.215506797684749e-05, "loss": 3.2617, "step": 7520 }, { "epoch": 9.61824, "grad_norm": 0.5358967185020447, "learning_rate": 1.211468569121012e-05, "loss": 3.2692, "step": 7521 }, { "epoch": 9.61952, "grad_norm": 0.5491884350776672, "learning_rate": 1.2074303405572754e-05, "loss": 3.208, "step": 7522 }, { "epoch": 9.6208, "grad_norm": 0.5433226227760315, "learning_rate": 1.2033921119935387e-05, "loss": 3.2463, "step": 7523 }, { "epoch": 9.62208, "grad_norm": 0.548875093460083, "learning_rate": 1.1993538834298019e-05, "loss": 3.2841, "step": 7524 }, { "epoch": 9.62336, "grad_norm": 0.5372313261032104, "learning_rate": 1.1953156548660652e-05, "loss": 3.2922, "step": 7525 }, { "epoch": 9.62464, "grad_norm": 0.5375744700431824, "learning_rate": 1.1912774263023286e-05, "loss": 3.2362, "step": 7526 }, { "epoch": 9.62592, "grad_norm": 0.5384906530380249, "learning_rate": 1.187239197738592e-05, "loss": 3.3354, "step": 7527 }, { "epoch": 9.6272, "grad_norm": 0.532680869102478, "learning_rate": 1.1832009691748551e-05, "loss": 3.2397, "step": 7528 }, { "epoch": 9.62848, "grad_norm": 0.5564149618148804, "learning_rate": 1.1791627406111185e-05, "loss": 3.2319, "step": 7529 }, { "epoch": 9.62976, "grad_norm": 0.5441488027572632, "learning_rate": 1.1751245120473818e-05, "loss": 3.2088, "step": 7530 }, { "epoch": 9.63104, "grad_norm": 0.5175425410270691, "learning_rate": 1.171086283483645e-05, "loss": 3.2514, "step": 7531 }, { "epoch": 9.63232, "grad_norm": 0.5204359292984009, "learning_rate": 1.1670480549199083e-05, "loss": 3.2264, "step": 7532 }, { "epoch": 9.6336, "grad_norm": 0.5370702743530273, "learning_rate": 1.1630098263561717e-05, "loss": 3.2164, "step": 7533 }, { "epoch": 9.63488, "grad_norm": 0.562423050403595, "learning_rate": 1.1589715977924348e-05, "loss": 3.2344, "step": 7534 }, { "epoch": 9.63616, "grad_norm": 0.5344887375831604, "learning_rate": 1.1549333692286982e-05, "loss": 3.1734, "step": 7535 }, { "epoch": 9.63744, "grad_norm": 0.5403535962104797, "learning_rate": 1.1508951406649615e-05, "loss": 3.2056, "step": 7536 }, { "epoch": 9.63872, "grad_norm": 0.5409168004989624, "learning_rate": 1.1468569121012249e-05, "loss": 3.2126, "step": 7537 }, { "epoch": 9.64, "grad_norm": 0.5526571273803711, "learning_rate": 1.142818683537488e-05, "loss": 3.2779, "step": 7538 }, { "epoch": 9.64128, "grad_norm": 0.5267760753631592, "learning_rate": 1.1387804549737514e-05, "loss": 3.2548, "step": 7539 }, { "epoch": 9.64256, "grad_norm": 0.5352032780647278, "learning_rate": 1.1347422264100148e-05, "loss": 3.2621, "step": 7540 }, { "epoch": 9.64384, "grad_norm": 0.5344724059104919, "learning_rate": 1.130703997846278e-05, "loss": 3.1993, "step": 7541 }, { "epoch": 9.64512, "grad_norm": 0.5381539463996887, "learning_rate": 1.1266657692825413e-05, "loss": 3.2864, "step": 7542 }, { "epoch": 9.6464, "grad_norm": 0.5417296886444092, "learning_rate": 1.1226275407188046e-05, "loss": 3.2113, "step": 7543 }, { "epoch": 9.64768, "grad_norm": 0.5437041521072388, "learning_rate": 1.1185893121550678e-05, "loss": 3.1934, "step": 7544 }, { "epoch": 9.64896, "grad_norm": 0.5453625321388245, "learning_rate": 1.1145510835913311e-05, "loss": 3.2435, "step": 7545 }, { "epoch": 9.65024, "grad_norm": 0.5361076593399048, "learning_rate": 1.1105128550275945e-05, "loss": 3.2032, "step": 7546 }, { "epoch": 9.65152, "grad_norm": 0.5454299449920654, "learning_rate": 1.1064746264638578e-05, "loss": 3.3266, "step": 7547 }, { "epoch": 9.6528, "grad_norm": 0.5517051219940186, "learning_rate": 1.102436397900121e-05, "loss": 3.2157, "step": 7548 }, { "epoch": 9.65408, "grad_norm": 0.5611749291419983, "learning_rate": 1.0983981693363844e-05, "loss": 3.2855, "step": 7549 }, { "epoch": 9.65536, "grad_norm": 0.5386665463447571, "learning_rate": 1.0943599407726477e-05, "loss": 3.2633, "step": 7550 }, { "epoch": 9.65664, "grad_norm": 0.545172929763794, "learning_rate": 1.0903217122089109e-05, "loss": 3.2414, "step": 7551 }, { "epoch": 9.65792, "grad_norm": 0.5379675030708313, "learning_rate": 1.0862834836451742e-05, "loss": 3.2504, "step": 7552 }, { "epoch": 9.6592, "grad_norm": 0.545353353023529, "learning_rate": 1.0822452550814376e-05, "loss": 3.2483, "step": 7553 }, { "epoch": 9.66048, "grad_norm": 0.5340268015861511, "learning_rate": 1.0782070265177008e-05, "loss": 3.2026, "step": 7554 }, { "epoch": 9.66176, "grad_norm": 0.5404039621353149, "learning_rate": 1.0741687979539641e-05, "loss": 3.2664, "step": 7555 }, { "epoch": 9.66304, "grad_norm": 0.5477861762046814, "learning_rate": 1.0701305693902274e-05, "loss": 3.2691, "step": 7556 }, { "epoch": 9.66432, "grad_norm": 0.5429335832595825, "learning_rate": 1.0660923408264908e-05, "loss": 3.2404, "step": 7557 }, { "epoch": 9.6656, "grad_norm": 0.5497188568115234, "learning_rate": 1.062054112262754e-05, "loss": 3.1482, "step": 7558 }, { "epoch": 9.66688, "grad_norm": 0.5364313721656799, "learning_rate": 1.0580158836990173e-05, "loss": 3.3282, "step": 7559 }, { "epoch": 9.66816, "grad_norm": 0.5295746326446533, "learning_rate": 1.0539776551352807e-05, "loss": 3.2642, "step": 7560 }, { "epoch": 9.66944, "grad_norm": 0.5566303133964539, "learning_rate": 1.0499394265715438e-05, "loss": 3.3054, "step": 7561 }, { "epoch": 9.67072, "grad_norm": 0.5400784611701965, "learning_rate": 1.0459011980078072e-05, "loss": 3.2225, "step": 7562 }, { "epoch": 9.672, "grad_norm": 0.546852707862854, "learning_rate": 1.0418629694440705e-05, "loss": 3.2187, "step": 7563 }, { "epoch": 9.67328, "grad_norm": 0.5458729267120361, "learning_rate": 1.0378247408803339e-05, "loss": 3.265, "step": 7564 }, { "epoch": 9.67456, "grad_norm": 0.5483717322349548, "learning_rate": 1.033786512316597e-05, "loss": 3.2788, "step": 7565 }, { "epoch": 9.67584, "grad_norm": 0.5389772057533264, "learning_rate": 1.0297482837528604e-05, "loss": 3.2733, "step": 7566 }, { "epoch": 9.67712, "grad_norm": 0.5320543646812439, "learning_rate": 1.0257100551891237e-05, "loss": 3.2597, "step": 7567 }, { "epoch": 9.6784, "grad_norm": 0.5460016131401062, "learning_rate": 1.0216718266253868e-05, "loss": 3.226, "step": 7568 }, { "epoch": 9.67968, "grad_norm": 0.5420553684234619, "learning_rate": 1.0176335980616503e-05, "loss": 3.2611, "step": 7569 }, { "epoch": 9.68096, "grad_norm": 0.5544966459274292, "learning_rate": 1.0135953694979136e-05, "loss": 3.2986, "step": 7570 }, { "epoch": 9.68224, "grad_norm": 0.5373578071594238, "learning_rate": 1.0095571409341766e-05, "loss": 3.2232, "step": 7571 }, { "epoch": 9.68352, "grad_norm": 0.5499645471572876, "learning_rate": 1.00551891237044e-05, "loss": 3.2186, "step": 7572 }, { "epoch": 9.6848, "grad_norm": 0.5397508144378662, "learning_rate": 1.0014806838067035e-05, "loss": 3.284, "step": 7573 }, { "epoch": 9.68608, "grad_norm": 0.5304678082466125, "learning_rate": 9.974424552429668e-06, "loss": 3.2274, "step": 7574 }, { "epoch": 9.68736, "grad_norm": 0.5234605073928833, "learning_rate": 9.934042266792298e-06, "loss": 3.216, "step": 7575 }, { "epoch": 9.68864, "grad_norm": 0.5570342540740967, "learning_rate": 9.893659981154932e-06, "loss": 3.2837, "step": 7576 }, { "epoch": 9.68992, "grad_norm": 0.5394232273101807, "learning_rate": 9.853277695517565e-06, "loss": 3.1624, "step": 7577 }, { "epoch": 9.6912, "grad_norm": 0.5388901829719543, "learning_rate": 9.812895409880197e-06, "loss": 3.2089, "step": 7578 }, { "epoch": 9.69248, "grad_norm": 0.5200719833374023, "learning_rate": 9.77251312424283e-06, "loss": 3.198, "step": 7579 }, { "epoch": 9.69376, "grad_norm": 0.550902247428894, "learning_rate": 9.732130838605464e-06, "loss": 3.2799, "step": 7580 }, { "epoch": 9.69504, "grad_norm": 0.531865656375885, "learning_rate": 9.691748552968096e-06, "loss": 3.242, "step": 7581 }, { "epoch": 9.69632, "grad_norm": 0.533011257648468, "learning_rate": 9.65136626733073e-06, "loss": 3.2183, "step": 7582 }, { "epoch": 9.6976, "grad_norm": 0.538938581943512, "learning_rate": 9.610983981693363e-06, "loss": 3.2257, "step": 7583 }, { "epoch": 9.698879999999999, "grad_norm": 0.5364640951156616, "learning_rate": 9.570601696055996e-06, "loss": 3.2082, "step": 7584 }, { "epoch": 9.70016, "grad_norm": 0.5598052740097046, "learning_rate": 9.530219410418628e-06, "loss": 3.3729, "step": 7585 }, { "epoch": 9.70144, "grad_norm": 0.5431958436965942, "learning_rate": 9.489837124781261e-06, "loss": 3.2299, "step": 7586 }, { "epoch": 9.70272, "grad_norm": 0.5261279344558716, "learning_rate": 9.449454839143895e-06, "loss": 3.2334, "step": 7587 }, { "epoch": 9.704, "grad_norm": 0.5326124429702759, "learning_rate": 9.409072553506527e-06, "loss": 3.2051, "step": 7588 }, { "epoch": 9.70528, "grad_norm": 0.551518440246582, "learning_rate": 9.36869026786916e-06, "loss": 3.1614, "step": 7589 }, { "epoch": 9.70656, "grad_norm": 0.5355722308158875, "learning_rate": 9.328307982231794e-06, "loss": 3.2427, "step": 7590 }, { "epoch": 9.707840000000001, "grad_norm": 0.5317171812057495, "learning_rate": 9.287925696594427e-06, "loss": 3.1419, "step": 7591 }, { "epoch": 9.70912, "grad_norm": 0.5412980914115906, "learning_rate": 9.247543410957059e-06, "loss": 3.2759, "step": 7592 }, { "epoch": 9.7104, "grad_norm": 0.546062171459198, "learning_rate": 9.207161125319692e-06, "loss": 3.2773, "step": 7593 }, { "epoch": 9.71168, "grad_norm": 0.5412468314170837, "learning_rate": 9.166778839682326e-06, "loss": 3.247, "step": 7594 }, { "epoch": 9.71296, "grad_norm": 0.5379220247268677, "learning_rate": 9.12639655404496e-06, "loss": 3.2106, "step": 7595 }, { "epoch": 9.71424, "grad_norm": 0.5455538034439087, "learning_rate": 9.086014268407591e-06, "loss": 3.2229, "step": 7596 }, { "epoch": 9.71552, "grad_norm": 0.5402326583862305, "learning_rate": 9.045631982770223e-06, "loss": 3.2599, "step": 7597 }, { "epoch": 9.7168, "grad_norm": 0.5508577823638916, "learning_rate": 9.005249697132858e-06, "loss": 3.272, "step": 7598 }, { "epoch": 9.71808, "grad_norm": 0.5405802726745605, "learning_rate": 8.96486741149549e-06, "loss": 3.2912, "step": 7599 }, { "epoch": 9.71936, "grad_norm": 0.5341350436210632, "learning_rate": 8.924485125858123e-06, "loss": 3.2955, "step": 7600 }, { "epoch": 9.72064, "grad_norm": 0.5520155429840088, "learning_rate": 8.884102840220755e-06, "loss": 3.272, "step": 7601 }, { "epoch": 9.72192, "grad_norm": 0.5376763939857483, "learning_rate": 8.843720554583388e-06, "loss": 3.215, "step": 7602 }, { "epoch": 9.7232, "grad_norm": 0.5320829749107361, "learning_rate": 8.803338268946022e-06, "loss": 3.2852, "step": 7603 }, { "epoch": 9.72448, "grad_norm": 0.5295441746711731, "learning_rate": 8.762955983308654e-06, "loss": 3.1911, "step": 7604 }, { "epoch": 9.72576, "grad_norm": 0.5424656271934509, "learning_rate": 8.722573697671287e-06, "loss": 3.2145, "step": 7605 }, { "epoch": 9.72704, "grad_norm": 0.5340449810028076, "learning_rate": 8.68219141203392e-06, "loss": 3.275, "step": 7606 }, { "epoch": 9.72832, "grad_norm": 0.532345175743103, "learning_rate": 8.641809126396552e-06, "loss": 3.1572, "step": 7607 }, { "epoch": 9.7296, "grad_norm": 0.5268622636795044, "learning_rate": 8.601426840759186e-06, "loss": 3.2741, "step": 7608 }, { "epoch": 9.730879999999999, "grad_norm": 0.5382117629051208, "learning_rate": 8.56104455512182e-06, "loss": 3.2102, "step": 7609 }, { "epoch": 9.73216, "grad_norm": 0.5525277256965637, "learning_rate": 8.520662269484453e-06, "loss": 3.3048, "step": 7610 }, { "epoch": 9.73344, "grad_norm": 0.5279812216758728, "learning_rate": 8.480279983847084e-06, "loss": 3.256, "step": 7611 }, { "epoch": 9.73472, "grad_norm": 0.5394153594970703, "learning_rate": 8.439897698209718e-06, "loss": 3.2207, "step": 7612 }, { "epoch": 9.736, "grad_norm": 0.533865213394165, "learning_rate": 8.399515412572351e-06, "loss": 3.1895, "step": 7613 }, { "epoch": 9.73728, "grad_norm": 0.5335356593132019, "learning_rate": 8.359133126934983e-06, "loss": 3.2414, "step": 7614 }, { "epoch": 9.73856, "grad_norm": 0.5455939173698425, "learning_rate": 8.318750841297617e-06, "loss": 3.2906, "step": 7615 }, { "epoch": 9.739840000000001, "grad_norm": 0.5451316237449646, "learning_rate": 8.27836855566025e-06, "loss": 3.2626, "step": 7616 }, { "epoch": 9.74112, "grad_norm": 0.5487070679664612, "learning_rate": 8.237986270022882e-06, "loss": 3.2446, "step": 7617 }, { "epoch": 9.7424, "grad_norm": 0.5351135730743408, "learning_rate": 8.197603984385515e-06, "loss": 3.2191, "step": 7618 }, { "epoch": 9.74368, "grad_norm": 0.5554419755935669, "learning_rate": 8.157221698748149e-06, "loss": 3.272, "step": 7619 }, { "epoch": 9.74496, "grad_norm": 0.5496061444282532, "learning_rate": 8.116839413110782e-06, "loss": 3.2893, "step": 7620 }, { "epoch": 9.74624, "grad_norm": 0.5537092685699463, "learning_rate": 8.076457127473414e-06, "loss": 3.3493, "step": 7621 }, { "epoch": 9.74752, "grad_norm": 0.5267013907432556, "learning_rate": 8.036074841836047e-06, "loss": 3.2105, "step": 7622 }, { "epoch": 9.7488, "grad_norm": 0.537086009979248, "learning_rate": 7.995692556198681e-06, "loss": 3.2392, "step": 7623 }, { "epoch": 9.75008, "grad_norm": 0.5526744723320007, "learning_rate": 7.955310270561313e-06, "loss": 3.2584, "step": 7624 }, { "epoch": 9.75136, "grad_norm": 0.5359621047973633, "learning_rate": 7.914927984923946e-06, "loss": 3.2106, "step": 7625 }, { "epoch": 9.75264, "grad_norm": 0.5292626619338989, "learning_rate": 7.87454569928658e-06, "loss": 3.2777, "step": 7626 }, { "epoch": 9.75392, "grad_norm": 0.5385765433311462, "learning_rate": 7.834163413649213e-06, "loss": 3.2815, "step": 7627 }, { "epoch": 9.7552, "grad_norm": 0.5359296798706055, "learning_rate": 7.793781128011845e-06, "loss": 3.2214, "step": 7628 }, { "epoch": 9.75648, "grad_norm": 0.5338163375854492, "learning_rate": 7.753398842374477e-06, "loss": 3.2129, "step": 7629 }, { "epoch": 9.75776, "grad_norm": 0.5449706315994263, "learning_rate": 7.71301655673711e-06, "loss": 3.2514, "step": 7630 }, { "epoch": 9.75904, "grad_norm": 0.5432578325271606, "learning_rate": 7.672634271099744e-06, "loss": 3.2642, "step": 7631 }, { "epoch": 9.76032, "grad_norm": 0.5408291816711426, "learning_rate": 7.632251985462377e-06, "loss": 3.2562, "step": 7632 }, { "epoch": 9.7616, "grad_norm": 0.5344693064689636, "learning_rate": 7.59186969982501e-06, "loss": 3.2389, "step": 7633 }, { "epoch": 9.76288, "grad_norm": 0.5311311483383179, "learning_rate": 7.551487414187642e-06, "loss": 3.2397, "step": 7634 }, { "epoch": 9.76416, "grad_norm": 0.5357692241668701, "learning_rate": 7.511105128550276e-06, "loss": 3.2205, "step": 7635 }, { "epoch": 9.76544, "grad_norm": 0.5423558950424194, "learning_rate": 7.470722842912908e-06, "loss": 3.2897, "step": 7636 }, { "epoch": 9.76672, "grad_norm": 0.5260913372039795, "learning_rate": 7.430340557275542e-06, "loss": 3.1949, "step": 7637 }, { "epoch": 9.768, "grad_norm": 0.5533113479614258, "learning_rate": 7.3899582716381744e-06, "loss": 3.2199, "step": 7638 }, { "epoch": 9.76928, "grad_norm": 0.5424081087112427, "learning_rate": 7.349575986000806e-06, "loss": 3.3368, "step": 7639 }, { "epoch": 9.77056, "grad_norm": 0.5297884941101074, "learning_rate": 7.3091937003634405e-06, "loss": 3.1818, "step": 7640 }, { "epoch": 9.77184, "grad_norm": 0.5412716865539551, "learning_rate": 7.268811414726072e-06, "loss": 3.219, "step": 7641 }, { "epoch": 9.77312, "grad_norm": 0.5297017693519592, "learning_rate": 7.228429129088707e-06, "loss": 3.2751, "step": 7642 }, { "epoch": 9.7744, "grad_norm": 0.5230127573013306, "learning_rate": 7.188046843451338e-06, "loss": 3.2886, "step": 7643 }, { "epoch": 9.77568, "grad_norm": 0.5418264865875244, "learning_rate": 7.147664557813971e-06, "loss": 3.2826, "step": 7644 }, { "epoch": 9.77696, "grad_norm": 0.5246114134788513, "learning_rate": 7.1072822721766044e-06, "loss": 3.2256, "step": 7645 }, { "epoch": 9.77824, "grad_norm": 0.5591127872467041, "learning_rate": 7.066899986539237e-06, "loss": 3.2657, "step": 7646 }, { "epoch": 9.77952, "grad_norm": 0.5307941436767578, "learning_rate": 7.0265177009018705e-06, "loss": 3.2904, "step": 7647 }, { "epoch": 9.7808, "grad_norm": 0.5432814359664917, "learning_rate": 6.986135415264503e-06, "loss": 3.3016, "step": 7648 }, { "epoch": 9.78208, "grad_norm": 0.53673255443573, "learning_rate": 6.945753129627136e-06, "loss": 3.1894, "step": 7649 }, { "epoch": 9.78336, "grad_norm": 0.5456019043922424, "learning_rate": 6.905370843989769e-06, "loss": 3.251, "step": 7650 }, { "epoch": 9.78464, "grad_norm": 0.5382410287857056, "learning_rate": 6.864988558352402e-06, "loss": 3.2033, "step": 7651 }, { "epoch": 9.78592, "grad_norm": 0.5357476472854614, "learning_rate": 6.824606272715035e-06, "loss": 3.2886, "step": 7652 }, { "epoch": 9.7872, "grad_norm": 0.5439364314079285, "learning_rate": 6.784223987077668e-06, "loss": 3.314, "step": 7653 }, { "epoch": 9.78848, "grad_norm": 0.5283563137054443, "learning_rate": 6.7438417014403005e-06, "loss": 3.2845, "step": 7654 }, { "epoch": 9.78976, "grad_norm": 0.5235844254493713, "learning_rate": 6.703459415802934e-06, "loss": 3.2595, "step": 7655 }, { "epoch": 9.79104, "grad_norm": 0.5161533951759338, "learning_rate": 6.663077130165567e-06, "loss": 3.2127, "step": 7656 }, { "epoch": 9.79232, "grad_norm": 0.5272259712219238, "learning_rate": 6.6226948445282e-06, "loss": 3.2172, "step": 7657 }, { "epoch": 9.7936, "grad_norm": 0.5525578260421753, "learning_rate": 6.582312558890833e-06, "loss": 3.2411, "step": 7658 }, { "epoch": 9.79488, "grad_norm": 0.5354979634284973, "learning_rate": 6.541930273253465e-06, "loss": 3.1923, "step": 7659 }, { "epoch": 9.79616, "grad_norm": 0.5332154035568237, "learning_rate": 6.501547987616099e-06, "loss": 3.204, "step": 7660 }, { "epoch": 9.79744, "grad_norm": 0.5478113889694214, "learning_rate": 6.461165701978731e-06, "loss": 3.2707, "step": 7661 }, { "epoch": 9.79872, "grad_norm": 0.5272064208984375, "learning_rate": 6.420783416341365e-06, "loss": 3.2359, "step": 7662 }, { "epoch": 9.8, "grad_norm": 0.5392633676528931, "learning_rate": 6.3804011307039975e-06, "loss": 3.1821, "step": 7663 }, { "epoch": 9.80128, "grad_norm": 0.5203908681869507, "learning_rate": 6.34001884506663e-06, "loss": 3.2749, "step": 7664 }, { "epoch": 9.80256, "grad_norm": 0.5322408080101013, "learning_rate": 6.2996365594292635e-06, "loss": 3.2694, "step": 7665 }, { "epoch": 9.80384, "grad_norm": 0.5292795896530151, "learning_rate": 6.259254273791896e-06, "loss": 3.2264, "step": 7666 }, { "epoch": 9.80512, "grad_norm": 0.5364894866943359, "learning_rate": 6.21887198815453e-06, "loss": 3.1952, "step": 7667 }, { "epoch": 9.8064, "grad_norm": 0.5564066171646118, "learning_rate": 6.178489702517161e-06, "loss": 3.3316, "step": 7668 }, { "epoch": 9.80768, "grad_norm": 0.5362501740455627, "learning_rate": 6.138107416879796e-06, "loss": 3.2155, "step": 7669 }, { "epoch": 9.80896, "grad_norm": 0.5605980753898621, "learning_rate": 6.0977251312424275e-06, "loss": 3.2895, "step": 7670 }, { "epoch": 9.81024, "grad_norm": 0.538100004196167, "learning_rate": 6.05734284560506e-06, "loss": 3.2186, "step": 7671 }, { "epoch": 9.81152, "grad_norm": 0.5335180163383484, "learning_rate": 6.0169605599676936e-06, "loss": 3.2623, "step": 7672 }, { "epoch": 9.8128, "grad_norm": 0.535961389541626, "learning_rate": 5.976578274330326e-06, "loss": 3.2338, "step": 7673 }, { "epoch": 9.81408, "grad_norm": 0.5322915315628052, "learning_rate": 5.93619598869296e-06, "loss": 3.1992, "step": 7674 }, { "epoch": 9.81536, "grad_norm": 0.52436763048172, "learning_rate": 5.895813703055592e-06, "loss": 3.257, "step": 7675 }, { "epoch": 9.81664, "grad_norm": 0.5428990125656128, "learning_rate": 5.855431417418225e-06, "loss": 3.1951, "step": 7676 }, { "epoch": 9.81792, "grad_norm": 0.5562485456466675, "learning_rate": 5.815049131780858e-06, "loss": 3.303, "step": 7677 }, { "epoch": 9.8192, "grad_norm": 0.5489171743392944, "learning_rate": 5.774666846143491e-06, "loss": 3.2214, "step": 7678 }, { "epoch": 9.82048, "grad_norm": 0.5357405543327332, "learning_rate": 5.734284560506124e-06, "loss": 3.2282, "step": 7679 }, { "epoch": 9.82176, "grad_norm": 0.5341373682022095, "learning_rate": 5.693902274868757e-06, "loss": 3.2049, "step": 7680 }, { "epoch": 9.82304, "grad_norm": 0.5386019349098206, "learning_rate": 5.65351998923139e-06, "loss": 3.2545, "step": 7681 }, { "epoch": 9.82432, "grad_norm": 0.5332474708557129, "learning_rate": 5.613137703594023e-06, "loss": 3.207, "step": 7682 }, { "epoch": 9.8256, "grad_norm": 0.5416889786720276, "learning_rate": 5.572755417956656e-06, "loss": 3.2406, "step": 7683 }, { "epoch": 9.82688, "grad_norm": 0.5246055722236633, "learning_rate": 5.532373132319289e-06, "loss": 3.3026, "step": 7684 }, { "epoch": 9.82816, "grad_norm": 0.5286515951156616, "learning_rate": 5.491990846681922e-06, "loss": 3.2483, "step": 7685 }, { "epoch": 9.82944, "grad_norm": 0.5369952917098999, "learning_rate": 5.451608561044554e-06, "loss": 3.2452, "step": 7686 }, { "epoch": 9.83072, "grad_norm": 0.5319638252258301, "learning_rate": 5.411226275407188e-06, "loss": 3.2861, "step": 7687 }, { "epoch": 9.832, "grad_norm": 0.5259891748428345, "learning_rate": 5.3708439897698205e-06, "loss": 3.2605, "step": 7688 }, { "epoch": 9.83328, "grad_norm": 0.5339581966400146, "learning_rate": 5.330461704132454e-06, "loss": 3.2469, "step": 7689 }, { "epoch": 9.83456, "grad_norm": 0.5350625514984131, "learning_rate": 5.290079418495087e-06, "loss": 3.2347, "step": 7690 }, { "epoch": 9.83584, "grad_norm": 0.5377787947654724, "learning_rate": 5.249697132857719e-06, "loss": 3.1947, "step": 7691 }, { "epoch": 9.83712, "grad_norm": 0.5324146747589111, "learning_rate": 5.209314847220353e-06, "loss": 3.2842, "step": 7692 }, { "epoch": 9.8384, "grad_norm": 0.5433526635169983, "learning_rate": 5.168932561582985e-06, "loss": 3.1448, "step": 7693 }, { "epoch": 9.83968, "grad_norm": 0.5497397184371948, "learning_rate": 5.128550275945619e-06, "loss": 3.2761, "step": 7694 }, { "epoch": 9.84096, "grad_norm": 0.5405837893486023, "learning_rate": 5.088167990308251e-06, "loss": 3.2843, "step": 7695 }, { "epoch": 9.84224, "grad_norm": 0.5413019061088562, "learning_rate": 5.047785704670883e-06, "loss": 3.2202, "step": 7696 }, { "epoch": 9.84352, "grad_norm": 0.5248864889144897, "learning_rate": 5.0074034190335174e-06, "loss": 3.234, "step": 7697 }, { "epoch": 9.8448, "grad_norm": 0.5380563139915466, "learning_rate": 4.967021133396149e-06, "loss": 3.2331, "step": 7698 }, { "epoch": 9.84608, "grad_norm": 0.5344839096069336, "learning_rate": 4.926638847758783e-06, "loss": 3.3208, "step": 7699 }, { "epoch": 9.84736, "grad_norm": 0.544310450553894, "learning_rate": 4.886256562121415e-06, "loss": 3.242, "step": 7700 }, { "epoch": 9.84864, "grad_norm": 0.5436192750930786, "learning_rate": 4.845874276484048e-06, "loss": 3.2463, "step": 7701 }, { "epoch": 9.849920000000001, "grad_norm": 0.5517845749855042, "learning_rate": 4.805491990846681e-06, "loss": 3.2587, "step": 7702 }, { "epoch": 9.8512, "grad_norm": 0.5308093428611755, "learning_rate": 4.765109705209314e-06, "loss": 3.238, "step": 7703 }, { "epoch": 9.85248, "grad_norm": 0.5324238538742065, "learning_rate": 4.7247274195719474e-06, "loss": 3.1847, "step": 7704 }, { "epoch": 9.85376, "grad_norm": 0.5277994871139526, "learning_rate": 4.68434513393458e-06, "loss": 3.1705, "step": 7705 }, { "epoch": 9.85504, "grad_norm": 0.5271157026290894, "learning_rate": 4.6439628482972135e-06, "loss": 3.1905, "step": 7706 }, { "epoch": 9.85632, "grad_norm": 0.531658947467804, "learning_rate": 4.603580562659846e-06, "loss": 3.2695, "step": 7707 }, { "epoch": 9.8576, "grad_norm": 0.5260719060897827, "learning_rate": 4.56319827702248e-06, "loss": 3.2373, "step": 7708 }, { "epoch": 9.85888, "grad_norm": 0.5422466993331909, "learning_rate": 4.522815991385111e-06, "loss": 3.2401, "step": 7709 }, { "epoch": 9.86016, "grad_norm": 0.5454497337341309, "learning_rate": 4.482433705747745e-06, "loss": 3.2724, "step": 7710 }, { "epoch": 9.86144, "grad_norm": 0.5371284484863281, "learning_rate": 4.4420514201103775e-06, "loss": 3.267, "step": 7711 }, { "epoch": 9.86272, "grad_norm": 0.5461208820343018, "learning_rate": 4.401669134473011e-06, "loss": 3.2026, "step": 7712 }, { "epoch": 9.864, "grad_norm": 0.5462895631790161, "learning_rate": 4.3612868488356435e-06, "loss": 3.1647, "step": 7713 }, { "epoch": 9.86528, "grad_norm": 0.5370737314224243, "learning_rate": 4.320904563198276e-06, "loss": 3.2734, "step": 7714 }, { "epoch": 9.86656, "grad_norm": 0.5331903100013733, "learning_rate": 4.28052227756091e-06, "loss": 3.1304, "step": 7715 }, { "epoch": 9.86784, "grad_norm": 0.5273880958557129, "learning_rate": 4.240139991923542e-06, "loss": 3.2982, "step": 7716 }, { "epoch": 9.86912, "grad_norm": 0.5413378477096558, "learning_rate": 4.199757706286176e-06, "loss": 3.1814, "step": 7717 }, { "epoch": 9.8704, "grad_norm": 0.5351929664611816, "learning_rate": 4.159375420648808e-06, "loss": 3.2605, "step": 7718 }, { "epoch": 9.87168, "grad_norm": 0.545540452003479, "learning_rate": 4.118993135011441e-06, "loss": 3.2398, "step": 7719 }, { "epoch": 9.872959999999999, "grad_norm": 0.5313085913658142, "learning_rate": 4.078610849374074e-06, "loss": 3.2208, "step": 7720 }, { "epoch": 9.87424, "grad_norm": 0.5450189709663391, "learning_rate": 4.038228563736707e-06, "loss": 3.2366, "step": 7721 }, { "epoch": 9.87552, "grad_norm": 0.5320997834205627, "learning_rate": 3.9978462780993405e-06, "loss": 3.2865, "step": 7722 }, { "epoch": 9.8768, "grad_norm": 0.5449676513671875, "learning_rate": 3.957463992461973e-06, "loss": 3.3148, "step": 7723 }, { "epoch": 9.87808, "grad_norm": 0.525892972946167, "learning_rate": 3.9170817068246066e-06, "loss": 3.1522, "step": 7724 }, { "epoch": 9.87936, "grad_norm": 0.5414352416992188, "learning_rate": 3.876699421187238e-06, "loss": 3.2166, "step": 7725 }, { "epoch": 9.88064, "grad_norm": 0.5205792188644409, "learning_rate": 3.836317135549872e-06, "loss": 3.2689, "step": 7726 }, { "epoch": 9.881920000000001, "grad_norm": 0.5402050614356995, "learning_rate": 3.795934849912505e-06, "loss": 3.2732, "step": 7727 }, { "epoch": 9.8832, "grad_norm": 0.5389068722724915, "learning_rate": 3.755552564275138e-06, "loss": 3.2579, "step": 7728 }, { "epoch": 9.88448, "grad_norm": 0.5307827591896057, "learning_rate": 3.715170278637771e-06, "loss": 3.2471, "step": 7729 }, { "epoch": 9.88576, "grad_norm": 0.5479539036750793, "learning_rate": 3.674787993000403e-06, "loss": 3.2576, "step": 7730 }, { "epoch": 9.88704, "grad_norm": 0.5323429703712463, "learning_rate": 3.634405707363036e-06, "loss": 3.2332, "step": 7731 }, { "epoch": 9.88832, "grad_norm": 0.5382372736930847, "learning_rate": 3.594023421725669e-06, "loss": 3.2954, "step": 7732 }, { "epoch": 9.8896, "grad_norm": 0.5464025735855103, "learning_rate": 3.5536411360883022e-06, "loss": 3.2248, "step": 7733 }, { "epoch": 9.89088, "grad_norm": 0.5193501114845276, "learning_rate": 3.5132588504509353e-06, "loss": 3.2544, "step": 7734 }, { "epoch": 9.89216, "grad_norm": 0.5276758074760437, "learning_rate": 3.472876564813568e-06, "loss": 3.1927, "step": 7735 }, { "epoch": 9.89344, "grad_norm": 0.5477931499481201, "learning_rate": 3.432494279176201e-06, "loss": 3.2572, "step": 7736 }, { "epoch": 9.89472, "grad_norm": 0.5434842705726624, "learning_rate": 3.392111993538834e-06, "loss": 3.3412, "step": 7737 }, { "epoch": 9.896, "grad_norm": 0.5318534970283508, "learning_rate": 3.351729707901467e-06, "loss": 3.2676, "step": 7738 }, { "epoch": 9.89728, "grad_norm": 0.5352578163146973, "learning_rate": 3.3113474222641e-06, "loss": 3.2455, "step": 7739 }, { "epoch": 9.89856, "grad_norm": 0.5339382290840149, "learning_rate": 3.2709651366267327e-06, "loss": 3.2947, "step": 7740 }, { "epoch": 9.89984, "grad_norm": 0.5460047721862793, "learning_rate": 3.2305828509893657e-06, "loss": 3.3013, "step": 7741 }, { "epoch": 9.90112, "grad_norm": 0.5352959632873535, "learning_rate": 3.1902005653519987e-06, "loss": 3.2407, "step": 7742 }, { "epoch": 9.9024, "grad_norm": 0.5354017019271851, "learning_rate": 3.1498182797146318e-06, "loss": 3.1999, "step": 7743 }, { "epoch": 9.90368, "grad_norm": 0.5433909296989441, "learning_rate": 3.109435994077265e-06, "loss": 3.1953, "step": 7744 }, { "epoch": 9.904959999999999, "grad_norm": 0.5294229388237, "learning_rate": 3.069053708439898e-06, "loss": 3.19, "step": 7745 }, { "epoch": 9.90624, "grad_norm": 0.5373355150222778, "learning_rate": 3.02867142280253e-06, "loss": 3.2349, "step": 7746 }, { "epoch": 9.90752, "grad_norm": 0.5573491454124451, "learning_rate": 2.988289137165163e-06, "loss": 3.1777, "step": 7747 }, { "epoch": 9.9088, "grad_norm": 0.54659104347229, "learning_rate": 2.947906851527796e-06, "loss": 3.2335, "step": 7748 }, { "epoch": 9.91008, "grad_norm": 0.5471231341362, "learning_rate": 2.907524565890429e-06, "loss": 3.2676, "step": 7749 }, { "epoch": 9.91136, "grad_norm": 0.5398557186126709, "learning_rate": 2.867142280253062e-06, "loss": 3.2567, "step": 7750 }, { "epoch": 9.91264, "grad_norm": 0.5324544310569763, "learning_rate": 2.826759994615695e-06, "loss": 3.2461, "step": 7751 }, { "epoch": 9.91392, "grad_norm": 0.5316013693809509, "learning_rate": 2.786377708978328e-06, "loss": 3.2755, "step": 7752 }, { "epoch": 9.9152, "grad_norm": 0.5322742462158203, "learning_rate": 2.745995423340961e-06, "loss": 3.2667, "step": 7753 }, { "epoch": 9.91648, "grad_norm": 0.5327667593955994, "learning_rate": 2.705613137703594e-06, "loss": 3.278, "step": 7754 }, { "epoch": 9.91776, "grad_norm": 0.5432612895965576, "learning_rate": 2.665230852066227e-06, "loss": 3.2554, "step": 7755 }, { "epoch": 9.91904, "grad_norm": 0.5206037163734436, "learning_rate": 2.6248485664288596e-06, "loss": 3.2419, "step": 7756 }, { "epoch": 9.92032, "grad_norm": 0.5430627465248108, "learning_rate": 2.5844662807914926e-06, "loss": 3.2313, "step": 7757 }, { "epoch": 9.9216, "grad_norm": 0.527530312538147, "learning_rate": 2.5440839951541257e-06, "loss": 3.2726, "step": 7758 }, { "epoch": 9.92288, "grad_norm": 0.5246833562850952, "learning_rate": 2.5037017095167587e-06, "loss": 3.1722, "step": 7759 }, { "epoch": 9.92416, "grad_norm": 0.5283114314079285, "learning_rate": 2.4633194238793913e-06, "loss": 3.2337, "step": 7760 }, { "epoch": 9.92544, "grad_norm": 0.5455198884010315, "learning_rate": 2.422937138242024e-06, "loss": 3.2814, "step": 7761 }, { "epoch": 9.92672, "grad_norm": 0.5268814563751221, "learning_rate": 2.382554852604657e-06, "loss": 3.2407, "step": 7762 }, { "epoch": 9.928, "grad_norm": 0.5490137338638306, "learning_rate": 2.34217256696729e-06, "loss": 3.2843, "step": 7763 }, { "epoch": 9.92928, "grad_norm": 0.525651216506958, "learning_rate": 2.301790281329923e-06, "loss": 3.2615, "step": 7764 }, { "epoch": 9.93056, "grad_norm": 0.5373636484146118, "learning_rate": 2.2614079956925557e-06, "loss": 3.2135, "step": 7765 }, { "epoch": 9.93184, "grad_norm": 0.5554747581481934, "learning_rate": 2.2210257100551887e-06, "loss": 3.2698, "step": 7766 }, { "epoch": 9.93312, "grad_norm": 0.5517007112503052, "learning_rate": 2.1806434244178218e-06, "loss": 3.3233, "step": 7767 }, { "epoch": 9.9344, "grad_norm": 0.5402884483337402, "learning_rate": 2.140261138780455e-06, "loss": 3.2164, "step": 7768 }, { "epoch": 9.93568, "grad_norm": 0.5363087058067322, "learning_rate": 2.099878853143088e-06, "loss": 3.3029, "step": 7769 }, { "epoch": 9.93696, "grad_norm": 0.547885537147522, "learning_rate": 2.0594965675057205e-06, "loss": 3.2803, "step": 7770 }, { "epoch": 9.93824, "grad_norm": 0.5402307510375977, "learning_rate": 2.0191142818683535e-06, "loss": 3.2742, "step": 7771 }, { "epoch": 9.93952, "grad_norm": 0.5349573493003845, "learning_rate": 1.9787319962309865e-06, "loss": 3.2538, "step": 7772 }, { "epoch": 9.9408, "grad_norm": 0.5216020345687866, "learning_rate": 1.938349710593619e-06, "loss": 3.1519, "step": 7773 }, { "epoch": 9.94208, "grad_norm": 0.5340332388877869, "learning_rate": 1.8979674249562524e-06, "loss": 3.2481, "step": 7774 }, { "epoch": 9.94336, "grad_norm": 0.5314613580703735, "learning_rate": 1.8575851393188855e-06, "loss": 3.2759, "step": 7775 }, { "epoch": 9.94464, "grad_norm": 0.5448631048202515, "learning_rate": 1.817202853681518e-06, "loss": 3.2906, "step": 7776 }, { "epoch": 9.94592, "grad_norm": 0.5468592047691345, "learning_rate": 1.7768205680441511e-06, "loss": 3.2446, "step": 7777 }, { "epoch": 9.9472, "grad_norm": 0.5331081748008728, "learning_rate": 1.736438282406784e-06, "loss": 3.2681, "step": 7778 }, { "epoch": 9.94848, "grad_norm": 0.5275120139122009, "learning_rate": 1.696055996769417e-06, "loss": 3.2593, "step": 7779 }, { "epoch": 9.94976, "grad_norm": 0.5483401417732239, "learning_rate": 1.65567371113205e-06, "loss": 3.2909, "step": 7780 }, { "epoch": 9.95104, "grad_norm": 0.5375556349754333, "learning_rate": 1.6152914254946828e-06, "loss": 3.219, "step": 7781 }, { "epoch": 9.95232, "grad_norm": 0.532954752445221, "learning_rate": 1.5749091398573159e-06, "loss": 3.2613, "step": 7782 }, { "epoch": 9.9536, "grad_norm": 0.534887969493866, "learning_rate": 1.534526854219949e-06, "loss": 3.2319, "step": 7783 }, { "epoch": 9.95488, "grad_norm": 0.5328458547592163, "learning_rate": 1.4941445685825815e-06, "loss": 3.2237, "step": 7784 }, { "epoch": 9.95616, "grad_norm": 0.530035674571991, "learning_rate": 1.4537622829452146e-06, "loss": 3.2497, "step": 7785 }, { "epoch": 9.95744, "grad_norm": 0.5311540365219116, "learning_rate": 1.4133799973078474e-06, "loss": 3.1767, "step": 7786 }, { "epoch": 9.95872, "grad_norm": 0.5378389358520508, "learning_rate": 1.3729977116704805e-06, "loss": 3.253, "step": 7787 }, { "epoch": 9.96, "grad_norm": 0.5423367619514465, "learning_rate": 1.3326154260331135e-06, "loss": 3.2811, "step": 7788 }, { "epoch": 9.96128, "grad_norm": 0.5347367525100708, "learning_rate": 1.2922331403957463e-06, "loss": 3.2527, "step": 7789 }, { "epoch": 9.96256, "grad_norm": 0.547399640083313, "learning_rate": 1.2518508547583794e-06, "loss": 3.2159, "step": 7790 }, { "epoch": 9.96384, "grad_norm": 0.5291503667831421, "learning_rate": 1.211468569121012e-06, "loss": 3.1828, "step": 7791 }, { "epoch": 9.96512, "grad_norm": 0.5376866459846497, "learning_rate": 1.171086283483645e-06, "loss": 3.2118, "step": 7792 }, { "epoch": 9.9664, "grad_norm": 0.5406926274299622, "learning_rate": 1.1307039978462778e-06, "loss": 3.1903, "step": 7793 }, { "epoch": 9.96768, "grad_norm": 0.5449708104133606, "learning_rate": 1.0903217122089109e-06, "loss": 3.2392, "step": 7794 }, { "epoch": 9.96896, "grad_norm": 0.5226523876190186, "learning_rate": 1.049939426571544e-06, "loss": 3.2205, "step": 7795 }, { "epoch": 9.97024, "grad_norm": 0.5401772260665894, "learning_rate": 1.0095571409341768e-06, "loss": 3.2496, "step": 7796 }, { "epoch": 9.97152, "grad_norm": 0.5378510355949402, "learning_rate": 9.691748552968096e-07, "loss": 3.1916, "step": 7797 }, { "epoch": 9.9728, "grad_norm": 0.5274825096130371, "learning_rate": 9.287925696594427e-07, "loss": 3.2405, "step": 7798 }, { "epoch": 9.97408, "grad_norm": 0.5408015251159668, "learning_rate": 8.884102840220756e-07, "loss": 3.2419, "step": 7799 }, { "epoch": 9.97536, "grad_norm": 0.5279608964920044, "learning_rate": 8.480279983847085e-07, "loss": 3.2378, "step": 7800 }, { "epoch": 9.97664, "grad_norm": 0.5284964442253113, "learning_rate": 8.076457127473414e-07, "loss": 3.2899, "step": 7801 }, { "epoch": 9.97792, "grad_norm": 0.537851095199585, "learning_rate": 7.672634271099745e-07, "loss": 3.308, "step": 7802 }, { "epoch": 9.9792, "grad_norm": 0.5350565314292908, "learning_rate": 7.268811414726073e-07, "loss": 3.1784, "step": 7803 }, { "epoch": 9.98048, "grad_norm": 0.5333359837532043, "learning_rate": 6.864988558352402e-07, "loss": 3.2109, "step": 7804 }, { "epoch": 9.98176, "grad_norm": 0.5407792925834656, "learning_rate": 6.461165701978732e-07, "loss": 3.2474, "step": 7805 }, { "epoch": 9.98304, "grad_norm": 0.545315146446228, "learning_rate": 6.05734284560506e-07, "loss": 3.2454, "step": 7806 }, { "epoch": 9.98432, "grad_norm": 0.5328691601753235, "learning_rate": 5.653519989231389e-07, "loss": 3.2104, "step": 7807 }, { "epoch": 9.9856, "grad_norm": 0.5297470092773438, "learning_rate": 5.24969713285772e-07, "loss": 3.2225, "step": 7808 }, { "epoch": 9.98688, "grad_norm": 0.5331732630729675, "learning_rate": 4.845874276484048e-07, "loss": 3.2141, "step": 7809 }, { "epoch": 9.98816, "grad_norm": 0.5387186408042908, "learning_rate": 4.442051420110378e-07, "loss": 3.2828, "step": 7810 }, { "epoch": 9.98944, "grad_norm": 0.5281053781509399, "learning_rate": 4.038228563736707e-07, "loss": 3.2234, "step": 7811 }, { "epoch": 9.99072, "grad_norm": 0.5405586957931519, "learning_rate": 3.6344057073630365e-07, "loss": 3.3145, "step": 7812 }, { "epoch": 9.992, "grad_norm": 0.5372722744941711, "learning_rate": 3.230582850989366e-07, "loss": 3.2676, "step": 7813 }, { "epoch": 9.99328, "grad_norm": 0.5271605253219604, "learning_rate": 2.8267599946156946e-07, "loss": 3.2855, "step": 7814 }, { "epoch": 9.99456, "grad_norm": 0.5379496812820435, "learning_rate": 2.422937138242024e-07, "loss": 3.1735, "step": 7815 }, { "epoch": 9.99584, "grad_norm": 0.5309686660766602, "learning_rate": 2.0191142818683536e-07, "loss": 3.2678, "step": 7816 }, { "epoch": 9.99712, "grad_norm": 0.5377652645111084, "learning_rate": 1.615291425494683e-07, "loss": 3.3243, "step": 7817 }, { "epoch": 9.9984, "grad_norm": 0.5389779806137085, "learning_rate": 1.211468569121012e-07, "loss": 3.2626, "step": 7818 }, { "epoch": 9.99968, "grad_norm": 0.54366135597229, "learning_rate": 8.076457127473414e-08, "loss": 3.2713, "step": 7819 }, { "epoch": 10.0, "grad_norm": 1.0505841970443726, "learning_rate": 4.038228563736707e-08, "loss": 3.2862, "step": 7820 } ], "logging_steps": 1, "max_steps": 7820, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5963011260928e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }