diff --git "a/plots/data.json" "b/plots/data.json" new file mode 100644--- /dev/null +++ "b/plots/data.json" @@ -0,0 +1,6367 @@ +{ + "runs": [ + { + "run_name": "pico-decoder-tiny-dolma29k-v2", + "log_file": "log_20250829_003838.log", + "training_metrics": [ + { + "step": 0, + "loss": 10.9848, + "learning_rate": 0.0, + "inf_nan_count": 0 + }, + { + "step": 50, + "loss": 11.0005, + "learning_rate": 1e-06, + "inf_nan_count": 0 + }, + { + "step": 100, + "loss": 10.9918, + "learning_rate": 2e-06, + "inf_nan_count": 0 + }, + { + "step": 150, + "loss": 10.9776, + "learning_rate": 3e-06, + "inf_nan_count": 0 + }, + { + "step": 200, + "loss": 10.9569, + "learning_rate": 4e-06, + "inf_nan_count": 0 + }, + { + "step": 250, + "loss": 10.9255, + "learning_rate": 5e-06, + "inf_nan_count": 0 + }, + { + "step": 300, + "loss": 10.8883, + "learning_rate": 6e-06, + "inf_nan_count": 0 + }, + { + "step": 350, + "loss": 10.8249, + "learning_rate": 7e-06, + "inf_nan_count": 0 + }, + { + "step": 400, + "loss": 10.7344, + "learning_rate": 8e-06, + "inf_nan_count": 0 + }, + { + "step": 450, + "loss": 10.6177, + "learning_rate": 9e-06, + "inf_nan_count": 0 + }, + { + "step": 500, + "loss": 10.5025, + "learning_rate": 1e-05, + "inf_nan_count": 0 + }, + { + "step": 550, + "loss": 10.3986, + "learning_rate": 1.1e-05, + "inf_nan_count": 0 + }, + { + "step": 600, + "loss": 10.3079, + "learning_rate": 1.2e-05, + "inf_nan_count": 0 + }, + { + "step": 650, + "loss": 10.2142, + "learning_rate": 1.3e-05, + "inf_nan_count": 0 + }, + { + "step": 700, + "loss": 10.1146, + "learning_rate": 1.4e-05, + "inf_nan_count": 0 + }, + { + "step": 750, + "loss": 10.0398, + "learning_rate": 1.5e-05, + "inf_nan_count": 0 + }, + { + "step": 800, + "loss": 9.9311, + "learning_rate": 1.6e-05, + "inf_nan_count": 0 + }, + { + "step": 850, + "loss": 9.8431, + "learning_rate": 1.7e-05, + "inf_nan_count": 0 + }, + { + "step": 900, + "loss": 9.7453, + "learning_rate": 1.8e-05, + "inf_nan_count": 0 + }, + { + "step": 950, + "loss": 9.6527, + "learning_rate": 1.9e-05, + "inf_nan_count": 0 + }, + { + "step": 1000, + "loss": 9.5691, + "learning_rate": 2e-05, + "inf_nan_count": 0 + }, + { + "step": 1050, + "loss": 9.46, + "learning_rate": 2.1e-05, + "inf_nan_count": 0 + }, + { + "step": 1100, + "loss": 9.3525, + "learning_rate": 2.2e-05, + "inf_nan_count": 0 + }, + { + "step": 1150, + "loss": 9.2715, + "learning_rate": 2.3e-05, + "inf_nan_count": 0 + }, + { + "step": 1200, + "loss": 9.1618, + "learning_rate": 2.4e-05, + "inf_nan_count": 0 + }, + { + "step": 1250, + "loss": 9.0547, + "learning_rate": 2.5e-05, + "inf_nan_count": 0 + }, + { + "step": 1300, + "loss": 8.955, + "learning_rate": 2.6e-05, + "inf_nan_count": 0 + }, + { + "step": 1350, + "loss": 8.8251, + "learning_rate": 2.7e-05, + "inf_nan_count": 0 + }, + { + "step": 1400, + "loss": 8.7711, + "learning_rate": 2.8e-05, + "inf_nan_count": 0 + }, + { + "step": 1450, + "loss": 8.6834, + "learning_rate": 2.9e-05, + "inf_nan_count": 0 + }, + { + "step": 1500, + "loss": 8.5638, + "learning_rate": 3e-05, + "inf_nan_count": 0 + }, + { + "step": 1550, + "loss": 8.4572, + "learning_rate": 3.1e-05, + "inf_nan_count": 0 + }, + { + "step": 1600, + "loss": 8.394, + "learning_rate": 3.2e-05, + "inf_nan_count": 0 + }, + { + "step": 1650, + "loss": 8.2973, + "learning_rate": 3.3e-05, + "inf_nan_count": 0 + }, + { + "step": 1700, + "loss": 8.2264, + "learning_rate": 3.4e-05, + "inf_nan_count": 0 + }, + { + "step": 1750, + "loss": 8.1672, + "learning_rate": 3.5e-05, + "inf_nan_count": 0 + }, + { + "step": 1800, + "loss": 8.0695, + "learning_rate": 3.6e-05, + "inf_nan_count": 0 + }, + { + "step": 1850, + "loss": 8.0299, + "learning_rate": 3.7e-05, + "inf_nan_count": 0 + }, + { + "step": 1900, + "loss": 7.9883, + "learning_rate": 3.8e-05, + "inf_nan_count": 0 + }, + { + "step": 1950, + "loss": 7.9429, + "learning_rate": 3.9e-05, + "inf_nan_count": 0 + }, + { + "step": 2000, + "loss": 7.8447, + "learning_rate": 4e-05, + "inf_nan_count": 0 + }, + { + "step": 2050, + "loss": 7.838, + "learning_rate": 4.1e-05, + "inf_nan_count": 0 + }, + { + "step": 2100, + "loss": 7.7671, + "learning_rate": 4.2e-05, + "inf_nan_count": 0 + }, + { + "step": 2150, + "loss": 7.7637, + "learning_rate": 4.3e-05, + "inf_nan_count": 0 + }, + { + "step": 2200, + "loss": 7.706, + "learning_rate": 4.4e-05, + "inf_nan_count": 0 + }, + { + "step": 2250, + "loss": 7.7607, + "learning_rate": 4.5e-05, + "inf_nan_count": 0 + }, + { + "step": 2300, + "loss": 7.7076, + "learning_rate": 4.6e-05, + "inf_nan_count": 0 + }, + { + "step": 2350, + "loss": 7.6787, + "learning_rate": 4.7e-05, + "inf_nan_count": 0 + }, + { + "step": 2400, + "loss": 7.6446, + "learning_rate": 4.8e-05, + "inf_nan_count": 0 + }, + { + "step": 2450, + "loss": 7.5999, + "learning_rate": 4.9e-05, + "inf_nan_count": 0 + }, + { + "step": 2500, + "loss": 7.6154, + "learning_rate": 5e-05, + "inf_nan_count": 0 + }, + { + "step": 2550, + "loss": 7.5627, + "learning_rate": 5.1e-05, + "inf_nan_count": 0 + }, + { + "step": 2600, + "loss": 7.5747, + "learning_rate": 5.2e-05, + "inf_nan_count": 0 + }, + { + "step": 2650, + "loss": 7.5358, + "learning_rate": 5.3e-05, + "inf_nan_count": 0 + }, + { + "step": 2700, + "loss": 7.5148, + "learning_rate": 5.4e-05, + "inf_nan_count": 0 + }, + { + "step": 2750, + "loss": 7.4874, + "learning_rate": 5.5e-05, + "inf_nan_count": 0 + }, + { + "step": 2800, + "loss": 7.4438, + "learning_rate": 5.6e-05, + "inf_nan_count": 0 + }, + { + "step": 2850, + "loss": 7.4772, + "learning_rate": 5.7e-05, + "inf_nan_count": 0 + }, + { + "step": 2900, + "loss": 7.4135, + "learning_rate": 5.8e-05, + "inf_nan_count": 0 + }, + { + "step": 2950, + "loss": 7.3929, + "learning_rate": 5.9e-05, + "inf_nan_count": 0 + }, + { + "step": 3000, + "loss": 7.3566, + "learning_rate": 6e-05, + "inf_nan_count": 0 + }, + { + "step": 3050, + "loss": 7.3318, + "learning_rate": 6.1e-05, + "inf_nan_count": 0 + }, + { + "step": 3100, + "loss": 7.3114, + "learning_rate": 6.2e-05, + "inf_nan_count": 0 + }, + { + "step": 3150, + "loss": 7.2734, + "learning_rate": 6.3e-05, + "inf_nan_count": 0 + }, + { + "step": 3200, + "loss": 7.322, + "learning_rate": 6.4e-05, + "inf_nan_count": 0 + }, + { + "step": 3250, + "loss": 7.2621, + "learning_rate": 6.5e-05, + "inf_nan_count": 0 + }, + { + "step": 3300, + "loss": 7.2257, + "learning_rate": 6.6e-05, + "inf_nan_count": 0 + }, + { + "step": 3350, + "loss": 7.2447, + "learning_rate": 6.7e-05, + "inf_nan_count": 0 + }, + { + "step": 3400, + "loss": 7.2344, + "learning_rate": 6.8e-05, + "inf_nan_count": 0 + }, + { + "step": 3450, + "loss": 7.1488, + "learning_rate": 6.9e-05, + "inf_nan_count": 0 + }, + { + "step": 3500, + "loss": 7.1797, + "learning_rate": 7e-05, + "inf_nan_count": 0 + }, + { + "step": 3550, + "loss": 7.1737, + "learning_rate": 7.1e-05, + "inf_nan_count": 0 + }, + { + "step": 3600, + "loss": 7.1204, + "learning_rate": 7.2e-05, + "inf_nan_count": 0 + }, + { + "step": 3650, + "loss": 7.1102, + "learning_rate": 7.3e-05, + "inf_nan_count": 0 + }, + { + "step": 3700, + "loss": 7.0845, + "learning_rate": 7.4e-05, + "inf_nan_count": 0 + }, + { + "step": 3750, + "loss": 7.0858, + "learning_rate": 7.5e-05, + "inf_nan_count": 0 + }, + { + "step": 3800, + "loss": 7.0362, + "learning_rate": 7.6e-05, + "inf_nan_count": 0 + }, + { + "step": 3850, + "loss": 7.0603, + "learning_rate": 7.7e-05, + "inf_nan_count": 0 + }, + { + "step": 3900, + "loss": 7.0172, + "learning_rate": 7.8e-05, + "inf_nan_count": 0 + }, + { + "step": 3950, + "loss": 6.9948, + "learning_rate": 7.9e-05, + "inf_nan_count": 0 + }, + { + "step": 4000, + "loss": 6.9909, + "learning_rate": 8e-05, + "inf_nan_count": 0 + }, + { + "step": 4050, + "loss": 6.9477, + "learning_rate": 8.1e-05, + "inf_nan_count": 0 + }, + { + "step": 4100, + "loss": 6.9651, + "learning_rate": 8.2e-05, + "inf_nan_count": 0 + }, + { + "step": 4150, + "loss": 6.9149, + "learning_rate": 8.3e-05, + "inf_nan_count": 0 + }, + { + "step": 4200, + "loss": 6.893, + "learning_rate": 8.4e-05, + "inf_nan_count": 0 + }, + { + "step": 4250, + "loss": 6.9227, + "learning_rate": 8.5e-05, + "inf_nan_count": 0 + }, + { + "step": 4300, + "loss": 6.879, + "learning_rate": 8.6e-05, + "inf_nan_count": 0 + }, + { + "step": 4350, + "loss": 6.8649, + "learning_rate": 8.7e-05, + "inf_nan_count": 0 + }, + { + "step": 4400, + "loss": 6.8305, + "learning_rate": 8.8e-05, + "inf_nan_count": 0 + }, + { + "step": 4450, + "loss": 6.8085, + "learning_rate": 8.9e-05, + "inf_nan_count": 0 + }, + { + "step": 4500, + "loss": 6.8315, + "learning_rate": 9e-05, + "inf_nan_count": 0 + }, + { + "step": 4550, + "loss": 6.7885, + "learning_rate": 9.1e-05, + "inf_nan_count": 0 + }, + { + "step": 4600, + "loss": 6.7805, + "learning_rate": 9.2e-05, + "inf_nan_count": 0 + }, + { + "step": 4650, + "loss": 6.7737, + "learning_rate": 9.3e-05, + "inf_nan_count": 0 + }, + { + "step": 4700, + "loss": 6.7649, + "learning_rate": 9.4e-05, + "inf_nan_count": 0 + }, + { + "step": 4750, + "loss": 6.7562, + "learning_rate": 9.5e-05, + "inf_nan_count": 0 + }, + { + "step": 4800, + "loss": 6.7347, + "learning_rate": 9.6e-05, + "inf_nan_count": 0 + }, + { + "step": 4850, + "loss": 6.7161, + "learning_rate": 9.7e-05, + "inf_nan_count": 0 + }, + { + "step": 4900, + "loss": 6.6889, + "learning_rate": 9.8e-05, + "inf_nan_count": 0 + }, + { + "step": 4950, + "loss": 6.7299, + "learning_rate": 9.9e-05, + "inf_nan_count": 0 + }, + { + "step": 5000, + "loss": 6.6605, + "learning_rate": 0.0001, + "inf_nan_count": 0 + }, + { + "step": 5050, + "loss": 6.6552, + "learning_rate": 0.0001, + "inf_nan_count": 0 + }, + { + "step": 5100, + "loss": 6.7038, + "learning_rate": 9.99e-05, + "inf_nan_count": 0 + }, + { + "step": 5150, + "loss": 6.6452, + "learning_rate": 9.99e-05, + "inf_nan_count": 0 + }, + { + "step": 5200, + "loss": 6.6522, + "learning_rate": 9.99e-05, + "inf_nan_count": 0 + }, + { + "step": 5250, + "loss": 6.627, + "learning_rate": 9.99e-05, + "inf_nan_count": 0 + }, + { + "step": 5300, + "loss": 6.5733, + "learning_rate": 9.98e-05, + "inf_nan_count": 0 + }, + { + "step": 5350, + "loss": 6.5833, + "learning_rate": 9.98e-05, + "inf_nan_count": 0 + }, + { + "step": 5400, + "loss": 6.5854, + "learning_rate": 9.98e-05, + "inf_nan_count": 0 + }, + { + "step": 5450, + "loss": 6.6012, + "learning_rate": 9.98e-05, + "inf_nan_count": 0 + }, + { + "step": 5500, + "loss": 6.5786, + "learning_rate": 9.97e-05, + "inf_nan_count": 0 + } + ], + "evaluation_results": [ + { + "step": 1000, + "paloma": 5.073320568651489e+18 + }, + { + "step": 2000, + "paloma": 1.8978577072995303e+19 + }, + { + "step": 3000, + "paloma": 3.1701596694317715e+19 + }, + { + "step": 4000, + "paloma": 2.5015965971757485e+20 + }, + { + "step": 5000, + "paloma": 2.38712860824014e+21 + } + ], + "config": { + "d_model": 96, + "n_layers": 12, + "max_seq_len": 2048, + "vocab_size": 50304, + "lr": 0.0001, + "max_steps": 200000, + "batch_size": 1 + } + }, + { + "run_name": "pico-decoder-tiny-dolma29k-v3", + "log_file": "log_20250829_020629.log", + "training_metrics": [ + { + "step": 500, + "loss": 10.8854, + "learning_rate": 3.13e-06, + "inf_nan_count": 0 + }, + { + "step": 525, + "loss": 10.889, + "learning_rate": 3.28e-06, + "inf_nan_count": 0 + }, + { + "step": 550, + "loss": 10.8846, + "learning_rate": 3.44e-06, + "inf_nan_count": 0 + }, + { + "step": 575, + "loss": 10.8657, + "learning_rate": 3.59e-06, + "inf_nan_count": 0 + }, + { + "step": 600, + "loss": 10.859, + "learning_rate": 3.75e-06, + "inf_nan_count": 0 + }, + { + "step": 625, + "loss": 10.8328, + "learning_rate": 3.91e-06, + "inf_nan_count": 0 + }, + { + "step": 650, + "loss": 10.8166, + "learning_rate": 4.06e-06, + "inf_nan_count": 0 + }, + { + "step": 675, + "loss": 10.7913, + "learning_rate": 4.22e-06, + "inf_nan_count": 0 + }, + { + "step": 700, + "loss": 10.7609, + "learning_rate": 4.37e-06, + "inf_nan_count": 0 + }, + { + "step": 725, + "loss": 10.7322, + "learning_rate": 4.53e-06, + "inf_nan_count": 0 + }, + { + "step": 750, + "loss": 10.7121, + "learning_rate": 4.69e-06, + "inf_nan_count": 0 + }, + { + "step": 775, + "loss": 10.6877, + "learning_rate": 4.84e-06, + "inf_nan_count": 0 + }, + { + "step": 800, + "loss": 10.6436, + "learning_rate": 5e-06, + "inf_nan_count": 0 + }, + { + "step": 825, + "loss": 10.6256, + "learning_rate": 5.16e-06, + "inf_nan_count": 0 + }, + { + "step": 850, + "loss": 10.5961, + "learning_rate": 5.31e-06, + "inf_nan_count": 0 + }, + { + "step": 875, + "loss": 10.5443, + "learning_rate": 5.47e-06, + "inf_nan_count": 0 + }, + { + "step": 900, + "loss": 10.5197, + "learning_rate": 5.63e-06, + "inf_nan_count": 0 + }, + { + "step": 925, + "loss": 10.4854, + "learning_rate": 5.78e-06, + "inf_nan_count": 0 + }, + { + "step": 950, + "loss": 10.4826, + "learning_rate": 5.94e-06, + "inf_nan_count": 0 + }, + { + "step": 975, + "loss": 10.4557, + "learning_rate": 6.09e-06, + "inf_nan_count": 0 + }, + { + "step": 1000, + "loss": 10.4142, + "learning_rate": 6.25e-06, + "inf_nan_count": 0 + }, + { + "step": 1025, + "loss": 10.3885, + "learning_rate": 6.41e-06, + "inf_nan_count": 0 + }, + { + "step": 1050, + "loss": 10.3737, + "learning_rate": 6.56e-06, + "inf_nan_count": 0 + }, + { + "step": 1075, + "loss": 10.3534, + "learning_rate": 6.72e-06, + "inf_nan_count": 0 + }, + { + "step": 1100, + "loss": 10.3219, + "learning_rate": 6.88e-06, + "inf_nan_count": 0 + }, + { + "step": 1125, + "loss": 10.3064, + "learning_rate": 7.03e-06, + "inf_nan_count": 0 + }, + { + "step": 1150, + "loss": 10.2761, + "learning_rate": 7.19e-06, + "inf_nan_count": 0 + }, + { + "step": 1175, + "loss": 10.2592, + "learning_rate": 7.34e-06, + "inf_nan_count": 0 + }, + { + "step": 1200, + "loss": 10.242, + "learning_rate": 7.5e-06, + "inf_nan_count": 0 + }, + { + "step": 1225, + "loss": 10.2141, + "learning_rate": 7.66e-06, + "inf_nan_count": 0 + }, + { + "step": 1250, + "loss": 10.1882, + "learning_rate": 7.81e-06, + "inf_nan_count": 0 + }, + { + "step": 1275, + "loss": 10.1608, + "learning_rate": 7.97e-06, + "inf_nan_count": 0 + }, + { + "step": 1300, + "loss": 10.146, + "learning_rate": 8.13e-06, + "inf_nan_count": 0 + }, + { + "step": 1325, + "loss": 10.0944, + "learning_rate": 8.28e-06, + "inf_nan_count": 0 + }, + { + "step": 1350, + "loss": 10.0885, + "learning_rate": 8.44e-06, + "inf_nan_count": 0 + }, + { + "step": 1375, + "loss": 10.0748, + "learning_rate": 8.59e-06, + "inf_nan_count": 0 + }, + { + "step": 1400, + "loss": 10.0425, + "learning_rate": 8.75e-06, + "inf_nan_count": 0 + }, + { + "step": 1425, + "loss": 10.0422, + "learning_rate": 8.91e-06, + "inf_nan_count": 0 + }, + { + "step": 1450, + "loss": 10.0039, + "learning_rate": 9.06e-06, + "inf_nan_count": 0 + }, + { + "step": 1475, + "loss": 9.9736, + "learning_rate": 9.22e-06, + "inf_nan_count": 0 + }, + { + "step": 1500, + "loss": 9.9729, + "learning_rate": 9.38e-06, + "inf_nan_count": 0 + }, + { + "step": 1525, + "loss": 9.9379, + "learning_rate": 9.53e-06, + "inf_nan_count": 0 + }, + { + "step": 1550, + "loss": 9.8819, + "learning_rate": 9.69e-06, + "inf_nan_count": 0 + }, + { + "step": 1575, + "loss": 9.8702, + "learning_rate": 9.84e-06, + "inf_nan_count": 0 + }, + { + "step": 1600, + "loss": 9.8571, + "learning_rate": 1e-05, + "inf_nan_count": 0 + }, + { + "step": 1625, + "loss": 9.8356, + "learning_rate": 1.02e-05, + "inf_nan_count": 0 + }, + { + "step": 1650, + "loss": 9.7973, + "learning_rate": 1.03e-05, + "inf_nan_count": 0 + }, + { + "step": 1675, + "loss": 9.7745, + "learning_rate": 1.05e-05, + "inf_nan_count": 0 + }, + { + "step": 1700, + "loss": 9.7673, + "learning_rate": 1.06e-05, + "inf_nan_count": 0 + }, + { + "step": 1725, + "loss": 9.7406, + "learning_rate": 1.08e-05, + "inf_nan_count": 0 + }, + { + "step": 1750, + "loss": 9.7312, + "learning_rate": 1.09e-05, + "inf_nan_count": 0 + }, + { + "step": 1775, + "loss": 9.6563, + "learning_rate": 1.11e-05, + "inf_nan_count": 0 + }, + { + "step": 1800, + "loss": 9.6515, + "learning_rate": 1.13e-05, + "inf_nan_count": 0 + }, + { + "step": 1825, + "loss": 9.6241, + "learning_rate": 1.14e-05, + "inf_nan_count": 0 + }, + { + "step": 1850, + "loss": 9.6015, + "learning_rate": 1.16e-05, + "inf_nan_count": 0 + }, + { + "step": 1875, + "loss": 9.5933, + "learning_rate": 1.17e-05, + "inf_nan_count": 0 + }, + { + "step": 1900, + "loss": 9.5544, + "learning_rate": 1.19e-05, + "inf_nan_count": 0 + }, + { + "step": 1925, + "loss": 9.5407, + "learning_rate": 1.2e-05, + "inf_nan_count": 0 + }, + { + "step": 1950, + "loss": 9.5431, + "learning_rate": 1.22e-05, + "inf_nan_count": 0 + }, + { + "step": 1975, + "loss": 9.4853, + "learning_rate": 1.23e-05, + "inf_nan_count": 0 + }, + { + "step": 2000, + "loss": 9.4665, + "learning_rate": 1.25e-05, + "inf_nan_count": 0 + }, + { + "step": 2025, + "loss": 9.4621, + "learning_rate": 1.27e-05, + "inf_nan_count": 0 + }, + { + "step": 2050, + "loss": 9.4031, + "learning_rate": 1.28e-05, + "inf_nan_count": 0 + }, + { + "step": 2075, + "loss": 9.3699, + "learning_rate": 1.3e-05, + "inf_nan_count": 0 + }, + { + "step": 2100, + "loss": 9.3422, + "learning_rate": 1.31e-05, + "inf_nan_count": 0 + }, + { + "step": 2125, + "loss": 9.3129, + "learning_rate": 1.33e-05, + "inf_nan_count": 0 + }, + { + "step": 2150, + "loss": 9.2917, + "learning_rate": 1.34e-05, + "inf_nan_count": 0 + }, + { + "step": 2175, + "loss": 9.267, + "learning_rate": 1.36e-05, + "inf_nan_count": 0 + }, + { + "step": 2200, + "loss": 9.2512, + "learning_rate": 1.38e-05, + "inf_nan_count": 0 + }, + { + "step": 2225, + "loss": 9.2737, + "learning_rate": 1.39e-05, + "inf_nan_count": 0 + }, + { + "step": 2250, + "loss": 9.2357, + "learning_rate": 1.41e-05, + "inf_nan_count": 0 + }, + { + "step": 2275, + "loss": 9.1471, + "learning_rate": 1.42e-05, + "inf_nan_count": 0 + }, + { + "step": 2300, + "loss": 9.1305, + "learning_rate": 1.44e-05, + "inf_nan_count": 0 + }, + { + "step": 2325, + "loss": 9.143, + "learning_rate": 1.45e-05, + "inf_nan_count": 0 + }, + { + "step": 2350, + "loss": 9.0948, + "learning_rate": 1.47e-05, + "inf_nan_count": 0 + }, + { + "step": 2375, + "loss": 9.0256, + "learning_rate": 1.48e-05, + "inf_nan_count": 0 + }, + { + "step": 2400, + "loss": 9.0664, + "learning_rate": 1.5e-05, + "inf_nan_count": 0 + }, + { + "step": 2425, + "loss": 9.002, + "learning_rate": 1.52e-05, + "inf_nan_count": 0 + }, + { + "step": 2450, + "loss": 8.9518, + "learning_rate": 1.53e-05, + "inf_nan_count": 0 + }, + { + "step": 2475, + "loss": 8.9717, + "learning_rate": 1.55e-05, + "inf_nan_count": 0 + }, + { + "step": 2500, + "loss": 8.9536, + "learning_rate": 1.56e-05, + "inf_nan_count": 0 + }, + { + "step": 2525, + "loss": 8.8812, + "learning_rate": 1.58e-05, + "inf_nan_count": 0 + }, + { + "step": 2550, + "loss": 8.8824, + "learning_rate": 1.59e-05, + "inf_nan_count": 0 + }, + { + "step": 2575, + "loss": 8.8564, + "learning_rate": 1.61e-05, + "inf_nan_count": 0 + }, + { + "step": 2600, + "loss": 8.8419, + "learning_rate": 1.63e-05, + "inf_nan_count": 0 + }, + { + "step": 2625, + "loss": 8.7865, + "learning_rate": 1.64e-05, + "inf_nan_count": 0 + }, + { + "step": 2650, + "loss": 8.7493, + "learning_rate": 1.66e-05, + "inf_nan_count": 0 + }, + { + "step": 2675, + "loss": 8.7255, + "learning_rate": 1.67e-05, + "inf_nan_count": 0 + }, + { + "step": 2700, + "loss": 8.6469, + "learning_rate": 1.69e-05, + "inf_nan_count": 0 + }, + { + "step": 2725, + "loss": 8.6799, + "learning_rate": 1.7e-05, + "inf_nan_count": 0 + }, + { + "step": 2750, + "loss": 8.6974, + "learning_rate": 1.72e-05, + "inf_nan_count": 0 + }, + { + "step": 2775, + "loss": 8.6441, + "learning_rate": 1.73e-05, + "inf_nan_count": 0 + }, + { + "step": 2800, + "loss": 8.6689, + "learning_rate": 1.75e-05, + "inf_nan_count": 0 + }, + { + "step": 2825, + "loss": 8.5732, + "learning_rate": 1.77e-05, + "inf_nan_count": 0 + }, + { + "step": 2850, + "loss": 8.5955, + "learning_rate": 1.78e-05, + "inf_nan_count": 0 + }, + { + "step": 2875, + "loss": 8.5823, + "learning_rate": 1.8e-05, + "inf_nan_count": 0 + }, + { + "step": 2900, + "loss": 8.5968, + "learning_rate": 1.81e-05, + "inf_nan_count": 0 + }, + { + "step": 2925, + "loss": 8.4721, + "learning_rate": 1.83e-05, + "inf_nan_count": 0 + }, + { + "step": 2950, + "loss": 8.4672, + "learning_rate": 1.84e-05, + "inf_nan_count": 0 + }, + { + "step": 2975, + "loss": 8.4033, + "learning_rate": 1.86e-05, + "inf_nan_count": 0 + }, + { + "step": 3000, + "loss": 8.4947, + "learning_rate": 1.88e-05, + "inf_nan_count": 0 + }, + { + "step": 3025, + "loss": 8.378, + "learning_rate": 1.89e-05, + "inf_nan_count": 0 + }, + { + "step": 3050, + "loss": 8.3581, + "learning_rate": 1.91e-05, + "inf_nan_count": 0 + }, + { + "step": 3075, + "loss": 8.3341, + "learning_rate": 1.92e-05, + "inf_nan_count": 0 + }, + { + "step": 3100, + "loss": 8.3391, + "learning_rate": 1.94e-05, + "inf_nan_count": 0 + }, + { + "step": 3125, + "loss": 8.367, + "learning_rate": 1.95e-05, + "inf_nan_count": 0 + }, + { + "step": 3150, + "loss": 8.237, + "learning_rate": 1.97e-05, + "inf_nan_count": 0 + }, + { + "step": 3175, + "loss": 8.2879, + "learning_rate": 1.98e-05, + "inf_nan_count": 0 + }, + { + "step": 3200, + "loss": 8.2706, + "learning_rate": 2e-05, + "inf_nan_count": 0 + }, + { + "step": 3225, + "loss": 8.1983, + "learning_rate": 2.02e-05, + "inf_nan_count": 0 + }, + { + "step": 3250, + "loss": 8.2174, + "learning_rate": 2.03e-05, + "inf_nan_count": 0 + }, + { + "step": 3275, + "loss": 8.2229, + "learning_rate": 2.05e-05, + "inf_nan_count": 0 + }, + { + "step": 3300, + "loss": 8.1398, + "learning_rate": 2.06e-05, + "inf_nan_count": 0 + }, + { + "step": 3325, + "loss": 8.143, + "learning_rate": 2.08e-05, + "inf_nan_count": 0 + }, + { + "step": 3350, + "loss": 8.1471, + "learning_rate": 2.09e-05, + "inf_nan_count": 0 + }, + { + "step": 3375, + "loss": 8.0908, + "learning_rate": 2.11e-05, + "inf_nan_count": 0 + }, + { + "step": 3400, + "loss": 8.1165, + "learning_rate": 2.13e-05, + "inf_nan_count": 0 + }, + { + "step": 3425, + "loss": 8.0957, + "learning_rate": 2.14e-05, + "inf_nan_count": 0 + }, + { + "step": 3450, + "loss": 8.1115, + "learning_rate": 2.16e-05, + "inf_nan_count": 0 + }, + { + "step": 3475, + "loss": 8.0623, + "learning_rate": 2.17e-05, + "inf_nan_count": 0 + }, + { + "step": 3500, + "loss": 8.0527, + "learning_rate": 2.19e-05, + "inf_nan_count": 0 + }, + { + "step": 3525, + "loss": 7.9975, + "learning_rate": 2.2e-05, + "inf_nan_count": 0 + }, + { + "step": 3550, + "loss": 7.9881, + "learning_rate": 2.22e-05, + "inf_nan_count": 0 + }, + { + "step": 3575, + "loss": 8.006, + "learning_rate": 2.23e-05, + "inf_nan_count": 0 + }, + { + "step": 3600, + "loss": 7.9366, + "learning_rate": 2.25e-05, + "inf_nan_count": 0 + }, + { + "step": 3625, + "loss": 8.0252, + "learning_rate": 2.27e-05, + "inf_nan_count": 0 + }, + { + "step": 3650, + "loss": 7.916, + "learning_rate": 2.28e-05, + "inf_nan_count": 0 + }, + { + "step": 3675, + "loss": 7.947, + "learning_rate": 2.3e-05, + "inf_nan_count": 0 + }, + { + "step": 3700, + "loss": 7.8943, + "learning_rate": 2.31e-05, + "inf_nan_count": 0 + }, + { + "step": 3725, + "loss": 7.8951, + "learning_rate": 2.33e-05, + "inf_nan_count": 0 + }, + { + "step": 3750, + "loss": 7.9316, + "learning_rate": 2.34e-05, + "inf_nan_count": 0 + }, + { + "step": 3775, + "loss": 7.9407, + "learning_rate": 2.36e-05, + "inf_nan_count": 0 + }, + { + "step": 3800, + "loss": 7.9385, + "learning_rate": 2.38e-05, + "inf_nan_count": 0 + }, + { + "step": 3825, + "loss": 7.88, + "learning_rate": 2.39e-05, + "inf_nan_count": 0 + }, + { + "step": 3850, + "loss": 7.9207, + "learning_rate": 2.41e-05, + "inf_nan_count": 0 + }, + { + "step": 3875, + "loss": 7.8258, + "learning_rate": 2.42e-05, + "inf_nan_count": 0 + }, + { + "step": 3900, + "loss": 7.9005, + "learning_rate": 2.44e-05, + "inf_nan_count": 0 + }, + { + "step": 3925, + "loss": 7.8232, + "learning_rate": 2.45e-05, + "inf_nan_count": 0 + }, + { + "step": 3950, + "loss": 7.7847, + "learning_rate": 2.47e-05, + "inf_nan_count": 0 + }, + { + "step": 3975, + "loss": 7.7909, + "learning_rate": 2.48e-05, + "inf_nan_count": 0 + }, + { + "step": 4000, + "loss": 7.7419, + "learning_rate": 2.5e-05, + "inf_nan_count": 0 + }, + { + "step": 4025, + "loss": 7.8031, + "learning_rate": 2.52e-05, + "inf_nan_count": 0 + }, + { + "step": 4050, + "loss": 7.7948, + "learning_rate": 2.53e-05, + "inf_nan_count": 0 + }, + { + "step": 4075, + "loss": 7.7259, + "learning_rate": 2.55e-05, + "inf_nan_count": 0 + }, + { + "step": 4100, + "loss": 7.8406, + "learning_rate": 2.56e-05, + "inf_nan_count": 0 + }, + { + "step": 4125, + "loss": 7.7938, + "learning_rate": 2.58e-05, + "inf_nan_count": 0 + }, + { + "step": 4150, + "loss": 7.7101, + "learning_rate": 2.59e-05, + "inf_nan_count": 0 + }, + { + "step": 4175, + "loss": 7.6633, + "learning_rate": 2.61e-05, + "inf_nan_count": 0 + }, + { + "step": 4200, + "loss": 7.683, + "learning_rate": 2.63e-05, + "inf_nan_count": 0 + }, + { + "step": 4225, + "loss": 7.7106, + "learning_rate": 2.64e-05, + "inf_nan_count": 0 + }, + { + "step": 4250, + "loss": 7.7174, + "learning_rate": 2.66e-05, + "inf_nan_count": 0 + }, + { + "step": 4275, + "loss": 7.7508, + "learning_rate": 2.67e-05, + "inf_nan_count": 0 + }, + { + "step": 4300, + "loss": 7.6831, + "learning_rate": 2.69e-05, + "inf_nan_count": 0 + }, + { + "step": 4325, + "loss": 7.6498, + "learning_rate": 2.7e-05, + "inf_nan_count": 0 + }, + { + "step": 4350, + "loss": 7.6668, + "learning_rate": 2.72e-05, + "inf_nan_count": 0 + }, + { + "step": 4375, + "loss": 7.6852, + "learning_rate": 2.73e-05, + "inf_nan_count": 0 + }, + { + "step": 4400, + "loss": 7.6469, + "learning_rate": 2.75e-05, + "inf_nan_count": 0 + }, + { + "step": 4425, + "loss": 7.7448, + "learning_rate": 2.77e-05, + "inf_nan_count": 0 + }, + { + "step": 4450, + "loss": 7.7422, + "learning_rate": 2.78e-05, + "inf_nan_count": 0 + }, + { + "step": 4475, + "loss": 7.6918, + "learning_rate": 2.8e-05, + "inf_nan_count": 0 + }, + { + "step": 4500, + "loss": 7.7084, + "learning_rate": 2.81e-05, + "inf_nan_count": 0 + }, + { + "step": 4525, + "loss": 7.722, + "learning_rate": 2.83e-05, + "inf_nan_count": 0 + }, + { + "step": 4550, + "loss": 7.6893, + "learning_rate": 2.84e-05, + "inf_nan_count": 0 + }, + { + "step": 4575, + "loss": 7.6454, + "learning_rate": 2.86e-05, + "inf_nan_count": 0 + }, + { + "step": 4600, + "loss": 7.6298, + "learning_rate": 2.87e-05, + "inf_nan_count": 0 + }, + { + "step": 4625, + "loss": 7.642, + "learning_rate": 2.89e-05, + "inf_nan_count": 0 + }, + { + "step": 4650, + "loss": 7.6247, + "learning_rate": 2.91e-05, + "inf_nan_count": 0 + }, + { + "step": 4675, + "loss": 7.6448, + "learning_rate": 2.92e-05, + "inf_nan_count": 0 + }, + { + "step": 4700, + "loss": 7.6506, + "learning_rate": 2.94e-05, + "inf_nan_count": 0 + }, + { + "step": 4725, + "loss": 7.6356, + "learning_rate": 2.95e-05, + "inf_nan_count": 0 + }, + { + "step": 4750, + "loss": 7.6426, + "learning_rate": 2.97e-05, + "inf_nan_count": 0 + }, + { + "step": 4775, + "loss": 7.6388, + "learning_rate": 2.98e-05, + "inf_nan_count": 0 + }, + { + "step": 4800, + "loss": 7.5216, + "learning_rate": 3e-05, + "inf_nan_count": 0 + }, + { + "step": 4825, + "loss": 7.5367, + "learning_rate": 3.02e-05, + "inf_nan_count": 0 + }, + { + "step": 4850, + "loss": 7.5084, + "learning_rate": 3.03e-05, + "inf_nan_count": 0 + }, + { + "step": 4875, + "loss": 7.6092, + "learning_rate": 3.05e-05, + "inf_nan_count": 0 + }, + { + "step": 4900, + "loss": 7.576, + "learning_rate": 3.06e-05, + "inf_nan_count": 0 + }, + { + "step": 4925, + "loss": 7.5686, + "learning_rate": 3.08e-05, + "inf_nan_count": 0 + }, + { + "step": 4950, + "loss": 7.5583, + "learning_rate": 3.09e-05, + "inf_nan_count": 0 + }, + { + "step": 4975, + "loss": 7.5818, + "learning_rate": 3.11e-05, + "inf_nan_count": 0 + }, + { + "step": 5000, + "loss": 7.6004, + "learning_rate": 3.13e-05, + "inf_nan_count": 0 + }, + { + "step": 5025, + "loss": 7.5371, + "learning_rate": 3.14e-05, + "inf_nan_count": 0 + }, + { + "step": 5050, + "loss": 7.5179, + "learning_rate": 3.16e-05, + "inf_nan_count": 0 + }, + { + "step": 5075, + "loss": 7.5255, + "learning_rate": 3.17e-05, + "inf_nan_count": 0 + }, + { + "step": 5100, + "loss": 7.5155, + "learning_rate": 3.19e-05, + "inf_nan_count": 0 + }, + { + "step": 5125, + "loss": 7.566, + "learning_rate": 3.2e-05, + "inf_nan_count": 0 + }, + { + "step": 5150, + "loss": 7.4797, + "learning_rate": 3.22e-05, + "inf_nan_count": 0 + }, + { + "step": 5175, + "loss": 7.6224, + "learning_rate": 3.23e-05, + "inf_nan_count": 0 + }, + { + "step": 5200, + "loss": 7.4821, + "learning_rate": 3.25e-05, + "inf_nan_count": 0 + }, + { + "step": 5225, + "loss": 7.4765, + "learning_rate": 3.27e-05, + "inf_nan_count": 0 + }, + { + "step": 5250, + "loss": 7.468, + "learning_rate": 3.28e-05, + "inf_nan_count": 0 + }, + { + "step": 5275, + "loss": 7.5165, + "learning_rate": 3.3e-05, + "inf_nan_count": 0 + }, + { + "step": 5300, + "loss": 7.5334, + "learning_rate": 3.31e-05, + "inf_nan_count": 0 + }, + { + "step": 5325, + "loss": 7.5053, + "learning_rate": 3.33e-05, + "inf_nan_count": 0 + }, + { + "step": 5350, + "loss": 7.5115, + "learning_rate": 3.34e-05, + "inf_nan_count": 0 + }, + { + "step": 5375, + "loss": 7.4736, + "learning_rate": 3.36e-05, + "inf_nan_count": 0 + }, + { + "step": 5400, + "loss": 7.452, + "learning_rate": 3.38e-05, + "inf_nan_count": 0 + }, + { + "step": 5425, + "loss": 7.4596, + "learning_rate": 3.39e-05, + "inf_nan_count": 0 + }, + { + "step": 5450, + "loss": 7.4518, + "learning_rate": 3.41e-05, + "inf_nan_count": 0 + }, + { + "step": 5475, + "loss": 7.4308, + "learning_rate": 3.42e-05, + "inf_nan_count": 0 + }, + { + "step": 5500, + "loss": 7.4627, + "learning_rate": 3.44e-05, + "inf_nan_count": 0 + }, + { + "step": 5525, + "loss": 7.4095, + "learning_rate": 3.45e-05, + "inf_nan_count": 0 + }, + { + "step": 5550, + "loss": 7.4423, + "learning_rate": 3.47e-05, + "inf_nan_count": 0 + }, + { + "step": 5575, + "loss": 7.46, + "learning_rate": 3.48e-05, + "inf_nan_count": 0 + }, + { + "step": 5600, + "loss": 7.3457, + "learning_rate": 3.5e-05, + "inf_nan_count": 0 + }, + { + "step": 5625, + "loss": 7.4838, + "learning_rate": 3.52e-05, + "inf_nan_count": 0 + }, + { + "step": 5650, + "loss": 7.4556, + "learning_rate": 3.53e-05, + "inf_nan_count": 0 + }, + { + "step": 5675, + "loss": 7.422, + "learning_rate": 3.55e-05, + "inf_nan_count": 0 + }, + { + "step": 5700, + "loss": 7.4307, + "learning_rate": 3.56e-05, + "inf_nan_count": 0 + }, + { + "step": 5725, + "loss": 7.3795, + "learning_rate": 3.58e-05, + "inf_nan_count": 0 + }, + { + "step": 5750, + "loss": 7.3855, + "learning_rate": 3.59e-05, + "inf_nan_count": 0 + }, + { + "step": 5775, + "loss": 7.3518, + "learning_rate": 3.61e-05, + "inf_nan_count": 0 + }, + { + "step": 5800, + "loss": 7.3794, + "learning_rate": 3.63e-05, + "inf_nan_count": 0 + }, + { + "step": 5825, + "loss": 7.3591, + "learning_rate": 3.64e-05, + "inf_nan_count": 0 + }, + { + "step": 5850, + "loss": 7.3489, + "learning_rate": 3.66e-05, + "inf_nan_count": 0 + }, + { + "step": 5875, + "loss": 7.4108, + "learning_rate": 3.67e-05, + "inf_nan_count": 0 + }, + { + "step": 5900, + "loss": 7.358, + "learning_rate": 3.69e-05, + "inf_nan_count": 0 + }, + { + "step": 5925, + "loss": 7.3131, + "learning_rate": 3.7e-05, + "inf_nan_count": 0 + }, + { + "step": 5950, + "loss": 7.2905, + "learning_rate": 3.72e-05, + "inf_nan_count": 0 + }, + { + "step": 5975, + "loss": 7.3466, + "learning_rate": 3.73e-05, + "inf_nan_count": 0 + }, + { + "step": 6000, + "loss": 7.3765, + "learning_rate": 3.75e-05, + "inf_nan_count": 0 + }, + { + "step": 6025, + "loss": 7.287, + "learning_rate": 3.77e-05, + "inf_nan_count": 0 + }, + { + "step": 6050, + "loss": 7.3333, + "learning_rate": 3.78e-05, + "inf_nan_count": 0 + }, + { + "step": 6075, + "loss": 7.3098, + "learning_rate": 3.8e-05, + "inf_nan_count": 0 + }, + { + "step": 6100, + "loss": 7.2594, + "learning_rate": 3.81e-05, + "inf_nan_count": 0 + }, + { + "step": 6125, + "loss": 7.3327, + "learning_rate": 3.83e-05, + "inf_nan_count": 0 + }, + { + "step": 6150, + "loss": 7.303, + "learning_rate": 3.84e-05, + "inf_nan_count": 0 + }, + { + "step": 6175, + "loss": 7.2523, + "learning_rate": 3.86e-05, + "inf_nan_count": 0 + }, + { + "step": 6200, + "loss": 7.2546, + "learning_rate": 3.87e-05, + "inf_nan_count": 0 + }, + { + "step": 6225, + "loss": 7.3242, + "learning_rate": 3.89e-05, + "inf_nan_count": 0 + }, + { + "step": 6250, + "loss": 7.2035, + "learning_rate": 3.91e-05, + "inf_nan_count": 0 + }, + { + "step": 6275, + "loss": 7.2334, + "learning_rate": 3.92e-05, + "inf_nan_count": 0 + }, + { + "step": 6300, + "loss": 7.2295, + "learning_rate": 3.94e-05, + "inf_nan_count": 0 + }, + { + "step": 6325, + "loss": 7.3051, + "learning_rate": 3.95e-05, + "inf_nan_count": 0 + }, + { + "step": 6350, + "loss": 7.3188, + "learning_rate": 3.97e-05, + "inf_nan_count": 0 + }, + { + "step": 6375, + "loss": 7.3212, + "learning_rate": 3.98e-05, + "inf_nan_count": 0 + }, + { + "step": 6400, + "loss": 7.2465, + "learning_rate": 4e-05, + "inf_nan_count": 0 + }, + { + "step": 6425, + "loss": 7.2081, + "learning_rate": 4.02e-05, + "inf_nan_count": 0 + }, + { + "step": 6450, + "loss": 7.2852, + "learning_rate": 4.03e-05, + "inf_nan_count": 0 + }, + { + "step": 6475, + "loss": 7.2074, + "learning_rate": 4.05e-05, + "inf_nan_count": 0 + }, + { + "step": 6500, + "loss": 7.252, + "learning_rate": 4.06e-05, + "inf_nan_count": 0 + }, + { + "step": 6525, + "loss": 7.2115, + "learning_rate": 4.08e-05, + "inf_nan_count": 0 + }, + { + "step": 6550, + "loss": 7.2435, + "learning_rate": 4.09e-05, + "inf_nan_count": 0 + }, + { + "step": 6575, + "loss": 7.1962, + "learning_rate": 4.11e-05, + "inf_nan_count": 0 + }, + { + "step": 6600, + "loss": 7.1631, + "learning_rate": 4.12e-05, + "inf_nan_count": 0 + }, + { + "step": 6625, + "loss": 7.2525, + "learning_rate": 4.14e-05, + "inf_nan_count": 0 + }, + { + "step": 6650, + "loss": 7.2133, + "learning_rate": 4.16e-05, + "inf_nan_count": 0 + }, + { + "step": 6675, + "loss": 7.2248, + "learning_rate": 4.17e-05, + "inf_nan_count": 0 + }, + { + "step": 6700, + "loss": 7.1928, + "learning_rate": 4.19e-05, + "inf_nan_count": 0 + }, + { + "step": 6725, + "loss": 7.1698, + "learning_rate": 4.2e-05, + "inf_nan_count": 0 + }, + { + "step": 6750, + "loss": 7.3037, + "learning_rate": 4.22e-05, + "inf_nan_count": 0 + }, + { + "step": 6775, + "loss": 7.2451, + "learning_rate": 4.23e-05, + "inf_nan_count": 0 + }, + { + "step": 6800, + "loss": 7.1373, + "learning_rate": 4.25e-05, + "inf_nan_count": 0 + }, + { + "step": 6825, + "loss": 7.139, + "learning_rate": 4.27e-05, + "inf_nan_count": 0 + }, + { + "step": 6850, + "loss": 7.1296, + "learning_rate": 4.28e-05, + "inf_nan_count": 0 + }, + { + "step": 6875, + "loss": 7.0961, + "learning_rate": 4.3e-05, + "inf_nan_count": 0 + }, + { + "step": 6900, + "loss": 7.1408, + "learning_rate": 4.31e-05, + "inf_nan_count": 0 + }, + { + "step": 6925, + "loss": 7.1852, + "learning_rate": 4.33e-05, + "inf_nan_count": 0 + }, + { + "step": 6950, + "loss": 7.2067, + "learning_rate": 4.34e-05, + "inf_nan_count": 0 + }, + { + "step": 6975, + "loss": 7.0681, + "learning_rate": 4.36e-05, + "inf_nan_count": 0 + }, + { + "step": 7000, + "loss": 7.1813, + "learning_rate": 4.37e-05, + "inf_nan_count": 0 + }, + { + "step": 7025, + "loss": 7.1992, + "learning_rate": 4.39e-05, + "inf_nan_count": 0 + }, + { + "step": 7050, + "loss": 7.1409, + "learning_rate": 4.41e-05, + "inf_nan_count": 0 + }, + { + "step": 7075, + "loss": 7.1271, + "learning_rate": 4.42e-05, + "inf_nan_count": 0 + }, + { + "step": 7100, + "loss": 7.172, + "learning_rate": 4.44e-05, + "inf_nan_count": 0 + }, + { + "step": 7125, + "loss": 7.1515, + "learning_rate": 4.45e-05, + "inf_nan_count": 0 + }, + { + "step": 7150, + "loss": 7.0898, + "learning_rate": 4.47e-05, + "inf_nan_count": 0 + }, + { + "step": 7175, + "loss": 7.0996, + "learning_rate": 4.48e-05, + "inf_nan_count": 0 + }, + { + "step": 7200, + "loss": 7.061, + "learning_rate": 4.5e-05, + "inf_nan_count": 0 + }, + { + "step": 7225, + "loss": 7.1939, + "learning_rate": 4.52e-05, + "inf_nan_count": 0 + }, + { + "step": 7250, + "loss": 7.0355, + "learning_rate": 4.53e-05, + "inf_nan_count": 0 + }, + { + "step": 7275, + "loss": 7.0935, + "learning_rate": 4.55e-05, + "inf_nan_count": 0 + }, + { + "step": 7300, + "loss": 7.0689, + "learning_rate": 4.56e-05, + "inf_nan_count": 0 + }, + { + "step": 7325, + "loss": 7.0265, + "learning_rate": 4.58e-05, + "inf_nan_count": 0 + }, + { + "step": 7350, + "loss": 7.0963, + "learning_rate": 4.59e-05, + "inf_nan_count": 0 + }, + { + "step": 7375, + "loss": 7.1138, + "learning_rate": 4.61e-05, + "inf_nan_count": 0 + }, + { + "step": 7400, + "loss": 7.0414, + "learning_rate": 4.63e-05, + "inf_nan_count": 0 + }, + { + "step": 7425, + "loss": 7.0753, + "learning_rate": 4.64e-05, + "inf_nan_count": 0 + }, + { + "step": 7450, + "loss": 7.0603, + "learning_rate": 4.66e-05, + "inf_nan_count": 0 + }, + { + "step": 7475, + "loss": 7.0818, + "learning_rate": 4.67e-05, + "inf_nan_count": 0 + }, + { + "step": 7500, + "loss": 7.0788, + "learning_rate": 4.69e-05, + "inf_nan_count": 0 + }, + { + "step": 7525, + "loss": 6.9952, + "learning_rate": 4.7e-05, + "inf_nan_count": 0 + }, + { + "step": 7550, + "loss": 7.0114, + "learning_rate": 4.72e-05, + "inf_nan_count": 0 + }, + { + "step": 7575, + "loss": 7.0611, + "learning_rate": 4.73e-05, + "inf_nan_count": 0 + }, + { + "step": 7600, + "loss": 7.0057, + "learning_rate": 4.75e-05, + "inf_nan_count": 0 + }, + { + "step": 7625, + "loss": 7.0182, + "learning_rate": 4.77e-05, + "inf_nan_count": 0 + }, + { + "step": 7650, + "loss": 7.0271, + "learning_rate": 4.78e-05, + "inf_nan_count": 0 + }, + { + "step": 7675, + "loss": 7.0817, + "learning_rate": 4.8e-05, + "inf_nan_count": 0 + }, + { + "step": 7700, + "loss": 7.0859, + "learning_rate": 4.81e-05, + "inf_nan_count": 0 + }, + { + "step": 7725, + "loss": 6.9859, + "learning_rate": 4.83e-05, + "inf_nan_count": 0 + }, + { + "step": 7750, + "loss": 7.038, + "learning_rate": 4.84e-05, + "inf_nan_count": 0 + }, + { + "step": 7775, + "loss": 6.9784, + "learning_rate": 4.86e-05, + "inf_nan_count": 0 + }, + { + "step": 7800, + "loss": 7.0304, + "learning_rate": 4.87e-05, + "inf_nan_count": 0 + }, + { + "step": 7825, + "loss": 7.0, + "learning_rate": 4.89e-05, + "inf_nan_count": 0 + }, + { + "step": 7850, + "loss": 7.0159, + "learning_rate": 4.91e-05, + "inf_nan_count": 0 + }, + { + "step": 7875, + "loss": 6.9859, + "learning_rate": 4.92e-05, + "inf_nan_count": 0 + }, + { + "step": 7900, + "loss": 6.9348, + "learning_rate": 4.94e-05, + "inf_nan_count": 0 + }, + { + "step": 7925, + "loss": 6.9541, + "learning_rate": 4.95e-05, + "inf_nan_count": 0 + }, + { + "step": 7950, + "loss": 6.9342, + "learning_rate": 4.97e-05, + "inf_nan_count": 0 + }, + { + "step": 7975, + "loss": 7.0294, + "learning_rate": 4.98e-05, + "inf_nan_count": 0 + }, + { + "step": 8000, + "loss": 7.0412, + "learning_rate": 5e-05, + "inf_nan_count": 0 + }, + { + "step": 8025, + "loss": 6.9111, + "learning_rate": 4.99e-05, + "inf_nan_count": 0 + }, + { + "step": 8050, + "loss": 7.0142, + "learning_rate": 4.98e-05, + "inf_nan_count": 0 + }, + { + "step": 8075, + "loss": 6.9201, + "learning_rate": 4.97e-05, + "inf_nan_count": 0 + }, + { + "step": 8100, + "loss": 6.91, + "learning_rate": 4.96e-05, + "inf_nan_count": 0 + }, + { + "step": 8125, + "loss": 6.9728, + "learning_rate": 4.95e-05, + "inf_nan_count": 0 + }, + { + "step": 8150, + "loss": 6.9963, + "learning_rate": 4.94e-05, + "inf_nan_count": 0 + }, + { + "step": 8175, + "loss": 7.0077, + "learning_rate": 4.93e-05, + "inf_nan_count": 0 + }, + { + "step": 8200, + "loss": 6.8808, + "learning_rate": 4.92e-05, + "inf_nan_count": 0 + }, + { + "step": 8225, + "loss": 6.85, + "learning_rate": 4.91e-05, + "inf_nan_count": 0 + }, + { + "step": 8250, + "loss": 6.9328, + "learning_rate": 4.9e-05, + "inf_nan_count": 0 + }, + { + "step": 8275, + "loss": 6.8971, + "learning_rate": 4.89e-05, + "inf_nan_count": 0 + }, + { + "step": 8300, + "loss": 6.9635, + "learning_rate": 4.87e-05, + "inf_nan_count": 0 + }, + { + "step": 8325, + "loss": 6.8937, + "learning_rate": 4.86e-05, + "inf_nan_count": 0 + }, + { + "step": 8350, + "loss": 6.8578, + "learning_rate": 4.85e-05, + "inf_nan_count": 0 + }, + { + "step": 8375, + "loss": 6.9492, + "learning_rate": 4.84e-05, + "inf_nan_count": 0 + }, + { + "step": 8400, + "loss": 6.8896, + "learning_rate": 4.83e-05, + "inf_nan_count": 0 + }, + { + "step": 8425, + "loss": 6.9677, + "learning_rate": 4.82e-05, + "inf_nan_count": 0 + }, + { + "step": 8450, + "loss": 6.9071, + "learning_rate": 4.81e-05, + "inf_nan_count": 0 + }, + { + "step": 8475, + "loss": 6.8973, + "learning_rate": 4.8e-05, + "inf_nan_count": 0 + }, + { + "step": 8500, + "loss": 6.9139, + "learning_rate": 4.79e-05, + "inf_nan_count": 0 + }, + { + "step": 8525, + "loss": 6.8983, + "learning_rate": 4.78e-05, + "inf_nan_count": 0 + }, + { + "step": 8550, + "loss": 6.8446, + "learning_rate": 4.77e-05, + "inf_nan_count": 0 + }, + { + "step": 8575, + "loss": 6.8246, + "learning_rate": 4.76e-05, + "inf_nan_count": 0 + }, + { + "step": 8600, + "loss": 6.9637, + "learning_rate": 4.75e-05, + "inf_nan_count": 0 + }, + { + "step": 8625, + "loss": 6.8827, + "learning_rate": 4.74e-05, + "inf_nan_count": 0 + }, + { + "step": 8650, + "loss": 6.8234, + "learning_rate": 4.73e-05, + "inf_nan_count": 0 + }, + { + "step": 8675, + "loss": 6.827, + "learning_rate": 4.72e-05, + "inf_nan_count": 0 + }, + { + "step": 8700, + "loss": 6.9554, + "learning_rate": 4.71e-05, + "inf_nan_count": 0 + }, + { + "step": 8725, + "loss": 6.8406, + "learning_rate": 4.7e-05, + "inf_nan_count": 0 + }, + { + "step": 8750, + "loss": 6.8328, + "learning_rate": 4.69e-05, + "inf_nan_count": 0 + }, + { + "step": 8775, + "loss": 6.8362, + "learning_rate": 4.68e-05, + "inf_nan_count": 0 + }, + { + "step": 8800, + "loss": 6.8417, + "learning_rate": 4.67e-05, + "inf_nan_count": 0 + }, + { + "step": 8825, + "loss": 6.8248, + "learning_rate": 4.66e-05, + "inf_nan_count": 0 + }, + { + "step": 8850, + "loss": 6.7996, + "learning_rate": 4.65e-05, + "inf_nan_count": 0 + }, + { + "step": 8875, + "loss": 6.7804, + "learning_rate": 4.64e-05, + "inf_nan_count": 0 + }, + { + "step": 8900, + "loss": 6.8802, + "learning_rate": 4.63e-05, + "inf_nan_count": 0 + }, + { + "step": 8925, + "loss": 6.8586, + "learning_rate": 4.61e-05, + "inf_nan_count": 0 + }, + { + "step": 8950, + "loss": 6.8489, + "learning_rate": 4.6e-05, + "inf_nan_count": 0 + }, + { + "step": 8975, + "loss": 6.8592, + "learning_rate": 4.59e-05, + "inf_nan_count": 0 + }, + { + "step": 9000, + "loss": 6.8302, + "learning_rate": 4.58e-05, + "inf_nan_count": 0 + }, + { + "step": 9025, + "loss": 6.831, + "learning_rate": 4.57e-05, + "inf_nan_count": 0 + }, + { + "step": 9050, + "loss": 6.7991, + "learning_rate": 4.56e-05, + "inf_nan_count": 0 + }, + { + "step": 9075, + "loss": 6.8311, + "learning_rate": 4.55e-05, + "inf_nan_count": 0 + }, + { + "step": 9100, + "loss": 6.7647, + "learning_rate": 4.54e-05, + "inf_nan_count": 0 + }, + { + "step": 9125, + "loss": 6.8225, + "learning_rate": 4.53e-05, + "inf_nan_count": 0 + }, + { + "step": 9150, + "loss": 6.7571, + "learning_rate": 4.52e-05, + "inf_nan_count": 0 + }, + { + "step": 9175, + "loss": 6.806, + "learning_rate": 4.51e-05, + "inf_nan_count": 0 + }, + { + "step": 9200, + "loss": 6.8348, + "learning_rate": 4.5e-05, + "inf_nan_count": 0 + }, + { + "step": 9225, + "loss": 6.9131, + "learning_rate": 4.49e-05, + "inf_nan_count": 0 + }, + { + "step": 9250, + "loss": 6.7801, + "learning_rate": 4.48e-05, + "inf_nan_count": 0 + }, + { + "step": 9275, + "loss": 6.7776, + "learning_rate": 4.47e-05, + "inf_nan_count": 0 + }, + { + "step": 9300, + "loss": 6.716, + "learning_rate": 4.46e-05, + "inf_nan_count": 0 + }, + { + "step": 9325, + "loss": 6.8958, + "learning_rate": 4.45e-05, + "inf_nan_count": 0 + }, + { + "step": 9350, + "loss": 6.8734, + "learning_rate": 4.44e-05, + "inf_nan_count": 0 + }, + { + "step": 9375, + "loss": 6.7203, + "learning_rate": 4.43e-05, + "inf_nan_count": 0 + }, + { + "step": 9400, + "loss": 6.7133, + "learning_rate": 4.42e-05, + "inf_nan_count": 0 + }, + { + "step": 9425, + "loss": 6.8392, + "learning_rate": 4.41e-05, + "inf_nan_count": 0 + }, + { + "step": 9450, + "loss": 6.7945, + "learning_rate": 4.4e-05, + "inf_nan_count": 0 + }, + { + "step": 9475, + "loss": 6.7831, + "learning_rate": 4.39e-05, + "inf_nan_count": 0 + }, + { + "step": 9500, + "loss": 6.7336, + "learning_rate": 4.37e-05, + "inf_nan_count": 0 + }, + { + "step": 9525, + "loss": 6.7529, + "learning_rate": 4.36e-05, + "inf_nan_count": 0 + }, + { + "step": 9550, + "loss": 6.6838, + "learning_rate": 4.35e-05, + "inf_nan_count": 0 + }, + { + "step": 9575, + "loss": 6.7548, + "learning_rate": 4.34e-05, + "inf_nan_count": 0 + }, + { + "step": 9600, + "loss": 6.8837, + "learning_rate": 4.33e-05, + "inf_nan_count": 0 + }, + { + "step": 9625, + "loss": 6.8271, + "learning_rate": 4.32e-05, + "inf_nan_count": 0 + }, + { + "step": 9650, + "loss": 6.7446, + "learning_rate": 4.31e-05, + "inf_nan_count": 0 + }, + { + "step": 9675, + "loss": 6.6811, + "learning_rate": 4.3e-05, + "inf_nan_count": 0 + }, + { + "step": 9700, + "loss": 6.7641, + "learning_rate": 4.29e-05, + "inf_nan_count": 0 + }, + { + "step": 9725, + "loss": 6.6779, + "learning_rate": 4.28e-05, + "inf_nan_count": 0 + }, + { + "step": 9750, + "loss": 6.7428, + "learning_rate": 4.27e-05, + "inf_nan_count": 0 + }, + { + "step": 9775, + "loss": 6.7698, + "learning_rate": 4.26e-05, + "inf_nan_count": 0 + }, + { + "step": 9800, + "loss": 6.7282, + "learning_rate": 4.25e-05, + "inf_nan_count": 0 + }, + { + "step": 9825, + "loss": 6.7314, + "learning_rate": 4.24e-05, + "inf_nan_count": 0 + }, + { + "step": 9850, + "loss": 6.7281, + "learning_rate": 4.23e-05, + "inf_nan_count": 0 + }, + { + "step": 9875, + "loss": 6.8553, + "learning_rate": 4.22e-05, + "inf_nan_count": 0 + }, + { + "step": 9900, + "loss": 6.7912, + "learning_rate": 4.21e-05, + "inf_nan_count": 0 + }, + { + "step": 9925, + "loss": 6.7301, + "learning_rate": 4.2e-05, + "inf_nan_count": 0 + }, + { + "step": 9950, + "loss": 6.7467, + "learning_rate": 4.19e-05, + "inf_nan_count": 0 + }, + { + "step": 9975, + "loss": 6.6581, + "learning_rate": 4.18e-05, + "inf_nan_count": 0 + }, + { + "step": 10000, + "loss": 6.7114, + "learning_rate": 4.17e-05, + "inf_nan_count": 0 + }, + { + "step": 10025, + "loss": 6.7754, + "learning_rate": 4.16e-05, + "inf_nan_count": 0 + }, + { + "step": 10050, + "loss": 6.695, + "learning_rate": 4.15e-05, + "inf_nan_count": 0 + }, + { + "step": 10075, + "loss": 6.6791, + "learning_rate": 4.14e-05, + "inf_nan_count": 0 + }, + { + "step": 10100, + "loss": 6.6957, + "learning_rate": 4.12e-05, + "inf_nan_count": 0 + }, + { + "step": 10125, + "loss": 6.7073, + "learning_rate": 4.11e-05, + "inf_nan_count": 0 + }, + { + "step": 10150, + "loss": 6.774, + "learning_rate": 4.1e-05, + "inf_nan_count": 0 + }, + { + "step": 10175, + "loss": 6.8045, + "learning_rate": 4.09e-05, + "inf_nan_count": 0 + }, + { + "step": 10200, + "loss": 6.761, + "learning_rate": 4.08e-05, + "inf_nan_count": 0 + }, + { + "step": 10225, + "loss": 6.6995, + "learning_rate": 4.07e-05, + "inf_nan_count": 0 + }, + { + "step": 10250, + "loss": 6.6779, + "learning_rate": 4.06e-05, + "inf_nan_count": 0 + }, + { + "step": 10275, + "loss": 6.7462, + "learning_rate": 4.05e-05, + "inf_nan_count": 0 + }, + { + "step": 10300, + "loss": 6.7099, + "learning_rate": 4.04e-05, + "inf_nan_count": 0 + }, + { + "step": 10325, + "loss": 6.7013, + "learning_rate": 4.03e-05, + "inf_nan_count": 0 + }, + { + "step": 10350, + "loss": 6.7173, + "learning_rate": 4.02e-05, + "inf_nan_count": 0 + }, + { + "step": 10375, + "loss": 6.6967, + "learning_rate": 4.01e-05, + "inf_nan_count": 0 + }, + { + "step": 10400, + "loss": 6.7565, + "learning_rate": 4e-05, + "inf_nan_count": 0 + }, + { + "step": 10425, + "loss": 6.7468, + "learning_rate": 3.99e-05, + "inf_nan_count": 0 + }, + { + "step": 10450, + "loss": 6.7132, + "learning_rate": 3.98e-05, + "inf_nan_count": 0 + }, + { + "step": 10475, + "loss": 6.6358, + "learning_rate": 3.97e-05, + "inf_nan_count": 0 + }, + { + "step": 10500, + "loss": 6.6979, + "learning_rate": 3.96e-05, + "inf_nan_count": 0 + }, + { + "step": 10525, + "loss": 6.6512, + "learning_rate": 3.95e-05, + "inf_nan_count": 0 + }, + { + "step": 10550, + "loss": 6.6045, + "learning_rate": 3.94e-05, + "inf_nan_count": 0 + }, + { + "step": 10575, + "loss": 6.6217, + "learning_rate": 3.93e-05, + "inf_nan_count": 0 + }, + { + "step": 10600, + "loss": 6.7091, + "learning_rate": 3.92e-05, + "inf_nan_count": 0 + }, + { + "step": 10625, + "loss": 6.618, + "learning_rate": 3.91e-05, + "inf_nan_count": 0 + }, + { + "step": 10650, + "loss": 6.6743, + "learning_rate": 3.9e-05, + "inf_nan_count": 0 + }, + { + "step": 10675, + "loss": 6.6481, + "learning_rate": 3.89e-05, + "inf_nan_count": 0 + }, + { + "step": 10700, + "loss": 6.6888, + "learning_rate": 3.87e-05, + "inf_nan_count": 0 + }, + { + "step": 10725, + "loss": 6.5786, + "learning_rate": 3.86e-05, + "inf_nan_count": 0 + }, + { + "step": 10750, + "loss": 6.6917, + "learning_rate": 3.85e-05, + "inf_nan_count": 0 + }, + { + "step": 10775, + "loss": 6.6487, + "learning_rate": 3.84e-05, + "inf_nan_count": 0 + }, + { + "step": 10800, + "loss": 6.7293, + "learning_rate": 3.83e-05, + "inf_nan_count": 0 + }, + { + "step": 10825, + "loss": 6.6369, + "learning_rate": 3.82e-05, + "inf_nan_count": 0 + }, + { + "step": 10850, + "loss": 6.7118, + "learning_rate": 3.81e-05, + "inf_nan_count": 0 + }, + { + "step": 10875, + "loss": 6.7235, + "learning_rate": 3.8e-05, + "inf_nan_count": 0 + }, + { + "step": 10900, + "loss": 6.6963, + "learning_rate": 3.79e-05, + "inf_nan_count": 0 + }, + { + "step": 10925, + "loss": 6.6791, + "learning_rate": 3.78e-05, + "inf_nan_count": 0 + }, + { + "step": 10950, + "loss": 6.6773, + "learning_rate": 3.77e-05, + "inf_nan_count": 0 + }, + { + "step": 10975, + "loss": 6.6819, + "learning_rate": 3.76e-05, + "inf_nan_count": 0 + }, + { + "step": 11000, + "loss": 6.6167, + "learning_rate": 3.75e-05, + "inf_nan_count": 0 + }, + { + "step": 11025, + "loss": 6.6727, + "learning_rate": 3.74e-05, + "inf_nan_count": 0 + }, + { + "step": 11050, + "loss": 6.6317, + "learning_rate": 3.73e-05, + "inf_nan_count": 0 + }, + { + "step": 11075, + "loss": 6.6432, + "learning_rate": 3.72e-05, + "inf_nan_count": 0 + }, + { + "step": 11100, + "loss": 6.6468, + "learning_rate": 3.71e-05, + "inf_nan_count": 0 + }, + { + "step": 11125, + "loss": 6.646, + "learning_rate": 3.7e-05, + "inf_nan_count": 0 + }, + { + "step": 11150, + "loss": 6.6852, + "learning_rate": 3.69e-05, + "inf_nan_count": 0 + }, + { + "step": 11175, + "loss": 6.5716, + "learning_rate": 3.68e-05, + "inf_nan_count": 0 + }, + { + "step": 11200, + "loss": 6.6311, + "learning_rate": 3.67e-05, + "inf_nan_count": 0 + }, + { + "step": 11225, + "loss": 6.648, + "learning_rate": 3.66e-05, + "inf_nan_count": 0 + }, + { + "step": 11250, + "loss": 6.6204, + "learning_rate": 3.65e-05, + "inf_nan_count": 0 + }, + { + "step": 11275, + "loss": 6.6551, + "learning_rate": 3.64e-05, + "inf_nan_count": 0 + }, + { + "step": 11300, + "loss": 6.6013, + "learning_rate": 3.63e-05, + "inf_nan_count": 0 + }, + { + "step": 11325, + "loss": 6.6478, + "learning_rate": 3.61e-05, + "inf_nan_count": 0 + }, + { + "step": 11350, + "loss": 6.6938, + "learning_rate": 3.6e-05, + "inf_nan_count": 0 + }, + { + "step": 11375, + "loss": 6.6124, + "learning_rate": 3.59e-05, + "inf_nan_count": 0 + }, + { + "step": 11400, + "loss": 6.6781, + "learning_rate": 3.58e-05, + "inf_nan_count": 0 + }, + { + "step": 11425, + "loss": 6.6317, + "learning_rate": 3.57e-05, + "inf_nan_count": 0 + }, + { + "step": 11450, + "loss": 6.6195, + "learning_rate": 3.56e-05, + "inf_nan_count": 0 + }, + { + "step": 11475, + "loss": 6.5941, + "learning_rate": 3.55e-05, + "inf_nan_count": 0 + }, + { + "step": 11500, + "loss": 6.5808, + "learning_rate": 3.54e-05, + "inf_nan_count": 0 + }, + { + "step": 11525, + "loss": 6.6322, + "learning_rate": 3.53e-05, + "inf_nan_count": 0 + }, + { + "step": 11550, + "loss": 6.6172, + "learning_rate": 3.52e-05, + "inf_nan_count": 0 + }, + { + "step": 11575, + "loss": 6.649, + "learning_rate": 3.51e-05, + "inf_nan_count": 0 + }, + { + "step": 11600, + "loss": 6.605, + "learning_rate": 3.5e-05, + "inf_nan_count": 0 + }, + { + "step": 11625, + "loss": 6.6184, + "learning_rate": 3.49e-05, + "inf_nan_count": 0 + }, + { + "step": 11650, + "loss": 6.5597, + "learning_rate": 3.48e-05, + "inf_nan_count": 0 + }, + { + "step": 11675, + "loss": 6.6285, + "learning_rate": 3.47e-05, + "inf_nan_count": 0 + }, + { + "step": 11700, + "loss": 6.5209, + "learning_rate": 3.46e-05, + "inf_nan_count": 0 + }, + { + "step": 11725, + "loss": 6.5505, + "learning_rate": 3.45e-05, + "inf_nan_count": 0 + }, + { + "step": 11750, + "loss": 6.671, + "learning_rate": 3.44e-05, + "inf_nan_count": 0 + }, + { + "step": 11775, + "loss": 6.6403, + "learning_rate": 3.43e-05, + "inf_nan_count": 0 + }, + { + "step": 11800, + "loss": 6.5738, + "learning_rate": 3.42e-05, + "inf_nan_count": 0 + }, + { + "step": 11825, + "loss": 6.608, + "learning_rate": 3.41e-05, + "inf_nan_count": 0 + }, + { + "step": 11850, + "loss": 6.6406, + "learning_rate": 3.4e-05, + "inf_nan_count": 0 + }, + { + "step": 11875, + "loss": 6.6299, + "learning_rate": 3.39e-05, + "inf_nan_count": 0 + }, + { + "step": 11900, + "loss": 6.5781, + "learning_rate": 3.38e-05, + "inf_nan_count": 0 + }, + { + "step": 11925, + "loss": 6.5003, + "learning_rate": 3.36e-05, + "inf_nan_count": 0 + }, + { + "step": 11950, + "loss": 6.635, + "learning_rate": 3.35e-05, + "inf_nan_count": 0 + }, + { + "step": 11975, + "loss": 6.618, + "learning_rate": 3.34e-05, + "inf_nan_count": 0 + }, + { + "step": 12000, + "loss": 6.6603, + "learning_rate": 3.33e-05, + "inf_nan_count": 0 + }, + { + "step": 12025, + "loss": 6.5507, + "learning_rate": 3.32e-05, + "inf_nan_count": 0 + }, + { + "step": 12050, + "loss": 6.5878, + "learning_rate": 3.31e-05, + "inf_nan_count": 0 + }, + { + "step": 12075, + "loss": 6.5245, + "learning_rate": 3.3e-05, + "inf_nan_count": 0 + }, + { + "step": 12100, + "loss": 6.5629, + "learning_rate": 3.29e-05, + "inf_nan_count": 0 + }, + { + "step": 12125, + "loss": 6.6181, + "learning_rate": 3.28e-05, + "inf_nan_count": 0 + }, + { + "step": 12150, + "loss": 6.578, + "learning_rate": 3.27e-05, + "inf_nan_count": 0 + }, + { + "step": 12175, + "loss": 6.5753, + "learning_rate": 3.26e-05, + "inf_nan_count": 0 + }, + { + "step": 12200, + "loss": 6.6071, + "learning_rate": 3.25e-05, + "inf_nan_count": 0 + }, + { + "step": 12225, + "loss": 6.5885, + "learning_rate": 3.24e-05, + "inf_nan_count": 0 + }, + { + "step": 12250, + "loss": 6.5413, + "learning_rate": 3.23e-05, + "inf_nan_count": 0 + }, + { + "step": 12275, + "loss": 6.6635, + "learning_rate": 3.22e-05, + "inf_nan_count": 0 + }, + { + "step": 12300, + "loss": 6.6304, + "learning_rate": 3.21e-05, + "inf_nan_count": 0 + }, + { + "step": 12325, + "loss": 6.5078, + "learning_rate": 3.2e-05, + "inf_nan_count": 0 + }, + { + "step": 12350, + "loss": 6.5712, + "learning_rate": 3.19e-05, + "inf_nan_count": 0 + }, + { + "step": 12375, + "loss": 6.6284, + "learning_rate": 3.18e-05, + "inf_nan_count": 0 + }, + { + "step": 12400, + "loss": 6.5837, + "learning_rate": 3.17e-05, + "inf_nan_count": 0 + }, + { + "step": 12425, + "loss": 6.5354, + "learning_rate": 3.16e-05, + "inf_nan_count": 0 + }, + { + "step": 12450, + "loss": 6.6125, + "learning_rate": 3.15e-05, + "inf_nan_count": 0 + }, + { + "step": 12475, + "loss": 6.5477, + "learning_rate": 3.14e-05, + "inf_nan_count": 0 + }, + { + "step": 12500, + "loss": 6.5827, + "learning_rate": 3.13e-05, + "inf_nan_count": 0 + }, + { + "step": 12525, + "loss": 6.5874, + "learning_rate": 3.11e-05, + "inf_nan_count": 0 + }, + { + "step": 12550, + "loss": 6.5437, + "learning_rate": 3.1e-05, + "inf_nan_count": 0 + }, + { + "step": 12575, + "loss": 6.582, + "learning_rate": 3.09e-05, + "inf_nan_count": 0 + }, + { + "step": 12600, + "loss": 6.5286, + "learning_rate": 3.08e-05, + "inf_nan_count": 0 + }, + { + "step": 12625, + "loss": 6.5144, + "learning_rate": 3.07e-05, + "inf_nan_count": 0 + }, + { + "step": 12650, + "loss": 6.5327, + "learning_rate": 3.06e-05, + "inf_nan_count": 0 + }, + { + "step": 12675, + "loss": 6.6058, + "learning_rate": 3.05e-05, + "inf_nan_count": 0 + }, + { + "step": 12700, + "loss": 6.5626, + "learning_rate": 3.04e-05, + "inf_nan_count": 0 + }, + { + "step": 12725, + "loss": 6.4589, + "learning_rate": 3.03e-05, + "inf_nan_count": 0 + }, + { + "step": 12750, + "loss": 6.5629, + "learning_rate": 3.02e-05, + "inf_nan_count": 0 + }, + { + "step": 12775, + "loss": 6.4815, + "learning_rate": 3.01e-05, + "inf_nan_count": 0 + }, + { + "step": 12800, + "loss": 6.5651, + "learning_rate": 3e-05, + "inf_nan_count": 0 + }, + { + "step": 12825, + "loss": 6.6164, + "learning_rate": 2.99e-05, + "inf_nan_count": 0 + }, + { + "step": 12850, + "loss": 6.6102, + "learning_rate": 2.98e-05, + "inf_nan_count": 0 + }, + { + "step": 12875, + "loss": 6.4871, + "learning_rate": 2.97e-05, + "inf_nan_count": 0 + }, + { + "step": 12900, + "loss": 6.49, + "learning_rate": 2.96e-05, + "inf_nan_count": 0 + }, + { + "step": 12925, + "loss": 6.6028, + "learning_rate": 2.95e-05, + "inf_nan_count": 0 + }, + { + "step": 12950, + "loss": 6.5509, + "learning_rate": 2.94e-05, + "inf_nan_count": 0 + }, + { + "step": 12975, + "loss": 6.5454, + "learning_rate": 2.93e-05, + "inf_nan_count": 0 + }, + { + "step": 13000, + "loss": 6.5587, + "learning_rate": 2.92e-05, + "inf_nan_count": 0 + }, + { + "step": 13025, + "loss": 6.5862, + "learning_rate": 2.91e-05, + "inf_nan_count": 0 + }, + { + "step": 13050, + "loss": 6.5668, + "learning_rate": 2.9e-05, + "inf_nan_count": 0 + }, + { + "step": 13075, + "loss": 6.522, + "learning_rate": 2.89e-05, + "inf_nan_count": 0 + }, + { + "step": 13100, + "loss": 6.5044, + "learning_rate": 2.87e-05, + "inf_nan_count": 0 + }, + { + "step": 13125, + "loss": 6.6356, + "learning_rate": 2.86e-05, + "inf_nan_count": 0 + }, + { + "step": 13150, + "loss": 6.4772, + "learning_rate": 2.85e-05, + "inf_nan_count": 0 + }, + { + "step": 13175, + "loss": 6.5504, + "learning_rate": 2.84e-05, + "inf_nan_count": 0 + }, + { + "step": 13200, + "loss": 6.5415, + "learning_rate": 2.83e-05, + "inf_nan_count": 0 + }, + { + "step": 13225, + "loss": 6.4651, + "learning_rate": 2.82e-05, + "inf_nan_count": 0 + }, + { + "step": 13250, + "loss": 6.5536, + "learning_rate": 2.81e-05, + "inf_nan_count": 0 + }, + { + "step": 13275, + "loss": 6.4861, + "learning_rate": 2.8e-05, + "inf_nan_count": 0 + }, + { + "step": 13300, + "loss": 6.4688, + "learning_rate": 2.79e-05, + "inf_nan_count": 0 + }, + { + "step": 13325, + "loss": 6.5549, + "learning_rate": 2.78e-05, + "inf_nan_count": 0 + }, + { + "step": 13350, + "loss": 6.4589, + "learning_rate": 2.77e-05, + "inf_nan_count": 0 + }, + { + "step": 13375, + "loss": 6.4644, + "learning_rate": 2.76e-05, + "inf_nan_count": 0 + }, + { + "step": 13400, + "loss": 6.5937, + "learning_rate": 2.75e-05, + "inf_nan_count": 0 + }, + { + "step": 13425, + "loss": 6.5798, + "learning_rate": 2.74e-05, + "inf_nan_count": 0 + }, + { + "step": 13450, + "loss": 6.4615, + "learning_rate": 2.73e-05, + "inf_nan_count": 0 + }, + { + "step": 13475, + "loss": 6.5173, + "learning_rate": 2.72e-05, + "inf_nan_count": 0 + }, + { + "step": 13500, + "loss": 6.4795, + "learning_rate": 2.71e-05, + "inf_nan_count": 0 + }, + { + "step": 13525, + "loss": 6.4789, + "learning_rate": 2.7e-05, + "inf_nan_count": 0 + }, + { + "step": 13550, + "loss": 6.4835, + "learning_rate": 2.69e-05, + "inf_nan_count": 0 + }, + { + "step": 13575, + "loss": 6.5405, + "learning_rate": 2.68e-05, + "inf_nan_count": 0 + }, + { + "step": 13600, + "loss": 6.4616, + "learning_rate": 2.67e-05, + "inf_nan_count": 0 + }, + { + "step": 13625, + "loss": 6.4578, + "learning_rate": 2.66e-05, + "inf_nan_count": 0 + }, + { + "step": 13650, + "loss": 6.4083, + "learning_rate": 2.65e-05, + "inf_nan_count": 0 + }, + { + "step": 13675, + "loss": 6.561, + "learning_rate": 2.64e-05, + "inf_nan_count": 0 + }, + { + "step": 13700, + "loss": 6.5432, + "learning_rate": 2.63e-05, + "inf_nan_count": 0 + }, + { + "step": 13725, + "loss": 6.5119, + "learning_rate": 2.61e-05, + "inf_nan_count": 0 + }, + { + "step": 13750, + "loss": 6.454, + "learning_rate": 2.6e-05, + "inf_nan_count": 0 + }, + { + "step": 13775, + "loss": 6.44, + "learning_rate": 2.59e-05, + "inf_nan_count": 0 + }, + { + "step": 13800, + "loss": 6.4767, + "learning_rate": 2.58e-05, + "inf_nan_count": 0 + }, + { + "step": 13825, + "loss": 6.4765, + "learning_rate": 2.57e-05, + "inf_nan_count": 0 + }, + { + "step": 13850, + "loss": 6.5018, + "learning_rate": 2.56e-05, + "inf_nan_count": 0 + }, + { + "step": 13875, + "loss": 6.5011, + "learning_rate": 2.55e-05, + "inf_nan_count": 0 + }, + { + "step": 13900, + "loss": 6.4283, + "learning_rate": 2.54e-05, + "inf_nan_count": 0 + }, + { + "step": 13925, + "loss": 6.519, + "learning_rate": 2.53e-05, + "inf_nan_count": 0 + }, + { + "step": 13950, + "loss": 6.4388, + "learning_rate": 2.52e-05, + "inf_nan_count": 0 + }, + { + "step": 13975, + "loss": 6.455, + "learning_rate": 2.51e-05, + "inf_nan_count": 0 + }, + { + "step": 14000, + "loss": 6.3491, + "learning_rate": 2.5e-05, + "inf_nan_count": 0 + }, + { + "step": 14025, + "loss": 6.5285, + "learning_rate": 2.49e-05, + "inf_nan_count": 0 + }, + { + "step": 14050, + "loss": 6.5082, + "learning_rate": 2.48e-05, + "inf_nan_count": 0 + }, + { + "step": 14075, + "loss": 6.5451, + "learning_rate": 2.47e-05, + "inf_nan_count": 0 + }, + { + "step": 14100, + "loss": 6.4753, + "learning_rate": 2.46e-05, + "inf_nan_count": 0 + }, + { + "step": 14125, + "loss": 6.6011, + "learning_rate": 2.45e-05, + "inf_nan_count": 0 + }, + { + "step": 14150, + "loss": 6.4885, + "learning_rate": 2.44e-05, + "inf_nan_count": 0 + }, + { + "step": 14175, + "loss": 6.4635, + "learning_rate": 2.43e-05, + "inf_nan_count": 0 + }, + { + "step": 14200, + "loss": 6.5519, + "learning_rate": 2.42e-05, + "inf_nan_count": 0 + }, + { + "step": 14225, + "loss": 6.4356, + "learning_rate": 2.41e-05, + "inf_nan_count": 0 + }, + { + "step": 14250, + "loss": 6.4552, + "learning_rate": 2.4e-05, + "inf_nan_count": 0 + }, + { + "step": 14275, + "loss": 6.4613, + "learning_rate": 2.39e-05, + "inf_nan_count": 0 + }, + { + "step": 14300, + "loss": 6.4411, + "learning_rate": 2.38e-05, + "inf_nan_count": 0 + }, + { + "step": 14325, + "loss": 6.557, + "learning_rate": 2.36e-05, + "inf_nan_count": 0 + }, + { + "step": 14350, + "loss": 6.4476, + "learning_rate": 2.35e-05, + "inf_nan_count": 0 + }, + { + "step": 14375, + "loss": 6.5895, + "learning_rate": 2.34e-05, + "inf_nan_count": 0 + }, + { + "step": 14400, + "loss": 6.4836, + "learning_rate": 2.33e-05, + "inf_nan_count": 0 + }, + { + "step": 14425, + "loss": 6.4175, + "learning_rate": 2.32e-05, + "inf_nan_count": 0 + }, + { + "step": 14450, + "loss": 6.4971, + "learning_rate": 2.31e-05, + "inf_nan_count": 0 + }, + { + "step": 14475, + "loss": 6.4897, + "learning_rate": 2.3e-05, + "inf_nan_count": 0 + }, + { + "step": 14500, + "loss": 6.455, + "learning_rate": 2.29e-05, + "inf_nan_count": 0 + }, + { + "step": 14525, + "loss": 6.4688, + "learning_rate": 2.28e-05, + "inf_nan_count": 0 + }, + { + "step": 14550, + "loss": 6.5494, + "learning_rate": 2.27e-05, + "inf_nan_count": 0 + }, + { + "step": 14575, + "loss": 6.4501, + "learning_rate": 2.26e-05, + "inf_nan_count": 0 + }, + { + "step": 14600, + "loss": 6.5142, + "learning_rate": 2.25e-05, + "inf_nan_count": 0 + }, + { + "step": 14625, + "loss": 6.4891, + "learning_rate": 2.24e-05, + "inf_nan_count": 0 + }, + { + "step": 14650, + "loss": 6.4274, + "learning_rate": 2.23e-05, + "inf_nan_count": 0 + }, + { + "step": 14675, + "loss": 6.5277, + "learning_rate": 2.22e-05, + "inf_nan_count": 0 + }, + { + "step": 14700, + "loss": 6.4472, + "learning_rate": 2.21e-05, + "inf_nan_count": 0 + }, + { + "step": 14725, + "loss": 6.4328, + "learning_rate": 2.2e-05, + "inf_nan_count": 0 + }, + { + "step": 14750, + "loss": 6.4928, + "learning_rate": 2.19e-05, + "inf_nan_count": 0 + }, + { + "step": 14775, + "loss": 6.552, + "learning_rate": 2.18e-05, + "inf_nan_count": 0 + }, + { + "step": 14800, + "loss": 6.5474, + "learning_rate": 2.17e-05, + "inf_nan_count": 0 + }, + { + "step": 14825, + "loss": 6.4394, + "learning_rate": 2.16e-05, + "inf_nan_count": 0 + }, + { + "step": 14850, + "loss": 6.5234, + "learning_rate": 2.15e-05, + "inf_nan_count": 0 + }, + { + "step": 14875, + "loss": 6.4369, + "learning_rate": 2.14e-05, + "inf_nan_count": 0 + }, + { + "step": 14900, + "loss": 6.4694, + "learning_rate": 2.13e-05, + "inf_nan_count": 0 + }, + { + "step": 14925, + "loss": 6.5837, + "learning_rate": 2.11e-05, + "inf_nan_count": 0 + }, + { + "step": 14950, + "loss": 6.4841, + "learning_rate": 2.1e-05, + "inf_nan_count": 0 + }, + { + "step": 14975, + "loss": 6.4347, + "learning_rate": 2.09e-05, + "inf_nan_count": 0 + }, + { + "step": 15000, + "loss": 6.5816, + "learning_rate": 2.08e-05, + "inf_nan_count": 0 + }, + { + "step": 15025, + "loss": 6.5337, + "learning_rate": 2.07e-05, + "inf_nan_count": 0 + }, + { + "step": 15050, + "loss": 6.5131, + "learning_rate": 2.06e-05, + "inf_nan_count": 0 + }, + { + "step": 15075, + "loss": 6.4669, + "learning_rate": 2.05e-05, + "inf_nan_count": 0 + }, + { + "step": 15100, + "loss": 6.5141, + "learning_rate": 2.04e-05, + "inf_nan_count": 0 + }, + { + "step": 15125, + "loss": 6.438, + "learning_rate": 2.03e-05, + "inf_nan_count": 0 + }, + { + "step": 15150, + "loss": 6.4036, + "learning_rate": 2.02e-05, + "inf_nan_count": 0 + }, + { + "step": 15175, + "loss": 6.4517, + "learning_rate": 2.01e-05, + "inf_nan_count": 0 + }, + { + "step": 15200, + "loss": 6.477, + "learning_rate": 2e-05, + "inf_nan_count": 0 + }, + { + "step": 15225, + "loss": 6.4317, + "learning_rate": 1.99e-05, + "inf_nan_count": 0 + }, + { + "step": 15250, + "loss": 6.488, + "learning_rate": 1.98e-05, + "inf_nan_count": 0 + }, + { + "step": 15275, + "loss": 6.4466, + "learning_rate": 1.97e-05, + "inf_nan_count": 0 + }, + { + "step": 15300, + "loss": 6.4248, + "learning_rate": 1.96e-05, + "inf_nan_count": 0 + }, + { + "step": 15325, + "loss": 6.3834, + "learning_rate": 1.95e-05, + "inf_nan_count": 0 + }, + { + "step": 15350, + "loss": 6.4272, + "learning_rate": 1.94e-05, + "inf_nan_count": 0 + }, + { + "step": 15375, + "loss": 6.4834, + "learning_rate": 1.93e-05, + "inf_nan_count": 0 + }, + { + "step": 15400, + "loss": 6.405, + "learning_rate": 1.92e-05, + "inf_nan_count": 0 + }, + { + "step": 15425, + "loss": 6.4264, + "learning_rate": 1.91e-05, + "inf_nan_count": 0 + }, + { + "step": 15450, + "loss": 6.4941, + "learning_rate": 1.9e-05, + "inf_nan_count": 0 + }, + { + "step": 15475, + "loss": 6.4755, + "learning_rate": 1.89e-05, + "inf_nan_count": 0 + }, + { + "step": 15500, + "loss": 6.5459, + "learning_rate": 1.88e-05, + "inf_nan_count": 0 + }, + { + "step": 15525, + "loss": 6.3772, + "learning_rate": 1.86e-05, + "inf_nan_count": 0 + }, + { + "step": 15550, + "loss": 6.443, + "learning_rate": 1.85e-05, + "inf_nan_count": 0 + }, + { + "step": 15575, + "loss": 6.3931, + "learning_rate": 1.84e-05, + "inf_nan_count": 0 + }, + { + "step": 15600, + "loss": 6.4087, + "learning_rate": 1.83e-05, + "inf_nan_count": 0 + }, + { + "step": 15625, + "loss": 6.4743, + "learning_rate": 1.82e-05, + "inf_nan_count": 0 + }, + { + "step": 15650, + "loss": 6.4575, + "learning_rate": 1.81e-05, + "inf_nan_count": 0 + }, + { + "step": 15675, + "loss": 6.4971, + "learning_rate": 1.8e-05, + "inf_nan_count": 0 + }, + { + "step": 15700, + "loss": 6.438, + "learning_rate": 1.79e-05, + "inf_nan_count": 0 + }, + { + "step": 15725, + "loss": 6.5071, + "learning_rate": 1.78e-05, + "inf_nan_count": 0 + }, + { + "step": 15750, + "loss": 6.391, + "learning_rate": 1.77e-05, + "inf_nan_count": 0 + }, + { + "step": 15775, + "loss": 6.4386, + "learning_rate": 1.76e-05, + "inf_nan_count": 0 + }, + { + "step": 15800, + "loss": 6.4268, + "learning_rate": 1.75e-05, + "inf_nan_count": 0 + }, + { + "step": 15825, + "loss": 6.5534, + "learning_rate": 1.74e-05, + "inf_nan_count": 0 + }, + { + "step": 15850, + "loss": 6.4422, + "learning_rate": 1.73e-05, + "inf_nan_count": 0 + }, + { + "step": 15875, + "loss": 6.4075, + "learning_rate": 1.72e-05, + "inf_nan_count": 0 + }, + { + "step": 15900, + "loss": 6.4458, + "learning_rate": 1.71e-05, + "inf_nan_count": 0 + }, + { + "step": 15925, + "loss": 6.3855, + "learning_rate": 1.7e-05, + "inf_nan_count": 0 + }, + { + "step": 15950, + "loss": 6.3659, + "learning_rate": 1.69e-05, + "inf_nan_count": 0 + }, + { + "step": 15975, + "loss": 6.5396, + "learning_rate": 1.68e-05, + "inf_nan_count": 0 + }, + { + "step": 16000, + "loss": 6.4974, + "learning_rate": 1.67e-05, + "inf_nan_count": 0 + }, + { + "step": 16025, + "loss": 6.4785, + "learning_rate": 1.66e-05, + "inf_nan_count": 0 + }, + { + "step": 16050, + "loss": 6.4341, + "learning_rate": 1.65e-05, + "inf_nan_count": 0 + }, + { + "step": 16075, + "loss": 6.3709, + "learning_rate": 1.64e-05, + "inf_nan_count": 0 + }, + { + "step": 16100, + "loss": 6.3707, + "learning_rate": 1.63e-05, + "inf_nan_count": 0 + }, + { + "step": 16125, + "loss": 6.4206, + "learning_rate": 1.61e-05, + "inf_nan_count": 0 + }, + { + "step": 16150, + "loss": 6.397, + "learning_rate": 1.6e-05, + "inf_nan_count": 0 + }, + { + "step": 16175, + "loss": 6.4617, + "learning_rate": 1.59e-05, + "inf_nan_count": 0 + }, + { + "step": 16200, + "loss": 6.5586, + "learning_rate": 1.58e-05, + "inf_nan_count": 0 + }, + { + "step": 16225, + "loss": 6.4248, + "learning_rate": 1.57e-05, + "inf_nan_count": 0 + }, + { + "step": 16250, + "loss": 6.4204, + "learning_rate": 1.56e-05, + "inf_nan_count": 0 + }, + { + "step": 16275, + "loss": 6.4632, + "learning_rate": 1.55e-05, + "inf_nan_count": 0 + }, + { + "step": 16300, + "loss": 6.4491, + "learning_rate": 1.54e-05, + "inf_nan_count": 0 + }, + { + "step": 16325, + "loss": 6.4412, + "learning_rate": 1.53e-05, + "inf_nan_count": 0 + }, + { + "step": 16350, + "loss": 6.4144, + "learning_rate": 1.52e-05, + "inf_nan_count": 0 + }, + { + "step": 16375, + "loss": 6.466, + "learning_rate": 1.51e-05, + "inf_nan_count": 0 + }, + { + "step": 16400, + "loss": 6.4246, + "learning_rate": 1.5e-05, + "inf_nan_count": 0 + }, + { + "step": 16425, + "loss": 6.4571, + "learning_rate": 1.49e-05, + "inf_nan_count": 0 + }, + { + "step": 16450, + "loss": 6.3903, + "learning_rate": 1.48e-05, + "inf_nan_count": 0 + }, + { + "step": 16475, + "loss": 6.4141, + "learning_rate": 1.47e-05, + "inf_nan_count": 0 + }, + { + "step": 16500, + "loss": 6.4467, + "learning_rate": 1.46e-05, + "inf_nan_count": 0 + }, + { + "step": 16525, + "loss": 6.356, + "learning_rate": 1.45e-05, + "inf_nan_count": 0 + }, + { + "step": 16550, + "loss": 6.4049, + "learning_rate": 1.44e-05, + "inf_nan_count": 0 + }, + { + "step": 16575, + "loss": 6.4103, + "learning_rate": 1.43e-05, + "inf_nan_count": 0 + }, + { + "step": 16600, + "loss": 6.4282, + "learning_rate": 1.42e-05, + "inf_nan_count": 0 + }, + { + "step": 16625, + "loss": 6.5397, + "learning_rate": 1.41e-05, + "inf_nan_count": 0 + }, + { + "step": 16650, + "loss": 6.3862, + "learning_rate": 1.4e-05, + "inf_nan_count": 0 + }, + { + "step": 16675, + "loss": 6.4291, + "learning_rate": 1.39e-05, + "inf_nan_count": 0 + }, + { + "step": 16700, + "loss": 6.433, + "learning_rate": 1.38e-05, + "inf_nan_count": 0 + }, + { + "step": 16725, + "loss": 6.3934, + "learning_rate": 1.36e-05, + "inf_nan_count": 0 + }, + { + "step": 16750, + "loss": 6.4042, + "learning_rate": 1.35e-05, + "inf_nan_count": 0 + }, + { + "step": 16775, + "loss": 6.4187, + "learning_rate": 1.34e-05, + "inf_nan_count": 0 + }, + { + "step": 16800, + "loss": 6.4455, + "learning_rate": 1.33e-05, + "inf_nan_count": 0 + }, + { + "step": 16825, + "loss": 6.424, + "learning_rate": 1.32e-05, + "inf_nan_count": 0 + }, + { + "step": 16850, + "loss": 6.4491, + "learning_rate": 1.31e-05, + "inf_nan_count": 0 + }, + { + "step": 16875, + "loss": 6.3993, + "learning_rate": 1.3e-05, + "inf_nan_count": 0 + }, + { + "step": 16900, + "loss": 6.4393, + "learning_rate": 1.29e-05, + "inf_nan_count": 0 + }, + { + "step": 16925, + "loss": 6.3705, + "learning_rate": 1.28e-05, + "inf_nan_count": 0 + }, + { + "step": 16950, + "loss": 6.4404, + "learning_rate": 1.27e-05, + "inf_nan_count": 0 + }, + { + "step": 16975, + "loss": 6.4507, + "learning_rate": 1.26e-05, + "inf_nan_count": 0 + }, + { + "step": 17000, + "loss": 6.3821, + "learning_rate": 1.25e-05, + "inf_nan_count": 0 + }, + { + "step": 17025, + "loss": 6.4234, + "learning_rate": 1.24e-05, + "inf_nan_count": 0 + }, + { + "step": 17050, + "loss": 6.4235, + "learning_rate": 1.23e-05, + "inf_nan_count": 0 + }, + { + "step": 17075, + "loss": 6.4856, + "learning_rate": 1.22e-05, + "inf_nan_count": 0 + }, + { + "step": 17100, + "loss": 6.4877, + "learning_rate": 1.21e-05, + "inf_nan_count": 0 + }, + { + "step": 17125, + "loss": 6.3683, + "learning_rate": 1.2e-05, + "inf_nan_count": 0 + }, + { + "step": 17150, + "loss": 6.4225, + "learning_rate": 1.19e-05, + "inf_nan_count": 0 + }, + { + "step": 17175, + "loss": 6.2573, + "learning_rate": 1.18e-05, + "inf_nan_count": 0 + }, + { + "step": 17200, + "loss": 6.3946, + "learning_rate": 1.17e-05, + "inf_nan_count": 0 + }, + { + "step": 17225, + "loss": 6.4607, + "learning_rate": 1.16e-05, + "inf_nan_count": 0 + }, + { + "step": 17250, + "loss": 6.4407, + "learning_rate": 1.15e-05, + "inf_nan_count": 0 + }, + { + "step": 17275, + "loss": 6.4333, + "learning_rate": 1.14e-05, + "inf_nan_count": 0 + }, + { + "step": 17300, + "loss": 6.3782, + "learning_rate": 1.13e-05, + "inf_nan_count": 0 + }, + { + "step": 17325, + "loss": 6.3665, + "learning_rate": 1.11e-05, + "inf_nan_count": 0 + }, + { + "step": 17350, + "loss": 6.4329, + "learning_rate": 1.1e-05, + "inf_nan_count": 0 + }, + { + "step": 17375, + "loss": 6.5107, + "learning_rate": 1.09e-05, + "inf_nan_count": 0 + }, + { + "step": 17400, + "loss": 6.5076, + "learning_rate": 1.08e-05, + "inf_nan_count": 0 + }, + { + "step": 17425, + "loss": 6.4936, + "learning_rate": 1.07e-05, + "inf_nan_count": 0 + }, + { + "step": 17450, + "loss": 6.4119, + "learning_rate": 1.06e-05, + "inf_nan_count": 0 + }, + { + "step": 17475, + "loss": 6.4032, + "learning_rate": 1.05e-05, + "inf_nan_count": 0 + }, + { + "step": 17500, + "loss": 6.3962, + "learning_rate": 1.04e-05, + "inf_nan_count": 0 + }, + { + "step": 17525, + "loss": 6.4288, + "learning_rate": 1.03e-05, + "inf_nan_count": 0 + }, + { + "step": 17550, + "loss": 6.4021, + "learning_rate": 1.02e-05, + "inf_nan_count": 0 + }, + { + "step": 17575, + "loss": 6.367, + "learning_rate": 1.01e-05, + "inf_nan_count": 0 + }, + { + "step": 17600, + "loss": 6.3904, + "learning_rate": 1e-05, + "inf_nan_count": 0 + }, + { + "step": 17625, + "loss": 6.5059, + "learning_rate": 9.9e-06, + "inf_nan_count": 0 + }, + { + "step": 17650, + "loss": 6.4225, + "learning_rate": 9.79e-06, + "inf_nan_count": 0 + }, + { + "step": 17675, + "loss": 6.4422, + "learning_rate": 9.69e-06, + "inf_nan_count": 0 + }, + { + "step": 17700, + "loss": 6.457, + "learning_rate": 9.58e-06, + "inf_nan_count": 0 + }, + { + "step": 17725, + "loss": 6.4475, + "learning_rate": 9.48e-06, + "inf_nan_count": 0 + }, + { + "step": 17750, + "loss": 6.3786, + "learning_rate": 9.38e-06, + "inf_nan_count": 0 + }, + { + "step": 17775, + "loss": 6.4145, + "learning_rate": 9.27e-06, + "inf_nan_count": 0 + }, + { + "step": 17800, + "loss": 6.3543, + "learning_rate": 9.17e-06, + "inf_nan_count": 0 + }, + { + "step": 17825, + "loss": 6.5116, + "learning_rate": 9.06e-06, + "inf_nan_count": 0 + }, + { + "step": 17850, + "loss": 6.4101, + "learning_rate": 8.96e-06, + "inf_nan_count": 0 + }, + { + "step": 17875, + "loss": 6.4014, + "learning_rate": 8.85e-06, + "inf_nan_count": 0 + }, + { + "step": 17900, + "loss": 6.4216, + "learning_rate": 8.75e-06, + "inf_nan_count": 0 + }, + { + "step": 17925, + "loss": 6.4539, + "learning_rate": 8.65e-06, + "inf_nan_count": 0 + }, + { + "step": 17950, + "loss": 6.4205, + "learning_rate": 8.54e-06, + "inf_nan_count": 0 + }, + { + "step": 17975, + "loss": 6.3865, + "learning_rate": 8.44e-06, + "inf_nan_count": 0 + }, + { + "step": 18000, + "loss": 6.4347, + "learning_rate": 8.33e-06, + "inf_nan_count": 0 + }, + { + "step": 18025, + "loss": 6.4313, + "learning_rate": 8.23e-06, + "inf_nan_count": 0 + }, + { + "step": 18050, + "loss": 6.3868, + "learning_rate": 8.13e-06, + "inf_nan_count": 0 + }, + { + "step": 18075, + "loss": 6.3703, + "learning_rate": 8.02e-06, + "inf_nan_count": 0 + }, + { + "step": 18100, + "loss": 6.3747, + "learning_rate": 7.92e-06, + "inf_nan_count": 0 + }, + { + "step": 18125, + "loss": 6.4228, + "learning_rate": 7.81e-06, + "inf_nan_count": 0 + }, + { + "step": 18150, + "loss": 6.349, + "learning_rate": 7.71e-06, + "inf_nan_count": 0 + }, + { + "step": 18175, + "loss": 6.4522, + "learning_rate": 7.6e-06, + "inf_nan_count": 0 + }, + { + "step": 18200, + "loss": 6.3354, + "learning_rate": 7.5e-06, + "inf_nan_count": 0 + }, + { + "step": 18225, + "loss": 6.4663, + "learning_rate": 7.4e-06, + "inf_nan_count": 0 + }, + { + "step": 18250, + "loss": 6.4155, + "learning_rate": 7.29e-06, + "inf_nan_count": 0 + }, + { + "step": 18275, + "loss": 6.4584, + "learning_rate": 7.19e-06, + "inf_nan_count": 0 + }, + { + "step": 18300, + "loss": 6.3637, + "learning_rate": 7.08e-06, + "inf_nan_count": 0 + }, + { + "step": 18325, + "loss": 6.3583, + "learning_rate": 6.98e-06, + "inf_nan_count": 0 + }, + { + "step": 18350, + "loss": 6.4469, + "learning_rate": 6.88e-06, + "inf_nan_count": 0 + }, + { + "step": 18375, + "loss": 6.3768, + "learning_rate": 6.77e-06, + "inf_nan_count": 0 + }, + { + "step": 18400, + "loss": 6.3179, + "learning_rate": 6.67e-06, + "inf_nan_count": 0 + }, + { + "step": 18425, + "loss": 6.4046, + "learning_rate": 6.56e-06, + "inf_nan_count": 0 + }, + { + "step": 18450, + "loss": 6.3435, + "learning_rate": 6.46e-06, + "inf_nan_count": 0 + }, + { + "step": 18475, + "loss": 6.3454, + "learning_rate": 6.35e-06, + "inf_nan_count": 0 + }, + { + "step": 18500, + "loss": 6.3922, + "learning_rate": 6.25e-06, + "inf_nan_count": 0 + }, + { + "step": 18525, + "loss": 6.3459, + "learning_rate": 6.15e-06, + "inf_nan_count": 0 + }, + { + "step": 18550, + "loss": 6.3591, + "learning_rate": 6.04e-06, + "inf_nan_count": 0 + }, + { + "step": 18575, + "loss": 6.4337, + "learning_rate": 5.94e-06, + "inf_nan_count": 0 + }, + { + "step": 18600, + "loss": 6.3962, + "learning_rate": 5.83e-06, + "inf_nan_count": 0 + }, + { + "step": 18625, + "loss": 6.3425, + "learning_rate": 5.73e-06, + "inf_nan_count": 0 + }, + { + "step": 18650, + "loss": 6.4022, + "learning_rate": 5.63e-06, + "inf_nan_count": 0 + }, + { + "step": 18675, + "loss": 6.4513, + "learning_rate": 5.52e-06, + "inf_nan_count": 0 + }, + { + "step": 18700, + "loss": 6.4284, + "learning_rate": 5.42e-06, + "inf_nan_count": 0 + }, + { + "step": 18725, + "loss": 6.3879, + "learning_rate": 5.31e-06, + "inf_nan_count": 0 + }, + { + "step": 18750, + "loss": 6.4009, + "learning_rate": 5.21e-06, + "inf_nan_count": 0 + }, + { + "step": 18775, + "loss": 6.3713, + "learning_rate": 5.1e-06, + "inf_nan_count": 0 + }, + { + "step": 18800, + "loss": 6.3752, + "learning_rate": 5e-06, + "inf_nan_count": 0 + }, + { + "step": 18825, + "loss": 6.4265, + "learning_rate": 4.9e-06, + "inf_nan_count": 0 + }, + { + "step": 18850, + "loss": 6.3709, + "learning_rate": 4.79e-06, + "inf_nan_count": 0 + }, + { + "step": 18875, + "loss": 6.3316, + "learning_rate": 4.69e-06, + "inf_nan_count": 0 + }, + { + "step": 18900, + "loss": 6.4479, + "learning_rate": 4.58e-06, + "inf_nan_count": 0 + }, + { + "step": 18925, + "loss": 6.4247, + "learning_rate": 4.48e-06, + "inf_nan_count": 0 + }, + { + "step": 18950, + "loss": 6.4126, + "learning_rate": 4.37e-06, + "inf_nan_count": 0 + }, + { + "step": 18975, + "loss": 6.3489, + "learning_rate": 4.27e-06, + "inf_nan_count": 0 + }, + { + "step": 19000, + "loss": 6.325, + "learning_rate": 4.17e-06, + "inf_nan_count": 0 + }, + { + "step": 19025, + "loss": 6.3306, + "learning_rate": 4.06e-06, + "inf_nan_count": 0 + }, + { + "step": 19050, + "loss": 6.387, + "learning_rate": 3.96e-06, + "inf_nan_count": 0 + }, + { + "step": 19075, + "loss": 6.4133, + "learning_rate": 3.85e-06, + "inf_nan_count": 0 + }, + { + "step": 19100, + "loss": 6.334, + "learning_rate": 3.75e-06, + "inf_nan_count": 0 + }, + { + "step": 19125, + "loss": 6.3034, + "learning_rate": 3.65e-06, + "inf_nan_count": 0 + }, + { + "step": 19150, + "loss": 6.4097, + "learning_rate": 3.54e-06, + "inf_nan_count": 0 + }, + { + "step": 19175, + "loss": 6.442, + "learning_rate": 3.44e-06, + "inf_nan_count": 0 + }, + { + "step": 19200, + "loss": 6.3756, + "learning_rate": 3.33e-06, + "inf_nan_count": 0 + }, + { + "step": 19225, + "loss": 6.4037, + "learning_rate": 3.23e-06, + "inf_nan_count": 0 + }, + { + "step": 19250, + "loss": 6.3974, + "learning_rate": 3.13e-06, + "inf_nan_count": 0 + }, + { + "step": 19275, + "loss": 6.3933, + "learning_rate": 3.02e-06, + "inf_nan_count": 0 + }, + { + "step": 19300, + "loss": 6.3269, + "learning_rate": 2.92e-06, + "inf_nan_count": 0 + }, + { + "step": 19325, + "loss": 6.3907, + "learning_rate": 2.81e-06, + "inf_nan_count": 0 + }, + { + "step": 19350, + "loss": 6.3955, + "learning_rate": 2.71e-06, + "inf_nan_count": 0 + }, + { + "step": 19375, + "loss": 6.3972, + "learning_rate": 2.6e-06, + "inf_nan_count": 0 + }, + { + "step": 19400, + "loss": 6.3896, + "learning_rate": 2.5e-06, + "inf_nan_count": 0 + }, + { + "step": 19425, + "loss": 6.3425, + "learning_rate": 2.4e-06, + "inf_nan_count": 0 + }, + { + "step": 19450, + "loss": 6.3587, + "learning_rate": 2.29e-06, + "inf_nan_count": 0 + }, + { + "step": 19475, + "loss": 6.4179, + "learning_rate": 2.19e-06, + "inf_nan_count": 0 + }, + { + "step": 19500, + "loss": 6.4192, + "learning_rate": 2.08e-06, + "inf_nan_count": 0 + }, + { + "step": 19525, + "loss": 6.4252, + "learning_rate": 1.98e-06, + "inf_nan_count": 0 + }, + { + "step": 19550, + "loss": 6.3349, + "learning_rate": 1.88e-06, + "inf_nan_count": 0 + }, + { + "step": 19575, + "loss": 6.4042, + "learning_rate": 1.77e-06, + "inf_nan_count": 0 + }, + { + "step": 19600, + "loss": 6.3567, + "learning_rate": 1.67e-06, + "inf_nan_count": 0 + }, + { + "step": 19625, + "loss": 6.3912, + "learning_rate": 1.56e-06, + "inf_nan_count": 0 + }, + { + "step": 19650, + "loss": 6.3113, + "learning_rate": 1.46e-06, + "inf_nan_count": 0 + }, + { + "step": 19675, + "loss": 6.3756, + "learning_rate": 1.35e-06, + "inf_nan_count": 0 + }, + { + "step": 19700, + "loss": 6.385, + "learning_rate": 1.25e-06, + "inf_nan_count": 0 + }, + { + "step": 19725, + "loss": 6.3631, + "learning_rate": 1.15e-06, + "inf_nan_count": 0 + }, + { + "step": 19750, + "loss": 6.4564, + "learning_rate": 1.04e-06, + "inf_nan_count": 0 + }, + { + "step": 19775, + "loss": 6.3258, + "learning_rate": 9.38e-07, + "inf_nan_count": 0 + }, + { + "step": 19800, + "loss": 6.4682, + "learning_rate": 8.33e-07, + "inf_nan_count": 0 + }, + { + "step": 19825, + "loss": 6.4421, + "learning_rate": 7.29e-07, + "inf_nan_count": 0 + }, + { + "step": 19850, + "loss": 6.4342, + "learning_rate": 6.25e-07, + "inf_nan_count": 0 + }, + { + "step": 19875, + "loss": 6.4182, + "learning_rate": 5.21e-07, + "inf_nan_count": 0 + }, + { + "step": 19900, + "loss": 6.3203, + "learning_rate": 4.17e-07, + "inf_nan_count": 0 + }, + { + "step": 19925, + "loss": 6.4339, + "learning_rate": 3.13e-07, + "inf_nan_count": 0 + }, + { + "step": 19950, + "loss": 6.4095, + "learning_rate": 2.08e-07, + "inf_nan_count": 0 + }, + { + "step": 19975, + "loss": 6.4814, + "learning_rate": 1.04e-07, + "inf_nan_count": 0 + } + ], + "evaluation_results": [ + { + "step": 1000, + "paloma": 7.125172406420199e+27 + }, + { + "step": 1500, + "paloma": 6.5469212698356e+18 + }, + { + "step": 2000, + "paloma": 5.118641309912889e+18 + }, + { + "step": 2500, + "paloma": 3.37924315167126e+18 + }, + { + "step": 3000, + "paloma": 6.892747900243237e+18 + }, + { + "step": 3500, + "paloma": 2.0436832271954907e+19 + }, + { + "step": 4000, + "paloma": 4.1410268232311005e+19 + }, + { + "step": 4500, + "paloma": 3.4524340411684053e+19 + }, + { + "step": 5000, + "paloma": 2.320698426399461e+19 + }, + { + "step": 5500, + "paloma": 3.1834097890526753e+19 + }, + { + "step": 6000, + "paloma": 4.457139025979801e+19 + }, + { + "step": 6500, + "paloma": 7.3062353841856406e+19 + }, + { + "step": 7000, + "paloma": 1.2357969480287024e+20 + }, + { + "step": 7500, + "paloma": 2.7199371732053928e+20 + }, + { + "step": 8000, + "paloma": 7.181862506006892e+20 + }, + { + "step": 8500, + "paloma": 1.5123285241831744e+21 + }, + { + "step": 9000, + "paloma": 3.573074534351724e+21 + }, + { + "step": 9500, + "paloma": 7.403721262078652e+21 + }, + { + "step": 10000, + "paloma": 1.0650515380055143e+22 + }, + { + "step": 10500, + "paloma": 2.1077589258137904e+22 + }, + { + "step": 11000, + "paloma": 2.712416409262884e+22 + }, + { + "step": 11500, + "paloma": 4.877238989481918e+22 + }, + { + "step": 12000, + "paloma": 7.219509956260661e+22 + }, + { + "step": 12500, + "paloma": 1.1729325953411656e+23 + }, + { + "step": 13000, + "paloma": 1.729306754923583e+23 + }, + { + "step": 13500, + "paloma": 2.4018454768029128e+23 + }, + { + "step": 14000, + "paloma": 3.247328955167052e+23 + }, + { + "step": 14500, + "paloma": 4.43239578722337e+23 + }, + { + "step": 15000, + "paloma": 5.215164570276226e+23 + }, + { + "step": 15500, + "paloma": 6.102665947946271e+23 + }, + { + "step": 16000, + "paloma": 8.874629945146669e+23 + }, + { + "step": 16500, + "paloma": 9.981607121011733e+23 + }, + { + "step": 17000, + "paloma": 1.1075349421086151e+24 + }, + { + "step": 17500, + "paloma": 1.1064948792133394e+24 + }, + { + "step": 18000, + "paloma": 1.340918782615931e+24 + }, + { + "step": 18500, + "paloma": 1.4325241176004668e+24 + }, + { + "step": 19000, + "paloma": 1.5360601246943468e+24 + }, + { + "step": 19500, + "paloma": 1.6346615942991742e+24 + }, + { + "step": 20000, + "paloma": 1.645368302099182e+24 + } + ], + "config": { + "d_model": 96, + "n_layers": 12, + "max_seq_len": 2048, + "vocab_size": 50304, + "lr": 5e-05, + "max_steps": 20000, + "batch_size": 1 + } + }, + { + "run_name": "pico-decoder-tiny-dolma29k-v1", + "log_file": "log_20250828_225300.log", + "training_metrics": [ + { + "step": 1000, + "loss": 7.7657, + "learning_rate": 0.00012, + "inf_nan_count": 0 + }, + { + "step": 1100, + "loss": 7.6733, + "learning_rate": 0.000132, + "inf_nan_count": 0 + }, + { + "step": 1200, + "loss": 7.5969, + "learning_rate": 0.000144, + "inf_nan_count": 0 + }, + { + "step": 1300, + "loss": 7.4765, + "learning_rate": 0.000156, + "inf_nan_count": 0 + }, + { + "step": 1400, + "loss": 7.3686, + "learning_rate": 0.000168, + "inf_nan_count": 0 + }, + { + "step": 1500, + "loss": 7.3251, + "learning_rate": 0.00018, + "inf_nan_count": 0 + }, + { + "step": 1600, + "loss": 7.184, + "learning_rate": 0.000192, + "inf_nan_count": 0 + }, + { + "step": 1700, + "loss": 7.1116, + "learning_rate": 0.000204, + "inf_nan_count": 0 + }, + { + "step": 1800, + "loss": 7.0565, + "learning_rate": 0.000216, + "inf_nan_count": 0 + }, + { + "step": 1900, + "loss": 6.9964, + "learning_rate": 0.000228, + "inf_nan_count": 0 + }, + { + "step": 2000, + "loss": 6.969, + "learning_rate": 0.00024, + "inf_nan_count": 0 + }, + { + "step": 2100, + "loss": 6.884, + "learning_rate": 0.000252, + "inf_nan_count": 0 + }, + { + "step": 2200, + "loss": 6.8334, + "learning_rate": 0.000264, + "inf_nan_count": 0 + }, + { + "step": 2300, + "loss": 6.815, + "learning_rate": 0.000276, + "inf_nan_count": 0 + }, + { + "step": 2400, + "loss": 6.7519, + "learning_rate": 0.000288, + "inf_nan_count": 0 + }, + { + "step": 2500, + "loss": 6.6908, + "learning_rate": 0.0003, + "inf_nan_count": 0 + }, + { + "step": 2600, + "loss": 6.6351, + "learning_rate": 0.0003, + "inf_nan_count": 0 + }, + { + "step": 2700, + "loss": 6.5568, + "learning_rate": 0.0003, + "inf_nan_count": 0 + }, + { + "step": 2800, + "loss": 6.5799, + "learning_rate": 0.0003, + "inf_nan_count": 0 + }, + { + "step": 2900, + "loss": 6.5467, + "learning_rate": 0.000299, + "inf_nan_count": 0 + }, + { + "step": 3000, + "loss": 6.4865, + "learning_rate": 0.000299, + "inf_nan_count": 0 + }, + { + "step": 3100, + "loss": 6.4604, + "learning_rate": 0.000299, + "inf_nan_count": 0 + }, + { + "step": 3200, + "loss": 6.4205, + "learning_rate": 0.000299, + "inf_nan_count": 0 + }, + { + "step": 3300, + "loss": 6.4127, + "learning_rate": 0.000299, + "inf_nan_count": 0 + }, + { + "step": 3400, + "loss": 6.3692, + "learning_rate": 0.000299, + "inf_nan_count": 0 + }, + { + "step": 3500, + "loss": 6.3761, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 3600, + "loss": 6.2796, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 3700, + "loss": 6.2988, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 3800, + "loss": 6.2673, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 3900, + "loss": 6.2715, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 4000, + "loss": 6.189, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 4100, + "loss": 6.1832, + "learning_rate": 0.000298, + "inf_nan_count": 0 + }, + { + "step": 4200, + "loss": 6.1553, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4300, + "loss": 6.1629, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4400, + "loss": 6.1061, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4500, + "loss": 6.1601, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4600, + "loss": 6.0963, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4700, + "loss": 6.078, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4800, + "loss": 6.0835, + "learning_rate": 0.000297, + "inf_nan_count": 0 + }, + { + "step": 4900, + "loss": 6.0519, + "learning_rate": 0.000296, + "inf_nan_count": 0 + }, + { + "step": 5000, + "loss": 6.0661, + "learning_rate": 0.000296, + "inf_nan_count": 0 + }, + { + "step": 5100, + "loss": 6.0121, + "learning_rate": 0.000296, + "inf_nan_count": 0 + }, + { + "step": 5200, + "loss": 6.0544, + "learning_rate": 0.000296, + "inf_nan_count": 0 + }, + { + "step": 5300, + "loss": 6.0224, + "learning_rate": 0.000296, + "inf_nan_count": 0 + }, + { + "step": 5400, + "loss": 5.9831, + "learning_rate": 0.000296, + "inf_nan_count": 0 + }, + { + "step": 5500, + "loss": 5.9553, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 5600, + "loss": 5.9493, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 5700, + "loss": 5.9943, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 5800, + "loss": 5.963, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 5900, + "loss": 5.9349, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 6000, + "loss": 5.9087, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 6100, + "loss": 5.8818, + "learning_rate": 0.000295, + "inf_nan_count": 0 + }, + { + "step": 6200, + "loss": 5.8535, + "learning_rate": 0.000294, + "inf_nan_count": 0 + }, + { + "step": 6300, + "loss": 5.8896, + "learning_rate": 0.000294, + "inf_nan_count": 0 + }, + { + "step": 6400, + "loss": 5.9007, + "learning_rate": 0.000294, + "inf_nan_count": 0 + }, + { + "step": 6500, + "loss": 5.8617, + "learning_rate": 0.000294, + "inf_nan_count": 0 + }, + { + "step": 6600, + "loss": 5.8201, + "learning_rate": 0.000294, + "inf_nan_count": 0 + }, + { + "step": 6700, + "loss": 5.8544, + "learning_rate": 0.000294, + "inf_nan_count": 0 + }, + { + "step": 6800, + "loss": 5.8532, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 6900, + "loss": 5.795, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 7000, + "loss": 5.8146, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 7100, + "loss": 5.793, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 7200, + "loss": 5.7827, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 7300, + "loss": 5.7816, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 7400, + "loss": 5.73, + "learning_rate": 0.000293, + "inf_nan_count": 0 + }, + { + "step": 7500, + "loss": 5.767, + "learning_rate": 0.000292, + "inf_nan_count": 0 + }, + { + "step": 7600, + "loss": 5.745, + "learning_rate": 0.000292, + "inf_nan_count": 0 + }, + { + "step": 7700, + "loss": 5.7499, + "learning_rate": 0.000292, + "inf_nan_count": 0 + }, + { + "step": 7800, + "loss": 5.7233, + "learning_rate": 0.000292, + "inf_nan_count": 0 + }, + { + "step": 7900, + "loss": 5.7219, + "learning_rate": 0.000292, + "inf_nan_count": 0 + }, + { + "step": 8000, + "loss": 5.7523, + "learning_rate": 0.000292, + "inf_nan_count": 0 + }, + { + "step": 8100, + "loss": 5.7145, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8200, + "loss": 5.7469, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8300, + "loss": 5.7363, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8400, + "loss": 5.6938, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8500, + "loss": 5.6994, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8600, + "loss": 5.6583, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8700, + "loss": 5.6885, + "learning_rate": 0.000291, + "inf_nan_count": 0 + }, + { + "step": 8800, + "loss": 5.6313, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 8900, + "loss": 5.6314, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 9000, + "loss": 5.6501, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 9100, + "loss": 5.6357, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 9200, + "loss": 5.6045, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 9300, + "loss": 5.6405, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 9400, + "loss": 5.6241, + "learning_rate": 0.00029, + "inf_nan_count": 0 + }, + { + "step": 9500, + "loss": 5.6247, + "learning_rate": 0.000289, + "inf_nan_count": 0 + }, + { + "step": 9600, + "loss": 5.5983, + "learning_rate": 0.000289, + "inf_nan_count": 0 + }, + { + "step": 9700, + "loss": 5.5978, + "learning_rate": 0.000289, + "inf_nan_count": 0 + }, + { + "step": 9800, + "loss": 5.5746, + "learning_rate": 0.000289, + "inf_nan_count": 0 + } + ], + "evaluation_results": [ + { + "step": 1000, + "paloma": 2.5468931158531133e+19 + }, + { + "step": 2000, + "paloma": 3.627192449295412e+21 + }, + { + "step": 3000, + "paloma": 9.90975658825673e+22 + }, + { + "step": 4000, + "paloma": 2.6252526658823776e+24 + }, + { + "step": 5000, + "paloma": 7.294956881845611e+25 + }, + { + "step": 6000, + "paloma": 1.6856570425562805e+27 + }, + { + "step": 7000, + "paloma": 9.22180682233585e+28 + }, + { + "step": 8000, + "paloma": 3.1300823362207656e+29 + }, + { + "step": 9000, + "paloma": 4.983924509492406e+30 + } + ], + "config": { + "d_model": 96, + "n_layers": 12, + "max_seq_len": 2048, + "vocab_size": 50304, + "lr": 0.0003, + "max_steps": 200000, + "batch_size": 1 + } + }, + { + "run_name": "pico-decoder-tiny-dolma-teensy-v0", + "log_file": "log_20250828_210922.log", + "training_metrics": [ + { + "step": 0, + "loss": 10.9914, + "learning_rate": 0.0, + "inf_nan_count": 0 + } + ], + "evaluation_results": [ + { + "step": 0, + "paloma": 59434.76600609756 + }, + { + "step": 27, + "paloma": 59120.39268292683 + } + ], + "config": { + "d_model": 96, + "n_layers": 12, + "max_seq_len": 2048, + "vocab_size": 50304, + "lr": 0.0003, + "max_steps": 200000, + "batch_size": 8 + } + }, + { + "run_name": "pico-decoder-tiny-dolma-teensy-v1", + "log_file": "log_20250828_220514.log", + "training_metrics": [ + { + "step": 0, + "loss": 10.9886, + "learning_rate": 0.0, + "inf_nan_count": 0 + }, + { + "step": 100, + "loss": 10.9373, + "learning_rate": 1.2e-05, + "inf_nan_count": 0 + }, + { + "step": 200, + "loss": 10.5423, + "learning_rate": 2.4e-05, + "inf_nan_count": 0 + }, + { + "step": 300, + "loss": 9.9452, + "learning_rate": 3.6e-05, + "inf_nan_count": 0 + }, + { + "step": 400, + "loss": 9.449, + "learning_rate": 4.8e-05, + "inf_nan_count": 0 + }, + { + "step": 500, + "loss": 8.8455, + "learning_rate": 6e-05, + "inf_nan_count": 0 + }, + { + "step": 600, + "loss": 8.1482, + "learning_rate": 7.2e-05, + "inf_nan_count": 0 + }, + { + "step": 700, + "loss": 7.4303, + "learning_rate": 8.4e-05, + "inf_nan_count": 0 + }, + { + "step": 800, + "loss": 7.0363, + "learning_rate": 9.6e-05, + "inf_nan_count": 0 + }, + { + "step": 900, + "loss": 6.9702, + "learning_rate": 0.000108, + "inf_nan_count": 0 + }, + { + "step": 1000, + "loss": 6.8975, + "learning_rate": 0.00012, + "inf_nan_count": 0 + }, + { + "step": 1100, + "loss": 6.892, + "learning_rate": 0.000132, + "inf_nan_count": 0 + }, + { + "step": 1200, + "loss": 6.6684, + "learning_rate": 0.000144, + "inf_nan_count": 0 + }, + { + "step": 1300, + "loss": 6.4754, + "learning_rate": 0.000156, + "inf_nan_count": 0 + }, + { + "step": 1400, + "loss": 6.3649, + "learning_rate": 0.000168, + "inf_nan_count": 0 + }, + { + "step": 1500, + "loss": 6.2981, + "learning_rate": 0.00018, + "inf_nan_count": 0 + }, + { + "step": 1600, + "loss": 6.1551, + "learning_rate": 0.000192, + "inf_nan_count": 0 + }, + { + "step": 1700, + "loss": 5.9163, + "learning_rate": 0.000204, + "inf_nan_count": 0 + } + ], + "evaluation_results": [ + { + "step": 1000, + "paloma": 9.54583880403771e+19 + }, + { + "step": 1755, + "paloma": 2.945795672816324e+21 + } + ], + "config": { + "d_model": 96, + "n_layers": 12, + "max_seq_len": 2048, + "vocab_size": 50304, + "lr": 0.0003, + "max_steps": 200000, + "batch_size": 4 + } + }, + { + "run_name": "pico-decoder-tiny-dolma5M-v1", + "log_file": "log_20250830_014108.log", + "training_metrics": [ + { + "step": 32000, + "loss": 6.3376, + "learning_rate": 7.32e-06, + "inf_nan_count": 0 + }, + { + "step": 32025, + "loss": 6.1999, + "learning_rate": 7.28e-06, + "inf_nan_count": 0 + }, + { + "step": 32050, + "loss": 6.1488, + "learning_rate": 7.24e-06, + "inf_nan_count": 0 + }, + { + "step": 32075, + "loss": 6.046, + "learning_rate": 7.19e-06, + "inf_nan_count": 0 + } + ], + "evaluation_results": [ + { + "step": 32000, + "paloma": 2.977755235898109e+26 + } + ], + "config": { + "d_model": 96, + "n_layers": 12, + "max_seq_len": 2048, + "vocab_size": 50304, + "lr": 5e-05, + "max_steps": 20000, + "batch_size": 1 + } + } + ], + "summary": { + "total_runs": 6, + "run_names": [ + "pico-decoder-tiny-dolma29k-v2", + "pico-decoder-tiny-dolma29k-v3", + "pico-decoder-tiny-dolma29k-v1", + "pico-decoder-tiny-dolma-teensy-v0", + "pico-decoder-tiny-dolma-teensy-v1", + "pico-decoder-tiny-dolma5M-v1" + ] + } +} \ No newline at end of file