{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99290780141844, "eval_steps": 1000, "global_step": 951, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03152088258471237, "grad_norm": 30.522348403930664, "learning_rate": 9.375000000000001e-06, "loss": 8.3857, "step": 10 }, { "epoch": 0.06304176516942474, "grad_norm": 9.116207122802734, "learning_rate": 1.9791666666666665e-05, "loss": 5.2184, "step": 20 }, { "epoch": 0.09456264775413711, "grad_norm": 15.578251838684082, "learning_rate": 3.0208333333333334e-05, "loss": 4.6933, "step": 30 }, { "epoch": 0.12608353033884948, "grad_norm": 8.683645248413086, "learning_rate": 4.0625000000000005e-05, "loss": 4.4078, "step": 40 }, { "epoch": 0.15760441292356187, "grad_norm": 6.897328853607178, "learning_rate": 4.999984870177963e-05, "loss": 4.3808, "step": 50 }, { "epoch": 0.18912529550827423, "grad_norm": 6.390584468841553, "learning_rate": 4.9981695131093794e-05, "loss": 4.2976, "step": 60 }, { "epoch": 0.22064617809298662, "grad_norm": 4.875730991363525, "learning_rate": 4.9933307091588796e-05, "loss": 4.1837, "step": 70 }, { "epoch": 0.25216706067769895, "grad_norm": 5.509742736816406, "learning_rate": 4.985474314561054e-05, "loss": 4.165, "step": 80 }, { "epoch": 0.28368794326241137, "grad_norm": 36.74412536621094, "learning_rate": 4.974609837634544e-05, "loss": 4.4715, "step": 90 }, { "epoch": 0.31520882584712373, "grad_norm": 4.898118495941162, "learning_rate": 4.9607504272744575e-05, "loss": 4.3442, "step": 100 }, { "epoch": 0.3467297084318361, "grad_norm": 5.879344940185547, "learning_rate": 4.943912857038719e-05, "loss": 4.0963, "step": 110 }, { "epoch": 0.37825059101654845, "grad_norm": 5.230091094970703, "learning_rate": 4.9241175048476275e-05, "loss": 4.2056, "step": 120 }, { "epoch": 0.4097714736012608, "grad_norm": 4.855470180511475, "learning_rate": 4.90138832832117e-05, "loss": 4.066, "step": 130 }, { "epoch": 0.44129235618597323, "grad_norm": 5.235150337219238, "learning_rate": 4.8757528357839564e-05, "loss": 4.2, "step": 140 }, { "epoch": 0.4728132387706856, "grad_norm": 5.098632335662842, "learning_rate": 4.847242052972859e-05, "loss": 4.1053, "step": 150 }, { "epoch": 0.5043341213553979, "grad_norm": 5.152740001678467, "learning_rate": 4.8158904854876555e-05, "loss": 4.2672, "step": 160 }, { "epoch": 0.5358550039401103, "grad_norm": 4.3414082527160645, "learning_rate": 4.781736077030106e-05, "loss": 3.9562, "step": 170 }, { "epoch": 0.5673758865248227, "grad_norm": 4.447638511657715, "learning_rate": 4.744820163482029e-05, "loss": 3.829, "step": 180 }, { "epoch": 0.598896769109535, "grad_norm": 4.406827926635742, "learning_rate": 4.70518742287793e-05, "loss": 3.7732, "step": 190 }, { "epoch": 0.6304176516942475, "grad_norm": 4.291316986083984, "learning_rate": 4.66288582133275e-05, "loss": 3.748, "step": 200 }, { "epoch": 0.6619385342789598, "grad_norm": 5.285233974456787, "learning_rate": 4.6179665549901506e-05, "loss": 3.7187, "step": 210 }, { "epoch": 0.6934594168636722, "grad_norm": 4.025429725646973, "learning_rate": 4.5704839880616296e-05, "loss": 3.8554, "step": 220 }, { "epoch": 0.7249802994483846, "grad_norm": 4.376532554626465, "learning_rate": 4.520495587031414e-05, "loss": 3.6829, "step": 230 }, { "epoch": 0.7565011820330969, "grad_norm": 4.8744306564331055, "learning_rate": 4.468061851106797e-05, "loss": 3.7738, "step": 240 }, { "epoch": 0.7880220646178093, "grad_norm": 20.5545597076416, "learning_rate": 4.413246238998069e-05, "loss": 3.7111, "step": 250 }, { "epoch": 0.8195429472025216, "grad_norm": 4.567636489868164, "learning_rate": 4.356115092116668e-05, "loss": 3.5261, "step": 260 }, { "epoch": 0.851063829787234, "grad_norm": 4.672543048858643, "learning_rate": 4.2967375542844974e-05, "loss": 3.5459, "step": 270 }, { "epoch": 0.8825847123719465, "grad_norm": 4.592697620391846, "learning_rate": 4.235185488051585e-05, "loss": 3.6955, "step": 280 }, { "epoch": 0.9141055949566588, "grad_norm": 4.082319259643555, "learning_rate": 4.171533387723362e-05, "loss": 3.6012, "step": 290 }, { "epoch": 0.9456264775413712, "grad_norm": 4.013556003570557, "learning_rate": 4.1058582892028175e-05, "loss": 3.4834, "step": 300 }, { "epoch": 0.9771473601260835, "grad_norm": 4.339940547943115, "learning_rate": 4.0382396767566536e-05, "loss": 3.687, "step": 310 }, { "epoch": 1.0063041765169425, "grad_norm": 4.115149021148682, "learning_rate": 3.968759386818259e-05, "loss": 3.016, "step": 320 }, { "epoch": 1.037825059101655, "grad_norm": 3.8200573921203613, "learning_rate": 3.897501508943955e-05, "loss": 2.0546, "step": 330 }, { "epoch": 1.0693459416863673, "grad_norm": 3.4586374759674072, "learning_rate": 3.824552284042351e-05, "loss": 1.9928, "step": 340 }, { "epoch": 1.1008668242710795, "grad_norm": 4.013545513153076, "learning_rate": 3.7500000000000003e-05, "loss": 2.0627, "step": 350 }, { "epoch": 1.132387706855792, "grad_norm": 3.6197803020477295, "learning_rate": 3.673934884829673e-05, "loss": 1.9238, "step": 360 }, { "epoch": 1.1639085894405043, "grad_norm": 3.7971880435943604, "learning_rate": 3.5964489974705553e-05, "loss": 1.9624, "step": 370 }, { "epoch": 1.1954294720252168, "grad_norm": 3.554382562637329, "learning_rate": 3.517636116372546e-05, "loss": 2.0095, "step": 380 }, { "epoch": 1.226950354609929, "grad_norm": 4.342985153198242, "learning_rate": 3.437591625999497e-05, "loss": 1.8905, "step": 390 }, { "epoch": 1.2584712371946414, "grad_norm": 3.6071176528930664, "learning_rate": 3.356412401388732e-05, "loss": 2.0062, "step": 400 }, { "epoch": 1.2899921197793538, "grad_norm": 4.067968845367432, "learning_rate": 3.274196690906602e-05, "loss": 1.9529, "step": 410 }, { "epoch": 1.3215130023640662, "grad_norm": 3.6475672721862793, "learning_rate": 3.1910439973419294e-05, "loss": 1.9173, "step": 420 }, { "epoch": 1.3530338849487786, "grad_norm": 3.6837363243103027, "learning_rate": 3.107054957481271e-05, "loss": 1.9451, "step": 430 }, { "epoch": 1.384554767533491, "grad_norm": 4.136275768280029, "learning_rate": 3.022331220311756e-05, "loss": 1.9187, "step": 440 }, { "epoch": 1.4160756501182032, "grad_norm": 3.433151960372925, "learning_rate": 2.93697532399888e-05, "loss": 1.7977, "step": 450 }, { "epoch": 1.4475965327029157, "grad_norm": 3.2098097801208496, "learning_rate": 2.8510905717881614e-05, "loss": 1.8138, "step": 460 }, { "epoch": 1.479117415287628, "grad_norm": 3.2374401092529297, "learning_rate": 2.76478090698085e-05, "loss": 1.8966, "step": 470 }, { "epoch": 1.5106382978723403, "grad_norm": 3.700331449508667, "learning_rate": 2.6781507871349993e-05, "loss": 2.0714, "step": 480 }, { "epoch": 1.5421591804570527, "grad_norm": 4.613447666168213, "learning_rate": 2.5913050576441477e-05, "loss": 1.79, "step": 490 }, { "epoch": 1.573680063041765, "grad_norm": 3.2395973205566406, "learning_rate": 2.5043488248466184e-05, "loss": 1.8355, "step": 500 }, { "epoch": 1.6052009456264775, "grad_norm": 3.447166681289673, "learning_rate": 2.4173873288190114e-05, "loss": 1.8917, "step": 510 }, { "epoch": 1.63672182821119, "grad_norm": 3.5781800746917725, "learning_rate": 2.3305258160078274e-05, "loss": 1.8153, "step": 520 }, { "epoch": 1.6682427107959024, "grad_norm": 3.1937813758850098, "learning_rate": 2.2438694118533875e-05, "loss": 1.8061, "step": 530 }, { "epoch": 1.6997635933806148, "grad_norm": 3.4112651348114014, "learning_rate": 2.1575229935602086e-05, "loss": 1.7042, "step": 540 }, { "epoch": 1.731284475965327, "grad_norm": 3.6018483638763428, "learning_rate": 2.0715910631677968e-05, "loss": 1.6401, "step": 550 }, { "epoch": 1.7628053585500394, "grad_norm": 3.27825927734375, "learning_rate": 1.986177621075499e-05, "loss": 1.7344, "step": 560 }, { "epoch": 1.7943262411347518, "grad_norm": 3.4082956314086914, "learning_rate": 1.9013860401744716e-05, "loss": 1.8261, "step": 570 }, { "epoch": 1.825847123719464, "grad_norm": 3.231468439102173, "learning_rate": 1.817318940739098e-05, "loss": 1.6267, "step": 580 }, { "epoch": 1.8573680063041764, "grad_norm": 3.3175694942474365, "learning_rate": 1.7340780662292677e-05, "loss": 1.7841, "step": 590 }, { "epoch": 1.8888888888888888, "grad_norm": 3.1403086185455322, "learning_rate": 1.651764160153844e-05, "loss": 1.7242, "step": 600 }, { "epoch": 1.9204097714736013, "grad_norm": 3.0145950317382812, "learning_rate": 1.570476844144329e-05, "loss": 1.7107, "step": 610 }, { "epoch": 1.9519306540583137, "grad_norm": 3.2950940132141113, "learning_rate": 1.4903144973862973e-05, "loss": 1.5581, "step": 620 }, { "epoch": 1.983451536643026, "grad_norm": 3.2230618000030518, "learning_rate": 1.4113741375545222e-05, "loss": 1.5261, "step": 630 }, { "epoch": 2.012608353033885, "grad_norm": 2.3006889820098877, "learning_rate": 1.3337513033958904e-05, "loss": 1.0825, "step": 640 }, { "epoch": 2.0441292356185974, "grad_norm": 2.841254711151123, "learning_rate": 1.2575399391022061e-05, "loss": 0.6505, "step": 650 }, { "epoch": 2.07565011820331, "grad_norm": 2.3877882957458496, "learning_rate": 1.1828322806128373e-05, "loss": 0.7065, "step": 660 }, { "epoch": 2.107171000788022, "grad_norm": 3.346599578857422, "learning_rate": 1.1097187439847939e-05, "loss": 0.6801, "step": 670 }, { "epoch": 2.1386918833727346, "grad_norm": 2.6883623600006104, "learning_rate": 1.0382878159653447e-05, "loss": 0.7106, "step": 680 }, { "epoch": 2.1702127659574466, "grad_norm": 2.5878891944885254, "learning_rate": 9.686259468996153e-06, "loss": 0.6787, "step": 690 }, { "epoch": 2.201733648542159, "grad_norm": 2.9516632556915283, "learning_rate": 9.008174461027724e-06, "loss": 0.7173, "step": 700 }, { "epoch": 2.2332545311268714, "grad_norm": 2.9583373069763184, "learning_rate": 8.349443798234116e-06, "loss": 0.6442, "step": 710 }, { "epoch": 2.264775413711584, "grad_norm": 2.751403331756592, "learning_rate": 7.710864719216637e-06, "loss": 0.6828, "step": 720 }, { "epoch": 2.2962962962962963, "grad_norm": 2.852689743041992, "learning_rate": 7.0932100738220265e-06, "loss": 0.703, "step": 730 }, { "epoch": 2.3278171788810087, "grad_norm": 2.892123222351074, "learning_rate": 6.497227387789392e-06, "loss": 0.6609, "step": 740 }, { "epoch": 2.359338061465721, "grad_norm": 2.5285375118255615, "learning_rate": 5.923637958046058e-06, "loss": 0.6635, "step": 750 }, { "epoch": 2.3908589440504335, "grad_norm": 2.6019787788391113, "learning_rate": 5.373135979747227e-06, "loss": 0.6354, "step": 760 }, { "epoch": 2.422379826635146, "grad_norm": 2.3159821033477783, "learning_rate": 4.846387706115932e-06, "loss": 0.5731, "step": 770 }, { "epoch": 2.453900709219858, "grad_norm": 2.7990407943725586, "learning_rate": 4.344030642100133e-06, "loss": 0.6754, "step": 780 }, { "epoch": 2.4854215918045703, "grad_norm": 2.336843967437744, "learning_rate": 3.866672772822863e-06, "loss": 0.6382, "step": 790 }, { "epoch": 2.5169424743892828, "grad_norm": 2.7701914310455322, "learning_rate": 3.4148918277592005e-06, "loss": 0.6316, "step": 800 }, { "epoch": 2.548463356973995, "grad_norm": 2.583923101425171, "learning_rate": 2.989234581530509e-06, "loss": 0.631, "step": 810 }, { "epoch": 2.5799842395587076, "grad_norm": 2.658202648162842, "learning_rate": 2.5902161921623454e-06, "loss": 0.6692, "step": 820 }, { "epoch": 2.61150512214342, "grad_norm": 3.21925687789917, "learning_rate": 2.218319577606778e-06, "loss": 0.6858, "step": 830 }, { "epoch": 2.6430260047281324, "grad_norm": 2.6975653171539307, "learning_rate": 1.8739948312837014e-06, "loss": 0.6282, "step": 840 }, { "epoch": 2.674546887312845, "grad_norm": 2.680405855178833, "learning_rate": 1.5576586773486195e-06, "loss": 0.6061, "step": 850 }, { "epoch": 2.7060677698975573, "grad_norm": 2.4803316593170166, "learning_rate": 1.2696939663459973e-06, "loss": 0.6601, "step": 860 }, { "epoch": 2.7375886524822697, "grad_norm": 3.0240488052368164, "learning_rate": 1.0104492118586773e-06, "loss": 0.6088, "step": 870 }, { "epoch": 2.769109535066982, "grad_norm": 2.943082809448242, "learning_rate": 7.802381687141535e-07, "loss": 0.66, "step": 880 }, { "epoch": 2.8006304176516945, "grad_norm": 2.768092632293701, "learning_rate": 5.793394532580765e-07, "loss": 0.6184, "step": 890 }, { "epoch": 2.8321513002364065, "grad_norm": 2.631650924682617, "learning_rate": 4.079962061546955e-07, "loss": 0.5972, "step": 900 }, { "epoch": 2.863672182821119, "grad_norm": 2.7756528854370117, "learning_rate": 2.664157981222437e-07, "loss": 0.6151, "step": 910 }, { "epoch": 2.8951930654058313, "grad_norm": 2.6522891521453857, "learning_rate": 1.5476957895942946e-07, "loss": 0.6093, "step": 920 }, { "epoch": 2.9267139479905437, "grad_norm": 2.9670333862304688, "learning_rate": 7.319267016678288e-08, "loss": 0.6333, "step": 930 }, { "epoch": 2.958234830575256, "grad_norm": 2.2578587532043457, "learning_rate": 2.1783801413866046e-08, "loss": 0.5926, "step": 940 }, { "epoch": 2.9897557131599686, "grad_norm": 2.546121120452881, "learning_rate": 6.05191050198628e-10, "loss": 0.6528, "step": 950 }, { "epoch": 2.99290780141844, "step": 951, "total_flos": 6.011828060016345e+18, "train_loss": 2.2139622458774837, "train_runtime": 221179.8915, "train_samples_per_second": 0.551, "train_steps_per_second": 0.004 } ], "logging_steps": 10, "max_steps": 951, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.011828060016345e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }