| { | |
| "results": { | |
| "hendrycksTest-abstract_algebra": { | |
| "acc": 0.28, | |
| "acc_stderr": 0.04512608598542128, | |
| "acc_norm": 0.28, | |
| "acc_norm_stderr": 0.04512608598542128 | |
| }, | |
| "hendrycksTest-anatomy": { | |
| "acc": 0.4740740740740741, | |
| "acc_stderr": 0.04313531696750574, | |
| "acc_norm": 0.4740740740740741, | |
| "acc_norm_stderr": 0.04313531696750574 | |
| }, | |
| "hendrycksTest-astronomy": { | |
| "acc": 0.5131578947368421, | |
| "acc_stderr": 0.04067533136309174, | |
| "acc_norm": 0.5131578947368421, | |
| "acc_norm_stderr": 0.04067533136309174 | |
| }, | |
| "hendrycksTest-business_ethics": { | |
| "acc": 0.49, | |
| "acc_stderr": 0.05024183937956912, | |
| "acc_norm": 0.49, | |
| "acc_norm_stderr": 0.05024183937956912 | |
| }, | |
| "hendrycksTest-clinical_knowledge": { | |
| "acc": 0.4867924528301887, | |
| "acc_stderr": 0.030762134874500482, | |
| "acc_norm": 0.4867924528301887, | |
| "acc_norm_stderr": 0.030762134874500482 | |
| }, | |
| "hendrycksTest-college_biology": { | |
| "acc": 0.4652777777777778, | |
| "acc_stderr": 0.04171115858181618, | |
| "acc_norm": 0.4652777777777778, | |
| "acc_norm_stderr": 0.04171115858181618 | |
| }, | |
| "hendrycksTest-college_chemistry": { | |
| "acc": 0.32, | |
| "acc_stderr": 0.046882617226215034, | |
| "acc_norm": 0.32, | |
| "acc_norm_stderr": 0.046882617226215034 | |
| }, | |
| "hendrycksTest-college_computer_science": { | |
| "acc": 0.36, | |
| "acc_stderr": 0.04824181513244218, | |
| "acc_norm": 0.36, | |
| "acc_norm_stderr": 0.04824181513244218 | |
| }, | |
| "hendrycksTest-college_mathematics": { | |
| "acc": 0.26, | |
| "acc_stderr": 0.04408440022768078, | |
| "acc_norm": 0.26, | |
| "acc_norm_stderr": 0.04408440022768078 | |
| }, | |
| "hendrycksTest-college_medicine": { | |
| "acc": 0.37572254335260113, | |
| "acc_stderr": 0.036928207672648664, | |
| "acc_norm": 0.37572254335260113, | |
| "acc_norm_stderr": 0.036928207672648664 | |
| }, | |
| "hendrycksTest-college_physics": { | |
| "acc": 0.24509803921568626, | |
| "acc_stderr": 0.04280105837364395, | |
| "acc_norm": 0.24509803921568626, | |
| "acc_norm_stderr": 0.04280105837364395 | |
| }, | |
| "hendrycksTest-computer_security": { | |
| "acc": 0.56, | |
| "acc_stderr": 0.04988876515698589, | |
| "acc_norm": 0.56, | |
| "acc_norm_stderr": 0.04988876515698589 | |
| }, | |
| "hendrycksTest-conceptual_physics": { | |
| "acc": 0.3872340425531915, | |
| "acc_stderr": 0.03184389265339526, | |
| "acc_norm": 0.3872340425531915, | |
| "acc_norm_stderr": 0.03184389265339526 | |
| }, | |
| "hendrycksTest-econometrics": { | |
| "acc": 0.2719298245614035, | |
| "acc_stderr": 0.04185774424022056, | |
| "acc_norm": 0.2719298245614035, | |
| "acc_norm_stderr": 0.04185774424022056 | |
| }, | |
| "hendrycksTest-electrical_engineering": { | |
| "acc": 0.4206896551724138, | |
| "acc_stderr": 0.0411391498118926, | |
| "acc_norm": 0.4206896551724138, | |
| "acc_norm_stderr": 0.0411391498118926 | |
| }, | |
| "hendrycksTest-elementary_mathematics": { | |
| "acc": 0.30687830687830686, | |
| "acc_stderr": 0.023752928712112143, | |
| "acc_norm": 0.30687830687830686, | |
| "acc_norm_stderr": 0.023752928712112143 | |
| }, | |
| "hendrycksTest-formal_logic": { | |
| "acc": 0.2698412698412698, | |
| "acc_stderr": 0.03970158273235173, | |
| "acc_norm": 0.2698412698412698, | |
| "acc_norm_stderr": 0.03970158273235173 | |
| }, | |
| "hendrycksTest-global_facts": { | |
| "acc": 0.28, | |
| "acc_stderr": 0.04512608598542127, | |
| "acc_norm": 0.28, | |
| "acc_norm_stderr": 0.04512608598542127 | |
| }, | |
| "hendrycksTest-high_school_biology": { | |
| "acc": 0.5225806451612903, | |
| "acc_stderr": 0.02841498501970786, | |
| "acc_norm": 0.5225806451612903, | |
| "acc_norm_stderr": 0.02841498501970786 | |
| }, | |
| "hendrycksTest-high_school_chemistry": { | |
| "acc": 0.3054187192118227, | |
| "acc_stderr": 0.03240661565868408, | |
| "acc_norm": 0.3054187192118227, | |
| "acc_norm_stderr": 0.03240661565868408 | |
| }, | |
| "hendrycksTest-high_school_computer_science": { | |
| "acc": 0.43, | |
| "acc_stderr": 0.049756985195624284, | |
| "acc_norm": 0.43, | |
| "acc_norm_stderr": 0.049756985195624284 | |
| }, | |
| "hendrycksTest-high_school_european_history": { | |
| "acc": 0.5636363636363636, | |
| "acc_stderr": 0.03872592983524754, | |
| "acc_norm": 0.5636363636363636, | |
| "acc_norm_stderr": 0.03872592983524754 | |
| }, | |
| "hendrycksTest-high_school_geography": { | |
| "acc": 0.48484848484848486, | |
| "acc_stderr": 0.03560716516531061, | |
| "acc_norm": 0.48484848484848486, | |
| "acc_norm_stderr": 0.03560716516531061 | |
| }, | |
| "hendrycksTest-high_school_government_and_politics": { | |
| "acc": 0.6373056994818653, | |
| "acc_stderr": 0.03469713791704371, | |
| "acc_norm": 0.6373056994818653, | |
| "acc_norm_stderr": 0.03469713791704371 | |
| }, | |
| "hendrycksTest-high_school_macroeconomics": { | |
| "acc": 0.4282051282051282, | |
| "acc_stderr": 0.02508830145469483, | |
| "acc_norm": 0.4282051282051282, | |
| "acc_norm_stderr": 0.02508830145469483 | |
| }, | |
| "hendrycksTest-high_school_mathematics": { | |
| "acc": 0.26296296296296295, | |
| "acc_stderr": 0.026842057873833706, | |
| "acc_norm": 0.26296296296296295, | |
| "acc_norm_stderr": 0.026842057873833706 | |
| }, | |
| "hendrycksTest-high_school_microeconomics": { | |
| "acc": 0.40756302521008403, | |
| "acc_stderr": 0.03191863374478465, | |
| "acc_norm": 0.40756302521008403, | |
| "acc_norm_stderr": 0.03191863374478465 | |
| }, | |
| "hendrycksTest-high_school_physics": { | |
| "acc": 0.2781456953642384, | |
| "acc_stderr": 0.03658603262763744, | |
| "acc_norm": 0.2781456953642384, | |
| "acc_norm_stderr": 0.03658603262763744 | |
| }, | |
| "hendrycksTest-high_school_psychology": { | |
| "acc": 0.5559633027522936, | |
| "acc_stderr": 0.02130262121165452, | |
| "acc_norm": 0.5559633027522936, | |
| "acc_norm_stderr": 0.02130262121165452 | |
| }, | |
| "hendrycksTest-high_school_statistics": { | |
| "acc": 0.37037037037037035, | |
| "acc_stderr": 0.03293377139415191, | |
| "acc_norm": 0.37037037037037035, | |
| "acc_norm_stderr": 0.03293377139415191 | |
| }, | |
| "hendrycksTest-high_school_us_history": { | |
| "acc": 0.6029411764705882, | |
| "acc_stderr": 0.0343413116471913, | |
| "acc_norm": 0.6029411764705882, | |
| "acc_norm_stderr": 0.0343413116471913 | |
| }, | |
| "hendrycksTest-high_school_world_history": { | |
| "acc": 0.6582278481012658, | |
| "acc_stderr": 0.03087453753755362, | |
| "acc_norm": 0.6582278481012658, | |
| "acc_norm_stderr": 0.03087453753755362 | |
| }, | |
| "hendrycksTest-human_aging": { | |
| "acc": 0.5515695067264574, | |
| "acc_stderr": 0.033378837362550984, | |
| "acc_norm": 0.5515695067264574, | |
| "acc_norm_stderr": 0.033378837362550984 | |
| }, | |
| "hendrycksTest-human_sexuality": { | |
| "acc": 0.5343511450381679, | |
| "acc_stderr": 0.04374928560599738, | |
| "acc_norm": 0.5343511450381679, | |
| "acc_norm_stderr": 0.04374928560599738 | |
| }, | |
| "hendrycksTest-international_law": { | |
| "acc": 0.6776859504132231, | |
| "acc_stderr": 0.04266416363352167, | |
| "acc_norm": 0.6776859504132231, | |
| "acc_norm_stderr": 0.04266416363352167 | |
| }, | |
| "hendrycksTest-jurisprudence": { | |
| "acc": 0.5, | |
| "acc_stderr": 0.04833682445228318, | |
| "acc_norm": 0.5, | |
| "acc_norm_stderr": 0.04833682445228318 | |
| }, | |
| "hendrycksTest-logical_fallacies": { | |
| "acc": 0.4601226993865031, | |
| "acc_stderr": 0.03915857291436972, | |
| "acc_norm": 0.4601226993865031, | |
| "acc_norm_stderr": 0.03915857291436972 | |
| }, | |
| "hendrycksTest-machine_learning": { | |
| "acc": 0.30357142857142855, | |
| "acc_stderr": 0.04364226155841044, | |
| "acc_norm": 0.30357142857142855, | |
| "acc_norm_stderr": 0.04364226155841044 | |
| }, | |
| "hendrycksTest-management": { | |
| "acc": 0.5922330097087378, | |
| "acc_stderr": 0.048657775704107696, | |
| "acc_norm": 0.5922330097087378, | |
| "acc_norm_stderr": 0.048657775704107696 | |
| }, | |
| "hendrycksTest-marketing": { | |
| "acc": 0.6324786324786325, | |
| "acc_stderr": 0.031585391577456365, | |
| "acc_norm": 0.6324786324786325, | |
| "acc_norm_stderr": 0.031585391577456365 | |
| }, | |
| "hendrycksTest-medical_genetics": { | |
| "acc": 0.45, | |
| "acc_stderr": 0.05, | |
| "acc_norm": 0.45, | |
| "acc_norm_stderr": 0.05 | |
| }, | |
| "hendrycksTest-miscellaneous": { | |
| "acc": 0.632183908045977, | |
| "acc_stderr": 0.017243828891846263, | |
| "acc_norm": 0.632183908045977, | |
| "acc_norm_stderr": 0.017243828891846263 | |
| }, | |
| "hendrycksTest-moral_disputes": { | |
| "acc": 0.49710982658959535, | |
| "acc_stderr": 0.026918645383239015, | |
| "acc_norm": 0.49710982658959535, | |
| "acc_norm_stderr": 0.026918645383239015 | |
| }, | |
| "hendrycksTest-moral_scenarios": { | |
| "acc": 0.2748603351955307, | |
| "acc_stderr": 0.01493131670322051, | |
| "acc_norm": 0.2748603351955307, | |
| "acc_norm_stderr": 0.01493131670322051 | |
| }, | |
| "hendrycksTest-nutrition": { | |
| "acc": 0.477124183006536, | |
| "acc_stderr": 0.028599936776089782, | |
| "acc_norm": 0.477124183006536, | |
| "acc_norm_stderr": 0.028599936776089782 | |
| }, | |
| "hendrycksTest-philosophy": { | |
| "acc": 0.5562700964630225, | |
| "acc_stderr": 0.028217683556652315, | |
| "acc_norm": 0.5562700964630225, | |
| "acc_norm_stderr": 0.028217683556652315 | |
| }, | |
| "hendrycksTest-prehistory": { | |
| "acc": 0.5123456790123457, | |
| "acc_stderr": 0.027812262269327242, | |
| "acc_norm": 0.5123456790123457, | |
| "acc_norm_stderr": 0.027812262269327242 | |
| }, | |
| "hendrycksTest-professional_accounting": { | |
| "acc": 0.35106382978723405, | |
| "acc_stderr": 0.028473501272963764, | |
| "acc_norm": 0.35106382978723405, | |
| "acc_norm_stderr": 0.028473501272963764 | |
| }, | |
| "hendrycksTest-professional_law": { | |
| "acc": 0.3683181225554107, | |
| "acc_stderr": 0.012319403369564639, | |
| "acc_norm": 0.3683181225554107, | |
| "acc_norm_stderr": 0.012319403369564639 | |
| }, | |
| "hendrycksTest-professional_medicine": { | |
| "acc": 0.4264705882352941, | |
| "acc_stderr": 0.030042615832714867, | |
| "acc_norm": 0.4264705882352941, | |
| "acc_norm_stderr": 0.030042615832714867 | |
| }, | |
| "hendrycksTest-professional_psychology": { | |
| "acc": 0.4411764705882353, | |
| "acc_stderr": 0.020087362076702853, | |
| "acc_norm": 0.4411764705882353, | |
| "acc_norm_stderr": 0.020087362076702853 | |
| }, | |
| "hendrycksTest-public_relations": { | |
| "acc": 0.5272727272727272, | |
| "acc_stderr": 0.04782001791380061, | |
| "acc_norm": 0.5272727272727272, | |
| "acc_norm_stderr": 0.04782001791380061 | |
| }, | |
| "hendrycksTest-security_studies": { | |
| "acc": 0.5265306122448979, | |
| "acc_stderr": 0.03196412734523272, | |
| "acc_norm": 0.5265306122448979, | |
| "acc_norm_stderr": 0.03196412734523272 | |
| }, | |
| "hendrycksTest-sociology": { | |
| "acc": 0.582089552238806, | |
| "acc_stderr": 0.034875586404620636, | |
| "acc_norm": 0.582089552238806, | |
| "acc_norm_stderr": 0.034875586404620636 | |
| }, | |
| "hendrycksTest-us_foreign_policy": { | |
| "acc": 0.71, | |
| "acc_stderr": 0.045604802157206845, | |
| "acc_norm": 0.71, | |
| "acc_norm_stderr": 0.045604802157206845 | |
| }, | |
| "hendrycksTest-virology": { | |
| "acc": 0.41566265060240964, | |
| "acc_stderr": 0.03836722176598052, | |
| "acc_norm": 0.41566265060240964, | |
| "acc_norm_stderr": 0.03836722176598052 | |
| }, | |
| "hendrycksTest-world_religions": { | |
| "acc": 0.6783625730994152, | |
| "acc_stderr": 0.03582529442573122, | |
| "acc_norm": 0.6783625730994152, | |
| "acc_norm_stderr": 0.03582529442573122 | |
| } | |
| }, | |
| "versions": { | |
| "hendrycksTest-abstract_algebra": 1, | |
| "hendrycksTest-anatomy": 1, | |
| "hendrycksTest-astronomy": 1, | |
| "hendrycksTest-business_ethics": 1, | |
| "hendrycksTest-clinical_knowledge": 1, | |
| "hendrycksTest-college_biology": 1, | |
| "hendrycksTest-college_chemistry": 1, | |
| "hendrycksTest-college_computer_science": 1, | |
| "hendrycksTest-college_mathematics": 1, | |
| "hendrycksTest-college_medicine": 1, | |
| "hendrycksTest-college_physics": 1, | |
| "hendrycksTest-computer_security": 1, | |
| "hendrycksTest-conceptual_physics": 1, | |
| "hendrycksTest-econometrics": 1, | |
| "hendrycksTest-electrical_engineering": 1, | |
| "hendrycksTest-elementary_mathematics": 1, | |
| "hendrycksTest-formal_logic": 1, | |
| "hendrycksTest-global_facts": 1, | |
| "hendrycksTest-high_school_biology": 1, | |
| "hendrycksTest-high_school_chemistry": 1, | |
| "hendrycksTest-high_school_computer_science": 1, | |
| "hendrycksTest-high_school_european_history": 1, | |
| "hendrycksTest-high_school_geography": 1, | |
| "hendrycksTest-high_school_government_and_politics": 1, | |
| "hendrycksTest-high_school_macroeconomics": 1, | |
| "hendrycksTest-high_school_mathematics": 1, | |
| "hendrycksTest-high_school_microeconomics": 1, | |
| "hendrycksTest-high_school_physics": 1, | |
| "hendrycksTest-high_school_psychology": 1, | |
| "hendrycksTest-high_school_statistics": 1, | |
| "hendrycksTest-high_school_us_history": 1, | |
| "hendrycksTest-high_school_world_history": 1, | |
| "hendrycksTest-human_aging": 1, | |
| "hendrycksTest-human_sexuality": 1, | |
| "hendrycksTest-international_law": 1, | |
| "hendrycksTest-jurisprudence": 1, | |
| "hendrycksTest-logical_fallacies": 1, | |
| "hendrycksTest-machine_learning": 1, | |
| "hendrycksTest-management": 1, | |
| "hendrycksTest-marketing": 1, | |
| "hendrycksTest-medical_genetics": 1, | |
| "hendrycksTest-miscellaneous": 1, | |
| "hendrycksTest-moral_disputes": 1, | |
| "hendrycksTest-moral_scenarios": 1, | |
| "hendrycksTest-nutrition": 1, | |
| "hendrycksTest-philosophy": 1, | |
| "hendrycksTest-prehistory": 1, | |
| "hendrycksTest-professional_accounting": 1, | |
| "hendrycksTest-professional_law": 1, | |
| "hendrycksTest-professional_medicine": 1, | |
| "hendrycksTest-professional_psychology": 1, | |
| "hendrycksTest-public_relations": 1, | |
| "hendrycksTest-security_studies": 1, | |
| "hendrycksTest-sociology": 1, | |
| "hendrycksTest-us_foreign_policy": 1, | |
| "hendrycksTest-virology": 1, | |
| "hendrycksTest-world_religions": 1 | |
| }, | |
| "config": { | |
| "model": "sparseml", | |
| "model_args": "pretrained=/cache/shubhra/models/platypus_dolphin/cerebras/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2_quant_loge,trust_remote_code=True,dtype=bfloat16", | |
| "num_fewshot": 5, | |
| "batch_size": "8", | |
| "batch_sizes": [], | |
| "device": "cuda:0", | |
| "no_cache": true, | |
| "limit": null, | |
| "bootstrap_iters": 100000, | |
| "description_dict": {} | |
| } | |
| } |