From 404aa083bed64d1ad50c2859303892bd48514cad Mon Sep 17 00:00:00 2001 From: 4pdadmin <> Date: Tue, 3 Feb 2026 18:16:20 +0000 Subject: [PATCH] commit file to repo --- .gitattributes | 1 + .gitignore | 0 configs/20260204_013552.py | 1347 +++++++++++++++++ .../GaokaoBench_2010-2013_English_MCQs.out | 7 + logs/eval/public/qwen3-0-6b@main/lambada.out | 7 + logs/eval/public/qwen3-0-6b@main/triviaqa.out | 7 + .../GaokaoBench_2010-2013_English_MCQs.out | 10 + .../public/qwen3-0-6b@main/lambada_0.out | 10 + .../public/qwen3-0-6b@main/lambada_1.out | 10 + .../public/qwen3-0-6b@main/lambada_2.out | 10 + .../public/qwen3-0-6b@main/triviaqa_0.out | 10 + .../public/qwen3-0-6b@main/triviaqa_1.out | 10 + .../public/qwen3-0-6b@main/triviaqa_2.out | 10 + .../public/qwen3-0-6b@main/triviaqa_3.out | 10 + .../public/qwen3-0-6b@main/triviaqa_4.out | 10 + .../GaokaoBench_2010-2013_English_MCQs.json | 3 + .../public/qwen3-0-6b@main/lambada_0.json | 3 + .../public/qwen3-0-6b@main/lambada_1.json | 3 + .../public/qwen3-0-6b@main/lambada_2.json | 3 + .../public/qwen3-0-6b@main/triviaqa_0.json | 3 + .../public/qwen3-0-6b@main/triviaqa_1.json | 3 + .../public/qwen3-0-6b@main/triviaqa_2.json | 3 + .../public/qwen3-0-6b@main/triviaqa_3.json | 3 + .../public/qwen3-0-6b@main/triviaqa_4.json | 3 + .../GaokaoBench_2010-2013_English_MCQs.json | 3 + results/public/qwen3-0-6b@main/lambada.json | 3 + results/public/qwen3-0-6b@main/triviaqa.json | 3 + summary/summary_20260204_013552.csv | 87 ++ summary/summary_20260204_013552.txt | 197 +++ 29 files changed, 1779 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 configs/20260204_013552.py create mode 100644 logs/eval/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out create mode 100644 logs/eval/public/qwen3-0-6b@main/lambada.out create mode 100644 logs/eval/public/qwen3-0-6b@main/triviaqa.out create mode 100644 logs/infer/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out create mode 100644 logs/infer/public/qwen3-0-6b@main/lambada_0.out create mode 100644 logs/infer/public/qwen3-0-6b@main/lambada_1.out create mode 100644 logs/infer/public/qwen3-0-6b@main/lambada_2.out create mode 100644 logs/infer/public/qwen3-0-6b@main/triviaqa_0.out create mode 100644 logs/infer/public/qwen3-0-6b@main/triviaqa_1.out create mode 100644 logs/infer/public/qwen3-0-6b@main/triviaqa_2.out create mode 100644 logs/infer/public/qwen3-0-6b@main/triviaqa_3.out create mode 100644 logs/infer/public/qwen3-0-6b@main/triviaqa_4.out create mode 100644 predictions/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.json create mode 100644 predictions/public/qwen3-0-6b@main/lambada_0.json create mode 100644 predictions/public/qwen3-0-6b@main/lambada_1.json create mode 100644 predictions/public/qwen3-0-6b@main/lambada_2.json create mode 100644 predictions/public/qwen3-0-6b@main/triviaqa_0.json create mode 100644 predictions/public/qwen3-0-6b@main/triviaqa_1.json create mode 100644 predictions/public/qwen3-0-6b@main/triviaqa_2.json create mode 100644 predictions/public/qwen3-0-6b@main/triviaqa_3.json create mode 100644 predictions/public/qwen3-0-6b@main/triviaqa_4.json create mode 100644 results/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.json create mode 100644 results/public/qwen3-0-6b@main/lambada.json create mode 100644 results/public/qwen3-0-6b@main/triviaqa.json create mode 100644 summary/summary_20260204_013552.csv create mode 100644 summary/summary_20260204_013552.txt diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7fe70d7 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.json filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/configs/20260204_013552.py b/configs/20260204_013552.py new file mode 100644 index 0000000..402a882 --- /dev/null +++ b/configs/20260204_013552.py @@ -0,0 +1,1347 @@ +datasets=[ + dict(abbr='GaokaoBench_2010-2013_English_MCQs', + eval_cfg=dict( + evaluator=dict( + type='GaokaoBenchEvaluator_single_choice'), + pred_role='BOT'), + infer_cfg=dict( + ice_template=dict( + ice_token='', + template=dict( + round=[ + dict(prompt='请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和之间。\n例如:【答案】: A \n完整的题目回答的格式如下:\n【解析】 ... \n【答案】 ... \n请你严格按照上述格式作答。\n题目如下:{question}', + role='HUMAN'), + ]), + type='opencompass.openicl.icl_prompt_template.PromptTemplate'), + inferencer=dict( + max_out_len=1024, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + path='./data/GAOKAO-BENCH/data/Multiple-choice_Questions/2010-2013_English_MCQs.json', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer'), + type='opencompass.datasets.GaokaoBenchDataset'), + dict(abbr='lambada', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.LambadaEvaluator')), + infer_cfg=dict( + inferencer=dict( + max_out_len=5, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict( + round=[ + dict(prompt='Please complete the following sentence:\n{prompt}', + role='HUMAN'), + ]), + type='opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + path='./data/lambada/test/data-00000-of-00001.arrow', + reader_cfg=dict( + input_columns=[ + 'prompt', + ], + output_column='label', + test_split='test', + train_split='test'), + type='opencompass.datasets.lambadaDataset'), + dict(abbr='triviaqa', + eval_cfg=dict( + evaluator=dict( + type='opencompass.datasets.TriviaQAEvaluator'), + pred_role='BOT'), + infer_cfg=dict( + inferencer=dict( + max_out_len=50, + type='opencompass.openicl.icl_inferencer.GenInferencer'), + prompt_template=dict( + template=dict( + round=[ + dict(prompt="Answer these questions, your answer should be as simple as possible, start your answer with the prompt 'The answer is '.\nQ: {question}?", + role='HUMAN'), + dict(prompt='A:', + role='BOT'), + ]), + type='opencompass.openicl.icl_prompt_template.PromptTemplate'), + retriever=dict( + type='opencompass.openicl.icl_retriever.ZeroRetriever')), + path='./data/triviaqa/', + reader_cfg=dict( + input_columns=[ + 'question', + ], + output_column='answer', + test_split='dev', + train_split='dev'), + type='opencompass.datasets.TriviaQADataset'), + ] +models=[ + dict(abbr='{{$MODEL_ID:public/qwen3-0-6b@main}}', + batch_size=1, + key='fee1ce7f2b0843368012dfa938b261db', + max_out_len=100, + max_seq_len=2048, + openai_api_base='{{$MODEL_URL:http://modelhu-b0f7ds-nginx/learnware/models/openai/4pd/api/v1/chat/completions}}', + path='{{$MODEL_ID:public/qwen3-0-6b@main}}', + temperature=0.95, + type='opencompass.models.OpenAI'), + ] +summarizer=dict( + dataset_abbrs=[ + '--------- 考试 Exam ---------', + 'ceval', + 'agieval', + 'mmlu', + 'GaokaoBench', + 'ARC-c', + '--------- 语言 Language ---------', + 'WiC', + 'summedits', + 'chid-dev', + 'afqmc-dev', + 'bustm-dev', + 'cluewsc-dev', + 'WSC', + 'winogrande', + 'flores_100', + '--------- 知识 Knowledge ---------', + 'BoolQ', + 'commonsense_qa', + 'nq', + 'triviaqa', + '--------- 推理 Reasoning ---------', + 'cmnli', + 'ocnli', + 'ocnli_fc-dev', + 'AX_b', + 'AX_g', + 'CB', + 'RTE', + 'story_cloze', + 'COPA', + 'ReCoRD', + 'hellaswag', + 'piqa', + 'siqa', + 'strategyqa', + 'math', + 'gsm8k', + 'TheoremQA', + 'openai_humaneval', + 'mbpp', + 'cmmlu', + 'bbh', + '--------- 理解 Understanding ---------', + 'C3', + 'CMRC_dev', + 'DRCD_dev', + 'MultiRC', + 'race-middle', + 'race-high', + 'openbookqa_fact', + 'csl_dev', + 'lcsts', + 'Xsum', + 'eprstmt-dev', + 'lambada', + 'tnews-dev', + '--------- 安全 Safety ---------', + 'crows_pairs', + '--------- LEval Exact Match (Acc) ---------', + 'LEval_coursera', + 'LEval_gsm100', + 'LEval_quality', + 'LEval_tpo', + 'LEval_topic_retrieval', + '--------- LEval Gen (ROUGE) ---------', + 'LEval_financialqa', + 'LEval_gov_report_summ', + 'LEval_legal_contract_qa', + 'LEval_meeting_summ', + 'LEval_multidocqa', + 'LEval_narrativeqa', + 'LEval_nq', + 'LEval_news_summ', + 'LEval_paper_assistant', + 'LEval_patent_summ', + 'LEval_review_summ', + 'LEval_scientificqa', + 'LEval_tvshow_summ--------- 长文本 LongBench ---------', + 'longbench_lsht', + 'longbench_vcsum', + 'longbench_dureader', + 'longbench_multifieldqa_zh', + 'longbench_passage_retrieval_zh', + '--------- 单选 自定义数据 ---------', + 'SageBench-exam', + ], + prompt_db=dict( + blacklist='.promptignore', + config_dir='configs/datasets', + database_path='configs/datasets/log.json'), + summary_groups=[ + dict(name='agieval-chinese', + subsets=[ + 'agieval-gaokao-chinese', + 'agieval-gaokao-english', + 'agieval-gaokao-geography', + 'agieval-gaokao-history', + 'agieval-gaokao-biology', + 'agieval-gaokao-chemistry', + 'agieval-gaokao-physics', + 'agieval-gaokao-mathqa', + 'agieval-logiqa-zh', + 'agieval-jec-qa-kd', + 'agieval-jec-qa-ca', + 'agieval-gaokao-mathcloze', + ]), + dict(name='agieval-english', + subsets=[ + 'agieval-lsat-ar', + 'agieval-lsat-lr', + 'agieval-lsat-rc', + 'agieval-logiqa-en', + 'agieval-sat-math', + 'agieval-sat-en', + 'agieval-sat-en-without-passage', + 'agieval-aqua-rat', + 'agieval-math', + ]), + dict(name='agieval-gaokao', + subsets=[ + 'agieval-gaokao-chinese', + 'agieval-gaokao-english', + 'agieval-gaokao-geography', + 'agieval-gaokao-history', + 'agieval-gaokao-biology', + 'agieval-gaokao-chemistry', + 'agieval-gaokao-physics', + 'agieval-gaokao-mathqa', + 'agieval-gaokao-mathcloze', + ]), + dict(name='agieval', + subsets=[ + 'agieval-gaokao-chinese', + 'agieval-gaokao-english', + 'agieval-gaokao-geography', + 'agieval-gaokao-history', + 'agieval-gaokao-biology', + 'agieval-gaokao-chemistry', + 'agieval-gaokao-physics', + 'agieval-gaokao-mathqa', + 'agieval-logiqa-zh', + 'agieval-lsat-ar', + 'agieval-lsat-lr', + 'agieval-lsat-rc', + 'agieval-logiqa-en', + 'agieval-sat-math', + 'agieval-sat-en', + 'agieval-sat-en-without-passage', + 'agieval-aqua-rat', + 'agieval-jec-qa-kd', + 'agieval-jec-qa-ca', + 'agieval-gaokao-mathcloze', + 'agieval-math', + ]), + dict(name='mmlu-humanities', + subsets=[ + 'lukaemon_mmlu_formal_logic', + 'lukaemon_mmlu_high_school_european_history', + 'lukaemon_mmlu_high_school_us_history', + 'lukaemon_mmlu_high_school_world_history', + 'lukaemon_mmlu_international_law', + 'lukaemon_mmlu_jurisprudence', + 'lukaemon_mmlu_logical_fallacies', + 'lukaemon_mmlu_moral_disputes', + 'lukaemon_mmlu_moral_scenarios', + 'lukaemon_mmlu_philosophy', + 'lukaemon_mmlu_prehistory', + 'lukaemon_mmlu_professional_law', + 'lukaemon_mmlu_world_religions', + ]), + dict(name='mmlu-stem', + subsets=[ + 'lukaemon_mmlu_abstract_algebra', + 'lukaemon_mmlu_anatomy', + 'lukaemon_mmlu_astronomy', + 'lukaemon_mmlu_college_biology', + 'lukaemon_mmlu_college_chemistry', + 'lukaemon_mmlu_college_computer_science', + 'lukaemon_mmlu_college_mathematics', + 'lukaemon_mmlu_college_physics', + 'lukaemon_mmlu_computer_security', + 'lukaemon_mmlu_conceptual_physics', + 'lukaemon_mmlu_electrical_engineering', + 'lukaemon_mmlu_elementary_mathematics', + 'lukaemon_mmlu_high_school_biology', + 'lukaemon_mmlu_high_school_chemistry', + 'lukaemon_mmlu_high_school_computer_science', + 'lukaemon_mmlu_high_school_mathematics', + 'lukaemon_mmlu_high_school_physics', + 'lukaemon_mmlu_high_school_statistics', + 'lukaemon_mmlu_machine_learning', + ]), + dict(name='mmlu-social-science', + subsets=[ + 'lukaemon_mmlu_econometrics', + 'lukaemon_mmlu_high_school_geography', + 'lukaemon_mmlu_high_school_government_and_politics', + 'lukaemon_mmlu_high_school_macroeconomics', + 'lukaemon_mmlu_high_school_microeconomics', + 'lukaemon_mmlu_high_school_psychology', + 'lukaemon_mmlu_human_sexuality', + 'lukaemon_mmlu_professional_psychology', + 'lukaemon_mmlu_public_relations', + 'lukaemon_mmlu_security_studies', + 'lukaemon_mmlu_sociology', + 'lukaemon_mmlu_us_foreign_policy', + ]), + dict(name='mmlu-other', + subsets=[ + 'lukaemon_mmlu_business_ethics', + 'lukaemon_mmlu_clinical_knowledge', + 'lukaemon_mmlu_college_medicine', + 'lukaemon_mmlu_global_facts', + 'lukaemon_mmlu_human_aging', + 'lukaemon_mmlu_management', + 'lukaemon_mmlu_marketing', + 'lukaemon_mmlu_medical_genetics', + 'lukaemon_mmlu_miscellaneous', + 'lukaemon_mmlu_nutrition', + 'lukaemon_mmlu_professional_accounting', + 'lukaemon_mmlu_professional_medicine', + 'lukaemon_mmlu_virology', + ]), + dict(name='mmlu', + subsets=[ + 'lukaemon_mmlu_formal_logic', + 'lukaemon_mmlu_high_school_european_history', + 'lukaemon_mmlu_high_school_us_history', + 'lukaemon_mmlu_high_school_world_history', + 'lukaemon_mmlu_international_law', + 'lukaemon_mmlu_jurisprudence', + 'lukaemon_mmlu_logical_fallacies', + 'lukaemon_mmlu_moral_disputes', + 'lukaemon_mmlu_moral_scenarios', + 'lukaemon_mmlu_philosophy', + 'lukaemon_mmlu_prehistory', + 'lukaemon_mmlu_professional_law', + 'lukaemon_mmlu_world_religions', + 'lukaemon_mmlu_abstract_algebra', + 'lukaemon_mmlu_anatomy', + 'lukaemon_mmlu_astronomy', + 'lukaemon_mmlu_college_biology', + 'lukaemon_mmlu_college_chemistry', + 'lukaemon_mmlu_college_computer_science', + 'lukaemon_mmlu_college_mathematics', + 'lukaemon_mmlu_college_physics', + 'lukaemon_mmlu_computer_security', + 'lukaemon_mmlu_conceptual_physics', + 'lukaemon_mmlu_electrical_engineering', + 'lukaemon_mmlu_elementary_mathematics', + 'lukaemon_mmlu_high_school_biology', + 'lukaemon_mmlu_high_school_chemistry', + 'lukaemon_mmlu_high_school_computer_science', + 'lukaemon_mmlu_high_school_mathematics', + 'lukaemon_mmlu_high_school_physics', + 'lukaemon_mmlu_high_school_statistics', + 'lukaemon_mmlu_machine_learning', + 'lukaemon_mmlu_econometrics', + 'lukaemon_mmlu_high_school_geography', + 'lukaemon_mmlu_high_school_government_and_politics', + 'lukaemon_mmlu_high_school_macroeconomics', + 'lukaemon_mmlu_high_school_microeconomics', + 'lukaemon_mmlu_high_school_psychology', + 'lukaemon_mmlu_human_sexuality', + 'lukaemon_mmlu_professional_psychology', + 'lukaemon_mmlu_public_relations', + 'lukaemon_mmlu_security_studies', + 'lukaemon_mmlu_sociology', + 'lukaemon_mmlu_us_foreign_policy', + 'lukaemon_mmlu_business_ethics', + 'lukaemon_mmlu_clinical_knowledge', + 'lukaemon_mmlu_college_medicine', + 'lukaemon_mmlu_global_facts', + 'lukaemon_mmlu_human_aging', + 'lukaemon_mmlu_management', + 'lukaemon_mmlu_marketing', + 'lukaemon_mmlu_medical_genetics', + 'lukaemon_mmlu_miscellaneous', + 'lukaemon_mmlu_nutrition', + 'lukaemon_mmlu_professional_accounting', + 'lukaemon_mmlu_professional_medicine', + 'lukaemon_mmlu_virology', + ]), + dict(name='mmlu-weighted', + subsets=[ + 'lukaemon_mmlu_formal_logic', + 'lukaemon_mmlu_high_school_european_history', + 'lukaemon_mmlu_high_school_us_history', + 'lukaemon_mmlu_high_school_world_history', + 'lukaemon_mmlu_international_law', + 'lukaemon_mmlu_jurisprudence', + 'lukaemon_mmlu_logical_fallacies', + 'lukaemon_mmlu_moral_disputes', + 'lukaemon_mmlu_moral_scenarios', + 'lukaemon_mmlu_philosophy', + 'lukaemon_mmlu_prehistory', + 'lukaemon_mmlu_professional_law', + 'lukaemon_mmlu_world_religions', + 'lukaemon_mmlu_abstract_algebra', + 'lukaemon_mmlu_anatomy', + 'lukaemon_mmlu_astronomy', + 'lukaemon_mmlu_college_biology', + 'lukaemon_mmlu_college_chemistry', + 'lukaemon_mmlu_college_computer_science', + 'lukaemon_mmlu_college_mathematics', + 'lukaemon_mmlu_college_physics', + 'lukaemon_mmlu_computer_security', + 'lukaemon_mmlu_conceptual_physics', + 'lukaemon_mmlu_electrical_engineering', + 'lukaemon_mmlu_elementary_mathematics', + 'lukaemon_mmlu_high_school_biology', + 'lukaemon_mmlu_high_school_chemistry', + 'lukaemon_mmlu_high_school_computer_science', + 'lukaemon_mmlu_high_school_mathematics', + 'lukaemon_mmlu_high_school_physics', + 'lukaemon_mmlu_high_school_statistics', + 'lukaemon_mmlu_machine_learning', + 'lukaemon_mmlu_econometrics', + 'lukaemon_mmlu_high_school_geography', + 'lukaemon_mmlu_high_school_government_and_politics', + 'lukaemon_mmlu_high_school_macroeconomics', + 'lukaemon_mmlu_high_school_microeconomics', + 'lukaemon_mmlu_high_school_psychology', + 'lukaemon_mmlu_human_sexuality', + 'lukaemon_mmlu_professional_psychology', + 'lukaemon_mmlu_public_relations', + 'lukaemon_mmlu_security_studies', + 'lukaemon_mmlu_sociology', + 'lukaemon_mmlu_us_foreign_policy', + 'lukaemon_mmlu_business_ethics', + 'lukaemon_mmlu_clinical_knowledge', + 'lukaemon_mmlu_college_medicine', + 'lukaemon_mmlu_global_facts', + 'lukaemon_mmlu_human_aging', + 'lukaemon_mmlu_management', + 'lukaemon_mmlu_marketing', + 'lukaemon_mmlu_medical_genetics', + 'lukaemon_mmlu_miscellaneous', + 'lukaemon_mmlu_nutrition', + 'lukaemon_mmlu_professional_accounting', + 'lukaemon_mmlu_professional_medicine', + 'lukaemon_mmlu_virology', + ], + weights=dict( + lukaemon_mmlu_abstract_algebra=100, + lukaemon_mmlu_anatomy=135, + lukaemon_mmlu_astronomy=152, + lukaemon_mmlu_business_ethics=100, + lukaemon_mmlu_clinical_knowledge=265, + lukaemon_mmlu_college_biology=144, + lukaemon_mmlu_college_chemistry=100, + lukaemon_mmlu_college_computer_science=100, + lukaemon_mmlu_college_mathematics=100, + lukaemon_mmlu_college_medicine=173, + lukaemon_mmlu_college_physics=102, + lukaemon_mmlu_computer_security=100, + lukaemon_mmlu_conceptual_physics=235, + lukaemon_mmlu_econometrics=114, + lukaemon_mmlu_electrical_engineering=145, + lukaemon_mmlu_elementary_mathematics=378, + lukaemon_mmlu_formal_logic=126, + lukaemon_mmlu_global_facts=100, + lukaemon_mmlu_high_school_biology=310, + lukaemon_mmlu_high_school_chemistry=203, + lukaemon_mmlu_high_school_computer_science=100, + lukaemon_mmlu_high_school_european_history=165, + lukaemon_mmlu_high_school_geography=198, + lukaemon_mmlu_high_school_government_and_politics=193, + lukaemon_mmlu_high_school_macroeconomics=390, + lukaemon_mmlu_high_school_mathematics=270, + lukaemon_mmlu_high_school_microeconomics=238, + lukaemon_mmlu_high_school_physics=151, + lukaemon_mmlu_high_school_psychology=545, + lukaemon_mmlu_high_school_statistics=216, + lukaemon_mmlu_high_school_us_history=204, + lukaemon_mmlu_high_school_world_history=237, + lukaemon_mmlu_human_aging=223, + lukaemon_mmlu_human_sexuality=131, + lukaemon_mmlu_international_law=121, + lukaemon_mmlu_jurisprudence=108, + lukaemon_mmlu_logical_fallacies=163, + lukaemon_mmlu_machine_learning=112, + lukaemon_mmlu_management=103, + lukaemon_mmlu_marketing=234, + lukaemon_mmlu_medical_genetics=100, + lukaemon_mmlu_miscellaneous=783, + lukaemon_mmlu_moral_disputes=346, + lukaemon_mmlu_moral_scenarios=895, + lukaemon_mmlu_nutrition=306, + lukaemon_mmlu_philosophy=311, + lukaemon_mmlu_prehistory=324, + lukaemon_mmlu_professional_accounting=282, + lukaemon_mmlu_professional_law=1534, + lukaemon_mmlu_professional_medicine=272, + lukaemon_mmlu_professional_psychology=612, + lukaemon_mmlu_public_relations=110, + lukaemon_mmlu_security_studies=245, + lukaemon_mmlu_sociology=201, + lukaemon_mmlu_us_foreign_policy=100, + lukaemon_mmlu_virology=166, + lukaemon_mmlu_world_religions=171)), + dict(name='cmmlu-humanities', + subsets=[ + 'cmmlu-arts', + 'cmmlu-chinese_history', + 'cmmlu-chinese_literature', + 'cmmlu-college_law', + 'cmmlu-global_facts', + 'cmmlu-international_law', + 'cmmlu-jurisprudence', + 'cmmlu-logical', + 'cmmlu-marxist_theory', + 'cmmlu-philosophy', + 'cmmlu-professional_law', + 'cmmlu-world_history', + 'cmmlu-world_religions', + ]), + dict(name='cmmlu-stem', + subsets=[ + 'cmmlu-anatomy', + 'cmmlu-astronomy', + 'cmmlu-college_actuarial_science', + 'cmmlu-college_engineering_hydrology', + 'cmmlu-college_mathematics', + 'cmmlu-college_medical_statistics', + 'cmmlu-computer_science', + 'cmmlu-conceptual_physics', + 'cmmlu-electrical_engineering', + 'cmmlu-elementary_mathematics', + 'cmmlu-genetics', + 'cmmlu-high_school_biology', + 'cmmlu-high_school_chemistry', + 'cmmlu-high_school_mathematics', + 'cmmlu-high_school_physics', + 'cmmlu-machine_learning', + 'cmmlu-virology', + ]), + dict(name='cmmlu-social-science', + subsets=[ + 'cmmlu-ancient_chinese', + 'cmmlu-business_ethics', + 'cmmlu-chinese_civil_service_exam', + 'cmmlu-chinese_food_culture', + 'cmmlu-chinese_foreign_policy', + 'cmmlu-chinese_teacher_qualification', + 'cmmlu-college_education', + 'cmmlu-economics', + 'cmmlu-education', + 'cmmlu-elementary_chinese', + 'cmmlu-ethnology', + 'cmmlu-high_school_geography', + 'cmmlu-high_school_politics', + 'cmmlu-journalism', + 'cmmlu-management', + 'cmmlu-marketing', + 'cmmlu-modern_chinese', + 'cmmlu-professional_accounting', + 'cmmlu-professional_psychology', + 'cmmlu-public_relations', + 'cmmlu-security_study', + 'cmmlu-sociology', + ]), + dict(name='cmmlu-other', + subsets=[ + 'cmmlu-agronomy', + 'cmmlu-chinese_driving_rule', + 'cmmlu-clinical_knowledge', + 'cmmlu-college_medicine', + 'cmmlu-computer_security', + 'cmmlu-construction_project_management', + 'cmmlu-elementary_commonsense', + 'cmmlu-elementary_information_and_technology', + 'cmmlu-food_science', + 'cmmlu-human_sexuality', + 'cmmlu-legal_and_moral_basis', + 'cmmlu-nutrition', + 'cmmlu-professional_medicine', + 'cmmlu-sports_science', + 'cmmlu-traditional_chinese_medicine', + ]), + dict(name='cmmlu-china-specific', + subsets=[ + 'cmmlu-ancient_chinese', + 'cmmlu-chinese_civil_service_exam', + 'cmmlu-chinese_driving_rule', + 'cmmlu-chinese_food_culture', + 'cmmlu-chinese_foreign_policy', + 'cmmlu-chinese_history', + 'cmmlu-chinese_literature', + 'cmmlu-chinese_teacher_qualification', + 'cmmlu-construction_project_management', + 'cmmlu-elementary_chinese', + 'cmmlu-elementary_commonsense', + 'cmmlu-ethnology', + 'cmmlu-high_school_politics', + 'cmmlu-modern_chinese', + 'cmmlu-traditional_chinese_medicine', + ]), + dict(name='cmmlu', + subsets=[ + 'cmmlu-agronomy', + 'cmmlu-anatomy', + 'cmmlu-ancient_chinese', + 'cmmlu-arts', + 'cmmlu-astronomy', + 'cmmlu-business_ethics', + 'cmmlu-chinese_civil_service_exam', + 'cmmlu-chinese_driving_rule', + 'cmmlu-chinese_food_culture', + 'cmmlu-chinese_foreign_policy', + 'cmmlu-chinese_history', + 'cmmlu-chinese_literature', + 'cmmlu-chinese_teacher_qualification', + 'cmmlu-college_actuarial_science', + 'cmmlu-college_education', + 'cmmlu-college_engineering_hydrology', + 'cmmlu-college_law', + 'cmmlu-college_mathematics', + 'cmmlu-college_medical_statistics', + 'cmmlu-clinical_knowledge', + 'cmmlu-college_medicine', + 'cmmlu-computer_science', + 'cmmlu-computer_security', + 'cmmlu-conceptual_physics', + 'cmmlu-construction_project_management', + 'cmmlu-economics', + 'cmmlu-education', + 'cmmlu-elementary_chinese', + 'cmmlu-elementary_commonsense', + 'cmmlu-elementary_information_and_technology', + 'cmmlu-electrical_engineering', + 'cmmlu-elementary_mathematics', + 'cmmlu-ethnology', + 'cmmlu-food_science', + 'cmmlu-genetics', + 'cmmlu-global_facts', + 'cmmlu-high_school_biology', + 'cmmlu-high_school_chemistry', + 'cmmlu-high_school_geography', + 'cmmlu-high_school_mathematics', + 'cmmlu-high_school_physics', + 'cmmlu-high_school_politics', + 'cmmlu-human_sexuality', + 'cmmlu-international_law', + 'cmmlu-journalism', + 'cmmlu-jurisprudence', + 'cmmlu-legal_and_moral_basis', + 'cmmlu-logical', + 'cmmlu-machine_learning', + 'cmmlu-management', + 'cmmlu-marketing', + 'cmmlu-marxist_theory', + 'cmmlu-modern_chinese', + 'cmmlu-nutrition', + 'cmmlu-philosophy', + 'cmmlu-professional_accounting', + 'cmmlu-professional_law', + 'cmmlu-professional_medicine', + 'cmmlu-professional_psychology', + 'cmmlu-public_relations', + 'cmmlu-security_study', + 'cmmlu-sociology', + 'cmmlu-sports_science', + 'cmmlu-traditional_chinese_medicine', + 'cmmlu-virology', + 'cmmlu-world_history', + 'cmmlu-world_religions', + ]), + dict(name='ceval-stem', + subsets=[ + 'ceval-computer_network', + 'ceval-operating_system', + 'ceval-computer_architecture', + 'ceval-college_programming', + 'ceval-college_physics', + 'ceval-college_chemistry', + 'ceval-advanced_mathematics', + 'ceval-probability_and_statistics', + 'ceval-discrete_mathematics', + 'ceval-electrical_engineer', + 'ceval-metrology_engineer', + 'ceval-high_school_mathematics', + 'ceval-high_school_physics', + 'ceval-high_school_chemistry', + 'ceval-high_school_biology', + 'ceval-middle_school_mathematics', + 'ceval-middle_school_biology', + 'ceval-middle_school_physics', + 'ceval-middle_school_chemistry', + 'ceval-veterinary_medicine', + ]), + dict(name='ceval-social-science', + subsets=[ + 'ceval-college_economics', + 'ceval-business_administration', + 'ceval-marxism', + 'ceval-mao_zedong_thought', + 'ceval-education_science', + 'ceval-teacher_qualification', + 'ceval-high_school_politics', + 'ceval-high_school_geography', + 'ceval-middle_school_politics', + 'ceval-middle_school_geography', + ]), + dict(name='ceval-humanities', + subsets=[ + 'ceval-modern_chinese_history', + 'ceval-ideological_and_moral_cultivation', + 'ceval-logic', + 'ceval-law', + 'ceval-chinese_language_and_literature', + 'ceval-art_studies', + 'ceval-professional_tour_guide', + 'ceval-legal_professional', + 'ceval-high_school_chinese', + 'ceval-high_school_history', + 'ceval-middle_school_history', + ]), + dict(name='ceval-other', + subsets=[ + 'ceval-civil_servant', + 'ceval-sports_science', + 'ceval-plant_protection', + 'ceval-basic_medicine', + 'ceval-clinical_medicine', + 'ceval-urban_and_rural_planner', + 'ceval-accountant', + 'ceval-fire_engineer', + 'ceval-environmental_impact_assessment_engineer', + 'ceval-tax_accountant', + 'ceval-physician', + ]), + dict(name='ceval-hard', + subsets=[ + 'ceval-advanced_mathematics', + 'ceval-discrete_mathematics', + 'ceval-probability_and_statistics', + 'ceval-college_chemistry', + 'ceval-college_physics', + 'ceval-high_school_mathematics', + 'ceval-high_school_chemistry', + 'ceval-high_school_physics', + ]), + dict(name='ceval', + subsets=[ + 'ceval-computer_network', + 'ceval-operating_system', + 'ceval-computer_architecture', + 'ceval-college_programming', + 'ceval-college_physics', + 'ceval-college_chemistry', + 'ceval-advanced_mathematics', + 'ceval-probability_and_statistics', + 'ceval-discrete_mathematics', + 'ceval-electrical_engineer', + 'ceval-metrology_engineer', + 'ceval-high_school_mathematics', + 'ceval-high_school_physics', + 'ceval-high_school_chemistry', + 'ceval-high_school_biology', + 'ceval-middle_school_mathematics', + 'ceval-middle_school_biology', + 'ceval-middle_school_physics', + 'ceval-middle_school_chemistry', + 'ceval-veterinary_medicine', + 'ceval-college_economics', + 'ceval-business_administration', + 'ceval-marxism', + 'ceval-mao_zedong_thought', + 'ceval-education_science', + 'ceval-teacher_qualification', + 'ceval-high_school_politics', + 'ceval-high_school_geography', + 'ceval-middle_school_politics', + 'ceval-middle_school_geography', + 'ceval-modern_chinese_history', + 'ceval-ideological_and_moral_cultivation', + 'ceval-logic', + 'ceval-law', + 'ceval-chinese_language_and_literature', + 'ceval-art_studies', + 'ceval-professional_tour_guide', + 'ceval-legal_professional', + 'ceval-high_school_chinese', + 'ceval-high_school_history', + 'ceval-middle_school_history', + 'ceval-civil_servant', + 'ceval-sports_science', + 'ceval-plant_protection', + 'ceval-basic_medicine', + 'ceval-clinical_medicine', + 'ceval-urban_and_rural_planner', + 'ceval-accountant', + 'ceval-fire_engineer', + 'ceval-environmental_impact_assessment_engineer', + 'ceval-tax_accountant', + 'ceval-physician', + ]), + dict(name='bbh', + subsets=[ + 'bbh-temporal_sequences', + 'bbh-disambiguation_qa', + 'bbh-date_understanding', + 'bbh-tracking_shuffled_objects_three_objects', + 'bbh-penguins_in_a_table', + 'bbh-geometric_shapes', + 'bbh-snarks', + 'bbh-ruin_names', + 'bbh-tracking_shuffled_objects_seven_objects', + 'bbh-tracking_shuffled_objects_five_objects', + 'bbh-logical_deduction_three_objects', + 'bbh-hyperbaton', + 'bbh-logical_deduction_five_objects', + 'bbh-logical_deduction_seven_objects', + 'bbh-movie_recommendation', + 'bbh-salient_translation_error_detection', + 'bbh-reasoning_about_colored_objects', + 'bbh-multistep_arithmetic_two', + 'bbh-navigate', + 'bbh-dyck_languages', + 'bbh-word_sorting', + 'bbh-sports_understanding', + 'bbh-boolean_expressions', + 'bbh-object_counting', + 'bbh-formal_fallacies', + 'bbh-causal_judgement', + 'bbh-web_of_lies', + ]), + dict(name='GaokaoBench', + subsets=[ + 'GaokaoBench_2010-2022_Math_II_MCQs', + 'GaokaoBench_2010-2022_Math_I_MCQs', + 'GaokaoBench_2010-2022_History_MCQs', + 'GaokaoBench_2010-2022_Biology_MCQs', + 'GaokaoBench_2010-2022_Political_Science_MCQs', + 'GaokaoBench_2010-2022_Physics_MCQs', + 'GaokaoBench_2010-2022_Chemistry_MCQs', + 'GaokaoBench_2010-2013_English_MCQs', + 'GaokaoBench_2010-2022_Chinese_Modern_Lit', + 'GaokaoBench_2010-2022_English_Fill_in_Blanks', + 'GaokaoBench_2012-2022_English_Cloze_Test', + 'GaokaoBench_2010-2022_Geography_MCQs', + 'GaokaoBench_2010-2022_English_Reading_Comp', + 'GaokaoBench_2010-2022_Chinese_Lang_and_Usage_MCQs', + ], + weights=dict( + {'GaokaoBench_2010-2013_English_MCQs': 105, + 'GaokaoBench_2010-2022_Biology_MCQs': 900, + 'GaokaoBench_2010-2022_Chemistry_MCQs': 744, + 'GaokaoBench_2010-2022_Chinese_Lang_and_Usage_MCQs': 240, + 'GaokaoBench_2010-2022_Chinese_Modern_Lit': 261, + 'GaokaoBench_2010-2022_English_Fill_in_Blanks': 900.0, + 'GaokaoBench_2010-2022_English_Reading_Comp': 940, + 'GaokaoBench_2010-2022_Geography_MCQs': 380, + 'GaokaoBench_2010-2022_History_MCQs': 1148, + 'GaokaoBench_2010-2022_Math_II_MCQs': 1090, + 'GaokaoBench_2010-2022_Math_I_MCQs': 1070, + 'GaokaoBench_2010-2022_Physics_MCQs': 384, + 'GaokaoBench_2010-2022_Political_Science_MCQs': 1280, + 'GaokaoBench_2012-2022_English_Cloze_Test': 260})), + dict(name='flores_100_Indo-European-Germanic_English', + subsets=[ + 'flores_100_afr-eng', + 'flores_100_dan-eng', + 'flores_100_deu-eng', + 'flores_100_isl-eng', + 'flores_100_ltz-eng', + 'flores_100_nld-eng', + 'flores_100_nob-eng', + 'flores_100_swe-eng', + ]), + dict(name='flores_100_English_Indo-European-Germanic', + subsets=[ + 'flores_100_eng-afr', + 'flores_100_eng-dan', + 'flores_100_eng-deu', + 'flores_100_eng-isl', + 'flores_100_eng-ltz', + 'flores_100_eng-nld', + 'flores_100_eng-nob', + 'flores_100_eng-swe', + ]), + dict(name='flores_100_Indo-European-Romance_English', + subsets=[ + 'flores_100_ast-eng', + 'flores_100_cat-eng', + 'flores_100_fra-eng', + 'flores_100_glg-eng', + 'flores_100_oci-eng', + 'flores_100_por-eng', + 'flores_100_ron-eng', + 'flores_100_spa-eng', + ]), + dict(name='flores_100_English_Indo-European-Romance', + subsets=[ + 'flores_100_eng-ast', + 'flores_100_eng-cat', + 'flores_100_eng-fra', + 'flores_100_eng-glg', + 'flores_100_eng-oci', + 'flores_100_eng-por', + 'flores_100_eng-ron', + 'flores_100_eng-spa', + ]), + dict(name='flores_100_Indo-European-Slavic_English', + subsets=[ + 'flores_100_bel-eng', + 'flores_100_bos-eng', + 'flores_100_bul-eng', + 'flores_100_ces-eng', + 'flores_100_hrv-eng', + 'flores_100_mkd-eng', + 'flores_100_pol-eng', + 'flores_100_rus-eng', + 'flores_100_slk-eng', + 'flores_100_slv-eng', + 'flores_100_srp-eng', + 'flores_100_ukr-eng', + ]), + dict(name='flores_100_English_Indo-European-Slavic', + subsets=[ + 'flores_100_eng-bel', + 'flores_100_eng-bos', + 'flores_100_eng-bul', + 'flores_100_eng-ces', + 'flores_100_eng-hrv', + 'flores_100_eng-mkd', + 'flores_100_eng-pol', + 'flores_100_eng-rus', + 'flores_100_eng-slk', + 'flores_100_eng-slv', + 'flores_100_eng-srp', + 'flores_100_eng-ukr', + ]), + dict(name='flores_100_Indo-European-Indo-Aryan_English', + subsets=[ + 'flores_100_asm-eng', + 'flores_100_ben-eng', + 'flores_100_guj-eng', + 'flores_100_hin-eng', + 'flores_100_mar-eng', + 'flores_100_npi-eng', + 'flores_100_ory-eng', + 'flores_100_pan-eng', + 'flores_100_snd-eng', + 'flores_100_urd-eng', + ]), + dict(name='flores_100_English_Indo-European-Indo-Aryan', + subsets=[ + 'flores_100_eng-asm', + 'flores_100_eng-ben', + 'flores_100_eng-guj', + 'flores_100_eng-hin', + 'flores_100_eng-mar', + 'flores_100_eng-npi', + 'flores_100_eng-ory', + 'flores_100_eng-pan', + 'flores_100_eng-snd', + 'flores_100_eng-urd', + ]), + dict(name='flores_100_Indo-European-Other_English', + subsets=[ + 'flores_100_ckb-eng', + 'flores_100_cym-eng', + 'flores_100_ell-eng', + 'flores_100_fas-eng', + 'flores_100_gle-eng', + 'flores_100_hye-eng', + 'flores_100_ita-eng', + 'flores_100_lav-eng', + 'flores_100_lit-eng', + 'flores_100_pus-eng', + 'flores_100_tgk-eng', + ]), + dict(name='flores_100_English_Indo-European-Other', + subsets=[ + 'flores_100_eng-ckb', + 'flores_100_eng-cym', + 'flores_100_eng-ell', + 'flores_100_eng-fas', + 'flores_100_eng-gle', + 'flores_100_eng-hye', + 'flores_100_eng-ita', + 'flores_100_eng-lav', + 'flores_100_eng-lit', + 'flores_100_eng-pus', + 'flores_100_eng-tgk', + ]), + dict(name='flores_100_Austronesian_English', + subsets=[ + 'flores_100_ceb-eng', + 'flores_100_ind-eng', + 'flores_100_jav-eng', + 'flores_100_mri-eng', + 'flores_100_msa-eng', + 'flores_100_tgl-eng', + ]), + dict(name='flores_100_English_Austronesian', + subsets=[ + 'flores_100_eng-ceb', + 'flores_100_eng-ind', + 'flores_100_eng-jav', + 'flores_100_eng-mri', + 'flores_100_eng-msa', + 'flores_100_eng-tgl', + ]), + dict(name='flores_100_Atlantic-Congo_English', + subsets=[ + 'flores_100_ibo-eng', + 'flores_100_kam-eng', + 'flores_100_kea-eng', + 'flores_100_lin-eng', + 'flores_100_lug-eng', + 'flores_100_nso-eng', + 'flores_100_nya-eng', + 'flores_100_sna-eng', + 'flores_100_swh-eng', + 'flores_100_umb-eng', + 'flores_100_wol-eng', + 'flores_100_xho-eng', + 'flores_100_yor-eng', + 'flores_100_zul-eng', + ]), + dict(name='flores_100_English_Atlantic-Congo', + subsets=[ + 'flores_100_eng-ibo', + 'flores_100_eng-kam', + 'flores_100_eng-kea', + 'flores_100_eng-lin', + 'flores_100_eng-lug', + 'flores_100_eng-nso', + 'flores_100_eng-nya', + 'flores_100_eng-sna', + 'flores_100_eng-swh', + 'flores_100_eng-umb', + 'flores_100_eng-wol', + 'flores_100_eng-xho', + 'flores_100_eng-yor', + 'flores_100_eng-zul', + ]), + dict(name='flores_100_Afro-Asiatic_English', + subsets=[ + 'flores_100_amh-eng', + 'flores_100_ara-eng', + 'flores_100_ful-eng', + 'flores_100_mlt-eng', + 'flores_100_orm-eng', + 'flores_100_som-eng', + ]), + dict(name='flores_100_English_Afro-Asiatic', + subsets=[ + 'flores_100_eng-amh', + 'flores_100_eng-ara', + 'flores_100_eng-ful', + 'flores_100_eng-mlt', + 'flores_100_eng-orm', + 'flores_100_eng-som', + ]), + dict(name='flores_100_Turkic_English', + subsets=[ + 'flores_100_azj-eng', + 'flores_100_kaz-eng', + 'flores_100_kir-eng', + 'flores_100_tur-eng', + 'flores_100_uzb-eng', + ]), + dict(name='flores_100_English_Turkic', + subsets=[ + 'flores_100_eng-azj', + 'flores_100_eng-kaz', + 'flores_100_eng-kir', + 'flores_100_eng-tur', + 'flores_100_eng-uzb', + ]), + dict(name='flores_100_Dravidian_English', + subsets=[ + 'flores_100_kan-eng', + 'flores_100_mal-eng', + 'flores_100_tam-eng', + 'flores_100_tel-eng', + ]), + dict(name='flores_100_English_Dravidian', + subsets=[ + 'flores_100_eng-kan', + 'flores_100_eng-mal', + 'flores_100_eng-tam', + 'flores_100_eng-tel', + ]), + dict(name='flores_100_Sino-Tibetan_English', + subsets=[ + 'flores_100_mya-eng', + 'flores_100_zho_simpl-eng', + 'flores_100_zho_trad-eng', + ]), + dict(name='flores_100_English_Sino-Tibetan', + subsets=[ + 'flores_100_eng-mya', + 'flores_100_eng-zho_simpl', + 'flores_100_eng-zho_trad', + ]), + dict(name='flores_100_Other_English', + subsets=[ + 'flores_100_est-eng', + 'flores_100_fin-eng', + 'flores_100_hau-eng', + 'flores_100_heb-eng', + 'flores_100_hun-eng', + 'flores_100_jpn-eng', + 'flores_100_kat-eng', + 'flores_100_khm-eng', + 'flores_100_kor-eng', + 'flores_100_lao-eng', + 'flores_100_luo-eng', + 'flores_100_mon-eng', + 'flores_100_tha-eng', + 'flores_100_vie-eng', + ]), + dict(name='flores_100_English_Other', + subsets=[ + 'flores_100_eng-est', + 'flores_100_eng-fin', + 'flores_100_eng-hau', + 'flores_100_eng-heb', + 'flores_100_eng-hun', + 'flores_100_eng-jpn', + 'flores_100_eng-kat', + 'flores_100_eng-khm', + 'flores_100_eng-kor', + 'flores_100_eng-lao', + 'flores_100_eng-luo', + 'flores_100_eng-mon', + 'flores_100_eng-tha', + 'flores_100_eng-vie', + ]), + dict(name='flores_100', + subsets=[ + 'flores_100_afr-eng', + 'flores_100_dan-eng', + 'flores_100_deu-eng', + 'flores_100_isl-eng', + 'flores_100_ltz-eng', + 'flores_100_nld-eng', + 'flores_100_nob-eng', + 'flores_100_swe-eng', + 'flores_100_ast-eng', + 'flores_100_cat-eng', + 'flores_100_fra-eng', + 'flores_100_glg-eng', + 'flores_100_oci-eng', + 'flores_100_por-eng', + 'flores_100_ron-eng', + 'flores_100_spa-eng', + 'flores_100_bel-eng', + 'flores_100_bos-eng', + 'flores_100_bul-eng', + 'flores_100_ces-eng', + 'flores_100_hrv-eng', + 'flores_100_mkd-eng', + 'flores_100_pol-eng', + 'flores_100_rus-eng', + 'flores_100_slk-eng', + 'flores_100_slv-eng', + 'flores_100_srp-eng', + 'flores_100_ukr-eng', + 'flores_100_asm-eng', + 'flores_100_ben-eng', + 'flores_100_guj-eng', + 'flores_100_hin-eng', + 'flores_100_mar-eng', + 'flores_100_npi-eng', + 'flores_100_ory-eng', + 'flores_100_pan-eng', + 'flores_100_snd-eng', + 'flores_100_urd-eng', + 'flores_100_ckb-eng', + 'flores_100_cym-eng', + 'flores_100_ell-eng', + 'flores_100_fas-eng', + 'flores_100_gle-eng', + 'flores_100_hye-eng', + 'flores_100_ita-eng', + 'flores_100_lav-eng', + 'flores_100_lit-eng', + 'flores_100_pus-eng', + 'flores_100_tgk-eng', + 'flores_100_ceb-eng', + 'flores_100_ind-eng', + 'flores_100_jav-eng', + 'flores_100_mri-eng', + 'flores_100_msa-eng', + 'flores_100_tgl-eng', + 'flores_100_ibo-eng', + 'flores_100_kam-eng', + 'flores_100_kea-eng', + 'flores_100_lin-eng', + 'flores_100_lug-eng', + 'flores_100_nso-eng', + 'flores_100_nya-eng', + 'flores_100_sna-eng', + 'flores_100_swh-eng', + 'flores_100_umb-eng', + 'flores_100_wol-eng', + 'flores_100_xho-eng', + 'flores_100_yor-eng', + 'flores_100_zul-eng', + 'flores_100_amh-eng', + 'flores_100_ara-eng', + 'flores_100_ful-eng', + 'flores_100_mlt-eng', + 'flores_100_orm-eng', + 'flores_100_som-eng', + 'flores_100_azj-eng', + 'flores_100_kaz-eng', + 'flores_100_kir-eng', + 'flores_100_tur-eng', + 'flores_100_uzb-eng', + 'flores_100_kan-eng', + 'flores_100_mal-eng', + 'flores_100_tam-eng', + 'flores_100_tel-eng', + 'flores_100_mya-eng', + 'flores_100_zho_simpl-eng', + 'flores_100_zho_trad-eng', + 'flores_100_est-eng', + 'flores_100_fin-eng', + 'flores_100_hau-eng', + 'flores_100_heb-eng', + 'flores_100_hun-eng', + 'flores_100_jpn-eng', + 'flores_100_kat-eng', + 'flores_100_khm-eng', + 'flores_100_kor-eng', + 'flores_100_lao-eng', + 'flores_100_luo-eng', + 'flores_100_mon-eng', + 'flores_100_tha-eng', + 'flores_100_vie-eng', + 'flores_100_eng-afr', + 'flores_100_eng-dan', + 'flores_100_eng-deu', + 'flores_100_eng-isl', + 'flores_100_eng-ltz', + 'flores_100_eng-nld', + 'flores_100_eng-nob', + 'flores_100_eng-swe', + 'flores_100_eng-ast', + 'flores_100_eng-cat', + 'flores_100_eng-fra', + 'flores_100_eng-glg', + 'flores_100_eng-oci', + 'flores_100_eng-por', + 'flores_100_eng-ron', + 'flores_100_eng-spa', + 'flores_100_eng-bel', + 'flores_100_eng-bos', + 'flores_100_eng-bul', + 'flores_100_eng-ces', + 'flores_100_eng-hrv', + 'flores_100_eng-mkd', + 'flores_100_eng-pol', + 'flores_100_eng-rus', + 'flores_100_eng-slk', + 'flores_100_eng-slv', + 'flores_100_eng-srp', + 'flores_100_eng-ukr', + 'flores_100_eng-asm', + 'flores_100_eng-ben', + 'flores_100_eng-guj', + 'flores_100_eng-hin', + 'flores_100_eng-mar', + 'flores_100_eng-npi', + 'flores_100_eng-ory', + 'flores_100_eng-pan', + 'flores_100_eng-snd', + 'flores_100_eng-urd', + 'flores_100_eng-ckb', + 'flores_100_eng-cym', + 'flores_100_eng-ell', + 'flores_100_eng-fas', + 'flores_100_eng-gle', + 'flores_100_eng-hye', + 'flores_100_eng-ita', + 'flores_100_eng-lav', + 'flores_100_eng-lit', + 'flores_100_eng-pus', + 'flores_100_eng-tgk', + 'flores_100_eng-ceb', + 'flores_100_eng-ind', + 'flores_100_eng-jav', + 'flores_100_eng-mri', + 'flores_100_eng-msa', + 'flores_100_eng-tgl', + 'flores_100_eng-ibo', + 'flores_100_eng-kam', + 'flores_100_eng-kea', + 'flores_100_eng-lin', + 'flores_100_eng-lug', + 'flores_100_eng-nso', + 'flores_100_eng-nya', + 'flores_100_eng-sna', + 'flores_100_eng-swh', + 'flores_100_eng-umb', + 'flores_100_eng-wol', + 'flores_100_eng-xho', + 'flores_100_eng-yor', + 'flores_100_eng-zul', + 'flores_100_eng-amh', + 'flores_100_eng-ara', + 'flores_100_eng-ful', + 'flores_100_eng-mlt', + 'flores_100_eng-orm', + 'flores_100_eng-som', + 'flores_100_eng-azj', + 'flores_100_eng-kaz', + 'flores_100_eng-kir', + 'flores_100_eng-tur', + 'flores_100_eng-uzb', + 'flores_100_eng-kan', + 'flores_100_eng-mal', + 'flores_100_eng-tam', + 'flores_100_eng-tel', + 'flores_100_eng-mya', + 'flores_100_eng-zho_simpl', + 'flores_100_eng-zho_trad', + 'flores_100_eng-est', + 'flores_100_eng-fin', + 'flores_100_eng-hau', + 'flores_100_eng-heb', + 'flores_100_eng-hun', + 'flores_100_eng-jpn', + 'flores_100_eng-kat', + 'flores_100_eng-khm', + 'flores_100_eng-kor', + 'flores_100_eng-lao', + 'flores_100_eng-luo', + 'flores_100_eng-mon', + 'flores_100_eng-tha', + 'flores_100_eng-vie', + ]), + dict(name='jigsaw_multilingual', + subsets=[ + 'jigsaw_multilingual_es', + 'jigsaw_multilingual_fr', + 'jigsaw_multilingual_it', + 'jigsaw_multilingual_pt', + 'jigsaw_multilingual_ru', + 'jigsaw_multilingual_tr', + ]), + ]) +work_dir='outputs/demo/20260204_013552' \ No newline at end of file diff --git a/logs/eval/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out b/logs/eval/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out new file mode 100644 index 0000000..7a99586 --- /dev/null +++ b/logs/eval/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out @@ -0,0 +1,7 @@ +[RISE-CORE Msg(16239:140152262265856:libvgpu.c:901)]: Initializing..... +[RISE-CORE ERROR (pid:16239 thread=140152262265856 libvgpu.c:958)]: cuInit failed:100 +02/04 02:16:10 - OpenCompass - INFO - Task [public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs]: {'score': 40.0} +02/04 02:16:10 - OpenCompass - INFO - time elapsed: 2.35s +/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') +[RISE-CORE Msg(16239:140152262265856:multiprocess_memory_limit.c:504)]: Calling exit handler 16239 diff --git a/logs/eval/public/qwen3-0-6b@main/lambada.out b/logs/eval/public/qwen3-0-6b@main/lambada.out new file mode 100644 index 0000000..68c35b3 --- /dev/null +++ b/logs/eval/public/qwen3-0-6b@main/lambada.out @@ -0,0 +1,7 @@ +[RISE-CORE Msg(16051:140385105300480:libvgpu.c:901)]: Initializing..... +[RISE-CORE ERROR (pid:16051 thread=140385105300480 libvgpu.c:958)]: cuInit failed:100 +02/04 02:15:59 - OpenCompass - INFO - Task [public/qwen3-0-6b@main/lambada]: {'accuracy': 0.038812342324859306} +02/04 02:15:59 - OpenCompass - INFO - time elapsed: 2.27s +/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') +[RISE-CORE Msg(16051:140385105300480:multiprocess_memory_limit.c:504)]: Calling exit handler 16051 diff --git a/logs/eval/public/qwen3-0-6b@main/triviaqa.out b/logs/eval/public/qwen3-0-6b@main/triviaqa.out new file mode 100644 index 0000000..cd26c43 --- /dev/null +++ b/logs/eval/public/qwen3-0-6b@main/triviaqa.out @@ -0,0 +1,7 @@ +[RISE-CORE Msg(16236:140076091423744:libvgpu.c:901)]: Initializing..... +[RISE-CORE ERROR (pid:16236 thread=140076091423744 libvgpu.c:958)]: cuInit failed:100 +02/04 02:16:11 - OpenCompass - INFO - Task [public/qwen3-0-6b@main/triviaqa]: {'score': 0.011316057485572028} +02/04 02:16:11 - OpenCompass - INFO - time elapsed: 3.99s +/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') +[RISE-CORE Msg(16236:140076091423744:multiprocess_memory_limit.c:504)]: Calling exit handler 16236 diff --git a/logs/infer/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out b/logs/infer/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out new file mode 100644 index 0000000..a1f9959 --- /dev/null +++ b/logs/infer/public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs.out @@ -0,0 +1,10 @@ +[RISE-CORE Msg(15373:140578603224064:libvgpu.c:901)]: Initializing..... +[RISE-CORE ERROR (pid:15373 thread=140578603224064 libvgpu.c:958)]: cuInit failed:100 +02/04 02:04:59 - OpenCompass - INFO - Task [public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs] +02/04 02:05:01 - OpenCompass - INFO - Start inferencing [public/qwen3-0-6b@main/GaokaoBench_2010-2013_English_MCQs] +/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') +[2026-02-04 02:05:01,165] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process... + 0%| | 0/105 [00:00