commit file to repo

This commit is contained in:
4pdadmin 2026-04-08 09:30:46 +00:00 committed by root
commit 4034d5e6eb
12 changed files with 22217 additions and 0 deletions

0
.gitignore vendored Normal file
View File

1292
configs/20260408_170120.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
04/08 17:30:36 - OpenCompass - INFO - Task [public-minimax-m2-5-main/lambada]: {'accuracy': 0.038812342324859306}
04/08 17:30:36 - OpenCompass - INFO - time elapsed: 2.08s
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
{"accuracy": 0.038812342324859306}

View File

@ -0,0 +1,87 @@
dataset,version,metric,mode,public-minimax-m2-5-main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,217e11,accuracy,gen,0.04
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
1 dataset version metric mode public-minimax-m2-5-main
2 --------- 考试 Exam --------- - - - -
3 ceval - - - -
4 agieval - - - -
5 mmlu - - - -
6 GaokaoBench - - - -
7 ARC-c - - - -
8 --------- 语言 Language --------- - - - -
9 WiC - - - -
10 summedits - - - -
11 chid-dev - - - -
12 afqmc-dev - - - -
13 bustm-dev - - - -
14 cluewsc-dev - - - -
15 WSC - - - -
16 winogrande - - - -
17 flores_100 - - - -
18 --------- 知识 Knowledge --------- - - - -
19 BoolQ - - - -
20 commonsense_qa - - - -
21 nq - - - -
22 triviaqa - - - -
23 --------- 推理 Reasoning --------- - - - -
24 cmnli - - - -
25 ocnli - - - -
26 ocnli_fc-dev - - - -
27 AX_b - - - -
28 AX_g - - - -
29 CB - - - -
30 RTE - - - -
31 story_cloze - - - -
32 COPA - - - -
33 ReCoRD - - - -
34 hellaswag - - - -
35 piqa - - - -
36 siqa - - - -
37 strategyqa - - - -
38 math - - - -
39 gsm8k - - - -
40 TheoremQA - - - -
41 openai_humaneval - - - -
42 mbpp - - - -
43 cmmlu - - - -
44 bbh - - - -
45 --------- 理解 Understanding --------- - - - -
46 C3 - - - -
47 CMRC_dev - - - -
48 DRCD_dev - - - -
49 MultiRC - - - -
50 race-middle - - - -
51 race-high - - - -
52 openbookqa_fact - - - -
53 csl_dev - - - -
54 lcsts - - - -
55 Xsum - - - -
56 eprstmt-dev - - - -
57 lambada 217e11 accuracy gen 0.04
58 tnews-dev - - - -
59 --------- 安全 Safety --------- - - - -
60 crows_pairs - - - -
61 --------- LEval Exact Match (Acc) --------- - - - -
62 LEval_coursera - - - -
63 LEval_gsm100 - - - -
64 LEval_quality - - - -
65 LEval_tpo - - - -
66 LEval_topic_retrieval - - - -
67 --------- LEval Gen (ROUGE) --------- - - - -
68 LEval_financialqa - - - -
69 LEval_gov_report_summ - - - -
70 LEval_legal_contract_qa - - - -
71 LEval_meeting_summ - - - -
72 LEval_multidocqa - - - -
73 LEval_narrativeqa - - - -
74 LEval_nq - - - -
75 LEval_news_summ - - - -
76 LEval_paper_assistant - - - -
77 LEval_patent_summ - - - -
78 LEval_review_summ - - - -
79 LEval_scientificqa - - - -
80 LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
81 longbench_lsht - - - -
82 longbench_vcsum - - - -
83 longbench_dureader - - - -
84 longbench_multifieldqa_zh - - - -
85 longbench_passage_retrieval_zh - - - -
86 --------- 单选 自定义数据 --------- - - - -
87 SageBench-exam - - - -

View File

@ -0,0 +1,194 @@
20260408_170120
tabulate format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset version metric mode public-minimax-m2-5-main
----------------------------------------------------- --------- -------- ------ --------------------------
--------- 考试 Exam --------- - - - -
ceval - - - -
agieval - - - -
mmlu - - - -
GaokaoBench - - - -
ARC-c - - - -
--------- 语言 Language --------- - - - -
WiC - - - -
summedits - - - -
chid-dev - - - -
afqmc-dev - - - -
bustm-dev - - - -
cluewsc-dev - - - -
WSC - - - -
winogrande - - - -
flores_100 - - - -
--------- 知识 Knowledge --------- - - - -
BoolQ - - - -
commonsense_qa - - - -
nq - - - -
triviaqa - - - -
--------- 推理 Reasoning --------- - - - -
cmnli - - - -
ocnli - - - -
ocnli_fc-dev - - - -
AX_b - - - -
AX_g - - - -
CB - - - -
RTE - - - -
story_cloze - - - -
COPA - - - -
ReCoRD - - - -
hellaswag - - - -
piqa - - - -
siqa - - - -
strategyqa - - - -
math - - - -
gsm8k - - - -
TheoremQA - - - -
openai_humaneval - - - -
mbpp - - - -
cmmlu - - - -
bbh - - - -
--------- 理解 Understanding --------- - - - -
C3 - - - -
CMRC_dev - - - -
DRCD_dev - - - -
MultiRC - - - -
race-middle - - - -
race-high - - - -
openbookqa_fact - - - -
csl_dev - - - -
lcsts - - - -
Xsum - - - -
eprstmt-dev - - - -
lambada 217e11 accuracy gen 0.04
tnews-dev - - - -
--------- 安全 Safety --------- - - - -
crows_pairs - - - -
--------- LEval Exact Match (Acc) --------- - - - -
LEval_coursera - - - -
LEval_gsm100 - - - -
LEval_quality - - - -
LEval_tpo - - - -
LEval_topic_retrieval - - - -
--------- LEval Gen (ROUGE) --------- - - - -
LEval_financialqa - - - -
LEval_gov_report_summ - - - -
LEval_legal_contract_qa - - - -
LEval_meeting_summ - - - -
LEval_multidocqa - - - -
LEval_narrativeqa - - - -
LEval_nq - - - -
LEval_news_summ - - - -
LEval_paper_assistant - - - -
LEval_patent_summ - - - -
LEval_review_summ - - - -
LEval_scientificqa - - - -
LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
longbench_lsht - - - -
longbench_vcsum - - - -
longbench_dureader - - - -
longbench_multifieldqa_zh - - - -
longbench_passage_retrieval_zh - - - -
--------- 单选 自定义数据 --------- - - - -
SageBench-exam - - - -
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
csv format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset,version,metric,mode,public-minimax-m2-5-main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,217e11,accuracy,gen,0.04
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
raw format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-------------------------------
Model: public-minimax-m2-5-main
lambada: {'accuracy': 0.038812342324859306}
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$