Compare commits

...

No commits in common. "main" and "main-260410-004040" have entirely different histories.

13 changed files with 22217 additions and 1 deletions

0
.gitignore vendored Normal file
View File

View File

@ -1 +0,0 @@
This is a repo for model evaluation.

1292
configs/20260410_004154.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
04/10 01:10:59 - OpenCompass - INFO - Task [public-qwen3-06b-main/lambada]: {'accuracy': 0.038812342324859306}
04/10 01:10:59 - OpenCompass - INFO - time elapsed: 1.87s
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@
{"accuracy": 0.038812342324859306}

View File

@ -0,0 +1,87 @@
dataset,version,metric,mode,public-qwen3-06b-main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,217e11,accuracy,gen,0.04
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
1 dataset version metric mode public-qwen3-06b-main
2 --------- 考试 Exam --------- - - - -
3 ceval - - - -
4 agieval - - - -
5 mmlu - - - -
6 GaokaoBench - - - -
7 ARC-c - - - -
8 --------- 语言 Language --------- - - - -
9 WiC - - - -
10 summedits - - - -
11 chid-dev - - - -
12 afqmc-dev - - - -
13 bustm-dev - - - -
14 cluewsc-dev - - - -
15 WSC - - - -
16 winogrande - - - -
17 flores_100 - - - -
18 --------- 知识 Knowledge --------- - - - -
19 BoolQ - - - -
20 commonsense_qa - - - -
21 nq - - - -
22 triviaqa - - - -
23 --------- 推理 Reasoning --------- - - - -
24 cmnli - - - -
25 ocnli - - - -
26 ocnli_fc-dev - - - -
27 AX_b - - - -
28 AX_g - - - -
29 CB - - - -
30 RTE - - - -
31 story_cloze - - - -
32 COPA - - - -
33 ReCoRD - - - -
34 hellaswag - - - -
35 piqa - - - -
36 siqa - - - -
37 strategyqa - - - -
38 math - - - -
39 gsm8k - - - -
40 TheoremQA - - - -
41 openai_humaneval - - - -
42 mbpp - - - -
43 cmmlu - - - -
44 bbh - - - -
45 --------- 理解 Understanding --------- - - - -
46 C3 - - - -
47 CMRC_dev - - - -
48 DRCD_dev - - - -
49 MultiRC - - - -
50 race-middle - - - -
51 race-high - - - -
52 openbookqa_fact - - - -
53 csl_dev - - - -
54 lcsts - - - -
55 Xsum - - - -
56 eprstmt-dev - - - -
57 lambada 217e11 accuracy gen 0.04
58 tnews-dev - - - -
59 --------- 安全 Safety --------- - - - -
60 crows_pairs - - - -
61 --------- LEval Exact Match (Acc) --------- - - - -
62 LEval_coursera - - - -
63 LEval_gsm100 - - - -
64 LEval_quality - - - -
65 LEval_tpo - - - -
66 LEval_topic_retrieval - - - -
67 --------- LEval Gen (ROUGE) --------- - - - -
68 LEval_financialqa - - - -
69 LEval_gov_report_summ - - - -
70 LEval_legal_contract_qa - - - -
71 LEval_meeting_summ - - - -
72 LEval_multidocqa - - - -
73 LEval_narrativeqa - - - -
74 LEval_nq - - - -
75 LEval_news_summ - - - -
76 LEval_paper_assistant - - - -
77 LEval_patent_summ - - - -
78 LEval_review_summ - - - -
79 LEval_scientificqa - - - -
80 LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
81 longbench_lsht - - - -
82 longbench_vcsum - - - -
83 longbench_dureader - - - -
84 longbench_multifieldqa_zh - - - -
85 longbench_passage_retrieval_zh - - - -
86 --------- 单选 自定义数据 --------- - - - -
87 SageBench-exam - - - -

View File

@ -0,0 +1,194 @@
20260410_004154
tabulate format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset version metric mode public-qwen3-06b-main
----------------------------------------------------- --------- -------- ------ -----------------------
--------- 考试 Exam --------- - - - -
ceval - - - -
agieval - - - -
mmlu - - - -
GaokaoBench - - - -
ARC-c - - - -
--------- 语言 Language --------- - - - -
WiC - - - -
summedits - - - -
chid-dev - - - -
afqmc-dev - - - -
bustm-dev - - - -
cluewsc-dev - - - -
WSC - - - -
winogrande - - - -
flores_100 - - - -
--------- 知识 Knowledge --------- - - - -
BoolQ - - - -
commonsense_qa - - - -
nq - - - -
triviaqa - - - -
--------- 推理 Reasoning --------- - - - -
cmnli - - - -
ocnli - - - -
ocnli_fc-dev - - - -
AX_b - - - -
AX_g - - - -
CB - - - -
RTE - - - -
story_cloze - - - -
COPA - - - -
ReCoRD - - - -
hellaswag - - - -
piqa - - - -
siqa - - - -
strategyqa - - - -
math - - - -
gsm8k - - - -
TheoremQA - - - -
openai_humaneval - - - -
mbpp - - - -
cmmlu - - - -
bbh - - - -
--------- 理解 Understanding --------- - - - -
C3 - - - -
CMRC_dev - - - -
DRCD_dev - - - -
MultiRC - - - -
race-middle - - - -
race-high - - - -
openbookqa_fact - - - -
csl_dev - - - -
lcsts - - - -
Xsum - - - -
eprstmt-dev - - - -
lambada 217e11 accuracy gen 0.04
tnews-dev - - - -
--------- 安全 Safety --------- - - - -
crows_pairs - - - -
--------- LEval Exact Match (Acc) --------- - - - -
LEval_coursera - - - -
LEval_gsm100 - - - -
LEval_quality - - - -
LEval_tpo - - - -
LEval_topic_retrieval - - - -
--------- LEval Gen (ROUGE) --------- - - - -
LEval_financialqa - - - -
LEval_gov_report_summ - - - -
LEval_legal_contract_qa - - - -
LEval_meeting_summ - - - -
LEval_multidocqa - - - -
LEval_narrativeqa - - - -
LEval_nq - - - -
LEval_news_summ - - - -
LEval_paper_assistant - - - -
LEval_patent_summ - - - -
LEval_review_summ - - - -
LEval_scientificqa - - - -
LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
longbench_lsht - - - -
longbench_vcsum - - - -
longbench_dureader - - - -
longbench_multifieldqa_zh - - - -
longbench_passage_retrieval_zh - - - -
--------- 单选 自定义数据 --------- - - - -
SageBench-exam - - - -
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
csv format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset,version,metric,mode,public-qwen3-06b-main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,217e11,accuracy,gen,0.04
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
raw format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-------------------------------
Model: public-qwen3-06b-main
lambada: {'accuracy': 0.038812342324859306}
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$