Compare commits

...

No commits in common. "main" and "main-251125-155217" have entirely different histories.

14 changed files with 1623 additions and 1 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.json filter=lfs diff=lfs merge=lfs -text

0
.gitignore vendored Normal file
View File

View File

@ -1 +0,0 @@
This is a repo for model evaluation.

1292
configs/20251125_155438.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
[RISE-CORE Msg(5980:140293153455104:libvgpu.c:900)]: Initializing.....
[RISE-CORE ERROR (pid:5980 thread=140293153455104 libvgpu.c:958)]: cuInit failed:100
11/25 16:23:35 - OpenCompass - INFO - Task [public/qwen3-0-6b@main/lambada]: {'accuracy': 0.038812342324859306}
11/25 16:23:35 - OpenCompass - INFO - time elapsed: 2.22s
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
[RISE-CORE Msg(5980:140293153455104:multiprocess_memory_limit.c:504)]: Calling exit handler 5980

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

BIN
predictions/public/qwen3-0-6b@main/lambada_0.json (Stored with Git LFS) Normal file

Binary file not shown.

BIN
predictions/public/qwen3-0-6b@main/lambada_1.json (Stored with Git LFS) Normal file

Binary file not shown.

BIN
predictions/public/qwen3-0-6b@main/lambada_2.json (Stored with Git LFS) Normal file

Binary file not shown.

BIN
results/public/qwen3-0-6b@main/lambada.json (Stored with Git LFS) Normal file

Binary file not shown.

View File

@ -0,0 +1,87 @@
dataset,version,metric,mode,public/qwen3-0-6b@main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,217e11,accuracy,gen,0.04
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
1 dataset version metric mode public/qwen3-0-6b@main
2 --------- 考试 Exam --------- - - - -
3 ceval - - - -
4 agieval - - - -
5 mmlu - - - -
6 GaokaoBench - - - -
7 ARC-c - - - -
8 --------- 语言 Language --------- - - - -
9 WiC - - - -
10 summedits - - - -
11 chid-dev - - - -
12 afqmc-dev - - - -
13 bustm-dev - - - -
14 cluewsc-dev - - - -
15 WSC - - - -
16 winogrande - - - -
17 flores_100 - - - -
18 --------- 知识 Knowledge --------- - - - -
19 BoolQ - - - -
20 commonsense_qa - - - -
21 nq - - - -
22 triviaqa - - - -
23 --------- 推理 Reasoning --------- - - - -
24 cmnli - - - -
25 ocnli - - - -
26 ocnli_fc-dev - - - -
27 AX_b - - - -
28 AX_g - - - -
29 CB - - - -
30 RTE - - - -
31 story_cloze - - - -
32 COPA - - - -
33 ReCoRD - - - -
34 hellaswag - - - -
35 piqa - - - -
36 siqa - - - -
37 strategyqa - - - -
38 math - - - -
39 gsm8k - - - -
40 TheoremQA - - - -
41 openai_humaneval - - - -
42 mbpp - - - -
43 cmmlu - - - -
44 bbh - - - -
45 --------- 理解 Understanding --------- - - - -
46 C3 - - - -
47 CMRC_dev - - - -
48 DRCD_dev - - - -
49 MultiRC - - - -
50 race-middle - - - -
51 race-high - - - -
52 openbookqa_fact - - - -
53 csl_dev - - - -
54 lcsts - - - -
55 Xsum - - - -
56 eprstmt-dev - - - -
57 lambada 217e11 accuracy gen 0.04
58 tnews-dev - - - -
59 --------- 安全 Safety --------- - - - -
60 crows_pairs - - - -
61 --------- LEval Exact Match (Acc) --------- - - - -
62 LEval_coursera - - - -
63 LEval_gsm100 - - - -
64 LEval_quality - - - -
65 LEval_tpo - - - -
66 LEval_topic_retrieval - - - -
67 --------- LEval Gen (ROUGE) --------- - - - -
68 LEval_financialqa - - - -
69 LEval_gov_report_summ - - - -
70 LEval_legal_contract_qa - - - -
71 LEval_meeting_summ - - - -
72 LEval_multidocqa - - - -
73 LEval_narrativeqa - - - -
74 LEval_nq - - - -
75 LEval_news_summ - - - -
76 LEval_paper_assistant - - - -
77 LEval_patent_summ - - - -
78 LEval_review_summ - - - -
79 LEval_scientificqa - - - -
80 LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
81 longbench_lsht - - - -
82 longbench_vcsum - - - -
83 longbench_dureader - - - -
84 longbench_multifieldqa_zh - - - -
85 longbench_passage_retrieval_zh - - - -
86 --------- 单选 自定义数据 --------- - - - -
87 SageBench-exam - - - -

View File

@ -0,0 +1,194 @@
20251125_155438
tabulate format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset version metric mode public/qwen3-0-6b@main
----------------------------------------------------- --------- -------- ------ ------------------------
--------- 考试 Exam --------- - - - -
ceval - - - -
agieval - - - -
mmlu - - - -
GaokaoBench - - - -
ARC-c - - - -
--------- 语言 Language --------- - - - -
WiC - - - -
summedits - - - -
chid-dev - - - -
afqmc-dev - - - -
bustm-dev - - - -
cluewsc-dev - - - -
WSC - - - -
winogrande - - - -
flores_100 - - - -
--------- 知识 Knowledge --------- - - - -
BoolQ - - - -
commonsense_qa - - - -
nq - - - -
triviaqa - - - -
--------- 推理 Reasoning --------- - - - -
cmnli - - - -
ocnli - - - -
ocnli_fc-dev - - - -
AX_b - - - -
AX_g - - - -
CB - - - -
RTE - - - -
story_cloze - - - -
COPA - - - -
ReCoRD - - - -
hellaswag - - - -
piqa - - - -
siqa - - - -
strategyqa - - - -
math - - - -
gsm8k - - - -
TheoremQA - - - -
openai_humaneval - - - -
mbpp - - - -
cmmlu - - - -
bbh - - - -
--------- 理解 Understanding --------- - - - -
C3 - - - -
CMRC_dev - - - -
DRCD_dev - - - -
MultiRC - - - -
race-middle - - - -
race-high - - - -
openbookqa_fact - - - -
csl_dev - - - -
lcsts - - - -
Xsum - - - -
eprstmt-dev - - - -
lambada 217e11 accuracy gen 0.04
tnews-dev - - - -
--------- 安全 Safety --------- - - - -
crows_pairs - - - -
--------- LEval Exact Match (Acc) --------- - - - -
LEval_coursera - - - -
LEval_gsm100 - - - -
LEval_quality - - - -
LEval_tpo - - - -
LEval_topic_retrieval - - - -
--------- LEval Gen (ROUGE) --------- - - - -
LEval_financialqa - - - -
LEval_gov_report_summ - - - -
LEval_legal_contract_qa - - - -
LEval_meeting_summ - - - -
LEval_multidocqa - - - -
LEval_narrativeqa - - - -
LEval_nq - - - -
LEval_news_summ - - - -
LEval_paper_assistant - - - -
LEval_patent_summ - - - -
LEval_review_summ - - - -
LEval_scientificqa - - - -
LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
longbench_lsht - - - -
longbench_vcsum - - - -
longbench_dureader - - - -
longbench_multifieldqa_zh - - - -
longbench_passage_retrieval_zh - - - -
--------- 单选 自定义数据 --------- - - - -
SageBench-exam - - - -
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
csv format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset,version,metric,mode,public/qwen3-0-6b@main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,217e11,accuracy,gen,0.04
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
raw format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-------------------------------
Model: public/qwen3-0-6b@main
lambada: {'accuracy': 0.038812342324859306}
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$