Compare commits

...

No commits in common. "main" and "main-251010-194152" have entirely different histories.

10 changed files with 1679 additions and 1 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.json filter=lfs diff=lfs merge=lfs -text

0
.gitignore vendored Normal file
View File

View File

@ -1 +0,0 @@
This is a repo for model evaluation.

1292
configs/20251010_200419.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
[RISE-CORE Msg(1147:139828651981824:libvgpu.c:900)]: Initializing.....
[RISE-CORE ERROR (pid:1147 thread=139828651981824 libvgpu.c:958)]: cuInit failed:100
10/10 20:04:34 - OpenCompass - ERROR - /models/opencompass/opencompass/tasks/openicl_eval.py - _score - 163 - Task [public/sense-voice-small@main/lambada]: No predictions found.
10/10 20:04:34 - OpenCompass - INFO - time elapsed: 2.04s
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
[RISE-CORE Msg(1147:139828651981824:multiprocess_memory_limit.c:504)]: Calling exit handler 1147

View File

@ -0,0 +1,33 @@
[RISE-CORE Msg(356:140591983016960:libvgpu.c:900)]: Initializing.....
[RISE-CORE ERROR (pid:356 thread=140591983016960 libvgpu.c:958)]: cuInit failed:100
10/10 20:04:25 - OpenCompass - INFO - Task [public/sense-voice-small@main/lambada_0]
10/10 20:04:27 - OpenCompass - INFO - Start inferencing [public/sense-voice-small@main/lambada_0]
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
[2025-10-10 20:04:27,544] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
0%| | 0/1718 [00:00<?, ?it/s] 0%| | 0/1718 [00:01<?, ?it/s]
Traceback (most recent call last):
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 147, in <module>
inferencer.run()
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 76, in run
self._inference()
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 119, in _inference
inferencer.inference(retriever,
File "/models/opencompass/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py", line 122, in inference
results = self.model.generate_from_template(
File "/models/opencompass/opencompass/models/base.py", line 117, in generate_from_template
return self.generate(inputs, max_out_len=max_out_len, **kwargs)
File "/models/opencompass/opencompass/models/openai_api.py", line 123, in generate
results = list(
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
yield fs.pop().result()
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/opt/conda/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/models/opencompass/opencompass/models/openai_api.py", line 250, in _generate
raise RuntimeError('Calling OpenAI failed after retrying for '
RuntimeError: Calling OpenAI failed after retrying for 2 times. Check the logs for details.
[RISE-CORE Msg(356:140591983016960:multiprocess_memory_limit.c:504)]: Calling exit handler 356

View File

@ -0,0 +1,33 @@
[RISE-CORE Msg(357:139961932819456:libvgpu.c:900)]: Initializing.....
[RISE-CORE ERROR (pid:357 thread=139961932819456 libvgpu.c:958)]: cuInit failed:100
10/10 20:04:24 - OpenCompass - INFO - Task [public/sense-voice-small@main/lambada_1]
10/10 20:04:27 - OpenCompass - INFO - Start inferencing [public/sense-voice-small@main/lambada_1]
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
[2025-10-10 20:04:27,228] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
0%| | 0/1718 [00:00<?, ?it/s] 0%| | 0/1718 [00:01<?, ?it/s]
Traceback (most recent call last):
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 147, in <module>
inferencer.run()
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 76, in run
self._inference()
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 119, in _inference
inferencer.inference(retriever,
File "/models/opencompass/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py", line 122, in inference
results = self.model.generate_from_template(
File "/models/opencompass/opencompass/models/base.py", line 117, in generate_from_template
return self.generate(inputs, max_out_len=max_out_len, **kwargs)
File "/models/opencompass/opencompass/models/openai_api.py", line 123, in generate
results = list(
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
yield fs.pop().result()
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/opt/conda/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/models/opencompass/opencompass/models/openai_api.py", line 250, in _generate
raise RuntimeError('Calling OpenAI failed after retrying for '
RuntimeError: Calling OpenAI failed after retrying for 2 times. Check the logs for details.
[RISE-CORE Msg(357:139961932819456:multiprocess_memory_limit.c:504)]: Calling exit handler 357

View File

@ -0,0 +1,33 @@
[RISE-CORE Msg(358:140415675620352:libvgpu.c:900)]: Initializing.....
[RISE-CORE ERROR (pid:358 thread=140415675620352 libvgpu.c:958)]: cuInit failed:100
10/10 20:04:25 - OpenCompass - INFO - Task [public/sense-voice-small@main/lambada_2]
10/10 20:04:27 - OpenCompass - INFO - Start inferencing [public/sense-voice-small@main/lambada_2]
/opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
[2025-10-10 20:04:27,578] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
0%| | 0/1717 [00:00<?, ?it/s] 0%| | 0/1717 [00:01<?, ?it/s]
Traceback (most recent call last):
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 147, in <module>
inferencer.run()
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 76, in run
self._inference()
File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 119, in _inference
inferencer.inference(retriever,
File "/models/opencompass/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py", line 122, in inference
results = self.model.generate_from_template(
File "/models/opencompass/opencompass/models/base.py", line 117, in generate_from_template
return self.generate(inputs, max_out_len=max_out_len, **kwargs)
File "/models/opencompass/opencompass/models/openai_api.py", line 123, in generate
results = list(
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
yield fs.pop().result()
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 444, in result
return self.__get_result()
File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
raise self._exception
File "/opt/conda/lib/python3.8/concurrent/futures/thread.py", line 57, in run
result = self.fn(*self.args, **self.kwargs)
File "/models/opencompass/opencompass/models/openai_api.py", line 250, in _generate
raise RuntimeError('Calling OpenAI failed after retrying for '
RuntimeError: Calling OpenAI failed after retrying for 2 times. Check the logs for details.
[RISE-CORE Msg(358:140415675620352:multiprocess_memory_limit.c:504)]: Calling exit handler 358

View File

@ -0,0 +1,87 @@
dataset,version,metric,mode,public/sense-voice-small@main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,-,-,-,-
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
1 dataset version metric mode public/sense-voice-small@main
2 --------- 考试 Exam --------- - - - -
3 ceval - - - -
4 agieval - - - -
5 mmlu - - - -
6 GaokaoBench - - - -
7 ARC-c - - - -
8 --------- 语言 Language --------- - - - -
9 WiC - - - -
10 summedits - - - -
11 chid-dev - - - -
12 afqmc-dev - - - -
13 bustm-dev - - - -
14 cluewsc-dev - - - -
15 WSC - - - -
16 winogrande - - - -
17 flores_100 - - - -
18 --------- 知识 Knowledge --------- - - - -
19 BoolQ - - - -
20 commonsense_qa - - - -
21 nq - - - -
22 triviaqa - - - -
23 --------- 推理 Reasoning --------- - - - -
24 cmnli - - - -
25 ocnli - - - -
26 ocnli_fc-dev - - - -
27 AX_b - - - -
28 AX_g - - - -
29 CB - - - -
30 RTE - - - -
31 story_cloze - - - -
32 COPA - - - -
33 ReCoRD - - - -
34 hellaswag - - - -
35 piqa - - - -
36 siqa - - - -
37 strategyqa - - - -
38 math - - - -
39 gsm8k - - - -
40 TheoremQA - - - -
41 openai_humaneval - - - -
42 mbpp - - - -
43 cmmlu - - - -
44 bbh - - - -
45 --------- 理解 Understanding --------- - - - -
46 C3 - - - -
47 CMRC_dev - - - -
48 DRCD_dev - - - -
49 MultiRC - - - -
50 race-middle - - - -
51 race-high - - - -
52 openbookqa_fact - - - -
53 csl_dev - - - -
54 lcsts - - - -
55 Xsum - - - -
56 eprstmt-dev - - - -
57 lambada - - - -
58 tnews-dev - - - -
59 --------- 安全 Safety --------- - - - -
60 crows_pairs - - - -
61 --------- LEval Exact Match (Acc) --------- - - - -
62 LEval_coursera - - - -
63 LEval_gsm100 - - - -
64 LEval_quality - - - -
65 LEval_tpo - - - -
66 LEval_topic_retrieval - - - -
67 --------- LEval Gen (ROUGE) --------- - - - -
68 LEval_financialqa - - - -
69 LEval_gov_report_summ - - - -
70 LEval_legal_contract_qa - - - -
71 LEval_meeting_summ - - - -
72 LEval_multidocqa - - - -
73 LEval_narrativeqa - - - -
74 LEval_nq - - - -
75 LEval_news_summ - - - -
76 LEval_paper_assistant - - - -
77 LEval_patent_summ - - - -
78 LEval_review_summ - - - -
79 LEval_scientificqa - - - -
80 LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
81 longbench_lsht - - - -
82 longbench_vcsum - - - -
83 longbench_dureader - - - -
84 longbench_multifieldqa_zh - - - -
85 longbench_passage_retrieval_zh - - - -
86 --------- 单选 自定义数据 --------- - - - -
87 SageBench-exam - - - -

View File

@ -0,0 +1,193 @@
20251010_200419
tabulate format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset version metric mode public/sense-voice-small@main
----------------------------------------------------- --------- -------- ------ -------------------------------
--------- 考试 Exam --------- - - - -
ceval - - - -
agieval - - - -
mmlu - - - -
GaokaoBench - - - -
ARC-c - - - -
--------- 语言 Language --------- - - - -
WiC - - - -
summedits - - - -
chid-dev - - - -
afqmc-dev - - - -
bustm-dev - - - -
cluewsc-dev - - - -
WSC - - - -
winogrande - - - -
flores_100 - - - -
--------- 知识 Knowledge --------- - - - -
BoolQ - - - -
commonsense_qa - - - -
nq - - - -
triviaqa - - - -
--------- 推理 Reasoning --------- - - - -
cmnli - - - -
ocnli - - - -
ocnli_fc-dev - - - -
AX_b - - - -
AX_g - - - -
CB - - - -
RTE - - - -
story_cloze - - - -
COPA - - - -
ReCoRD - - - -
hellaswag - - - -
piqa - - - -
siqa - - - -
strategyqa - - - -
math - - - -
gsm8k - - - -
TheoremQA - - - -
openai_humaneval - - - -
mbpp - - - -
cmmlu - - - -
bbh - - - -
--------- 理解 Understanding --------- - - - -
C3 - - - -
CMRC_dev - - - -
DRCD_dev - - - -
MultiRC - - - -
race-middle - - - -
race-high - - - -
openbookqa_fact - - - -
csl_dev - - - -
lcsts - - - -
Xsum - - - -
eprstmt-dev - - - -
lambada - - - -
tnews-dev - - - -
--------- 安全 Safety --------- - - - -
crows_pairs - - - -
--------- LEval Exact Match (Acc) --------- - - - -
LEval_coursera - - - -
LEval_gsm100 - - - -
LEval_quality - - - -
LEval_tpo - - - -
LEval_topic_retrieval - - - -
--------- LEval Gen (ROUGE) --------- - - - -
LEval_financialqa - - - -
LEval_gov_report_summ - - - -
LEval_legal_contract_qa - - - -
LEval_meeting_summ - - - -
LEval_multidocqa - - - -
LEval_narrativeqa - - - -
LEval_nq - - - -
LEval_news_summ - - - -
LEval_paper_assistant - - - -
LEval_patent_summ - - - -
LEval_review_summ - - - -
LEval_scientificqa - - - -
LEval_tvshow_summ--------- 长文本 LongBench --------- - - - -
longbench_lsht - - - -
longbench_vcsum - - - -
longbench_dureader - - - -
longbench_multifieldqa_zh - - - -
longbench_passage_retrieval_zh - - - -
--------- 单选 自定义数据 --------- - - - -
SageBench-exam - - - -
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
csv format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dataset,version,metric,mode,public/sense-voice-small@main
--------- 考试 Exam ---------,-,-,-,-
ceval,-,-,-,-
agieval,-,-,-,-
mmlu,-,-,-,-
GaokaoBench,-,-,-,-
ARC-c,-,-,-,-
--------- 语言 Language ---------,-,-,-,-
WiC,-,-,-,-
summedits,-,-,-,-
chid-dev,-,-,-,-
afqmc-dev,-,-,-,-
bustm-dev,-,-,-,-
cluewsc-dev,-,-,-,-
WSC,-,-,-,-
winogrande,-,-,-,-
flores_100,-,-,-,-
--------- 知识 Knowledge ---------,-,-,-,-
BoolQ,-,-,-,-
commonsense_qa,-,-,-,-
nq,-,-,-,-
triviaqa,-,-,-,-
--------- 推理 Reasoning ---------,-,-,-,-
cmnli,-,-,-,-
ocnli,-,-,-,-
ocnli_fc-dev,-,-,-,-
AX_b,-,-,-,-
AX_g,-,-,-,-
CB,-,-,-,-
RTE,-,-,-,-
story_cloze,-,-,-,-
COPA,-,-,-,-
ReCoRD,-,-,-,-
hellaswag,-,-,-,-
piqa,-,-,-,-
siqa,-,-,-,-
strategyqa,-,-,-,-
math,-,-,-,-
gsm8k,-,-,-,-
TheoremQA,-,-,-,-
openai_humaneval,-,-,-,-
mbpp,-,-,-,-
cmmlu,-,-,-,-
bbh,-,-,-,-
--------- 理解 Understanding ---------,-,-,-,-
C3,-,-,-,-
CMRC_dev,-,-,-,-
DRCD_dev,-,-,-,-
MultiRC,-,-,-,-
race-middle,-,-,-,-
race-high,-,-,-,-
openbookqa_fact,-,-,-,-
csl_dev,-,-,-,-
lcsts,-,-,-,-
Xsum,-,-,-,-
eprstmt-dev,-,-,-,-
lambada,-,-,-,-
tnews-dev,-,-,-,-
--------- 安全 Safety ---------,-,-,-,-
crows_pairs,-,-,-,-
--------- LEval Exact Match (Acc) ---------,-,-,-,-
LEval_coursera,-,-,-,-
LEval_gsm100,-,-,-,-
LEval_quality,-,-,-,-
LEval_tpo,-,-,-,-
LEval_topic_retrieval,-,-,-,-
--------- LEval Gen (ROUGE) ---------,-,-,-,-
LEval_financialqa,-,-,-,-
LEval_gov_report_summ,-,-,-,-
LEval_legal_contract_qa,-,-,-,-
LEval_meeting_summ,-,-,-,-
LEval_multidocqa,-,-,-,-
LEval_narrativeqa,-,-,-,-
LEval_nq,-,-,-,-
LEval_news_summ,-,-,-,-
LEval_paper_assistant,-,-,-,-
LEval_patent_summ,-,-,-,-
LEval_review_summ,-,-,-,-
LEval_scientificqa,-,-,-,-
LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
longbench_lsht,-,-,-,-
longbench_vcsum,-,-,-,-
longbench_dureader,-,-,-,-
longbench_multifieldqa_zh,-,-,-,-
longbench_passage_retrieval_zh,-,-,-,-
--------- 单选 自定义数据 ---------,-,-,-,-
SageBench-exam,-,-,-,-
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
raw format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-------------------------------
Model: public/sense-voice-small@main
$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$