10 changed files with 1723 additions and 1 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1 @@
 *.json filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
--- a/README.md
+++ b/README.md
@ -1 +0,0 @@
 This is a repo for model evaluation.
--- a/configs/20250725_182625.py
+++ b/configs/20250725_182625.py
--- a/logs/eval/public/qwen2-5-vl-7b-instruct-awq@main/lambada.out
+++ b/logs/eval/public/qwen2-5-vl-7b-instruct-awq@main/lambada.out
@ -0,0 +1,9 @@
 [4pdvGPU Msg(1173:140612078611456:libvgpu.c:873)]: Initializing.....
 [4pdvGPU Msg(1173:140612078611456:multiprocess_memory_limit.c:144)]: uuid GPU-9a16bbfd-e4c2-d946-8cf6-81879301e66c validated
 [4pdvGPU Msg(1173:140612078611456:multiprocess_memory_limit.c:144)]: uuid GPU-845ac5d5-6827-1c5e-8cd0-275d4ba08b97 validated
 [4pdvGPU ERROR (pid:1173 thread=140612078611456 libvgpu.c:924)]: cuInit failed:100
 07/25 18:26:43 - OpenCompass - ERROR - /models/opencompass/opencompass/tasks/openicl_eval.py - _score - 163 - Task [public/qwen2-5-vl-7b-instruct-awq@main/lambada]: No predictions found.
 07/25 18:26:43 - OpenCompass - INFO - time elapsed: 1.99s
 /opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
 [4pdvGPU Msg(1173:140612078611456:multiprocess_memory_limit.c:543)]: Calling exit handler 1173
--- a/logs/infer/public/qwen2-5-vl-7b-instruct-awq@main/lambada_0.out
+++ b/logs/infer/public/qwen2-5-vl-7b-instruct-awq@main/lambada_0.out
@ -0,0 +1,47 @@
 [4pdvGPU Msg(368:139999047420928:libvgpu.c:873)]: Initializing.....
 [4pdvGPU Msg(368:139999047420928:multiprocess_memory_limit.c:144)]: uuid GPU-9a16bbfd-e4c2-d946-8cf6-81879301e66c validated
 [4pdvGPU Msg(368:139999047420928:multiprocess_memory_limit.c:144)]: uuid GPU-845ac5d5-6827-1c5e-8cd0-275d4ba08b97 validated
 [4pdvGPU ERROR (pid:368 thread=139999047420928 libvgpu.c:924)]: cuInit failed:100
 07/25 18:26:33 - OpenCompass - INFO - Task [public/qwen2-5-vl-7b-instruct-awq@main/lambada_0]
 07/25 18:26:35 - OpenCompass - INFO - Start inferencing [public/qwen2-5-vl-7b-instruct-awq@main/lambada_0]
 /opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
 [2025-07-25 18:26:35,690] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
 
  0%|          | 0/1718 [00:00<?, ?it/s]
  0%|          | 0/1718 [00:00<?, ?it/s]
 Traceback (most recent call last):
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 147, in <module>
    inferencer.run()
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 76, in run
    self._inference()
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 119, in _inference
    inferencer.inference(retriever,
  File "/models/opencompass/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py", line 122, in inference
    results = self.model.generate_from_template(
  File "/models/opencompass/opencompass/models/base.py", line 117, in generate_from_template
    return self.generate(inputs, max_out_len=max_out_len, **kwargs)
  File "/models/opencompass/opencompass/models/openai_api.py", line 123, in generate
    results = list(
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
    yield fs.pop().result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/models/opencompass/opencompass/models/openai_api.py", line 222, in _generate
    raw_response = requests.post(self.url,
  File "/opt/conda/lib/python3.8/site-packages/requests/api.py", line 115, in post
    return request("post", url, data=data, json=json, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/requests/sessions.py", line 575, in request
    prep = self.prepare_request(req)
  File "/opt/conda/lib/python3.8/site-packages/requests/sessions.py", line 486, in prepare_request
    p.prepare(
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 368, in prepare
    self.prepare_url(url, params)
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 445, in prepare_url
    raise InvalidURL(f"Invalid URL {url!r}: No host supplied")
 requests.exceptions.InvalidURL: Invalid URL 'http:///learnware/models/openai/4pd/api/v1/chat/completions': No host supplied
 [4pdvGPU Msg(368:139999047420928:multiprocess_memory_limit.c:543)]: Calling exit handler 368
--- a/logs/infer/public/qwen2-5-vl-7b-instruct-awq@main/lambada_1.out
+++ b/logs/infer/public/qwen2-5-vl-7b-instruct-awq@main/lambada_1.out
@ -0,0 +1,47 @@
 [4pdvGPU Msg(369:140636706909184:libvgpu.c:873)]: Initializing.....
 [4pdvGPU Msg(369:140636706909184:multiprocess_memory_limit.c:144)]: uuid GPU-9a16bbfd-e4c2-d946-8cf6-81879301e66c validated
 [4pdvGPU Msg(369:140636706909184:multiprocess_memory_limit.c:144)]: uuid GPU-845ac5d5-6827-1c5e-8cd0-275d4ba08b97 validated
 [4pdvGPU ERROR (pid:369 thread=140636706909184 libvgpu.c:924)]: cuInit failed:100
 07/25 18:26:33 - OpenCompass - INFO - Task [public/qwen2-5-vl-7b-instruct-awq@main/lambada_1]
 07/25 18:26:35 - OpenCompass - INFO - Start inferencing [public/qwen2-5-vl-7b-instruct-awq@main/lambada_1]
 /opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
 [2025-07-25 18:26:35,414] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
 
  0%|          | 0/1718 [00:00<?, ?it/s]
  0%|          | 0/1718 [00:00<?, ?it/s]
 Traceback (most recent call last):
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 147, in <module>
    inferencer.run()
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 76, in run
    self._inference()
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 119, in _inference
    inferencer.inference(retriever,
  File "/models/opencompass/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py", line 122, in inference
    results = self.model.generate_from_template(
  File "/models/opencompass/opencompass/models/base.py", line 117, in generate_from_template
    return self.generate(inputs, max_out_len=max_out_len, **kwargs)
  File "/models/opencompass/opencompass/models/openai_api.py", line 123, in generate
    results = list(
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
    yield fs.pop().result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/models/opencompass/opencompass/models/openai_api.py", line 222, in _generate
    raw_response = requests.post(self.url,
  File "/opt/conda/lib/python3.8/site-packages/requests/api.py", line 115, in post
    return request("post", url, data=data, json=json, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/requests/sessions.py", line 575, in request
    prep = self.prepare_request(req)
  File "/opt/conda/lib/python3.8/site-packages/requests/sessions.py", line 486, in prepare_request
    p.prepare(
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 368, in prepare
    self.prepare_url(url, params)
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 445, in prepare_url
    raise InvalidURL(f"Invalid URL {url!r}: No host supplied")
 requests.exceptions.InvalidURL: Invalid URL 'http:///learnware/models/openai/4pd/api/v1/chat/completions': No host supplied
 [4pdvGPU Msg(369:140636706909184:multiprocess_memory_limit.c:543)]: Calling exit handler 369
--- a/logs/infer/public/qwen2-5-vl-7b-instruct-awq@main/lambada_2.out
+++ b/logs/infer/public/qwen2-5-vl-7b-instruct-awq@main/lambada_2.out
@ -0,0 +1,47 @@
 [4pdvGPU Msg(500:139660419451904:libvgpu.c:873)]: Initializing.....
 [4pdvGPU Msg(500:139660419451904:multiprocess_memory_limit.c:144)]: uuid GPU-9a16bbfd-e4c2-d946-8cf6-81879301e66c validated
 [4pdvGPU Msg(500:139660419451904:multiprocess_memory_limit.c:144)]: uuid GPU-845ac5d5-6827-1c5e-8cd0-275d4ba08b97 validated
 [4pdvGPU ERROR (pid:500 thread=139660419451904 libvgpu.c:924)]: cuInit failed:100
 07/25 18:26:33 - OpenCompass - INFO - Task [public/qwen2-5-vl-7b-instruct-awq@main/lambada_2]
 07/25 18:26:35 - OpenCompass - INFO - Start inferencing [public/qwen2-5-vl-7b-instruct-awq@main/lambada_2]
 /opt/conda/lib/python3.8/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
  warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
 [2025-07-25 18:26:35,731] [opencompass.openicl.icl_inferencer.icl_gen_inferencer] [INFO] Starting inference process...
 
  0%|          | 0/1717 [00:00<?, ?it/s]
  0%|          | 0/1717 [00:00<?, ?it/s]
 Traceback (most recent call last):
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 147, in <module>
    inferencer.run()
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 76, in run
    self._inference()
  File "/models/opencompass/opencompass/tasks/openicl_infer.py", line 119, in _inference
    inferencer.inference(retriever,
  File "/models/opencompass/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py", line 122, in inference
    results = self.model.generate_from_template(
  File "/models/opencompass/opencompass/models/base.py", line 117, in generate_from_template
    return self.generate(inputs, max_out_len=max_out_len, **kwargs)
  File "/models/opencompass/opencompass/models/openai_api.py", line 123, in generate
    results = list(
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 619, in result_iterator
    yield fs.pop().result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 444, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.8/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.8/concurrent/futures/thread.py", line 57, in run
    result = self.fn(*self.args, **self.kwargs)
  File "/models/opencompass/opencompass/models/openai_api.py", line 222, in _generate
    raw_response = requests.post(self.url,
  File "/opt/conda/lib/python3.8/site-packages/requests/api.py", line 115, in post
    return request("post", url, data=data, json=json, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "/opt/conda/lib/python3.8/site-packages/requests/sessions.py", line 575, in request
    prep = self.prepare_request(req)
  File "/opt/conda/lib/python3.8/site-packages/requests/sessions.py", line 486, in prepare_request
    p.prepare(
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 368, in prepare
    self.prepare_url(url, params)
  File "/opt/conda/lib/python3.8/site-packages/requests/models.py", line 445, in prepare_url
    raise InvalidURL(f"Invalid URL {url!r}: No host supplied")
 requests.exceptions.InvalidURL: Invalid URL 'http:///learnware/models/openai/4pd/api/v1/chat/completions': No host supplied
 [4pdvGPU Msg(500:139660419451904:multiprocess_memory_limit.c:543)]: Calling exit handler 500
--- a/summary/summary_20250725_182625.csv
+++ b/summary/summary_20250725_182625.csv
@ -0,0 +1,87 @@
 dataset,version,metric,mode,public/qwen2-5-vl-7b-instruct-awq@main
 --------- 考试 Exam ---------,-,-,-,-
 ceval,-,-,-,-
 agieval,-,-,-,-
 mmlu,-,-,-,-
 GaokaoBench,-,-,-,-
 ARC-c,-,-,-,-
 --------- 语言 Language ---------,-,-,-,-
 WiC,-,-,-,-
 summedits,-,-,-,-
 chid-dev,-,-,-,-
 afqmc-dev,-,-,-,-
 bustm-dev,-,-,-,-
 cluewsc-dev,-,-,-,-
 WSC,-,-,-,-
 winogrande,-,-,-,-
 flores_100,-,-,-,-
 --------- 知识 Knowledge ---------,-,-,-,-
 BoolQ,-,-,-,-
 commonsense_qa,-,-,-,-
 nq,-,-,-,-
 triviaqa,-,-,-,-
 --------- 推理 Reasoning ---------,-,-,-,-
 cmnli,-,-,-,-
 ocnli,-,-,-,-
 ocnli_fc-dev,-,-,-,-
 AX_b,-,-,-,-
 AX_g,-,-,-,-
 CB,-,-,-,-
 RTE,-,-,-,-
 story_cloze,-,-,-,-
 COPA,-,-,-,-
 ReCoRD,-,-,-,-
 hellaswag,-,-,-,-
 piqa,-,-,-,-
 siqa,-,-,-,-
 strategyqa,-,-,-,-
 math,-,-,-,-
 gsm8k,-,-,-,-
 TheoremQA,-,-,-,-
 openai_humaneval,-,-,-,-
 mbpp,-,-,-,-
 cmmlu,-,-,-,-
 bbh,-,-,-,-
 --------- 理解 Understanding ---------,-,-,-,-
 C3,-,-,-,-
 CMRC_dev,-,-,-,-
 DRCD_dev,-,-,-,-
 MultiRC,-,-,-,-
 race-middle,-,-,-,-
 race-high,-,-,-,-
 openbookqa_fact,-,-,-,-
 csl_dev,-,-,-,-
 lcsts,-,-,-,-
 Xsum,-,-,-,-
 eprstmt-dev,-,-,-,-
 lambada,-,-,-,-
 tnews-dev,-,-,-,-
 --------- 安全 Safety ---------,-,-,-,-
 crows_pairs,-,-,-,-
 --------- LEval Exact Match (Acc) ---------,-,-,-,-
 LEval_coursera,-,-,-,-
 LEval_gsm100,-,-,-,-
 LEval_quality,-,-,-,-
 LEval_tpo,-,-,-,-
 LEval_topic_retrieval,-,-,-,-
 --------- LEval Gen (ROUGE) ---------,-,-,-,-
 LEval_financialqa,-,-,-,-
 LEval_gov_report_summ,-,-,-,-
 LEval_legal_contract_qa,-,-,-,-
 LEval_meeting_summ,-,-,-,-
 LEval_multidocqa,-,-,-,-
 LEval_narrativeqa,-,-,-,-
 LEval_nq,-,-,-,-
 LEval_news_summ,-,-,-,-
 LEval_paper_assistant,-,-,-,-
 LEval_patent_summ,-,-,-,-
 LEval_review_summ,-,-,-,-
 LEval_scientificqa,-,-,-,-
 LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
 longbench_lsht,-,-,-,-
 longbench_vcsum,-,-,-,-
 longbench_dureader,-,-,-,-
 longbench_multifieldqa_zh,-,-,-,-
 longbench_passage_retrieval_zh,-,-,-,-
 --------- 单选 自定义数据 ---------,-,-,-,-
 SageBench-exam,-,-,-,-
--- a/summary/summary_20250725_182625.txt
+++ b/summary/summary_20250725_182625.txt
@ -0,0 +1,193 @@
 20250725_182625
 tabulate format
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 dataset                                                version    metric    mode    public/qwen2-5-vl-7b-instruct-awq@main
 -----------------------------------------------------  ---------  --------  ------  ----------------------------------------
 --------- 考试 Exam ---------                          -          -         -       -
 ceval                                                  -          -         -       -
 agieval                                                -          -         -       -
 mmlu                                                   -          -         -       -
 GaokaoBench                                            -          -         -       -
 ARC-c                                                  -          -         -       -
 --------- 语言 Language ---------                      -          -         -       -
 WiC                                                    -          -         -       -
 summedits                                              -          -         -       -
 chid-dev                                               -          -         -       -
 afqmc-dev                                              -          -         -       -
 bustm-dev                                              -          -         -       -
 cluewsc-dev                                            -          -         -       -
 WSC                                                    -          -         -       -
 winogrande                                             -          -         -       -
 flores_100                                             -          -         -       -
 --------- 知识 Knowledge ---------                     -          -         -       -
 BoolQ                                                  -          -         -       -
 commonsense_qa                                         -          -         -       -
 nq                                                     -          -         -       -
 triviaqa                                               -          -         -       -
 --------- 推理 Reasoning ---------                     -          -         -       -
 cmnli                                                  -          -         -       -
 ocnli                                                  -          -         -       -
 ocnli_fc-dev                                           -          -         -       -
 AX_b                                                   -          -         -       -
 AX_g                                                   -          -         -       -
 CB                                                     -          -         -       -
 RTE                                                    -          -         -       -
 story_cloze                                            -          -         -       -
 COPA                                                   -          -         -       -
 ReCoRD                                                 -          -         -       -
 hellaswag                                              -          -         -       -
 piqa                                                   -          -         -       -
 siqa                                                   -          -         -       -
 strategyqa                                             -          -         -       -
 math                                                   -          -         -       -
 gsm8k                                                  -          -         -       -
 TheoremQA                                              -          -         -       -
 openai_humaneval                                       -          -         -       -
 mbpp                                                   -          -         -       -
 cmmlu                                                  -          -         -       -
 bbh                                                    -          -         -       -
 --------- 理解 Understanding ---------                 -          -         -       -
 C3                                                     -          -         -       -
 CMRC_dev                                               -          -         -       -
 DRCD_dev                                               -          -         -       -
 MultiRC                                                -          -         -       -
 race-middle                                            -          -         -       -
 race-high                                              -          -         -       -
 openbookqa_fact                                        -          -         -       -
 csl_dev                                                -          -         -       -
 lcsts                                                  -          -         -       -
 Xsum                                                   -          -         -       -
 eprstmt-dev                                            -          -         -       -
 lambada                                                -          -         -       -
 tnews-dev                                              -          -         -       -
 --------- 安全 Safety ---------                        -          -         -       -
 crows_pairs                                            -          -         -       -
 --------- LEval Exact Match (Acc) ---------            -          -         -       -
 LEval_coursera                                         -          -         -       -
 LEval_gsm100                                           -          -         -       -
 LEval_quality                                          -          -         -       -
 LEval_tpo                                              -          -         -       -
 LEval_topic_retrieval                                  -          -         -       -
 --------- LEval Gen (ROUGE) ---------                  -          -         -       -
 LEval_financialqa                                      -          -         -       -
 LEval_gov_report_summ                                  -          -         -       -
 LEval_legal_contract_qa                                -          -         -       -
 LEval_meeting_summ                                     -          -         -       -
 LEval_multidocqa                                       -          -         -       -
 LEval_narrativeqa                                      -          -         -       -
 LEval_nq                                               -          -         -       -
 LEval_news_summ                                        -          -         -       -
 LEval_paper_assistant                                  -          -         -       -
 LEval_patent_summ                                      -          -         -       -
 LEval_review_summ                                      -          -         -       -
 LEval_scientificqa                                     -          -         -       -
 LEval_tvshow_summ--------- 长文本 LongBench ---------  -          -         -       -
 longbench_lsht                                         -          -         -       -
 longbench_vcsum                                        -          -         -       -
 longbench_dureader                                     -          -         -       -
 longbench_multifieldqa_zh                              -          -         -       -
 longbench_passage_retrieval_zh                         -          -         -       -
 --------- 单选 自定义数据 ---------                    -          -         -       -
 SageBench-exam                                         -          -         -       -
 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 -------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
 csv format
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 dataset,version,metric,mode,public/qwen2-5-vl-7b-instruct-awq@main
 --------- 考试 Exam ---------,-,-,-,-
 ceval,-,-,-,-
 agieval,-,-,-,-
 mmlu,-,-,-,-
 GaokaoBench,-,-,-,-
 ARC-c,-,-,-,-
 --------- 语言 Language ---------,-,-,-,-
 WiC,-,-,-,-
 summedits,-,-,-,-
 chid-dev,-,-,-,-
 afqmc-dev,-,-,-,-
 bustm-dev,-,-,-,-
 cluewsc-dev,-,-,-,-
 WSC,-,-,-,-
 winogrande,-,-,-,-
 flores_100,-,-,-,-
 --------- 知识 Knowledge ---------,-,-,-,-
 BoolQ,-,-,-,-
 commonsense_qa,-,-,-,-
 nq,-,-,-,-
 triviaqa,-,-,-,-
 --------- 推理 Reasoning ---------,-,-,-,-
 cmnli,-,-,-,-
 ocnli,-,-,-,-
 ocnli_fc-dev,-,-,-,-
 AX_b,-,-,-,-
 AX_g,-,-,-,-
 CB,-,-,-,-
 RTE,-,-,-,-
 story_cloze,-,-,-,-
 COPA,-,-,-,-
 ReCoRD,-,-,-,-
 hellaswag,-,-,-,-
 piqa,-,-,-,-
 siqa,-,-,-,-
 strategyqa,-,-,-,-
 math,-,-,-,-
 gsm8k,-,-,-,-
 TheoremQA,-,-,-,-
 openai_humaneval,-,-,-,-
 mbpp,-,-,-,-
 cmmlu,-,-,-,-
 bbh,-,-,-,-
 --------- 理解 Understanding ---------,-,-,-,-
 C3,-,-,-,-
 CMRC_dev,-,-,-,-
 DRCD_dev,-,-,-,-
 MultiRC,-,-,-,-
 race-middle,-,-,-,-
 race-high,-,-,-,-
 openbookqa_fact,-,-,-,-
 csl_dev,-,-,-,-
 lcsts,-,-,-,-
 Xsum,-,-,-,-
 eprstmt-dev,-,-,-,-
 lambada,-,-,-,-
 tnews-dev,-,-,-,-
 --------- 安全 Safety ---------,-,-,-,-
 crows_pairs,-,-,-,-
 --------- LEval Exact Match (Acc) ---------,-,-,-,-
 LEval_coursera,-,-,-,-
 LEval_gsm100,-,-,-,-
 LEval_quality,-,-,-,-
 LEval_tpo,-,-,-,-
 LEval_topic_retrieval,-,-,-,-
 --------- LEval Gen (ROUGE) ---------,-,-,-,-
 LEval_financialqa,-,-,-,-
 LEval_gov_report_summ,-,-,-,-
 LEval_legal_contract_qa,-,-,-,-
 LEval_meeting_summ,-,-,-,-
 LEval_multidocqa,-,-,-,-
 LEval_narrativeqa,-,-,-,-
 LEval_nq,-,-,-,-
 LEval_news_summ,-,-,-,-
 LEval_paper_assistant,-,-,-,-
 LEval_patent_summ,-,-,-,-
 LEval_review_summ,-,-,-,-
 LEval_scientificqa,-,-,-,-
 LEval_tvshow_summ--------- 长文本 LongBench ---------,-,-,-,-
 longbench_lsht,-,-,-,-
 longbench_vcsum,-,-,-,-
 longbench_dureader,-,-,-,-
 longbench_multifieldqa_zh,-,-,-,-
 longbench_passage_retrieval_zh,-,-,-,-
 --------- 单选 自定义数据 ---------,-,-,-,-
 SageBench-exam,-,-,-,-
 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 -------------------------------------------------------------------------------------------------------------------------------- THIS IS A DIVIDER --------------------------------------------------------------------------------------------------------------------------------
 raw format
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 -------------------------------
 Model: public/qwen2-5-vl-7b-instruct-awq@main
 $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$