89 lines
2.4 KiB
Python
89 lines
2.4 KiB
Python
|
|
import re
|
|||
|
|
|
|||
|
|
from opencompass.registry import TEXT_POSTPROCESSORS
|
|||
|
|
|
|||
|
|
|
|||
|
|
@TEXT_POSTPROCESSORS.register_module('general')
|
|||
|
|
def general_postprocess(text: str) -> str:
|
|||
|
|
# Cut off the first newline, period, or comma
|
|||
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|||
|
|
|
|||
|
|
# Remove punctuation
|
|||
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|||
|
|
|
|||
|
|
# Remove article
|
|||
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|||
|
|
'',
|
|||
|
|
no_punctuation,
|
|||
|
|
flags=re.IGNORECASE)
|
|||
|
|
|
|||
|
|
# Remove duplicated blank spaces
|
|||
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|||
|
|
|
|||
|
|
return cleaned_text
|
|||
|
|
|
|||
|
|
|
|||
|
|
@TEXT_POSTPROCESSORS.register_module('general_cn')
|
|||
|
|
def general_cn_postprocess(text: str) -> str:
|
|||
|
|
truncated_text = re.split(r'[\n.,]', text, 1)[0]
|
|||
|
|
|
|||
|
|
no_punctuation = re.sub(r'[^\w\s]', '', truncated_text)
|
|||
|
|
|
|||
|
|
no_articles = re.sub(r'\b(a|an|the)\b',
|
|||
|
|
'',
|
|||
|
|
no_punctuation,
|
|||
|
|
flags=re.IGNORECASE)
|
|||
|
|
|
|||
|
|
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
|||
|
|
import jieba
|
|||
|
|
cleaned_text = ' '.join(jieba.cut(text))
|
|||
|
|
return cleaned_text
|
|||
|
|
|
|||
|
|
|
|||
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital')
|
|||
|
|
def first_capital_postprocess(text: str) -> str:
|
|||
|
|
for t in text:
|
|||
|
|
if t.isupper():
|
|||
|
|
return t
|
|||
|
|
return ''
|
|||
|
|
|
|||
|
|
|
|||
|
|
def first_option_postprocess(text: str, options: str) -> str:
|
|||
|
|
"""Find first valid option for text."""
|
|||
|
|
|
|||
|
|
patterns = [
|
|||
|
|
f'[Tt]he answer is [{options}]',
|
|||
|
|
f'[Tt]he correct answer is [{options}]',
|
|||
|
|
f'答案是(.*?)[{options}]',
|
|||
|
|
f'答案为(.*?)[{options}]',
|
|||
|
|
f'固选(.*?)[{options}]',
|
|||
|
|
f'答案应该是(.*?)[{options}]',
|
|||
|
|
f'(\s|^)[{options}][\s。,,\.$]', # noqa
|
|||
|
|
f'[{options}]',
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
regexes = [re.compile(pattern) for pattern in patterns]
|
|||
|
|
for regex in regexes:
|
|||
|
|
match = regex.search(text)
|
|||
|
|
if match:
|
|||
|
|
outputs = match.group(0)
|
|||
|
|
for i in options:
|
|||
|
|
if i in outputs:
|
|||
|
|
return i
|
|||
|
|
return ''
|
|||
|
|
|
|||
|
|
|
|||
|
|
@TEXT_POSTPROCESSORS.register_module('first-capital-multi')
|
|||
|
|
def first_capital_postprocess_multi(text: str) -> str:
|
|||
|
|
match = re.search(r'([A-D]+)', text)
|
|||
|
|
if match:
|
|||
|
|
return match.group(1)
|
|||
|
|
return ''
|
|||
|
|
|
|||
|
|
|
|||
|
|
def last_option_postprocess(text: str, options: str) -> str:
|
|||
|
|
match = re.findall(rf'([{options}])', text)
|
|||
|
|
if match:
|
|||
|
|
return match[-1]
|
|||
|
|
return ''
|