benchmark/agbenchmark/utils/prompts.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

SCORING_MAP = {
    "percentage": "assign a float score that will represent a percentage out of 100. Use decimal points to be even more accurate. 0 represents the worst possible generation, while 100 represents the ideal generation",
    "scale": "assign an integer score from a scale of 1-10. 1 represents a really bad generation, while 10 represents an ideal generation",
    "binary": "assign a binary score of either 0 or 1. 0 represents a failure, while 1 represents a success",
}


REFERENCE_PROMPT = """Ignore previous directions. You are now an expert at evaluating how close machine generated responses are to human answers. You essentially act as a hyper advanced BLEU score.
In order to score the machine generated response you will {scoring}. Make sure to factor in the distance to the ideal response into your thinking, deliberation, and final result regarding scoring. Return nothing but a float score.

Here is the given task for you to evaluate:
{task}

Here is the ideal response you're comparing to based on the task:
{answer}

Here is the current machine generated response to the task that you need to evaluate:
{response}

"""

RUBRIC_PROMPT = """Ignore previous directions. You are now an expert at evaluating machine generated responses to given tasks.
In order to score the generated texts you will {scoring}. Make sure to factor in rubric into your thinking, deliberation, and final result regarding scoring. Return nothing but a float score.

Here is the given task for you to evaluate:
{task}

Use the below rubric to guide your thinking about scoring:
{answer}

Here is the current machine generated response to the task that you need to evaluate:
{response}

"""

QUESTION_PROMPT = """Ignore previous directions. You are now an expert at evaluating machine generated responses to given tasks.
In order to score the generated texts you will {scoring}. Make sure to think about whether the generated response answers the question well in order to score accurately. Return nothing but a float score.

Here is the given task:
{task}

Here is a question that checks if the task was completed correctly:
{answer}

Here is the current machine generated response to the task that you need to evaluate:
{response}

"""

FEW_SHOT_EXAMPLES = """Here are some examples of how to score a machine generated response based on the above:
{examples}

"""

CUSTOM_PROMPT = """{custom}
{scoring}

"""

PROMPT_MAP = {
    "rubric": RUBRIC_PROMPT,
    "reference": REFERENCE_PROMPT,
    "question": QUESTION_PROMPT,
    "custom": CUSTOM_PROMPT,
}

END_PROMPT = """Remember to always end your response with nothing but a float score.
Float score:"""