From da2cc6ed70dbb38d9de8a16134eab51a00157a60 Mon Sep 17 00:00:00 2001 From: Maximilian Schmidt <maximilian.schmidt@ims.uni-stuttgart.de> Date: Thu, 22 Aug 2024 15:01:52 +0200 Subject: [PATCH] Add judgement using judge model and update editor rendering --- evoprompt/optimization.py | 71 ++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py index 94ef340..4f41949 100644 --- a/evoprompt/optimization.py +++ b/evoprompt/optimization.py @@ -1,11 +1,15 @@ import json import logging from pathlib import Path +import re from typing import Any, Literal, NamedTuple, Optional, TypedDict from textual.app import App, ComposeResult from textual.binding import Binding -from textual.widgets import Collapsible, Footer, Label, TextArea +from textual.containers import ScrollableContainer +from textual.widgets import Collapsible, Footer, Label, TextArea, Static +from rich.panel import Panel +from rich.rule import Rule from tqdm import tqdm, trange from evoprompt.models import ChatMessages, LLMModel @@ -40,21 +44,36 @@ class ResponseEditor(App): ), ] - def __init__(self, instruction: str, original_response: str, history: ChatMessages): + def __init__( + self, + instruction: str, + original_response: str, + history: ChatMessages, + judge_response: str, + ): self.instruction = instruction self.response = original_response self.history = history + self.judge_response = judge_response super().__init__() def compose(self) -> ComposeResult: - self.text_area = TextArea.code_editor(self.response) - for idx, message in enumerate(self.history[:-1]): - yield Collapsible( - Label(message["content"]), - title=message["role"], - collapsed=idx != len(self.history) - 2, + self.text_area = TextArea.code_editor(self.response, soft_wrap=True) + yield ScrollableContainer( + *( + Collapsible( + Static(message["content"]), + title=message["role"], + collapsed=idx != len(self.history) - 1, + ) + for idx, message in enumerate(self.history) ) - yield self.text_area + ) + yield ScrollableContainer( + Label(Panel(self.judge_response, title="Judge response")), + Label(Rule(title="Response to edit"), expand=True), + self.text_area, + ) yield Footer() @property @@ -281,18 +300,46 @@ class PromptOptimization: if self.judge_model is None: return Judgement(response, response, happy=None) - # TODO: judge the actual response - judge_happy = False + # judge the actual response + prompt = f"Instruction: {instruction}\nResponse: {response}" + system_message = ( + "You are acting as a judge. Please read the instruction and the response and decide " + "if the response follows the instruction. " + "If it does, answer 'good'. If it does not, answer 'bad'. " + "Wrap the answer with tags <judgement> and </judgement>. " + "Please also add an explanation for your judgement." + ) + judgement_response, _, _ = self.judge_model.create_completion( + system_message=system_message, + prompt=prompt, + ) + matches = re.findall( + # regex that matches `good` and `bad` between <judgement> and </judgement> where additional characters can be present, e.g., whitespace + r"<judgement>.*(good|bad).*</judgement>", + judgement_response, + flags=(re.IGNORECASE), + ) + # parse the judgement response + if matches: + judge_happy = True if matches[0].lower() == "good" else False + else: + judge_happy = None logger.info( f"{self.judge_model.__class__.__name__} judged the response as {'good' if judge_happy else 'bad'}" ) + if judge_happy: return Judgement(response, response, happy=True) logger.info(f"Prompt judged as bad. Letting User change the prompt.") - editor = ResponseEditor(instruction, response, history) + editor = ResponseEditor( + instruction, + response, + history[:-1], + judge_response=judgement_response, + ) editor.run() return Judgement(response, editor.modified_response, happy=False) -- GitLab