diff --git a/evoprompt/evolution/evolution.py b/evoprompt/evolution/evolution.py index 7eae7c2af18d1fbd7fc1eb4e6852f64a66f13e28..d3e0ec338c8356d3062868034770abd29ecd3fa6 100644 --- a/evoprompt/evolution/evolution.py +++ b/evoprompt/evolution/evolution.py @@ -108,6 +108,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta): end_token: str = "</prompt>", allow_missing_end_token: bool = True, ): + # TODO another option would be to select the first match that is not equal to " and " (which is part of the instruction and usually repeated in the response) matches = re.findall( # regex that matches any characters between last pair of `start_token` and `end_token`, and optionally allow missing `end_token` rf"{start_token}(?!.*{start_token})(?:(.*){end_token}{"|(.*)" if allow_missing_end_token else ""})", @@ -297,7 +298,7 @@ class GeneticAlgorithm(EvolutionAlgorithm): ) judgement = self.judge_and_correct_step( - filled_prompt, response, history, recent_turn + filled_prompt, response, history=None, recent_turn=recent_turn ) if judgement.skip: @@ -410,11 +411,11 @@ class DifferentialEvolution(EvolutionAlgorithm): ) judgement = self.judge_and_correct_step( - filled_prompt, response, history, recent_turn + filled_prompt, response, history=None, recent_turn=recent_turn ) if judgement.skip: - # skip this prompt, for DE this means using the basic prompt + # user asked to skip this prompt, for DE this means using the basic prompt return ( prompts_current_evolution[current_iteration].content, [judgement], @@ -532,11 +533,11 @@ class DifferentialEvolutionWithCot(DifferentialEvolution): self.evolution_model._get_user_message(filled_prompt) ) # TODO Shall we still use only a single turn containing all messages if we do not use demonstrations for evolution? - messages = self.condense_messages(evolutions_steps) + prompt = self.condense_messages(evolutions_steps, return_str=True) response, history, recent_turn, usage = ( self.evolution_model.create_completion( system_message=SYSTEM_MESSAGE, - messages=messages, + prompt=prompt, history=messages_demos, # the models often repeat the instuction which could also contain </prompt> therefore we should not stop early stop=None, @@ -552,13 +553,18 @@ class DifferentialEvolutionWithCot(DifferentialEvolution): history + recent_turn, response, ) + # TODO use serialized messages as prompt or use previous evolution steps as history? judgement = self.judge_and_correct_step( - filled_prompt, response, history=history, recent_turn=recent_turn + filled_prompt, + response, + history=evolutions_steps[:-2], + recent_turn=recent_turn, + # prompt, response, history=None, recent_turn=recent_turn ) judgements.append(judgement) if judgement.skip: - # skip this prompt, for DE this means using the basic prompt + # user asked to skip this prompt, for DE this means using the basic prompt return ( prompts_current_evolution[current_iteration].content, judgements, @@ -595,8 +601,12 @@ class DifferentialEvolutionWithCot(DifferentialEvolution): return evolved_prompt, judgements, usage - def condense_messages(self, messages: list[ChatMessages]) -> list[dict]: + def condense_messages( + self, messages: list[ChatMessages], return_str: bool = False + ) -> list[dict] | str: if not messages: + if return_str: + return "" return [] if messages[-1]["role"] == "assistant": @@ -606,6 +616,12 @@ class DifferentialEvolutionWithCot(DifferentialEvolution): assistant_turn = None user_turn = "\n\n".join(message["content"] for message in messages) + if return_str: + assert ( + assistant_turn is None + ), "Cannot return string if most recent turn is from assistant." + return user_turn + messages = [self.evolution_model._get_user_message(user_turn)] if assistant_turn is not None: messages.append(assistant_turn) diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py index ffeded99da45d76fd73728b9f9b9a174ef4a5e82..de7ffb5d3748ca6851b9456286708f08dd5aa5b6 100644 --- a/evoprompt/optimization.py +++ b/evoprompt/optimization.py @@ -67,16 +67,17 @@ class ResponseEditor(App): def compose(self) -> ComposeResult: self.text_area = TextArea.code_editor(self.response, soft_wrap=True) - yield ScrollableContainer( - *( - Collapsible( - Static(message["content"]), - title=message["role"], - collapsed=idx != len(self.history) - 1, + if self.history is not None: + yield ScrollableContainer( + *( + Collapsible( + Static(message["content"]), + title=message["role"], + collapsed=idx != len(self.history) - 1, + ) + for idx, message in enumerate(self.history) ) - for idx, message in enumerate(self.history) ) - ) yield ScrollableContainer( Label(Panel(self.judge_response, title="Judge response")), Label(Rule(title="Response to edit"), expand=True), @@ -347,7 +348,7 @@ class PromptOptimization: self, instruction: str, response: str, - history: ChatMessages, + history: ChatMessages | None, recent_turn: ChatMessages, ) -> Judgement: # TODO potentially move to separate class wrapping the judge model and related functionality @@ -357,27 +358,37 @@ class PromptOptimization: # judge the actual response # concatenate all user and assistant messages to provide context - history_str = "\n".join( - message["content"] - for message in history - if message["role"] in ["user", "assistant"] - ) - # TODO What if the history does not exist (is empty), i.e., for the first step in de-cot? - prompt = ( - f"Context: {history_str}\nInstruction: {instruction}\nResponse: {response}" - ) - system_message = ( - "You are acting as a judge. Please read the context, the instruction and the response " - "and decide if the response follows the instruction. " - "If it does, answer 'good'. If it does not, answer 'bad'. " - "Wrap the answer with tags <judgement> and </judgement>. " - "Please also add an explanation for your judgement." - ) + # if there is no history, only show instruction and response + if history: + history_str = "\n".join( + message["content"] + for message in history + if message["role"] in ["user", "assistant"] + ) + prompt = f"Context:\n{history_str}\n\nInstruction:\n{instruction}\n\nResponse:\n{response}" + system_message = ( + "You are acting as a judge. Please read the context, the instruction and the response " + "and decide if the response follows the instruction. " + "If it does, answer 'good'. If it does not, answer 'bad'. " + "Wrap the answer with tags <judgement> and </judgement>. " + "Please also add an explanation for your judgement." + ) + else: + prompt = f"Instruction:\n{instruction}\n\nResponse:\n{response}" + system_message = ( + "You are acting as a judge. Please read the instruction and the response " + "and decide if the response follows the instruction. " + "If it does, answer 'good'. If it does not, answer 'bad'. " + "Wrap the answer with tags <judgement> and </judgement>. " + "Please also add an explanation for your judgement." + ) + # input(f"System message:\n{system_message}\n\nPrompt:\n{prompt}\n") judgement_response, _, _, _ = self.judge_model.create_completion( system_message=system_message, prompt=prompt, ) + # input(f"Judgement response:\n{judgement_response}\n") matches = re.findall( # regex that matches `good` and `bad` between <judgement> and </judgement> where additional characters can be present, e.g., whitespace r"<judgement>.*(good|bad).*</judgement>", @@ -407,7 +418,7 @@ class PromptOptimization: editor = ResponseEditor( instruction, response, - history[:-1], + history[:-1] if history is not None else None, judge_response=judgement_response, ) editor.run() @@ -418,13 +429,12 @@ class PromptOptimization: delta = Differ().compare( response.splitlines(), editor.modified_response.splitlines() ) + delta = [ + line for line in delta if line.startswith("+") or line.startswith("-") + ] logger.info( - "User corrected prompt (delta):\n%s", - "\n".join( - line - for line in delta - if line.startswith("+") or line.startswith("-") - ), + "User corrected prompt (delta):%s", + ("\n" + "\n".join(delta)) if delta else " No changes", # "User corrected prompt:\n'%s'\n -> \n'%s'", # response, # editor.modified_response,