diff --git a/evoprompt/evolution.py b/evoprompt/evolution.py index 492947684ed4c07de34a38f41bca9b00ae4675b0..a187a6724f6c04ac946bce69667a50a6c34e504e 100644 --- a/evoprompt/evolution.py +++ b/evoprompt/evolution.py @@ -16,9 +16,7 @@ from evoprompt.utils import get_all_subclasses, log_calls logger = logging.getLogger(__name__) -SYSTEM_MESSAGE = ( - "Please follow the instruction step-by-step to generate a better prompt." -) +SYSTEM_MESSAGE = "Please carefully follow the instruction step-by-step." GA_PROMPT = """ 1. Cross over the following prompts and generate a new prompt: @@ -28,6 +26,14 @@ Prompt 2: {prompt2} """ +DE_COT_PROMPTS = [ + "Step 1: Identify the different parts between the Prompt 1 and Prompt 2:\nPrompt 1: {prompt1}\nPrompt 2: {prompt2}", + "Step 2: Randomly mutate the different parts", + "Step 3: Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.\nPrompt 3: {prompt3}", + "Step 4: Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\nBasic Prompt: {basic_prompt}", +] + + class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta): shorthand: str @@ -131,7 +137,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta): # Line 8: Return the best prompt, p∗, among the final population PT : # p∗ ↠argmaxp∈PT f(p, D) p = max(self.P[-1], key=lambda prompt: self.all_prompts[prompt.id].score) - logger.info(f"Best prompt: {p}") + logger.info("Best prompt with score %.2f: %s", p.score, p) # We pick the prompt with the highest score on the development set and report its score on the testset. test_performance, _, _ = self.task.evaluate_test(p.content) @@ -149,7 +155,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta): class GeneticAlgorithm(EvolutionAlgorithm): - """The genetic algorithm implemented using LLMs.""" + """The genetic algorithm (GA) implemented using LLMs.""" shorthand = "ga" @@ -171,7 +177,7 @@ class GeneticAlgorithm(EvolutionAlgorithm): # Based on this two-step process, we design instructions, guiding LLMs to # generate a new prompt based on these steps to perform Evo(·) in Algorithm 1. - evolved_prompt, usage = self.evolution_model( + evolved_prompt, _, usage = self.evolution_model( system_message=SYSTEM_MESSAGE, prompt=GA_PROMPT.format(prompt1=prompt_1, prompt2=prompt_2), ) @@ -211,11 +217,11 @@ class GeneticAlgorithm(EvolutionAlgorithm): class DifferentialEvolution(EvolutionAlgorithm): - """The genetic algorithm implemented using LLMs.""" + """The differential algorithm (DE) implemented using LLMs.""" shorthand = "de" - @log_calls("Performing prompt evolution using GA") + @log_calls("Performing prompt evolution using DE") def evolve( self, prompt_1: str, @@ -231,7 +237,7 @@ class DifferentialEvolution(EvolutionAlgorithm): prompts_current_evolution, key=lambda prompt: prompt.score ) - evolved_prompt, usage = self.evolution_model( + evolved_prompt, _, usage = self.evolution_model( system_message=SYSTEM_MESSAGE, prompt=get_de_prompt_template(self.use_evolution_demo, self.task).format( prompt1=prompt_1, @@ -254,7 +260,7 @@ class DifferentialEvolution(EvolutionAlgorithm): return evolved_prompt, usage - @log_calls("Performing update for GA") + @log_calls("Performing update for DE") def update( self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt] ): @@ -269,13 +275,91 @@ class DifferentialEvolution(EvolutionAlgorithm): return population +class DifferentialEvolutionWithCot(DifferentialEvolution): + """The differential algorithm using Chain-of-Thought (DE-CoT) implemented using LLMs.""" + + shorthand = "de-cot" + + @log_calls("Performing prompt evolution using DE-CoT") + def evolve( + self, + prompt_1: str, + prompt_2: str, + *, + prompts_current_evolution: list[Prompt], + current_iteration: int, + ): + # TODO add description from paper + + # DE needs best prompt for evolution + best_prompt_current_evolution = max( + prompts_current_evolution, key=lambda prompt: prompt.score + ) + + response, messages, usage = self.evolution_model( + system_message=SYSTEM_MESSAGE, + prompt=DE_COT_PROMPTS[0].format( + prompt1=prompt_1, + prompt2=prompt_2, + ), + ) + # input(messages) + # input(response) + response, messages, usage = self.evolution_model( + system_message=SYSTEM_MESSAGE, + prompt=DE_COT_PROMPTS[1], + history=messages, + ) + # input(messages) + # input(response) + response, messages, usage = self.evolution_model( + system_message=SYSTEM_MESSAGE, + prompt=DE_COT_PROMPTS[2].format( + prompt3=best_prompt_current_evolution, + basic_prompt=prompts_current_evolution[current_iteration], + ), + history=messages, + ) + # input(messages) + # input(response) + evolved_prompt, messages, usage = self.evolution_model( + system_message=SYSTEM_MESSAGE, + prompt=DE_COT_PROMPTS[3].format( + basic_prompt=prompts_current_evolution[current_iteration], + ), + history=messages, + stop="</prompt>", + ) + # input(messages) + # input(evolved_prompt) + if "<prompt>" in evolved_prompt: + evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0] + + logger.info( + "DE-CoT-evolved prompts '%s', '%s' and '%s' with basic prompt '%s' into '%s'", + prompt_1, + prompt_2, + best_prompt_current_evolution, + prompts_current_evolution[current_iteration], + evolved_prompt, + ) + + return evolved_prompt, usage + + +def get_all_subclasses(cls): + return set(cls.__subclasses__()).union( + [s for c in cls.__subclasses__() for s in get_all_subclasses(c)] + ) + + optimizers = { algorithm.shorthand: algorithm for algorithm in get_all_subclasses(EvolutionAlgorithm) } -def get_optimizer_class(name: str): +def get_optimizer_class(name: str) -> type[EvolutionAlgorithm]: if name not in optimizers: raise ValueError("Optimization Algorithm %s does not exist", name) return optimizers[name] diff --git a/evoprompt/models.py b/evoprompt/models.py index e29b212fb18fdcfbf85d212f9a9ecb8e4b64e3f1..43f39f54b0e0b9e4891aa1ec4f33c716c58476b4 100644 --- a/evoprompt/models.py +++ b/evoprompt/models.py @@ -114,8 +114,9 @@ class Llama(LLMModel): prompt_prefix: str = "", prompt_suffix: str = "", chat: bool | None = None, - stop: str = "</prompt>", + stop: str = None, max_tokens: int = 200, + history: dict = None, **kwargs: Any, ) -> tuple[str, ModelUsage]: if chat is None: @@ -128,7 +129,11 @@ class Llama(LLMModel): "content": prompt_prefix + prompt + prompt_suffix + prompt_appendix, } ] - if system_message: + # a history is prepended to the messages, and we assume that it also includes a system message, i.e., we never add a system message in this case + # TODO is it better to check for a system message? + if history is not None: + messages = history + messages + elif system_message: messages.insert( 0, { @@ -144,20 +149,35 @@ class Llama(LLMModel): ) response_text = response["choices"][0]["message"]["content"] else: - response = self.model.create_completion( - prompt=(system_message if system_message else "") + prompt = ( + (system_message if system_message else "") + prompt_prefix + prompt + prompt_suffix - + prompt_appendix, + + prompt_appendix + ) + response = self.model.create_completion( + prompt=prompt, stop=stop, max_tokens=max_tokens, **kwargs, ) response_text = response["choices"][0]["text"] + messages = [ + { + "role": "user", + "content": prompt, + } + ] + messages.append( + { + "role": "assistant", + "content": response_text, + } + ) # input(f"Response: {response_text}") usage = ModelUsage(**response["usage"]) - return response_text, usage + return response_text, messages, usage @classmethod def register_arguments(cls, parser: ArgumentParser): diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py index 1a8ef797acf6739dc6fb1521a8e91b82e4cd0f1a..5c0eda4b4fcbb118607c272b9f214945c67400fb 100644 --- a/evoprompt/optimization.py +++ b/evoprompt/optimization.py @@ -34,7 +34,7 @@ def paraphrase_prompts( if num_tries >= max_tries: break num_tries += 1 - paraphrase, usage = model( + paraphrase, _, usage = model( system_message=PARAPHRASE_PROMPT, prompt=prompt, prompt_prefix=' Instruction: "', diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py index 58d8c77c22bdf590451d04216b20091061444ca1..4d7450e47e12ac8d7f5b94251e8d1cd75c1e59c6 100644 --- a/evoprompt/task/task.py +++ b/evoprompt/task/task.py @@ -319,8 +319,7 @@ class Task(metaclass=ABCMeta): def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]: # run model for inference using grammar to constrain output # TODO grammar also depends on prompt and vice-versa -> what are good labels? - - response, usage = self.model( + response, _, usage = self.model( system_message=SYSTEM_MESSAGE, prompt=prompt, prompt_appendix=self._get_prompt_text_for_datum(datum), diff --git a/evoprompt/utils.py b/evoprompt/utils.py index 3eefa2e9c709a75bff442f70880df3f1c0336347..16efd8886426ed7d18444a9c5786280b3cd85515 100644 --- a/evoprompt/utils.py +++ b/evoprompt/utils.py @@ -49,9 +49,9 @@ def initialize_run_directory(model: Callable): if file_handler is not None: logger.removeHandler(file_handler) - response, usage = model(None, run_name_prompt) + response, _, _ = model(None, run_name_prompt) run_name_match = re.search(r"^\w+$", response, re.MULTILINE) - existing_run_names = os.listdir(RUNS_DIR) + existing_run_names = os.listdir(RUNS_DIR) if RUNS_DIR.exists() else [] if run_name_match is None or run_name_match.group(0) in existing_run_names: run_name = uuid4().hex else: