From 8943980c379e50654c8c7ed2b8fc983a17388aa8 Mon Sep 17 00:00:00 2001
From: Maximilian Schmidt <maximilian.schmidt@ims.uni-stuttgart.de>
Date: Thu, 29 Aug 2024 12:33:41 +0200
Subject: [PATCH] Add demonstration data for GA and DE-CoT

---
 evoprompt/api/backend.py               |   3 +-
 evoprompt/evolution.py                 |   2 +-
 evoprompt/evolution/__init__.py        |  29 ++
 evoprompt/evolution/evolution.py       | 437 +++++++++++++++++++++++++
 evoprompt/evolution/template.py        |   6 +
 evoprompt/evolution/template_de.py     |  67 ++++
 evoprompt/evolution/template_de_cot.py |  97 ++++++
 evoprompt/evolution/template_ga.py     |  29 ++
 evoprompt/template_de.py               | 115 -------
 9 files changed, 667 insertions(+), 118 deletions(-)
 create mode 100644 evoprompt/evolution/__init__.py
 create mode 100644 evoprompt/evolution/evolution.py
 create mode 100644 evoprompt/evolution/template.py
 create mode 100644 evoprompt/evolution/template_de.py
 create mode 100644 evoprompt/evolution/template_de_cot.py
 create mode 100644 evoprompt/evolution/template_ga.py
 delete mode 100644 evoprompt/template_de.py

diff --git a/evoprompt/api/backend.py b/evoprompt/api/backend.py
index 2c9e0eb..6248660 100644
--- a/evoprompt/api/backend.py
+++ b/evoprompt/api/backend.py
@@ -1,5 +1,4 @@
 import asyncio
-from argparse import Namespace
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import asynccontextmanager
 from typing import ClassVar
@@ -8,7 +7,7 @@ from fastapi import FastAPI
 
 from evoprompt.api import config
 from evoprompt.cli import argument_parser
-from evoprompt.evolution import GeneticAlgorithm
+from evoprompt.evolution.evolution import GeneticAlgorithm
 from evoprompt.models import Llama, LLMModel
 from evoprompt.task.sentiment_analysis import SST2
 
diff --git a/evoprompt/evolution.py b/evoprompt/evolution.py
index 96b072f..0bf36db 100644
--- a/evoprompt/evolution.py
+++ b/evoprompt/evolution.py
@@ -6,11 +6,11 @@ from typing import Any
 from tqdm import trange
 
 from evoprompt.cli import argument_parser
+from evoprompt.evolution.template_de import get_de_prompt_template
 from evoprompt.models import LLMModel
 from evoprompt.opt_types import ModelUsage, Prompt
 from evoprompt.optimization import Judgement, PromptOptimization
 from evoprompt.task import Task
-from evoprompt.template_de import get_de_prompt_template
 from evoprompt.utils import get_all_subclasses, get_rng, log_calls
 
 logger = logging.getLogger(__name__)
diff --git a/evoprompt/evolution/__init__.py b/evoprompt/evolution/__init__.py
new file mode 100644
index 0000000..9da4b2c
--- /dev/null
+++ b/evoprompt/evolution/__init__.py
@@ -0,0 +1,29 @@
+from evoprompt.cli import argument_parser
+from evoprompt.evolution.evolution import (
+    DifferentialEvolution,
+    DifferentialEvolutionWithCot,
+    EvolutionAlgorithm,
+    GeneticAlgorithm,
+)
+from evoprompt.utils import get_all_subclasses
+
+optimizers = {
+    algorithm.shorthand: algorithm
+    for algorithm in get_all_subclasses(EvolutionAlgorithm)
+}
+
+
+def get_optimizer_class(name: str) -> type[EvolutionAlgorithm]:
+    if name not in optimizers:
+        raise ValueError("Optimization Algorithm %s does not exist", name)
+    return optimizers[name]
+
+
+argument_parser.add_argument(
+    "--evolution-algorithm", "-a", type=str, choices=optimizers.keys(), default="ga"
+)
+argument_parser.add_argument(
+    "--use-evolution-demo",
+    action="store_true",
+    help="Whether to prepend a single demonstration example for evolution or not",
+)
diff --git a/evoprompt/evolution/evolution.py b/evoprompt/evolution/evolution.py
new file mode 100644
index 0000000..eb969f8
--- /dev/null
+++ b/evoprompt/evolution/evolution.py
@@ -0,0 +1,437 @@
+import logging
+import re
+from abc import ABCMeta, abstractmethod
+from typing import Any
+
+from tqdm import trange
+
+from evoprompt.evolution.template import get_demonstration_prompt_template
+from evoprompt.evolution.template_de import (
+    DE_DEMONSTRATION_DATA_CLS,
+    DE_DEMONSTRATION_DATA_SIM,
+    DE_PROMPT,
+)
+from evoprompt.evolution.template_de_cot import (
+    DE_COT_DEMONSTRATION_DATA_CLS,
+    DE_COT_DEMONSTRATION_DATA_SIM,
+    DE_COT_PROMPTS,
+)
+from evoprompt.evolution.template_ga import (
+    GA_DEMONSTRATION_DATA_CLS,
+    GA_DEMONSTRATION_DATA_SIM,
+    GA_PROMPT,
+)
+from evoprompt.models import LLMModel
+from evoprompt.opt_types import ModelUsage, Prompt
+from evoprompt.optimization import Judgement, PromptOptimization
+from evoprompt.task import Task
+from evoprompt.task.simplification import Simplification
+from evoprompt.task.summarization import Summarization
+from evoprompt.task.text_classification import TextClassification
+from evoprompt.utils import get_rng, log_calls
+
+logger = logging.getLogger(__name__)
+
+
+SYSTEM_MESSAGE = "Please carefully follow the instruction step-by-step."
+
+
+class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
+    shorthand: str
+
+    """The super class for all evolution algorithms containing shared parameters."""
+
+    def __init__(
+        self,
+        population_size: int,
+        *,
+        task: Task,
+        evolution_model: LLMModel,
+        evaluation_model: LLMModel,
+        judge_model: LLMModel | None,
+        run_options: dict[str, Any] = {},
+    ) -> None:
+        super().__init__(
+            task=task,
+            evolution_model=evolution_model,
+            evaluation_model=evaluation_model,
+            judge_model=judge_model,
+            run_options=run_options,
+        )
+        self.use_evolution_demo = run_options.get("use_evolution_demo", False)
+
+        self.population_size = population_size
+
+    @log_calls("Performing selection")
+    def select(self, prompts: list[Prompt]):
+        # In GA, two parent solutions are normally selected based on the roulette wheel
+        # selection method according to the fitness value (Lipowski & Lipowska, 2012).
+        # Similar to this, we utilize the roulette wheel selection method to select
+        # two parent prompts in the current population according to the scores evaluated
+        # on development sets. Specifically, let si denote the performance score on the
+        # development set of the i-th prompt in the population, which contains a total
+        # of N prompts. The probability of selecting the i-th prompt as a parent can be expressed as
+        # pi = si / Σj=1->N sj.
+        # add small value to avoid zero chance of selection for some prompts
+        scores = [prompt.score + 1e-6 for prompt in prompts]
+        selection_probabilities = [score / sum(scores) for score in scores]
+        return get_rng().choice(
+            prompts, size=2, replace=False, p=selection_probabilities
+        )
+
+    @abstractmethod
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ) -> tuple[str, list[Judgement], ModelUsage]:
+        pass
+
+    @abstractmethod
+    def update(self, *args, **kwargs):
+        pass
+
+    def run(self, num_iterations: int, debug: bool = False) -> None:
+        # debug mode for quick run
+        if debug:
+            self.population_size = 3
+            num_iterations = 2
+
+        self.init_run(self.population_size, num_iterations, debug=debug)
+
+        # Algorithm 1 Discrete prompt optimization: EVOPROMPT
+
+        # Line 2:
+        for t in self.iterations_pbar:
+            # Line 3: Selection: select a certain number of prompts from current population as parent prompts
+            # pr1,...,prk ∼ Pt−1
+            prompts_current_evolution = self.P[t - 1]
+
+            new_evolutions = []
+
+            for i in trange(self.population_size, desc="updates", leave=False):
+                # for both GA and DE we start with two parent prompts
+                pr1, pr2 = self.select(self.P[t - 1])
+
+                # Line 4: Evolution: generate a new prompt based on the selected parent prompts by leveraging LLM to perform evolutionary operators
+                # p′i ←Evo(pr1,...,prk)
+                (
+                    p_i,
+                    judgements,
+                    evolution_usage,
+                ) = self.evolve(
+                    pr1,
+                    pr2,
+                    prompts_current_evolution=prompts_current_evolution,
+                    current_iteration=i,
+                )
+                self.total_evolution_usage += evolution_usage
+
+                prompt_source = (
+                    "corrected" if not all(j.happy for j in judgements) else "generated"
+                )
+                evolved_prompt = self.add_prompt(
+                    p_i,
+                    parents=(pr1, pr2),
+                    meta={"gen": t, "source": prompt_source, "judgements": judgements},
+                )
+                self.total_evaluation_usage += evolved_prompt.usage
+
+                new_evolutions.append(evolved_prompt)
+                self.save_snapshot()
+            # Line 6: Update based on the evaluation scores
+            # Pt ← {Pt−1, p′i} and St ← {St−1, s′i}
+            new_population = self.update(new_evolutions, prompts_current_evolution)
+
+            # store new generation
+            self.P.append(new_population)
+            self.save_snapshot()
+
+        self.save_snapshot()
+        # Line 8: Return the best prompt, p∗, among the final population PT :
+        # p∗ ← argmaxp∈PT f(p, D)
+        p = max(self.P[-1], key=lambda prompt: self.all_prompts[prompt.id].score)
+        logger.info("Best prompt with score %.2f: %s", p.score, p)
+
+        # We pick the prompt with the highest score on the development set and report its score on the testset.
+        test_performance, _, _ = self.task.evaluate_test(p.content)
+        logger.info(
+            "Best prompt on test set: %s %s", test_performance, self.task.metric_name
+        )
+        logger.info(
+            "Usage (evolution model / evaluation model / total): %s / %s / %s",
+            self.total_evolution_usage,
+            self.total_evaluation_usage,
+            self.total_evolution_usage + self.total_evaluation_usage,
+        )
+
+        return self.total_evolution_usage, self.total_evaluation_usage
+
+
+class GeneticAlgorithm(EvolutionAlgorithm):
+    """The genetic algorithm (GA) implemented using LLMs."""
+
+    shorthand = "ga"
+
+    # kwargs is just there for convenience, as evolve function of other optimizers might have different inputs
+    # @register_action(ignore_args=["kwargs"])
+    @log_calls("Performing prompt evolution using GA")
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        **kwargs,
+    ):
+        # Following the evolutionary operators in GA, a new candidate prompt is generated through
+        # a two-step process based on the selected two parents:
+        # 1) The parent prompts undergo crossover, resulting in a new prompt that
+        #   selectively combines components from both parents;
+        # 2) The newly generated prompt from the first step undergoes mutation,
+        #   in which random alterations are made to some of its content.
+        # Based on this two-step process, we design instructions, guiding LLMs to
+        # generate a new prompt based on these steps to perform Evo(·) in Algorithm 1.
+
+        filled_prompt = self.get_prompt_template().format(
+            prompt1=prompt_1,
+            prompt2=prompt_2,
+        )
+        evolved_prompt, messages, usage = self.evolution_model.create_completion(
+            system_message=SYSTEM_MESSAGE,
+            prompt=filled_prompt,
+        )
+
+        judgement = self.judge_and_correct_step(filled_prompt, evolved_prompt, messages)
+        evolved_prompt = judgement.corrected_response
+
+        if "<prompt>" in evolved_prompt:
+            evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0]
+
+        logger.info(
+            "GA-evolved prompts '%s' and '%s' into '%s'",
+            prompt_1,
+            prompt_2,
+            evolved_prompt,
+        )
+
+        return evolved_prompt, [judgement], usage
+
+    @log_calls("Performing update for GA")
+    def update(
+        self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt]
+    ):
+        # EVOPROMPT iteratively generates new candidate prompts and assesses each prompt
+        # using a development set, denoted as D, to obtain a score that quantifies the
+        # quality of the prompt. We consider a straightforward selection strategy.
+        # Specifically, at each iteration, EVOPROMPT based on GA produces N new prompts,
+        # which are combined with the current population of N prompts.
+        # The updated population is then selected by retaining the N prompts with the highest scores.
+        retained_prompts: list[Prompt] = []
+        min_retained_score = 0
+        for prompt in prompts_current_evolution + new_evolutions:
+            if len(retained_prompts) < self.population_size:
+                retained_prompts.append(prompt)
+                min_retained_score = min(min_retained_score, prompt.score)
+            elif prompt.score > min_retained_score:
+                retained_prompts.sort(key=lambda p: p.score)
+                retained_prompts[0] = prompt
+
+        return retained_prompts
+
+    def get_prompt_template(self):
+        if self.use_evolution_demo:
+            if isinstance(self.task, (TextClassification, Summarization)):
+                return get_demonstration_prompt_template(
+                    GA_PROMPT, GA_DEMONSTRATION_DATA_SIM
+                )
+            elif isinstance(self.task, Simplification):
+                return get_demonstration_prompt_template(
+                    GA_PROMPT, GA_DEMONSTRATION_DATA_CLS
+                )
+            else:
+                raise NotImplementedError(
+                    f"Prompt with demonstration data is not implemented for task of type {type(task)}."
+                )
+        return GA_PROMPT
+
+
+class DifferentialEvolution(EvolutionAlgorithm):
+    """The differential algorithm (DE) implemented using LLMs."""
+
+    shorthand = "de"
+
+    @log_calls("Performing prompt evolution using DE")
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ):
+        # TODO add description from paper
+
+        # DE needs best prompt for evolution
+        best_prompt_current_evolution = max(
+            prompts_current_evolution, key=lambda prompt: prompt.score
+        )
+
+        filled_prompt = self.get_prompt_template().format(
+            prompt1=prompt_1,
+            prompt2=prompt_2,
+            prompt3=best_prompt_current_evolution,
+            basic_prompt=prompts_current_evolution[current_iteration],
+        )
+        evolved_prompt, messages, usage = self.evolution_model.create_completion(
+            system_message=SYSTEM_MESSAGE,
+            prompt=filled_prompt,
+        )
+
+        judgement = self.judge_and_correct_step(filled_prompt, evolved_prompt, messages)
+        evolved_prompt = judgement.corrected_response
+
+        matches = re.findall(
+            # regex that matches any characters between last pair of <prompt></prompt>, also if </prompt> is missing
+            r"<prompt>(?!.*<prompt>)(?:(.*)</prompt>|(.*))",
+            evolved_prompt,
+            flags=(re.IGNORECASE | re.DOTALL),
+        )
+        if matches and any(matches[0]):
+            # there is always only a single match, and one group should be non-empty
+            if matches[0][0]:
+                evolved_prompt = matches[0][0]
+            else:
+                assert matches[0][1]
+                evolved_prompt = matches[0][1]
+        else:
+            # TODO what to do in this case? Discard generated prompt directly?
+            pass
+
+        logger.info(
+            "DE-evolved prompts '%s', '%s' and '%s' with basic prompt '%s' into '%s'",
+            prompt_1,
+            prompt_2,
+            best_prompt_current_evolution,
+            prompts_current_evolution[current_iteration],
+            evolved_prompt,
+        )
+
+        return evolved_prompt, [judgement], usage
+
+    @log_calls("Performing update for DE")
+    def update(
+        self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt]
+    ):
+        # for DE we keep the evolved prompt if it is better than the basic prompt, and use the basic prompt otherwise
+        assert len(prompts_current_evolution) == len(new_evolutions)
+        population = [
+            (new_prompt if new_prompt.score > current_prompt.score else current_prompt)
+            for current_prompt, new_prompt in zip(
+                prompts_current_evolution, new_evolutions
+            )
+        ]
+        return population
+
+    def get_prompt_template(self):
+        if self.use_evolution_demo:
+            if isinstance(self.task, (TextClassification, Summarization)):
+                return get_demonstration_prompt_template(
+                    DE_PROMPT, DE_DEMONSTRATION_DATA_SIM
+                )
+            elif isinstance(self.task, Simplification):
+                return get_demonstration_prompt_template(
+                    DE_PROMPT, DE_DEMONSTRATION_DATA_CLS
+                )
+            else:
+                raise NotImplementedError(
+                    f"Prompt with demonstration data is not implemented for task of type {type(self.task)}."
+                )
+        return DE_PROMPT
+
+
+class DifferentialEvolutionWithCot(DifferentialEvolution):
+    """The differential algorithm using Chain-of-Thought (DE-CoT) implemented using LLMs."""
+
+    shorthand = "de-cot"
+
+    @log_calls("Performing prompt evolution using DE-CoT")
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ):
+        # TODO add description
+
+        # DE needs best prompt for evolution
+        best_prompt_current_evolution = max(
+            prompts_current_evolution, key=lambda prompt: prompt.score
+        )
+
+        messages = None
+        response: str = ""
+        judgements: list[Judgement] = []
+        usage: ModelUsage = ModelUsage()
+        for idx, prompt in enumerate(self.get_prompt_template()):
+            filled_prompt = prompt.format(
+                prompt1=prompt_1,
+                prompt2=prompt_2,
+                prompt3=best_prompt_current_evolution,
+                basic_prompt=prompts_current_evolution[current_iteration],
+            )
+            response, messages, usage = self.evolution_model.create_completion(
+                system_message=SYSTEM_MESSAGE,
+                prompt=filled_prompt,
+                history=messages,
+                stop="</prompt>" if idx == len(DE_COT_PROMPTS) - 1 else None,
+            )
+            logger.debug(
+                "Performed evolution (step %d) using DE-CoT:\n\tInputs: %s\n\tResponse: %s",
+                idx,
+                messages,
+                response,
+            )
+            judgement = self.judge_and_correct_step(
+                filled_prompt, response, history=messages
+            )
+            judgements.append(judgement)
+            # replace last message with corrected response
+            messages[-1]["content"] = judgement.corrected_response
+            response = judgement.corrected_response
+
+        # at this point we should get a new prompt
+        if "<prompt>" in response:
+            response = response.split("<prompt>")[1].split("</prompt>")[0]
+
+        logger.info(
+            "DE-CoT-evolved prompts '%s', '%s' and '%s' with basic prompt '%s' into '%s'",
+            prompt_1,
+            prompt_2,
+            best_prompt_current_evolution,
+            prompts_current_evolution[current_iteration],
+            response,
+        )
+
+        return response, judgements, usage
+
+    def get_prompt_template(self):
+        if self.use_evolution_demo:
+            if isinstance(self.task, (TextClassification, Summarization)):
+                demonstration_data = DE_COT_DEMONSTRATION_DATA_SIM
+            elif isinstance(self.task, Simplification):
+                demonstration_data = DE_COT_DEMONSTRATION_DATA_CLS
+            else:
+                raise NotImplementedError(
+                    f"Prompt with demonstration data is not implemented for task of type {type(self.task)}."
+                )
+
+            for prompt, demonstration_data_item in zip(
+                DE_COT_PROMPTS, demonstration_data
+            ):
+                yield get_demonstration_prompt_template(prompt, demonstration_data_item)
+        else:
+            yield from DE_COT_PROMPTS
diff --git a/evoprompt/evolution/template.py b/evoprompt/evolution/template.py
new file mode 100644
index 0000000..a84dbb3
--- /dev/null
+++ b/evoprompt/evolution/template.py
@@ -0,0 +1,6 @@
+def get_demonstration_prompt_template(prompt_template: str, demonstration_data: dict):
+    prompt_template_with_demo = prompt_template.format(**demonstration_data)
+    prompt_template_with_demo += "\n\n" + demonstration_data["response"]
+    prompt_template_with_demo += "\n\n" + prompt_template
+    prompt_template_with_demo += "\n\n" + demonstration_data["generation_prefix"]
+    return prompt_template_with_demo
diff --git a/evoprompt/evolution/template_de.py b/evoprompt/evolution/template_de.py
new file mode 100644
index 0000000..674f098
--- /dev/null
+++ b/evoprompt/evolution/template_de.py
@@ -0,0 +1,67 @@
+# adopted from https://github.com/beeevita/EvoPrompt/blob/bf43b0dcc63fb79b7c0007d4693b2c0721e9a1a7/data/template_de.py
+
+DE_PROMPT = """
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.
+Prompt 3: {prompt3}
+4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: {basic_prompt}
+"""
+
+# NOTE that the phrase "Please follow ..." was removed since this is already part of our system message although model inputs will look slightly different though.
+DE_DEMONSTRATION_DATA_SIM = {
+    "prompt1": "Rewrite the input text into simpler text.",
+    "prompt2": "Rewrite my complex sentence in simpler terms, but keep the meaning.",
+    "prompt3": "Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.",
+    "basic_prompt": "Make the sentence easier for people who do not speak English fluently to comprehend.",
+    "generation_prefix": "1. ",
+    "response": (
+        "1. Identifying the different parts between Prompt 1 and Prompt 2:\n"
+        "Prompt 1: Rewrite the input text into simpler text.\n"
+        "Prompt 2: Rewrite my complex sentence in simpler terms, but keep the meaning.\n"
+        "Different parts:\n"
+        '"input text" vs "my complex sentence"\n'
+        '"simpler text" vs "simpler terms, but keep the meaning"\n\n'
+        "2. Randomly mutate the different parts:\n"
+        '"input text" -> "provided text"\n'
+        '"my complex sentence" -> "the difficult sentence"\n'
+        '"simpler text" -> "easier language"\n'
+        '"simpler terms, but keep the meaning" -> "simpler words while maintaining the meaning"\n\n'
+        "3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:\n"
+        "Prompt 3: Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.\n"
+        "New Prompt: Transform the provided text into easier language while maintaining the meaning, making it accessible for non-native English speakers.\n\n"
+        "4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\n"
+        "Basic Prompt: Make the sentence easier for people who do not speak English fluently to comprehend.\n"
+        "Final Prompt: <prompt>Convert the difficult sentence into simpler words while preserving the meaning, so it's easier for non-native English speakers to understand.</prompt>"
+    ),
+}
+
+DE_DEMONSTRATION_DATA_CLS = {
+    "prompt1": "Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.",
+    "prompt2": "In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.",
+    "prompt3": "Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.",
+    "basic_prompt": "You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.",
+    "generation_prefix": "1. ",
+    "response": (
+        "1. Identifying the different parts between Prompt 1 and Prompt 2:\n"
+        "Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.\n"
+        "Prompt 2: In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.\n"
+        "Different parts:\n"
+        '"classify the comment" vs "classify a sentence"\n'
+        '"Your task is to" vs "In this task, you are given sentences from movie reviews. The task is to"\n\n'
+        "2. Randomly mutate the different parts:\n"
+        '"classify the comment" -> "categorize the statement"\n'
+        '"classify a sentence" -> "evaluate the review"\n'
+        '"Your task is to" -> "Your mission is to"\n'
+        '"In this task, you are given sentences from movie reviews. The task is to" -> "In this assignment, you will receive movie review sentences. Your job is to"\n\n'
+        "3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:\n"
+        "Prompt 3: Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.\n"
+        "New Prompt: In this assignment, you will receive movie review sentences. Your job is to evaluate the review and determine the sentiment, choosing from terrible, bad, okay, good, and great to describe the movie.\n\n"
+        "4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\n"
+        "Basic Prompt: You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.\n"
+        "Final Prompt: <prompt>Your mission is to categorize the statement from a movie review by understanding its meaning and context, and then classify it as one of the following categories: terrible, bad, okay, good, or great.</prompt>"
+    ),
+}
diff --git a/evoprompt/evolution/template_de_cot.py b/evoprompt/evolution/template_de_cot.py
new file mode 100644
index 0000000..437d264
--- /dev/null
+++ b/evoprompt/evolution/template_de_cot.py
@@ -0,0 +1,97 @@
+# adopted from https://github.com/beeevita/EvoPrompt/blob/bf43b0dcc63fb79b7c0007d4693b2c0721e9a1a7/data/template_de.py
+
+DE_COT_PROMPTS = [
+    "Step 1: Identify the main different parts between the Prompt 1 and Prompt 2:\nPrompt 1: {prompt1}\nPrompt 2: {prompt2}",
+    "Step 2: Randomly mutate the different parts",
+    "Step 3: Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.\nPrompt 3: {prompt3}",
+    "Step 4: Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\nBasic Prompt: {basic_prompt}",
+]
+
+# NOTE that the phrase "Please follow ..." was removed since this is already part of our system message although model inputs will look slightly different though.
+DE_COT_DEMONSTRATION_DATA_SIM = [
+    {
+        "prompt1": "Rewrite the input text into simpler text.",
+        "prompt2": "Rewrite my complex sentence in simpler terms, but keep the meaning.",
+        "generation_prefix": "1. ",
+        "response": (
+            "1. Identifying the different parts between Prompt 1 and Prompt 2:\n"
+            "Prompt 1: Rewrite the input text into simpler text.\n"
+            "Prompt 2: Rewrite my complex sentence in simpler terms, but keep the meaning.\n"
+            "Different parts:\n"
+            '"input text" vs "my complex sentence"\n'
+            '"simpler text" vs "simpler terms, but keep the meaning"'
+        ),
+    },
+    {
+        "generation_prefix": "2. ",
+        "response": (
+            "2. Randomly mutate the different parts:\n"
+            '"input text" -> "provided text"\n'
+            '"my complex sentence" -> "the difficult sentence"\n'
+            '"simpler text" -> "easier language"\n'
+            '"simpler terms, but keep the meaning" -> "simpler words while maintaining the meaning"'
+        ),
+    },
+    {
+        "prompt3": "Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.",
+        "generation_prefix": "3. ",
+        "response": (
+            "3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:\n"
+            "Prompt 3: Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.\n"
+            "New Prompt: Transform the provided text into easier language while maintaining the meaning, making it accessible for non-native English speakers."
+        ),
+    },
+    {
+        "basic_prompt": "Make the sentence easier for people who do not speak English fluently to comprehend.",
+        "generation_prefix": "4. ",
+        "response": (
+            "4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\n"
+            "Basic Prompt: Make the sentence easier for people who do not speak English fluently to comprehend.\n"
+            "Final Prompt: <prompt>Convert the difficult sentence into simpler words while preserving the meaning, so it's easier for non-native English speakers to understand.</prompt>"
+        ),
+    },
+]
+
+DE_COT_DEMONSTRATION_DATA_CLS = [
+    {
+        "prompt1": "Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.",
+        "prompt2": "In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.",
+        "generation_prefix": "1. ",
+        "response": (
+            "1. Identifying the different parts between Prompt 1 and Prompt 2:\n"
+            "Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.\n"
+            "Prompt 2: In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.\n"
+            "Different parts:\n"
+            '"classify the comment" vs "classify a sentence"\n'
+            '"Your task is to" vs "In this task, you are given sentences from movie reviews. The task is to"'
+        ),
+    },
+    {
+        "generation_prefix": "2. ",
+        "response": (
+            "2. Randomly mutate the different parts:\n"
+            '"classify the comment" -> "categorize the statement"\n'
+            '"classify a sentence" -> "evaluate the review"\n'
+            '"Your task is to" -> "Your mission is to"\n'
+            '"In this task, you are given sentences from movie reviews. The task is to" -> "In this assignment, you will receive movie review sentences. Your job is to"'
+        ),
+    },
+    {
+        "prompt3": "Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.",
+        "generation_prefix": "3. ",
+        "response": (
+            "3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:\n"
+            "Prompt 3: Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.\n"
+            "New Prompt: In this assignment, you will receive movie review sentences. Your job is to evaluate the review and determine the sentiment, choosing from terrible, bad, okay, good, and great to describe the movie."
+        ),
+    },
+    {
+        "basic_prompt": "You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.",
+        "generation_prefix": "4. ",
+        "response": (
+            "4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\n"
+            "Basic Prompt: You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.\n"
+            "Final Prompt: <prompt>Your mission is to categorize the statement from a movie review by understanding its meaning and context, and then classify it as one of the following categories: terrible, bad, okay, good, or great.</prompt>"
+        ),
+    },
+]
diff --git a/evoprompt/evolution/template_ga.py b/evoprompt/evolution/template_ga.py
new file mode 100644
index 0000000..59c478d
--- /dev/null
+++ b/evoprompt/evolution/template_ga.py
@@ -0,0 +1,29 @@
+# adopted from https://github.com/beeevita/EvoPrompt/blob/bf43b0dcc63fb79b7c0007d4693b2c0721e9a1a7/data/template_ga.py
+
+GA_PROMPT = """
+1. Cross over the following prompts and generate a new prompt:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Mutate the prompt generated in Step 1 and generate a final prompt bracketed with <prompt> and </prompt>.
+"""
+
+# NOTE that the phrase "Please follow ..." was removed since this is already part of our system message although model inputs will look slightly different though.
+GA_DEMONSTRATION_DATA_SIM = {
+    "prompt1": "Rewrite the input text into simpler text.",
+    "prompt2": "Rewrite my complex sentence in simpler terms, but keep the meaning.",
+    "generation_prefix": "1. ",
+    "response": (
+        "1. Crossover Prompt: Rewrite the complex text into simpler text while keeping its meaning.\n"
+        "2. <prompt>Transform the provided text into simpler language, maintaining its essence.</prompt>"
+    ),
+}
+
+GA_DEMONSTRATION_DATA_CLS = {
+    "prompt1": "Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.",
+    "prompt2": "In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.",
+    "generation_prefix": "1. ",
+    "response": (
+        "1. Crossover Prompt: In this task, you are given comments from movie reviews. Your task is to classify each comment as one of the following categories: terrible, bad, okay, good, great."
+        "2. <prompt>Given a sentence from a movie review, classify it into one of the following categories: terrible, bad, okay, good, or great.</prompt>"
+    ),
+}
diff --git a/evoprompt/template_de.py b/evoprompt/template_de.py
deleted file mode 100644
index 1a8cb0e..0000000
--- a/evoprompt/template_de.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# adopted from https://github.com/beeevita/EvoPrompt/blob/bf43b0dcc63fb79b7c0007d4693b2c0721e9a1a7/data/template_de.py
-from evoprompt.task import Task, TextClassification, Summarization, Simplification
-
-
-DE_PROMPT = """
-1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: {prompt1}
-Prompt 2: {prompt2}
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.
-Prompt 3: {prompt3}
-4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: {basic_prompt}
-"""
-
-# NOTE that the phrase "Please follow ..." was removed since this is already part of our system message although model inputs will look slightly different though.
-DE_PROMPT_WITH_DEMONSTRATION_SIM = """1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: Rewrite the input text into simpler text.
-Prompt 2: Rewrite my complex sentence in simpler terms, but keep the meaning.
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt.
-Prompt 3: Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.
-4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: Make the sentence easier for people who do not speak English fluently to comprehend.
-
-1. Identifying the different parts between Prompt 1 and Prompt 2:
-Prompt 1: Rewrite the input text into simpler text.
-Prompt 2: Rewrite my complex sentence in simpler terms, but keep the meaning.
-Different parts:
-"input text" vs "my complex sentence"
-"simpler text" vs "simpler terms, but keep the meaning"
-
-2. Randomly mutate the different parts:
-"input text" -> "provided text"
-"my complex sentence" -> "the difficult sentence"
-"simpler text" -> "easier language"
-"simpler terms, but keep the meaning" -> "simpler words while maintaining the meaning"
-
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:
-Prompt 3: Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.
-New Prompt: Transform the provided text into easier language while maintaining the meaning, making it accessible for non-native English speakers.
-
-4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: Make the sentence easier for people who do not speak English fluently to comprehend.
-Final Prompt: <prompt>Convert the difficult sentence into simpler words while preserving the meaning, so it's easier for non-native English speakers to understand.</prompt>
-
-
-1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: {prompt1}
-Prompt 2: {prompt2}
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step2 and generate a new prompt.
-Prompt 3: {prompt3}
-4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: {basic_prompt}
-
-1. """
-
-DE_PROMPT_WITH_DEMONSTRATION_CLS = """1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.
-Prompt 2: In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt.
-Prompt 3: Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.
-4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.
-
-1. Identifying the different parts between Prompt 1 and Prompt 2:
-Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.
-Prompt 2: In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.
-Different parts:
-"classify the comment" vs "classify a sentence"
-"Your task is to" vs "In this task, you are given sentences from movie reviews. The task is to"
-
-2. Randomly mutate the different parts:
-"classify the comment" -> "categorize the statement"
-"classify a sentence" -> "evaluate the review"
-"Your task is to" -> "Your mission is to"
-"In this task, you are given sentences from movie reviews. The task is to" -> "In this assignment, you will receive movie review sentences. Your job is to"
-
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:
-Prompt 3: Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.
-New Prompt: In this assignment, you will receive movie review sentences. Your job is to evaluate the review and determine the sentiment, choosing from terrible, bad, okay, good, and great to describe the movie.
-
-4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.
-Final Prompt: <prompt>Your mission is to categorize the statement from a movie review by understanding its meaning and context, and then classify it as one of the following categories: terrible, bad, okay, good, or great.</prompt>
-
-1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: {prompt1}
-Prompt 2: {prompt2}
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step2 and generate a new prompt.
-Prompt 3: {prompt3}
-4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: {basic_prompt}
-
-1. """
-
-
-def get_de_prompt_template(use_demonstration_example: bool, task: None | Task = None):
-    if use_demonstration_example:
-        assert (
-            task is not None
-        ), "Task cannot be None if demonstation data should be used."
-
-        if isinstance(task, (TextClassification, Summarization)):
-            return DE_PROMPT_WITH_DEMONSTRATION_SIM
-        elif isinstance(task, Simplification):
-            return DE_PROMPT_WITH_DEMONSTRATION_CLS
-        else:
-            raise NotImplementedError(
-                f"Prompt with demonstration data is not implemented for task of type {type(task)}."
-            )
-    return DE_PROMPT
-- 
GitLab