diff --git a/evolution.py b/evolution.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d0e5958fcabdaa9e032ae7c06d42226b2a04fd
--- /dev/null
+++ b/evolution.py
@@ -0,0 +1,258 @@
+from abc import abstractmethod
+
+from models import LLMModel
+from numpy.random import choice
+from opt_types import Prompt
+from optimization import PromptOptimization
+from task import Task
+from tqdm import trange
+from utils import initialize_run_directory, log_calls, logger, save_snapshot
+
+SYSTEM_MESSAGE = (
+    "Please follow the instruction step-by-step to generate a better prompt."
+)
+
+GA_PROMPT = """
+1. Cross over the following prompts and generate a new prompt:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Mutate the prompt generated in Step 1 and generate a final prompt bracketed with <prompt> and </prompt>.
+"""
+
+
+DE_PROMPT = """
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.
+Prompt 3: {prompt3}
+4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: {basic_prompt}
+"""
+
+
+class EvolutionAlgorithm(PromptOptimization):
+    # TODO add docstrings
+    """The super class for all evolution algorithms containing shared parameters."""
+
+    def __init__(
+        self,
+        population_size: int,
+        *,
+        task: Task,
+        evolution_model: LLMModel,
+        evaluation_model: LLMModel,
+    ) -> None:
+        super().__init__(
+            task=task,
+            evolution_model=evolution_model,
+            evaluation_model=evaluation_model,
+        )
+
+        self.population_size = population_size
+
+    @log_calls("Performing selection")
+    def select(self, prompts: list[Prompt]):
+        # In GA, two parent solutions are normally selected based on the roulette wheel
+        # selection method according to the fitness value (Lipowski & Lipowska, 2012).
+        # Similar to this, we utilize the roulette wheel selection method to select
+        # two parent prompts in the current population according to the scores evaluated
+        # on development sets. Specifically, let si denote the performance score on the
+        # development set of the i-th prompt in the population, which contains a total
+        # of N prompts. The probability of selecting the i-th prompt as a parent can be expressed as
+        # pi = si / Î£j=1->N sj.
+        scores = [prompt.score for prompt in prompts]
+        if sum(scores) == 0:
+            # sum of scores is 0 ==> each score is 0, draw with equal probability
+            selection_probabilities = len(scores) * [1 / len(scores)]
+        else:
+            selection_probabilities = [score / sum(scores) for score in scores]
+        return choice(prompts, size=2, replace=False, p=selection_probabilities)
+
+    @abstractmethod
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ):
+        pass
+
+    @abstractmethod
+    def update(self, *args, **kwargs):
+        pass
+
+    def run(
+        self, num_iterations: int, add_snapshot_dict: dict, debug: bool = False
+    ) -> None:
+        # debug mode for quick run
+        if debug:
+            self.population_size = 3
+            num_iterations = 2
+
+        run_directory = initialize_run_directory(self.evolution_model)
+
+        initial_prompts, _ = self.init_run(self.population_size)
+
+        # Algorithm 1 Discrete prompt optimization: EVOPROMPT
+
+        # P keeps track of prompts in each generation
+        P = [initial_prompts]
+
+        # Line 2:
+        for t in trange(1, num_iterations + 1, desc="iterations", leave=True):
+            # Line 3: Selection: select a certain number of prompts from current population as parent prompts
+            # pr1,...,prk âˆ¼ Ptâˆ’1
+            prompts_current_evolution = P[t - 1]
+
+            new_evolutions = []
+
+            for i in trange(self.population_size, desc="updates", leave=False):
+                # for both GA and DE we start with two parent prompts
+                pr1, pr2 = self.select(P[t - 1])
+
+                # Line 4: Evolution: generate a new prompt based on the selected parent prompts by leveraging LLM to perform evolutionary operators
+                # pâ€²i â†Evo(pr1,...,prk)
+                p_i, _ = self.evolve(
+                    pr1,
+                    pr2,
+                    prompts_current_evolution=prompts_current_evolution,
+                    current_iteration=i,
+                )
+
+                evolved_prompt = self.add_prompt(p_i, (pr1, pr2), {"gen": t})
+
+                new_evolutions.append(evolved_prompt)
+            # Line 6: Update based on the evaluation scores
+            # Pt â† {Ptâˆ’1, pâ€²i} and St â† {Stâˆ’1, sâ€²i}
+            new_population = self.update(new_evolutions, prompts_current_evolution)
+
+            # store new generation
+            P.append(new_population)
+
+        # TODO move to super class
+        save_snapshot(
+            run_directory,
+            self.all_prompts,
+            self.family_tree,
+            [[prompt.id for prompt in population] for population in P],
+            num_iterations,
+            self.population_size,
+            self.task,
+            self.evolution_model,
+            # model usage for evaluating prompts
+            self.evaluation_model.usage,
+            # model usage for evolution of prompts
+            self.evolution_model.usage,
+            add_snapshot_dict,
+        )
+        # Line 8: Return the best prompt, pâˆ—, among the final population PT :
+        # pâˆ— â† argmaxpâˆˆPT f(p, D)
+        p = max(P[-1], key=lambda prompt: self.all_prompts[prompt.id].score)
+        logger.info(f"Best prompt: {p}")
+
+        # We pick the prompt with the highest score on the development set and report its score on the testset.
+        test_performance = self.task.evaluate_test(p.content)
+        logger.info(f"Best prompt on test set: {test_performance}")
+
+
+class GeneticAlgorithm(EvolutionAlgorithm):
+    """The genetic algorithm implemented using LLMs."""
+
+    @log_calls("Performing prompt evolution using GA")
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ):
+        # Following the evolutionary operators in GA, a new candidate prompt is generated through
+        # a two-step process based on the selected two parents:
+        # 1) The parent prompts undergo crossover, resulting in a new prompt that
+        #   selectively combines components from both parents;
+        # 2) The newly generated prompt from the first step undergoes mutation,
+        #   in which random alterations are made to some of its content.
+        # Based on this two-step process, we design instructions, guiding LLMs to
+        # generate a new prompt based on these steps to perform Evo(Â·) in Algorithm 1.
+
+        evolved_prompt, usage = self.evolution_model(
+            system_message=SYSTEM_MESSAGE,
+            prompt=GA_PROMPT.format(prompt1=prompt_1, prompt2=prompt_2),
+        )
+        if "<prompt>" in evolved_prompt:
+            evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0]
+        return evolved_prompt, usage
+
+    @log_calls("Performing update for GA")
+    def update(
+        self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt]
+    ):
+        # EVOPROMPT iteratively generates new candidate prompts and assesses each prompt
+        # using a development set, denoted as D, to obtain a score that quantifies the
+        # quality of the prompt. We consider a straightforward selection strategy.
+        # Specifically, at each iteration, EVOPROMPT based on GA produces N new prompts,
+        # which are combined with the current population of N prompts.
+        # The updated population is then selected by retaining the N prompts with the highest scores.
+        retained_prompts: list[Prompt] = []
+        min_retained_score = 0
+        for prompt in prompts_current_evolution + new_evolutions:
+            if len(retained_prompts) < self.population_size:
+                retained_prompts.append(prompt)
+                min_retained_score = min(min_retained_score, prompt.score)
+            elif prompt.score > min_retained_score:
+                retained_prompts.sort(key=lambda p: p.score)
+                retained_prompts[0] = prompt
+
+        return retained_prompts
+
+
+class DifferentialEvolution(EvolutionAlgorithm):
+    """The genetic algorithm implemented using LLMs."""
+
+    @log_calls("Performing prompt evolution using GA")
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ):
+        # TODO add description from paper
+
+        # DE needs best prompt for evolution
+        best_prompt_current_evolution = max(
+            prompts_current_evolution, key=lambda prompt: prompt.score
+        )
+
+        evolved_prompt, usage = self.evolution_model(
+            system_message=SYSTEM_MESSAGE,
+            prompt=DE_PROMPT.format(
+                prompt1=prompt_1,
+                prompt2=prompt_2,
+                prompt3=best_prompt_current_evolution,
+                basic_prompt=prompts_current_evolution[current_iteration],
+            ),
+        )
+        if "<prompt>" in evolved_prompt:
+            evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0]
+        return evolved_prompt, usage
+
+    @log_calls("Performing update for GA")
+    def update(
+        self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt]
+    ):
+        # for DE we keep the evolved prompt if it is better than the basic prompt, and use the basic prompt otherwise
+        assert len(prompts_current_evolution) == len(new_evolutions)
+        population = [
+            (new_prompt if new_prompt.score > current_prompt.score else current_prompt)
+            for current_prompt, new_prompt in zip(
+                prompts_current_evolution, new_evolutions
+            )
+        ]
+        return population
diff --git a/main.py b/main.py
index 5524016966d5ba565f06e06131024660e2e0b0b3..2095276a34e462f8f4489e69a9d0983247964081 100644
--- a/main.py
+++ b/main.py
@@ -1,16 +1,14 @@
 import os
-from functools import lru_cache
 from typing import Any
 
-from dotenv import load_dotenv
-from numpy.random import choice
-from tqdm import trange
-
 from cli import argument_parser
-from evo_types import ModelUsage, Prompt
+from dotenv import load_dotenv
+from evolution import DifferentialEvolution, GeneticAlgorithm
 from models import Llama2, OpenAI
 from task import QuestionAnswering, SentimentAnalysis
-from utils import initialize_run_directory, log_calls, logger, save_snapshot
+from utils import logger
+
+load_dotenv()
 
 
 def conv2bool(_str: Any):
@@ -23,265 +21,6 @@ def conv2bool(_str: Any):
     return None
 
 
-load_dotenv()
-
-PARAPHRASE_PROMPT = """You are given an instruction that describes a task. Write a response that paraphrases the instruction. Only output the paraphrased instruction bracketed in <prompt> and </prompt>."""
-
-
-@log_calls("Paraphrasing prompts")
-def paraphrase_prompts(prompt: str, n: int):
-    total_usage = ModelUsage()
-    paraphrases = []
-    for _ in range(n):
-        paraphrase, usage = evolution_model(
-            system_message=PARAPHRASE_PROMPT,
-            prompt=prompt,
-            prompt_prefix=' Instruction: "',
-            prompt_suffix='"',
-        )
-        total_usage += usage
-        if "<prompt>" in paraphrase:
-            paraphrase = paraphrase.split("<prompt>")[1].split("</prompt>")[0]
-        paraphrases.append(paraphrase)
-    return paraphrases, usage
-
-
-@log_calls("Performing selection")
-def selection(prompts):
-    # In GA, two parent solutions are normally selected based on the roulette wheel
-    # selection method according to the fitness value (Lipowski & Lipowska, 2012).
-    # Similar to this, we utilize the roulette wheel selection method to select
-    # two parent prompts in the current population according to the scores evaluated
-    # on development sets. Specifically, let si denote the performance score on the
-    # development set of the i-th prompt in the population, which contains a total
-    # of N prompts. The probability of selecting the i-th prompt as a parent can be expressed as
-    # pi = si / Î£j=1->N sj.
-    scores = [prompt.score for prompt in prompts]
-    if sum(scores) == 0:
-        # sum of scores is 0 ==> each score is 0, draw with equal probability
-        selection_probabilities = len(scores) * [1 / len(scores)]
-    else:
-        selection_probabilities = [score / sum(scores) for score in scores]
-    return choice(prompts, size=2, replace=False, p=selection_probabilities)
-
-
-SYSTEM_MESSAGE = (
-    "Please follow the instruction step-by-step to generate a better prompt."
-)
-
-GA_PROMPT = """
-1. Cross over the following prompts and generate a new prompt:
-Prompt 1: {prompt1}
-Prompt 2: {prompt2}
-2. Mutate the prompt generated in Step 1 and generate a final prompt bracketed with <prompt> and </prompt>.
-"""
-
-
-DE_PROMPT = """
-1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: {prompt1}
-Prompt 2: {prompt2}
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.
-Prompt 3: {prompt3}
-4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: {basic_prompt}
-"""
-
-
-@log_calls("Performing prompt evolution using GA")
-def evolution_ga(prompt1: str, prompt2: str):
-    # Following the evolutionary operators in GA, a new candidate prompt is generated through
-    # a two-step process based on the selected two parents:
-    # 1) The parent prompts undergo crossover, resulting in a new prompt that
-    #   selectively combines components from both parents;
-    # 2) The newly generated prompt from the first step undergoes mutation,
-    #   in which random alterations are made to some of its content.
-    # Based on this two-step process, we design instructions, guiding LLMs to
-    # generate a new prompt based on these steps to perform Evo(Â·) in Algorithm 1.
-    evolved_prompt, usage = evolution_model(
-        system_message=SYSTEM_MESSAGE,
-        prompt=GA_PROMPT.format(prompt1=prompt1, prompt2=prompt2),
-    )
-    if "<prompt>" in evolved_prompt:
-        evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0]
-    return evolved_prompt, usage
-
-
-@log_calls("Performing prompt evolution using DE")
-def evolution_de(prompt1: str, prompt2: str, basic_prompt: str, best_prompt: str):
-    # TODO add comment from paper
-    evolved_prompt, usage = evolution_model(
-        system_message=SYSTEM_MESSAGE,
-        prompt=DE_PROMPT.format(
-            prompt1=prompt1,
-            prompt2=prompt2,
-            prompt3=best_prompt,
-            basic_prompt=basic_prompt,
-        ),
-    )
-    if "<prompt>" in evolved_prompt:
-        evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0]
-    return evolved_prompt, usage
-
-
-@log_calls("Updating prompts")
-def update(prompts: list[str], N: int):
-    # EVOPROMPT iteratively generates new candidate prompts and assesses each prompt
-    # using a development set, denoted as D, to obtain a score that quantifies the
-    # quality of the prompt. We consider a straightforward selection strategy.
-    # Specifically, at each iteration, EVOPROMPT based on GA produces N new prompts,
-    # which are combined with the current population of N prompts.
-    # The updated population is then selected by retaining the N prompts with the highest scores.
-    retained_prompts: list[Prompt] = []
-    min_retained_score = 0
-    for prompt in prompts:
-        if len(retained_prompts) < N:
-            retained_prompts.append(prompt)
-            min_retained_score = min(min_retained_score, prompt.score)
-        elif prompt.score > min_retained_score:
-            retained_prompts.sort(key=lambda p: p.score)
-            retained_prompts[0] = prompt
-
-    return retained_prompts
-
-
-def run_episode(evo_alg_str: str, debug: bool = False):
-    # model usage for evolution of prompts
-    evolution_usage = ModelUsage()
-    # model usage for evaluating prompts
-    evaluation_usage = ModelUsage()
-    # Algorithm 1 Discrete prompt optimization: EVOPROMPT
-
-    # Require:
-    # - Size of population
-    N = 3 if debug else 10
-    # - Initial prompts P0 = {p1, p2, . . . , pN }
-    paraphrases, usage = paraphrase_prompts(task.base_prompt, n=N - 1)
-    evolution_usage += usage
-    # the initial population
-    initial_population = [task.base_prompt] + paraphrases
-
-    # - fD(Â·) denotes the score of a prompt on the desired LLM evaluated on D
-    f_D = lru_cache(maxsize=None)(task.evaluate_validation)
-
-    # - a pre-defined number of iterations T
-    T = 2 if debug else 10
-
-    # - carefully designed evolutionary operators to generate a new prompt Evo(Â·)
-
-    # Line 1: Initial evaluation scores: S0 â† {si = fD (pi )|i âˆˆ [1, N ]}
-    # the current population's scores
-    initial_population_scores: list[float] = [f_D(p) for p in initial_population]
-
-    # all_prompts contains a list of Prompt objects that took part in this run at some time
-    # converting prompts to Prompt object
-    all_prompts: dict[str, Prompt] = {
-        prompt.id: prompt
-        for prompt in [
-            Prompt(p, score=score, gen=0, usage=usage)
-            for (p, (score, usage)) in zip(
-                initial_population, initial_population_scores
-            )
-        ]
-    }
-
-    # P keeps track of prompts in each generation
-    P = [[prompt_id for prompt_id in all_prompts.keys()]]
-
-    # add initial prompts to family tree
-    # None marks that there is no parent
-    family_tree: dict[str, tuple[str, str] | None] = {
-        prompt_id: None for prompt_id in P[0]
-    }
-
-    # Line 2:
-    for t in trange(1, T + 1, desc="T", leave=True):
-        # Line 3: Selection: select a certain number of prompts from current population as parent prompts
-        # pr1,...,prk âˆ¼ Ptâˆ’1
-        prompts_current_evolution = [all_prompts[prompt_id] for prompt_id in P[t - 1]]
-        if evo_alg_str == "de":
-            # DE needs best prompt for evolution
-            best_prompt_current_evolution = max(
-                prompts_current_evolution, key=lambda prompt: prompt.score
-            )
-
-        new_evolutions = []
-
-        for i in trange(N, desc="N", leave=False):
-            # for both GA and DE we start with two parent prompts
-            pr1, pr2 = selection([all_prompts[prompt_id] for prompt_id in P[t - 1]])
-
-            # Line 4: Evolution: generate a new prompt based on the selected parent prompts by leveraging LLM to perform evolutionary operators
-            # pâ€²i â†Evo(pr1,...,prk)
-            if evo_alg_str == "ga":
-                p_i, usage = evolution_ga(pr1, pr2)
-            elif evo_alg_str == "de":
-                p_i, usage = evolution_de(
-                    pr1,
-                    pr2,
-                    prompts_current_evolution[i],
-                    best_prompt_current_evolution,
-                )
-            evolution_usage += usage
-
-            # Line 5: Evaluation
-            # sâ€²_i â† f(pâ€²i,D)
-            s_i, usage = f_D(p_i)
-            evaluation_usage += usage
-
-            evolved_prompt = Prompt(content=p_i, score=s_i, gen=t, usage=usage)
-
-            # keep track of genealogy
-            family_tree[evolved_prompt.id] = (pr1.id, pr2.id)
-
-            new_evolutions.append(evolved_prompt)
-        all_prompts |= {prompt.id: prompt for prompt in new_evolutions}
-        # Line 6: Update based on the evaluation scores
-        # Pt â† {Ptâˆ’1, pâ€²i} and St â† {Stâˆ’1, sâ€²i}
-        if evo_alg_str == "ga":
-            # GA keeps N best prompts from current population and evolutions
-            population = update(new_evolutions + prompts_current_evolution, N)
-        elif evo_alg_str == "de":
-            # for DE we keep the evolved prompt if it is better than the basic prompt, and use the basic prompt otherwise
-            assert len(prompts_current_evolution) == len(new_evolutions)
-            population = [
-                (
-                    new_prompt
-                    if new_prompt.score > current_prompt.score
-                    else current_prompt
-                )
-                for current_prompt, new_prompt in zip(
-                    prompts_current_evolution, new_evolutions
-                )
-            ]
-
-        # store new generation
-        P.append([prompt.id for prompt in population])
-
-    save_snapshot(
-        run_directory,
-        all_prompts,
-        family_tree,
-        P,
-        T,
-        N,
-        task,
-        evolution_model,
-        evaluation_usage,
-        evolution_usage,
-        options.__dict__,
-    )
-    # Line 8: Return the best prompt, pâˆ—, among the final population PT :
-    # pâˆ— â† argmaxpâˆˆPT f(p, D)
-    p = all_prompts[max(P[-1], key=lambda prompt_id: all_prompts[prompt_id].score)]
-    logger.info(f"Best prompt: {p}")
-
-    # We pick the prompt with the highest score on the development set and report its score on the testset.
-    test_performance = task.evaluate_test(p.content)
-    logger.info(f"Best prompt on test set: {test_performance}")
-
-
 if __name__ == "__main__":
     options = argument_parser.parse_args()
 
@@ -309,8 +48,6 @@ if __name__ == "__main__":
                 chat=options.chat,
             )
 
-    run_directory = initialize_run_directory(evolution_model)
-
     # log cli arguments
     logger.info(
         "CLI arguments:\n\tPositional:%s\n\tKeyword:\n\t\t%s",
@@ -355,4 +92,16 @@ if __name__ == "__main__":
 
     logger.info("Using evolutionary algorithm '%s'", options.evolution_algorithm)
 
-    run_episode(evo_alg_str=options.evolution_algorithm, debug=debug)
+    # TODO allow to register algorithms and map to classes
+    if options.evolution_algorithm == "ga":
+        optimizer_class = GeneticAlgorithm
+    else:
+        optimizer_class = DifferentialEvolution
+
+    optimizer = optimizer_class(
+        population_size=10,
+        task=task,
+        evolution_model=evolution_model,
+        evaluation_model=evaluation_model,
+    )
+    optimizer.run(10, debug=debug, add_snapshot_dict=options.__dict__)
diff --git a/models.py b/models.py
index eaa8f208c27837df9e1dcf544d5f505001c37997..486801546244e0c28ce8de309a91a8f6ff866125 100644
--- a/models.py
+++ b/models.py
@@ -4,8 +4,7 @@ from typing import Any
 
 import openai
 from llama_cpp import Llama
-
-from evo_types import ModelUsage
+from opt_types import ModelUsage
 
 current_directory = Path(__file__).resolve().parent
 
diff --git a/evo_types.py b/opt_types.py
similarity index 84%
rename from evo_types.py
rename to opt_types.py
index 55b6a71ce043549ca528cdf65451515e87d5d13c..3d46de80719d2ff86025b92880b51e04044b113f 100644
--- a/evo_types.py
+++ b/opt_types.py
@@ -2,8 +2,6 @@ import json
 from dataclasses import dataclass, field, is_dataclass
 from uuid import uuid4
 
-from llama_cpp.llama_types import CompletionUsage
-
 
 @dataclass(frozen=True)
 class ModelUsage:
@@ -31,24 +29,20 @@ class ModelUsage:
 class Prompt:
     content: str
     score: float
-    gen: int
     usage: ModelUsage
-    id: str = field(default_factory=lambda: uuid4().hex)
     meta: dict = field(default_factory=dict)
+    id: str = field(default_factory=lambda: uuid4().hex)
 
     def __str__(self) -> str:
         return self.content
 
     def __hash__(self) -> int:
         return (
-            hash(self.content)
-            + hash(self.score)
-            + hash(self.gen)
-            + hash(frozenset(self.meta.items()))
+            hash(self.content) + hash(self.score) + hash(frozenset(self.meta.items()))
         )
 
 
-class EvoTypeEncoder(json.JSONEncoder):
+class OptTypeEncoder(json.JSONEncoder):
     def default(self, obj):
         if is_dataclass(obj):
             return obj.__dict__
diff --git a/optimization.py b/optimization.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d707133b61a6fd4cbe8f4e9df72b476f899a56a
--- /dev/null
+++ b/optimization.py
@@ -0,0 +1,108 @@
+from itertools import zip_longest
+
+from models import LLMModel
+from opt_types import ModelUsage, Prompt
+from task import Task
+from utils import log_calls
+
+PARAPHRASE_PROMPT = """You are given an instruction that describes a task. Write a response that paraphrases the instruction. Only output the paraphrased instruction bracketed in <prompt> and </prompt>."""
+
+
+@log_calls("Paraphrasing prompts")
+def paraphrase_prompts(
+    model: LLMModel,
+    prompt: str,
+    n: int,
+    unique_prompts: bool = False,
+    num_tries: int = 10,
+    return_only_unique_prompts: bool = False,
+):
+    # TODO implement unique paraphrases
+    total_usage = ModelUsage()
+    paraphrases = []
+    for _ in range(n):
+        paraphrase, usage = model(
+            system_message=PARAPHRASE_PROMPT,
+            prompt=prompt,
+            prompt_prefix=' Instruction: "',
+            prompt_suffix='"',
+        )
+        total_usage += usage
+        if "<prompt>" in paraphrase:
+            paraphrase = paraphrase.split("<prompt>")[1].split("</prompt>")[0]
+        paraphrases.append(paraphrase)
+    if return_only_unique_prompts:
+        paraphrases = list(set(paraphrases))
+    return paraphrases, usage
+
+
+class PromptOptimization:
+    def __init__(
+        self, *, task: Task, evolution_model: LLMModel, evaluation_model: LLMModel
+    ) -> None:
+        self.task = task
+        self.evolution_model = evolution_model
+        self.evaluation_model = evaluation_model
+        self._init()
+
+    def _init(self):
+        # use caching for evaluation
+        self.family_tree: dict[str, tuple[str, ...] | None] = {}
+        # all_prompts contains a list of Prompt objects that took part in the optimization
+        # converting prompts to Prompt object
+        self.all_prompts: dict[str, Prompt] = {}
+
+    def reset(self):
+        self._init
+
+    def evaluate_prompt(self, prompt: str):
+        return self.task.evaluate_validation(prompt)
+
+    def add_prompt(
+        self, prompt: str, parents: tuple[Prompt] = None, meta: dict = None
+    ) -> Prompt:
+        score, usage = self.evaluate_prompt(prompt)
+        prompt = Prompt(content=prompt, score=score, meta=meta, usage=usage)
+
+        # keep track of prompt
+        self.all_prompts[prompt.id] = prompt
+        self.family_tree[prompt.id] = (
+            tuple(p.id for p in parents) if parents is not None else None
+        )
+
+        return prompt
+
+    def add_prompts(
+        self,
+        prompts: list[str],
+        parents: list[tuple[Prompt]] = iter(()),
+        metas: list[dict] = iter(()),
+    ):
+        return [
+            self.add_prompt(prompt, _parents, meta)
+            for prompt, _parents, meta in zip_longest(prompts, parents, metas)
+        ]
+
+    def get_prompt(self, prompt_id: str):
+        return self.all_prompt[prompt_id]
+
+    def get_prompts(self, prompt_ids: list[str]):
+        return [self.get_prompt(p_id) for p_id in prompt_ids]
+
+    def init_run(self, num_initial_prompts: int) -> tuple[list[Prompt], ModelUsage]:
+        # - Initial prompts P0 = {p1, p2, . . . , pN }
+        paraphrases, usage = paraphrase_prompts(
+            self.evolution_model, self.task.base_prompt, n=num_initial_prompts - 1
+        )
+
+        # the initial prompts
+        initial_prompts = [self.task.base_prompt] + paraphrases
+        initial_prompts = self.add_prompts(
+            initial_prompts, metas=[{"gen": 0} for _ in initial_prompts]
+        )
+
+        # accumulate usage
+        for prompt in initial_prompts:
+            usage += prompt.usage
+
+        return initial_prompts, usage
diff --git a/task.py b/task.py
index 08948ba9df927065dcc118da393d56578049a3d2..3c2dc0eecc2d065a6430c1290f1033a925c7206b 100644
--- a/task.py
+++ b/task.py
@@ -7,9 +7,8 @@ from typing import DefaultDict, Mapping, Union
 from datasets import Dataset, load_dataset
 from evaluate import load as load_metric
 from llama_cpp import LlamaGrammar
-from tqdm import tqdm
-
 from models import Llama2, OpenAI
+from tqdm import tqdm
 from utils import ModelUsage, log_calls, logger
 
 SYSTEM_MESSAGE = """
@@ -47,6 +46,7 @@ class Task:
         pass
 
     @log_calls("Evaluating validation dataset")
+    @lru_cache(maxsize=None)
     def evaluate_validation(self, prompt: str):
         return self._evaluate(prompt, self.validation_dataset)
 
@@ -262,9 +262,15 @@ class QuestionAnswering(Task):
                 "\u2013": "-",
                 "\u2014": "-",
             }
-            symbol_replacement_mapping = dict((re.escape(k), v) for k, v in symbol_replacement_mapping.items()) 
-            symbol_replacement_pattern = re.compile("|".join(symbol_replacement_mapping.keys()))
-            replace_fn = lambda text: symbol_replacement_pattern.sub(lambda m: symbol_replacement_mapping[re.escape(m.group(0))], text)
+            symbol_replacement_mapping = dict(
+                (re.escape(k), v) for k, v in symbol_replacement_mapping.items()
+            )
+            symbol_replacement_pattern = re.compile(
+                "|".join(symbol_replacement_mapping.keys())
+            )
+            replace_fn = lambda text: symbol_replacement_pattern.sub(
+                lambda m: symbol_replacement_mapping[re.escape(m.group(0))], text
+            )
             sample["context"] = replace_fn(sample["context"])
             sample["answers"]["text"] = [
                 replace_fn(text) for text in sample["answers"]["text"]
diff --git a/utils.py b/utils.py
index 167451ac702d54b3423e953206819f27fa39f0c2..3f4781de4f347fda537b1ceae42b5f7287405a59 100644
--- a/utils.py
+++ b/utils.py
@@ -10,8 +10,8 @@ from textwrap import dedent, indent
 from typing import Any, Callable
 from uuid import uuid4
 
-from evo_types import EvoTypeEncoder, ModelUsage, Prompt
 from models import Llama2, OpenAI
+from opt_types import ModelUsage, OptTypeEncoder, Prompt
 
 current_directory = Path(__file__).resolve().parent
 logger = logging.getLogger("test-classifier")
@@ -35,6 +35,7 @@ Only return the name without any text before or after.""".strip()
 
 RUNS_DIR = current_directory / "runs"
 
+
 def initialize_run_directory(model: OpenAI | Llama2):
     response, usage = model(None, run_name_prompt, chat=True)
     model.usage -= usage
@@ -163,7 +164,7 @@ def save_snapshot(
             },
             f,
             indent=4,
-            cls=EvoTypeEncoder,
+            cls=OptTypeEncoder,
         )