diff --git a/evoprompt/evolution.py b/evoprompt/evolution.py
index 492947684ed4c07de34a38f41bca9b00ae4675b0..a187a6724f6c04ac946bce69667a50a6c34e504e 100644
--- a/evoprompt/evolution.py
+++ b/evoprompt/evolution.py
@@ -16,9 +16,7 @@ from evoprompt.utils import get_all_subclasses, log_calls
 logger = logging.getLogger(__name__)
 
 
-SYSTEM_MESSAGE = (
-    "Please follow the instruction step-by-step to generate a better prompt."
-)
+SYSTEM_MESSAGE = "Please carefully follow the instruction step-by-step."
 
 GA_PROMPT = """
 1. Cross over the following prompts and generate a new prompt:
@@ -28,6 +26,14 @@ Prompt 2: {prompt2}
 """
 
 
+DE_COT_PROMPTS = [
+    "Step 1: Identify the different parts between the Prompt 1 and Prompt 2:\nPrompt 1: {prompt1}\nPrompt 2: {prompt2}",
+    "Step 2: Randomly mutate the different parts",
+    "Step 3: Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.\nPrompt 3: {prompt3}",
+    "Step 4: Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:\nBasic Prompt: {basic_prompt}",
+]
+
+
 class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
     shorthand: str
 
@@ -131,7 +137,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
         # Line 8: Return the best prompt, p∗, among the final population PT :
         # p∗ ← argmaxp∈PT f(p, D)
         p = max(self.P[-1], key=lambda prompt: self.all_prompts[prompt.id].score)
-        logger.info(f"Best prompt: {p}")
+        logger.info("Best prompt with score %.2f: %s", p.score, p)
 
         # We pick the prompt with the highest score on the development set and report its score on the testset.
         test_performance, _, _ = self.task.evaluate_test(p.content)
@@ -149,7 +155,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
 
 
 class GeneticAlgorithm(EvolutionAlgorithm):
-    """The genetic algorithm implemented using LLMs."""
+    """The genetic algorithm (GA) implemented using LLMs."""
 
     shorthand = "ga"
 
@@ -171,7 +177,7 @@ class GeneticAlgorithm(EvolutionAlgorithm):
         # Based on this two-step process, we design instructions, guiding LLMs to
         # generate a new prompt based on these steps to perform Evo(·) in Algorithm 1.
 
-        evolved_prompt, usage = self.evolution_model(
+        evolved_prompt, _, usage = self.evolution_model(
             system_message=SYSTEM_MESSAGE,
             prompt=GA_PROMPT.format(prompt1=prompt_1, prompt2=prompt_2),
         )
@@ -211,11 +217,11 @@ class GeneticAlgorithm(EvolutionAlgorithm):
 
 
 class DifferentialEvolution(EvolutionAlgorithm):
-    """The genetic algorithm implemented using LLMs."""
+    """The differential algorithm (DE) implemented using LLMs."""
 
     shorthand = "de"
 
-    @log_calls("Performing prompt evolution using GA")
+    @log_calls("Performing prompt evolution using DE")
     def evolve(
         self,
         prompt_1: str,
@@ -231,7 +237,7 @@ class DifferentialEvolution(EvolutionAlgorithm):
             prompts_current_evolution, key=lambda prompt: prompt.score
         )
 
-        evolved_prompt, usage = self.evolution_model(
+        evolved_prompt, _, usage = self.evolution_model(
             system_message=SYSTEM_MESSAGE,
             prompt=get_de_prompt_template(self.use_evolution_demo, self.task).format(
                 prompt1=prompt_1,
@@ -254,7 +260,7 @@ class DifferentialEvolution(EvolutionAlgorithm):
 
         return evolved_prompt, usage
 
-    @log_calls("Performing update for GA")
+    @log_calls("Performing update for DE")
     def update(
         self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt]
     ):
@@ -269,13 +275,91 @@ class DifferentialEvolution(EvolutionAlgorithm):
         return population
 
 
+class DifferentialEvolutionWithCot(DifferentialEvolution):
+    """The differential algorithm using Chain-of-Thought (DE-CoT) implemented using LLMs."""
+
+    shorthand = "de-cot"
+
+    @log_calls("Performing prompt evolution using DE-CoT")
+    def evolve(
+        self,
+        prompt_1: str,
+        prompt_2: str,
+        *,
+        prompts_current_evolution: list[Prompt],
+        current_iteration: int,
+    ):
+        # TODO add description from paper
+
+        # DE needs best prompt for evolution
+        best_prompt_current_evolution = max(
+            prompts_current_evolution, key=lambda prompt: prompt.score
+        )
+
+        response, messages, usage = self.evolution_model(
+            system_message=SYSTEM_MESSAGE,
+            prompt=DE_COT_PROMPTS[0].format(
+                prompt1=prompt_1,
+                prompt2=prompt_2,
+            ),
+        )
+        # input(messages)
+        # input(response)
+        response, messages, usage = self.evolution_model(
+            system_message=SYSTEM_MESSAGE,
+            prompt=DE_COT_PROMPTS[1],
+            history=messages,
+        )
+        # input(messages)
+        # input(response)
+        response, messages, usage = self.evolution_model(
+            system_message=SYSTEM_MESSAGE,
+            prompt=DE_COT_PROMPTS[2].format(
+                prompt3=best_prompt_current_evolution,
+                basic_prompt=prompts_current_evolution[current_iteration],
+            ),
+            history=messages,
+        )
+        # input(messages)
+        # input(response)
+        evolved_prompt, messages, usage = self.evolution_model(
+            system_message=SYSTEM_MESSAGE,
+            prompt=DE_COT_PROMPTS[3].format(
+                basic_prompt=prompts_current_evolution[current_iteration],
+            ),
+            history=messages,
+            stop="</prompt>",
+        )
+        # input(messages)
+        # input(evolved_prompt)
+        if "<prompt>" in evolved_prompt:
+            evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0]
+
+        logger.info(
+            "DE-CoT-evolved prompts '%s', '%s' and '%s' with basic prompt '%s' into '%s'",
+            prompt_1,
+            prompt_2,
+            best_prompt_current_evolution,
+            prompts_current_evolution[current_iteration],
+            evolved_prompt,
+        )
+
+        return evolved_prompt, usage
+
+
+def get_all_subclasses(cls):
+    return set(cls.__subclasses__()).union(
+        [s for c in cls.__subclasses__() for s in get_all_subclasses(c)]
+    )
+
+
 optimizers = {
     algorithm.shorthand: algorithm
     for algorithm in get_all_subclasses(EvolutionAlgorithm)
 }
 
 
-def get_optimizer_class(name: str):
+def get_optimizer_class(name: str) -> type[EvolutionAlgorithm]:
     if name not in optimizers:
         raise ValueError("Optimization Algorithm %s does not exist", name)
     return optimizers[name]
diff --git a/evoprompt/models.py b/evoprompt/models.py
index e29b212fb18fdcfbf85d212f9a9ecb8e4b64e3f1..43f39f54b0e0b9e4891aa1ec4f33c716c58476b4 100644
--- a/evoprompt/models.py
+++ b/evoprompt/models.py
@@ -114,8 +114,9 @@ class Llama(LLMModel):
         prompt_prefix: str = "",
         prompt_suffix: str = "",
         chat: bool | None = None,
-        stop: str = "</prompt>",
+        stop: str = None,
         max_tokens: int = 200,
+        history: dict = None,
         **kwargs: Any,
     ) -> tuple[str, ModelUsage]:
         if chat is None:
@@ -128,7 +129,11 @@ class Llama(LLMModel):
                     "content": prompt_prefix + prompt + prompt_suffix + prompt_appendix,
                 }
             ]
-            if system_message:
+            # a history is prepended to the messages, and we assume that it also includes a system message, i.e., we never add a system message in this case
+            # TODO is it better to check for a system message?
+            if history is not None:
+                messages = history + messages
+            elif system_message:
                 messages.insert(
                     0,
                     {
@@ -144,20 +149,35 @@ class Llama(LLMModel):
             )
             response_text = response["choices"][0]["message"]["content"]
         else:
-            response = self.model.create_completion(
-                prompt=(system_message if system_message else "")
+            prompt = (
+                (system_message if system_message else "")
                 + prompt_prefix
                 + prompt
                 + prompt_suffix
-                + prompt_appendix,
+                + prompt_appendix
+            )
+            response = self.model.create_completion(
+                prompt=prompt,
                 stop=stop,
                 max_tokens=max_tokens,
                 **kwargs,
             )
             response_text = response["choices"][0]["text"]
+            messages = [
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ]
+        messages.append(
+            {
+                "role": "assistant",
+                "content": response_text,
+            }
+        )
         # input(f"Response: {response_text}")
         usage = ModelUsage(**response["usage"])
-        return response_text, usage
+        return response_text, messages, usage
 
     @classmethod
     def register_arguments(cls, parser: ArgumentParser):
diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py
index 1a8ef797acf6739dc6fb1521a8e91b82e4cd0f1a..5c0eda4b4fcbb118607c272b9f214945c67400fb 100644
--- a/evoprompt/optimization.py
+++ b/evoprompt/optimization.py
@@ -34,7 +34,7 @@ def paraphrase_prompts(
         if num_tries >= max_tries:
             break
         num_tries += 1
-        paraphrase, usage = model(
+        paraphrase, _, usage = model(
             system_message=PARAPHRASE_PROMPT,
             prompt=prompt,
             prompt_prefix=' Instruction: "',
diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py
index 58d8c77c22bdf590451d04216b20091061444ca1..4d7450e47e12ac8d7f5b94251e8d1cd75c1e59c6 100644
--- a/evoprompt/task/task.py
+++ b/evoprompt/task/task.py
@@ -319,8 +319,7 @@ class Task(metaclass=ABCMeta):
     def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]:
         # run model for inference using grammar to constrain output
         # TODO grammar also depends on prompt and vice-versa -> what are good labels?
-
-        response, usage = self.model(
+        response, _, usage = self.model(
             system_message=SYSTEM_MESSAGE,
             prompt=prompt,
             prompt_appendix=self._get_prompt_text_for_datum(datum),
diff --git a/evoprompt/utils.py b/evoprompt/utils.py
index 3eefa2e9c709a75bff442f70880df3f1c0336347..16efd8886426ed7d18444a9c5786280b3cd85515 100644
--- a/evoprompt/utils.py
+++ b/evoprompt/utils.py
@@ -49,9 +49,9 @@ def initialize_run_directory(model: Callable):
     if file_handler is not None:
         logger.removeHandler(file_handler)
 
-    response, usage = model(None, run_name_prompt)
+    response, _, _ = model(None, run_name_prompt)
     run_name_match = re.search(r"^\w+$", response, re.MULTILINE)
-    existing_run_names = os.listdir(RUNS_DIR)
+    existing_run_names = os.listdir(RUNS_DIR) if RUNS_DIR.exists() else []
     if run_name_match is None or run_name_match.group(0) in existing_run_names:
         run_name = uuid4().hex
     else: