diff --git a/evoprompt/evolution/evolution.py b/evoprompt/evolution/evolution.py
index 57556f0c0b5716fc12390d01076a84377df03261..ce57de57845d5cd10021b174a90a5408896e6ac8 100644
--- a/evoprompt/evolution/evolution.py
+++ b/evoprompt/evolution/evolution.py
@@ -23,7 +23,7 @@ from evoprompt.evolution.template_ga import (
     GA_DEMONSTRATION_DATA_SIM,
     GA_PROMPT,
 )
-from evoprompt.models import LLMModel
+from evoprompt.models import ChatMessages, LLMModel
 from evoprompt.opt_types import ModelUsage, Prompt
 from evoprompt.optimization import Judgement, PromptOptimization
 from evoprompt.task import Task
@@ -50,14 +50,12 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
         *,
         task: Task,
         evolution_model: LLMModel,
-        evaluation_model: LLMModel,
         judge_model: LLMModel | None,
         run_options: dict[str, Any] = {},
     ) -> None:
         super().__init__(
             task=task,
             evolution_model=evolution_model,
-            evaluation_model=evaluation_model,
             judge_model=judge_model,
             run_options=run_options,
         )
@@ -163,7 +161,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
                     if p_i is not None:
                         prompt_source = (
                             "corrected"  # could also mean that user skipped the prompt
-                            if not all(j.happy for j in judgements)
+                            if False in [j.happy for j in judgements]
                             else "generated"
                         )
                         evolved_prompt = self.add_prompt(
@@ -195,7 +193,13 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta):
             # Line 8: Return the best prompt, p∗, among the final population PT :
             # p∗ ← argmaxp∈PT f(p, D)
             p = max(self.P[-1], key=lambda prompt: self.all_prompts[prompt.id].score)
-            logger.info("Best prompt with score %.2f: %s", p.score, p)
+            logger.info(
+                "Best prompt with score %.2f: %s (Source: %s - Gen: %d)",
+                p.score,
+                p,
+                p.meta["source"],
+                p.meta["gen"],
+            )
 
             # We pick the prompt with the highest score on the development set and report its score on the testset.
             test_performance, _, _ = self.task.evaluate_test(p.content)
diff --git a/evoprompt/models.py b/evoprompt/models.py
index 4012ce478dee5ff99dec5eb20df61e346d0db243..57e219a40960fd6c2df14abcdcb765c02240b1e1 100644
--- a/evoprompt/models.py
+++ b/evoprompt/models.py
@@ -1,6 +1,7 @@
 from collections.abc import Iterable
 import hashlib
 import inspect
+from itertools import zip_longest
 import json
 import logging
 import random
@@ -122,25 +123,32 @@ class LLMModel(ABC):
     def build_demonstration_data(
         self,
         demonstrations: Iterable[tuple[str, str]],
-        instruction: str | None,
+        instruction: list[str] | str | None,
         **kwargs,
     ) -> ChatMessages:
         if not isinstance(self, ChatModel):
             raise ValueError(
                 f"Model {self} does not support building demonstration data"
             )
+
+        if not isinstance(instruction, list):
+            instruction = [instruction]
         messages = []
-        for input_, output in demonstrations:
+        for (input_, output), _instruction in zip_longest(
+            demonstrations, instruction, fillvalue=instruction[-1]
+        ):
             messages.extend(
-                self.build_input_data(input_, instruction=instruction, **kwargs)[1]
+                self.build_input_data(input_, instruction=_instruction, **kwargs)[1]
             )
             messages.append(self._get_assistant_message(output))
         return messages
 
     def build_input_data(
-        self, prompt: str, instruction: str | None = None, **kwargs
+        self, input_: str, instruction: str | None = None, **kwargs
     ) -> ChatMessages:
-        return instruction, [self._get_user_message(prompt)]
+        return instruction, [
+            self._get_user_message(input_ if input_ is not None else instruction)
+        ]
 
     def _get_prediction_prefix(self):
         # some models use a special token prefix for the prediction
@@ -162,19 +170,19 @@ class LLMModel(ABC):
         use_randomness: bool = False,
     ): ...
 
-    def _get_user_message(self, content: Any):
+    def _get_user_message(self, content: Any) -> ChatMessage:
         return {
             "role": "user",
             "content": content,
         }
 
-    def _get_system_message(self, content: Any):
+    def _get_system_message(self, content: Any) -> ChatMessage:
         return {
             "role": "system",
             "content": content,
         }
 
-    def _get_assistant_message(self, content: Any):
+    def _get_assistant_message(self, content: Any) -> ChatMessage:
         return {
             "role": "assistant",
             "content": content,
@@ -400,10 +408,10 @@ class ChatModel:
 
         # we prepend the history to the messages
         # the chat format should take care of adding appropriate assistant messages for generating the completion
-        messages_for_model = messages
         if history is None:
-            history = []
-        messages_for_model = history + messages_for_model
+            messages_for_model = messages
+        else:
+            messages_for_model = history + messages
         # prepend system message if available
         if system_message is not None:
             if isinstance(system_message, str):
@@ -569,19 +577,19 @@ class AlpacaHfChat(HfChat):
         self.pipeline.tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ system_message }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '\\n\\n### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n'  + message['content'].strip()}}{% endif %}{% endfor %}"
 
     def build_input_data(
-        self, prompt: str, instruction: str = None, **kwargs
+        self, input_: str, instruction: str = None, **kwargs
     ) -> ChatMessages:
         # For Alpaca we add the instruction for each input (and assume that content is the context, otherwise the content should contain the instruction)
-        return self.SYSTEM_MESSAGE, [
-            self._get_user_message(
-                (
-                    (instruction + "\n\n" + self._get_input_prefix())
-                    if instruction is not None
-                    else ""
-                )
-                + prompt
-            )
-        ]
+        if instruction is None:
+            assert (
+                input_ is not None
+            ), "Either instruction or input must be provided for Alpaca"
+            prompt_input = input_
+        else:
+            prompt_input = instruction
+            if input_ is not None:
+                prompt_input += "\n\n" + self._get_input_prefix() + input_
+        return self.SYSTEM_MESSAGE, [self._get_user_message(prompt_input)]
 
     @staticmethod
     def _get_input_prefix():
diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py
index c1147f40f3b1a1c7c0df1ed0358012149c80acc6..ffeded99da45d76fd73728b9f9b9a174ef4a5e82 100644
--- a/evoprompt/optimization.py
+++ b/evoprompt/optimization.py
@@ -153,13 +153,11 @@ class PromptOptimization:
         *,
         task: Task,
         evolution_model: LLMModel,
-        evaluation_model: LLMModel,
         judge_model: LLMModel | None,
         run_options: dict[str, Any] = {},
     ) -> None:
         self.task = task
         self.evolution_model = evolution_model
-        self.evaluation_model = evaluation_model
         self.judge_model = judge_model
         self.run_options = run_options
 
@@ -174,7 +172,7 @@ class PromptOptimization:
         return self.task.evaluate_validation(prompt, parent_histories)
 
     def get_initial_prompts(self, num_initial_prompts: int, debug: bool = False):
-        # this implements the para_topk algorothm from https://github.com/beeevita/EvoPrompt
+        # this implements the para_topk algorithm from https://github.com/beeevita/EvoPrompt
         base_prompts = self.task.base_prompts
         if debug:
             base_prompts = base_prompts[:2]
diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py
index ae260f5844cf89a1a9064a0780705371cc3b71ec..3b59db32d9920253f9deb9b953676c79b9299854 100644
--- a/evoprompt/task/task.py
+++ b/evoprompt/task/task.py
@@ -437,12 +437,12 @@ class Task(metaclass=ABCMeta):
         use_prediction_prefix: bool = False,
     ) -> tuple[ChatMessage, ChatMessages]:
         # the default is to use the prompt as is and concatenate the datum string
-        prompt = self._get_prompt_text_for_datum(
+        datum_input = self._get_prompt_text_for_datum(
             sample, use_prefix=self.force_task_input_prefix
         )
         if use_prediction_prefix:
-            prompt += f"\n{self._get_prediction_prefix().strip()}"
-        return self.model.build_input_data(prompt, instruction)
+            datum_input += f"\n{self._get_prediction_prefix().strip()}"
+        return self.model.build_input_data(datum_input, instruction)
 
     def build_demonstration_prompt(
         self,
diff --git a/main.py b/main.py
index f74fbea070a3078cd23d6ca903981808e2e05823..ae7c518cffbe090d83aa570d1861021692d461de 100644
--- a/main.py
+++ b/main.py
@@ -124,7 +124,6 @@ if __name__ == "__main__":
         population_size=10,
         task=task,
         evolution_model=evolution_model,
-        evaluation_model=evaluation_model,
         judge_model=judge_model,
         run_options=options.__dict__,
     )