diff --git a/eval_prompt.py b/eval_prompt.py
index dc54636209b15cd8a320090842433c0b1ed9788c..3730ab42b6ac51852630302864e3b8577e9026e2 100644
--- a/eval_prompt.py
+++ b/eval_prompt.py
@@ -6,32 +6,19 @@ import torch
 
 from evoprompt.models import LLMModel
 from evoprompt.task import get_task, tasks
-from evoprompt.utils import setup_console_logger
+from evoprompt.utils import init_rng, setup_console_logger
 
 logger = logging.getLogger(__name__)
 
 
-def evaluate_prompt(prompt: str, task_name: str, args: Namespace):
+def evaluate_prompt(prompt: str, task_args: Namespace, model_args: Namespace):
     logger.info(f'Evaluating prompt "{prompt}"')
 
     evaluation_model = LLMModel.get_model(
-        name="hfllamachat",
-        # name="llamachat",
-        options=args,
-        model="PKU-Alignment/alpaca-7b-reproduced",
-        # model="chavinlo/alpaca-native",
-        model_kwargs=dict(
-            load_in_8bit=True,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            # use less randomness, i.e., more certain outputs
-            temperature=0.0,
-        ),
-        # torch_dtype is not JSON serializable therefore we ignore it
-        ignore_cache_kwargs=["model_kwargs.torch_dtype"],
+        **vars(model_args),
     )
 
-    task = get_task(task_name, evaluation_model, **vars(args))
+    task = get_task(evaluation_model=evaluation_model, **vars(task_args))
 
     eval_score, eval_usage, _ = task.evaluate_validation(prompt)
     logger.info(f"Score on evaluation set: {eval_score}")
@@ -40,28 +27,54 @@ def evaluate_prompt(prompt: str, task_name: str, args: Namespace):
 
 
 if __name__ == "__main__":
-    setup_console_logger()
 
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("-p", "--prompt", type=str, required=True)
-    argparser.add_argument(
-        "-t", "--task", type=str, choices=sorted(tasks.keys()), required=True
-    )
-    args = argparser.parse_args()
+    def main():
+        argparser = argparse.ArgumentParser()
+        argparser.add_argument("-p", "--prompt", type=str, required=True)
+        argparser.add_argument(
+            "-t", "--task", type=str, choices=sorted(tasks.keys()), required=True
+        )
+        argparser.add_argument(
+            "-v", "--verbose", action="count", default=0, help="Increase verbosity"
+        )
+        args = argparser.parse_args()
 
-    options = Namespace(
-        llama_path=None,
-        chat_format=None,
-        chat_handler=None,
-        verbose=False,
-        llama_verbose=False,
-        llama_model="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
-        llama_model_file="Meta-Llama-3.1-8B-Instruct.Q8_0.gguf",
-        disable_cache=False,
-        use_grammar=False,
-        evaluation_strategy="simple",
-        max_tokens=None,
-        **vars(args),
-    )
+        init_rng(1)
+        setup_console_logger(verbosity_level=args.verbose)
+
+        task_options = Namespace(
+            name=args.task,
+            use_grammar=False,
+            evaluation_strategy="simple",
+            n_evaluation_demo=1,
+        )
+        model_options = Namespace(
+            # name="hfchat",
+            name="alpacahfchat",
+            # name="llamachat",
+            verbose=args.verbose,
+            # model="PKU-Alignment/alpaca-7b-reproduced",
+            # model="chavinlo/alpaca-native",
+            load_in_8bit=True,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            # temperature=0.0,
+            # torch_dtype is not JSON serializable therefore we ignore it
+            ignore_cache_kwargs=["torch_dtype"],
+            llama_path=None,
+            chat_format=None,
+            chat_handler=None,
+            llama_verbose=False,
+            llama_model="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
+            # llama_model="TheBloke/Llama-2-70B-Chat-GGUF",
+            llama_model_file="Meta-Llama-3.1-8B-Instruct.Q8_0.gguf",
+            # llama_model_file="llama-2-70b-chat.Q4_K_M.gguf",
+            disable_cache=False,
+            max_tokens=None,
+        )
+        print(task_options)
+        print(model_options)
+
+        evaluate_prompt(args.prompt, task_options, model_options)
 
-    evaluate_prompt(args.prompt, args.task, options)
+    main()
diff --git a/evoprompt/evolution/evolution.py b/evoprompt/evolution/evolution.py
index e6b7d365237ad9221fd85c3890ceb4a192640e1c..57556f0c0b5716fc12390d01076a84377df03261 100644
--- a/evoprompt/evolution/evolution.py
+++ b/evoprompt/evolution/evolution.py
@@ -249,6 +249,7 @@ class GeneticAlgorithm(EvolutionAlgorithm):
             self.evolution_model.create_completion(
                 system_message=SYSTEM_MESSAGE,
                 prompt=filled_prompt,
+                use_randomness=True,
             )
         )
 
@@ -351,6 +352,7 @@ class DifferentialEvolution(EvolutionAlgorithm):
             self.evolution_model.create_completion(
                 system_message=SYSTEM_MESSAGE,
                 prompt=filled_prompt,
+                use_randomness=True,
             )
         )
 
@@ -467,7 +469,9 @@ class DifferentialEvolutionWithCot(DifferentialEvolution):
                     system_message=SYSTEM_MESSAGE,
                     prompt=filled_prompt,
                     history=history,
-                    stop="</prompt>" if idx == len(DE_COT_PROMPTS) - 1 else None,
+                    # the models often repeat the instuction which could also contain </prompt> therefore we should not stop early
+                    stop=None, # "</prompt>" if idx == len(DE_COT_PROMPTS) - 1 else None,
+                    use_randomness=True,
                 )
             )
             logger.debug(
diff --git a/evoprompt/models.py b/evoprompt/models.py
index cb6e1b3e6595f73b1c397c7204d2d9f1ca45ce9a..8653c0d5cbda4b9cebdbb07b556d50efdcbc08af 100644
--- a/evoprompt/models.py
+++ b/evoprompt/models.py
@@ -2,13 +2,15 @@ import hashlib
 import inspect
 import json
 import logging
+import random
 import warnings
 from abc import ABC, abstractmethod
-from argparse import ArgumentParser, Namespace
+from argparse import ArgumentParser
 from pathlib import Path
 from typing import Any, Callable, ClassVar
 
 import llama_cpp
+from llama_cpp.llama_chat_format import format_llama3
 import openai
 import weave
 from diskcache import Cache
@@ -36,11 +38,13 @@ class LLMModel(ABC):
         cls.register_arguments(argument_parser)
 
     @classmethod
-    def get_model(cls, name: str, options: Namespace, **kwargs):
+    def get_model(cls, name: str, **kwargs):
         if name not in LLMModel.registered_models:
-            raise ValueError("Model %s does not exist", name)
+            raise ValueError(
+                f"Model {name} does not exist; avilable models: {list(LLMModel.registered_models.keys())}"
+            )
 
-        key = cls.get_options_kwargs_hash(options, kwargs)
+        key = cls.get_hash_from_kwargs(**kwargs)
         # check if model is already loaded
         if cls in LLMModel.loaded_models:
             model, model_key = LLMModel.loaded_models[cls]
@@ -49,20 +53,20 @@ class LLMModel(ABC):
                     f"Model {model} is already loaded with different arguments"
                 )
         else:
-            model = LLMModel.registered_models[name](options=options, **kwargs)
+            model = LLMModel.registered_models[name](**kwargs)
             LLMModel.loaded_models[cls] = (model, key)
         return model
 
-    def __init__(self, options: Namespace, **kwargs):
+    def __init__(self, ignore_cache_kwargs: list[str] | None = None, **kwargs):
         self.usage = ModelUsage()
 
         # store kwargs for caching
-        self.options = options
-        self.kwargs = kwargs
+        self.kwargs = kwargs.copy()
+        self.kwargs["ignore_cache_kwargs"] = ignore_cache_kwargs
 
         # set up caching for model calls
         self._call_model_cached = None
-        if not options.disable_cache:
+        if not self.kwargs.get("disable_cache", False):
             cache = Cache(Path(".cache_dir", self.model_cache_key))
 
             @cache.memoize(typed=True, ignore=[0, "func"])
@@ -71,32 +75,66 @@ class LLMModel(ABC):
 
             self._call_model_cached = _call_function
 
-    @abstractmethod
     def create_completion(
         self,
         system_message: str | None,
         prompt: str,
         *,
         use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
         stop: str | None = None,
-        history: ChatMessages | None = None,
-        **kwargs: Any,
+        use_randomness: bool = False,
+        **kwargs,
     ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]:
-        pass
+        messages = [self._get_user_message(prompt)]
+        if system_message is not None:
+            prompt = system_message + prompt
 
-    def _get_user_message(self, content: str):
+        reponse, usage = self._create_completion(
+            prompt=prompt,
+            grammar=grammar,
+            stop=stop,
+            use_cache=use_cache,
+            max_tokens=self.kwargs.get("max_tokens", None),
+            use_randomness=use_randomness,
+        )
+
+        messages.append(self._get_assistant_message(reponse))
+        return reponse, None, messages, usage
+
+    def _get_prediction_prefix(self):
+        # some models use a special token prefix for the prediction
+        return None
+
+    def _get_input_prefix(self):
+        # some models have a special token for the input
+        return None
+
+    @abstractmethod
+    def _create_completion(
+        self,
+        prompt: str | None = None,
+        *,
+        use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
+        stop: str | None = None,
+        max_tokens: int | None = None,
+        use_randomness: bool = False,
+    ): ...
+
+    def _get_user_message(self, content: Any):
         return {
             "role": "user",
             "content": content,
         }
 
-    def _get_system_message(self, content: str):
+    def _get_system_message(self, content: Any):
         return {
             "role": "system",
             "content": content,
         }
 
-    def _get_assistant_message(self, content: str):
+    def _get_assistant_message(self, content: Any):
         return {
             "role": "assistant",
             "content": content,
@@ -116,7 +154,7 @@ class LLMModel(ABC):
             return model_completion_fn(**kwargs)
 
     @staticmethod
-    def get_options_kwargs_hash(options: Namespace, kwargs):
+    def get_hash_from_kwargs(**kwargs):
         # sometimes we want to ignore certain kwargs from the hash, e.g., when they are not relevant for the model or if they are not serializable
         kwargs = kwargs.copy()
 
@@ -130,19 +168,12 @@ class LLMModel(ABC):
 
         ignore_cache_kwargs: list[str] | None = kwargs.pop("ignore_cache_kwargs", None)
         if ignore_cache_kwargs is not None:
-            options = Namespace(
-                **{
-                    k: v
-                    for k, v in iter_dict(vars(options))
-                    if k not in ignore_cache_kwargs
-                }
-            )
             kwargs = {
                 k: v for k, v in iter_dict(kwargs) if k not in ignore_cache_kwargs
             }
 
         unique_options_key = json.dumps(
-            (vars(options), kwargs),
+            kwargs,
             sort_keys=True,
         )
         return hashlib.sha1(unique_options_key.encode()).hexdigest()
@@ -152,11 +183,15 @@ class LLMModel(ABC):
         return (
             str(self.model_name).replace("/", "_")
             + "/"
-            + self.get_options_kwargs_hash(self.options, self.kwargs)
+            + self.get_hash_from_kwargs(**self.kwargs)
         )
 
-    @classmethod
+    @property
     @abstractmethod
+    def model_name(self):
+        pass
+
+    @classmethod
     def register_arguments(cls, parser: ArgumentParser):
         pass
 
@@ -165,96 +200,101 @@ class Llama(LLMModel):
 
     def __init__(
         self,
-        options: Namespace,
-        n_gpu_layers: int = 60,
+        *,
+        ignore_cache_kwargs: list[str] | None = None,
+        llama_path: Path | None = None,
+        llama_model: str = "QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
+        llama_model_file: str = "Meta-Llama-3.1-8B-Instruct.Q8_0.gguf",
+        chat_format: str | None = None,
+        chat_handler: str | None = None,
+        verbose: int | bool = False,
+        llama_verbose: bool = False,
+        n_gpu_layers: int = -1,
         n_threads: int = 8,
         n_ctx: int = 8192,
         **kwargs,
     ) -> None:
-        # initialize model
-        add_kwargs = {}
+        # we collect all arguments to make sure they are passed to the super constructor
+        hashed_model_kwargs = {
+            "n_ctx": n_ctx,
+        }
+        if chat_format is not None:
+            hashed_model_kwargs["chat_format"] = chat_format
+        if chat_handler is not None:
+            hashed_model_kwargs["chat_handler"] = chat_handler
         seed = get_seed()
         if seed is not None:
-            add_kwargs["seed"] = seed
+            hashed_model_kwargs["seed"] = seed
 
-        # TODO some options could be optional
+        model_kwargs = dict(
+            **hashed_model_kwargs,
+            verbose=(verbose > 1 or llama_verbose),
+            n_gpu_layers=n_gpu_layers,
+            n_threads=n_threads,
+        )
+
+        ignore_cache_kwargs = (
+            ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else []
+        )
 
-        if options.llama_path is not None:
+        if llama_path is not None:
             # use local file
             self.model = llama_cpp.Llama(
-                model_path=options.llama_path,
-                chat_format=options.chat_format,
-                chat_handler=options.chat_handler,
-                verbose=options.verbose > 1 or options.llama_verbose,
-                n_gpu_layers=n_gpu_layers,
-                n_threads=n_threads,
-                n_ctx=n_ctx,
-                **add_kwargs,
-                **kwargs,
+                model_path=llama_path,
+                **model_kwargs,
             )
-            self.model_name = Path(options.llama_path).stem
+            self._model_name = llama_path.stem
         else:
             # use pre-trained model from HF hub
             self.model = llama_cpp.Llama.from_pretrained(
-                repo_id=options.llama_model,
-                filename=options.llama_model_file,
-                chat_format=options.chat_format,
-                chat_handler=options.chat_handler,
-                verbose=options.verbose > 1 or options.llama_verbose,
-                n_gpu_layers=n_gpu_layers,
-                n_threads=n_threads,
-                n_ctx=n_ctx,
-                **add_kwargs,
-                **kwargs,
+                repo_id=llama_model,
+                filename=llama_model_file,
+                **model_kwargs,
             )
-            self.model_name = Path(
-                options.llama_model, options.llama_model_file
-            ).with_suffix("")
+            self._model_name = Path(llama_model, llama_model_file).with_suffix("")
 
         # pass all arguments to super constructor which should be taken into account for caching
         # needs to be called after model is initialized
-        super().__init__(options=options, n_ctx=n_ctx, **kwargs)
+        super().__init__(ignore_cache_kwargs=ignore_cache_kwargs, **hashed_model_kwargs)
 
-    def create_completion(
+    def _create_completion(
         self,
-        system_message: str | None,
-        prompt: str,
+        prompt: str | None = None,
         *,
         use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
         stop: str | None = None,
-        history: ChatMessages | None = None,
-        **kwargs: Any,
-    ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]:
-        messages = [self._get_user_message(prompt)]
-        if system_message is not None:
-            prompt = system_message + prompt
-
-        reponse, usage = self._create_completion(
-            prompt=prompt,
-            stop=stop,
-            use_cache=use_cache,
-            max_tokens=self.options.max_tokens,
-            **kwargs,
-        )
-
-        messages.append(self._get_assistant_message(reponse))
-        return reponse, None, messages, usage
-
-    def _create_completion(
-        self,
-        use_cache: bool = False,
-        **kwargs,
+        max_tokens: int | None = None,
+        use_randomness: bool = False,
     ):
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "prompt": prompt,
+            "grammar": grammar,
+            "stop": stop,
+            "max_tokens": max_tokens,
+        }
+        if use_randomness:
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
+            model_call_kwargs["seed"] = random.randint(0, 2**32 - 1)
+        else:
+            model_call_kwargs["temperature"] = 0.0
+
         response = self._call_model(
             self.model.create_completion,
             use_cache=use_cache,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response["choices"][0]["text"]
 
         usage = ModelUsage(**response["usage"])
         return response_text, usage
 
+    @property
+    def model_name(self):
+        return self._model_name
+
     @classmethod
     def register_arguments(cls, parser: ArgumentParser):
         group = parser.add_argument_group(f"{cls.__name__} model arguments")
@@ -303,9 +343,11 @@ class ChatModel:
         prompt: str,
         *,
         use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
         stop: str | None = None,
         history: ChatMessages | None = None,
-        **kwargs: Any,
+        use_randomness: bool = False,
+        **kwargs,
     ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]:
         # a history is prepended to the messages, and we assume that it also includes a system message, i.e., we never add a system message in this case
         # TODO is it better to check for a system message in the history?
@@ -316,13 +358,17 @@ class ChatModel:
                 history = [self._get_system_message(system_message)]
             else:
                 history = []
+        # we prepend the history to the messages
+        # the chat format should take care of adding appropriate assistant messages for generating the completion
+        messages_for_model = history + messages
 
         reponse, usage = self._create_completion(
-            messages=history + messages,
+            messages=messages_for_model,
+            grammar=grammar,
             stop=stop,
             use_cache=use_cache,
-            max_tokens=self.options.max_tokens,
-            **kwargs,
+            max_tokens=self.kwargs.get("max_tokens", None),
+            use_randomness=use_randomness,
         )
 
         messages.append(self._get_assistant_message(reponse))
@@ -334,13 +380,35 @@ class LlamaChat(ChatModel, Llama):
     @weave.op()
     def _create_completion(
         self,
-        use_cache: bool = False,
-        **kwargs,
+        messages: ChatMessages,
+        *,
+        use_cache: bool,
+        grammar: llama_cpp.LlamaGrammar | None = None,
+        stop: str | None,
+        max_tokens: int | None,
+        use_randomness: bool,
     ):
+        # input(
+        #     f"The input for the model will look like this:\n{format_llama3(messages).prompt}"
+        # )
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "messages": messages,
+            "grammar": grammar,
+            "stop": stop,
+            "max_tokens": max_tokens,
+        }
+        if use_randomness:
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
+            model_call_kwargs["seed"] = random.randint(0, 2**32 - 1)
+        else:
+            model_call_kwargs["temperature"] = 0.0
+
         response = self._call_model(
             self.model.create_chat_completion,
             use_cache=use_cache,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response["choices"][0]["message"]["content"]
 
@@ -348,82 +416,177 @@ class LlamaChat(ChatModel, Llama):
         return response_text, usage
 
 
-class HfLlamaChat(ChatModel, Llama):
+class HfChat(ChatModel, LLMModel):
 
     def __init__(
         self,
-        options: Namespace,
-        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        ignore_cache_kwargs: list[str] | None = None,
         **kwargs,
     ) -> None:
         import torch
         import transformers
 
-        super().__init__(options, **kwargs)
+        self._model_name = model
 
-        # set some default values
-        model_kwargs = kwargs.pop("model_kwargs", {})
-        if "torch_dtype" not in model_kwargs:
-            model_kwargs["torch_dtype"] = torch.bfloat16
-        if "device_map" not in model_kwargs:
-            model_kwargs["device_map"] = "auto"
+        # we collect all arguments to make sure they are passed to the super constructor
+        # also set some default values
+        model_kwargs = {
+            "task": "text-generation",
+            "model": model,
+            "torch_dtype": kwargs.pop("torch_dtype", torch.bfloat16),
+            "device_map": kwargs.pop("device_map", "auto"),
+        }
+        if "do_sample" in kwargs:
+            model_kwargs["do_sample"] = kwargs.pop("do_sample")
+        ignore_cache_kwargs = (
+            ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else []
+        )
+        # torch.dtype is not JSON serializable therefore we ignore it
+        ignore_cache_kwargs.extend(["torch_dtype"])
+
+        # pass all arguments to super constructor which should be taken into account for caching
+        super().__init__(**model_kwargs, ignore_cache_kwargs=ignore_cache_kwargs)
 
         # initialize model
         self.pipeline = transformers.pipeline(
-            "text-generation",
-            model=model,
-            model_kwargs=model_kwargs,
-            **kwargs,
+            **model_kwargs,
         )
         # Setting the pad token to the eos token to avoid stdout prints
-        self.pipeline.model.generation_config.pad_token_id = (
-            self.pipeline.model.generation_config.eos_token_id
-        )
+        # if there are multiple eos tokens, we use the first one (similarly to how it is done in the TF library)
+        if isinstance(
+            self.pipeline.model.generation_config.eos_token_id, (list, tuple)
+        ):
+            eos_token_id = self.pipeline.model.generation_config.eos_token_id[0]
+        else:
+            eos_token_id = self.pipeline.model.generation_config.eos_token_id
+
+        self.pipeline.model.generation_config.pad_token_id = eos_token_id
 
     def _create_completion(
         self,
         messages: ChatMessages,
-        use_cache: bool = False,
-        max_tokens: int = None,
+        *,
+        use_cache: bool,
+        stop: str | None,
+        max_tokens: int | None,
+        use_randomness: bool,
         **kwargs,
     ):
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "text_inputs": messages,
+            "stop": stop,
+            "max_length": max_tokens if max_tokens is not None else 2048,
+        }
+        if use_randomness:
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
+            model_call_kwargs["do_sample"] = True
+        else:
+            model_call_kwargs["do_sample"] = False
+
+        # input(
+        #     f"The input for the model will look like this:\n{self.pipeline.tokenizer.apply_chat_template(messages, tokenize=False)}"
+        # )
         response = self._call_model(
             self.pipeline,
-            text_inputs=messages,
             use_cache=use_cache,
-            max_length=max_tokens if max_tokens is not None else 1024,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response[0]["generated_text"][-1]["content"]
-        # TODO no usage supported by HF pipeline; manually compute usage?
+        # no usage supported by HF pipeline; TODO manually compute usage?
         usage = ModelUsage()
         return response_text, usage
 
+    @property
+    def model_name(self):
+        return self._model_name
 
-class OpenAiChat(ChatModel, LLMModel):
+
+# For Alpaca we build inputs to follow the fine-tuning chat format like https://github.com/tatsu-lab/stanford_alpaca/blob/761dc5bfbdeeffa89b8bff5d038781a4055f796a/README.md?plain=1#L56-L66
+class AlpacaHfChat(HfChat):
+    def __init__(
+        self,
+        model: str = "chavinlo/alpaca-native",
+        ignore_cache_kwargs: list[str] | None = None,
+        **kwargs,
+    ):
+        super().__init__(model=model, ignore_cache_kwargs=ignore_cache_kwargs, **kwargs)
+
+        # chat template for Alpaca adapted from https://huggingface.co/Vezora/Mistral-22B-v0.1/blob/c15d70465e2fc46c3c4d7fec8fb62f533d4ef09b/tokenizer_config.json#L30
+        self.pipeline.tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ system_message + '\\n\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n'  + message['content'].strip()}}{% endif %}{% endfor %}"
+
+    def _create_completion(
+        self,
+        messages: list[dict[str, str]],
+        *,
+        use_cache: bool,
+        stop: str | None,
+        max_tokens: int | None,
+        use_randomness: bool,
+        **kwargs,
+    ):
+        # for some reason adding an empty assistant message yields different generations than adding it manually in the chat template
+        return super()._create_completion(
+            messages + [self._get_assistant_message("")],
+            use_cache=use_cache,
+            stop=stop,
+            max_tokens=max_tokens,
+            use_randomness=use_randomness,
+            **kwargs,
+        )
+
+    def _get_input_prefix(self):
+        return "### Input:\n"
+
+    def _get_prediction_prefix(self):
+        # Alpaca uses a special token prefix for the prediction
+        return "\n### Response:\n"
+
+
+class OpenAIChat(ChatModel, LLMModel):
     """Queries an OpenAI model using its API."""
 
     def __init__(
         self,
-        options: Namespace,
+        openai_model: str,
+        ignore_cache_kwargs: list[str] | None = None,
         **kwargs,
     ) -> None:
         # initialize client for API calls
-        self.openai_client = openai.OpenAI(**kwargs)
-        self.model_name = options.openai_model
+        # TODO set API key, project etc. via keyword arguments?
+        self.openai_client = openai.OpenAI()
+        self._model_name = openai_model
 
-        super().__init__(options, **kwargs)
+        super().__init__(model=openai_model, ignore_cache_kwargs=ignore_cache_kwargs)
 
     def _create_completion(
         self,
-        use_cache: bool = False,
-        **kwargs,
+        messages: ChatMessages,
+        *,
+        use_cache: bool,
+        stop: str | None,
+        max_tokens: int | None,
+        use_randomness: bool,
     ):
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "model": self.model_name,
+            "messages": messages,
+            "stop": stop,
+            "max_completion_tokens": max_tokens if max_tokens is not None else 1024,
+        }
+        if use_randomness:
+            # same temperature as in evoprompt paper reference implementation
+            model_call_kwargs["temperature"] = 0.5
+        else:
+            model_call_kwargs["temperature"] = 0.0
+
         response = self._call_model(
             self.openai_client.chat.completions.create,
-            model=self.model_name,
             use_cache=use_cache,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response.choices[0].message.content
         usage = ModelUsage(**response.usage.__dict__)
@@ -434,6 +597,10 @@ class OpenAiChat(ChatModel, LLMModel):
         group = parser.add_argument_group("OpenAI model arguments")
         group.add_argument("--openai-model", "-m", type=str, default="gpt-3.5-turbo")
 
+    @property
+    def model_name(self):
+        return self._model_name
+
 
 argument_group = argument_parser.add_argument_group("Model arguments")
 argument_group.add_argument(
diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py
index 81538709c460da09134e5fbe3841e1595105986b..c1147f40f3b1a1c7c0df1ed0358012149c80acc6 100644
--- a/evoprompt/optimization.py
+++ b/evoprompt/optimization.py
@@ -93,8 +93,8 @@ class ResponseEditor(App):
         return self.text_area.text
 
 
-@log_calls("Paraphrasing prompts")
-def paraphrase_prompts(
+@log_calls("Paraphrasing prompt")
+def paraphrase_prompt(
     model: LLMModel,
     prompt: str,
     n: int,
@@ -112,7 +112,8 @@ def paraphrase_prompts(
         num_tries += 1
         paraphrase, _, _, usage = model.create_completion(
             system_message=PARAPHRASE_PROMPT,
-            prompt=f"Instruction: {prompt}",
+            prompt=prompt,
+            use_randomness=True,
         )
         total_usage += usage
         if "<prompt>" in paraphrase:
@@ -200,7 +201,7 @@ class PromptOptimization:
         # fill up the rest with paraphrases of the top prompts
         promptindex_to_paraphrase = 0
         while len(initial_population) < num_initial_prompts:
-            paraphrases, paraphrase_usage = paraphrase_prompts(
+            paraphrases, paraphrase_usage = paraphrase_prompt(
                 self.evolution_model,
                 top_prompts[promptindex_to_paraphrase],
                 n=1,
diff --git a/evoprompt/task/__init__.py b/evoprompt/task/__init__.py
index 5238a2cf8d9a5cf231b4de1d3b18fd161b3af55b..4387f7c24ea05e75720984adc94c2bb3fb2a311d 100644
--- a/evoprompt/task/__init__.py
+++ b/evoprompt/task/__init__.py
@@ -24,7 +24,9 @@ tasks = {
 
 def get_task(name: str, evaluation_model: LLMModel, **options) -> Task:
     if name not in tasks:
-        raise ValueError("Model %s does not exist", name)
+        raise ValueError(
+            f"Task %{name} does not exist; available tasks: {list(tasks.keys())}"
+        )
     return tasks[name](evaluation_model, **options)
 
 
diff --git a/evoprompt/helpers/prompts.py b/evoprompt/task/base_prompts_mixin.py
similarity index 78%
rename from evoprompt/helpers/prompts.py
rename to evoprompt/task/base_prompts_mixin.py
index 8e653f761ad035761586084f8eb53123c8361777..c4d00cd99d71788623ce4b6d1e67e5b5dfd14e60 100644
--- a/evoprompt/helpers/prompts.py
+++ b/evoprompt/task/base_prompts_mixin.py
@@ -5,6 +5,7 @@ import re
 from datasets import Dataset
 
 from evoprompt.models import LLMModel
+from evoprompt.utils import get_rng
 
 
 class BasePromptsFromJsonMixin:
@@ -35,18 +36,25 @@ class BasePromptsFromGeneration:
         self, num_prompts: int, patience: int = 10, allow_duplicates: bool = False
     ) -> str:
         self.validation_dataset: Dataset
-        samples = self.validation_dataset._select_contiguous(0, 5)
+        samples = self.validation_dataset.shuffle(42).select(
+            get_rng().choice(len(self.validation_dataset), 5, replace=False)
+        )
         prompt = "I gave a friend an instruction and five inputs. The friend read the instruction and wrote an output for every one of the inputs. Here are the input-output pairs:\n"
-        for sample in samples:
-            prompt += f"\n\n{self._get_prompt_text_for_datum(sample)}\n{self._get_generation_prefix()}{self._get_gold_label_generation_for_datum(sample)}\n"
+        raise NotImplementedError(
+            "The prompt needs to be adapted for the model taking into account the correct format."
+        )
+        prompt = self.build_demonstration_prompt(samples, prompt=prompt)
         prompt += "\nThe instruction was "
+        system_message = "You are a helpful assistant. Please provide the instruction wrapped within tags <instruction> and </instruction> that belongs to the given input-output pairs."
+        input(prompt)
 
         generated_prompts = []
         while len(generated_prompts) < num_prompts:
             response, _, _, _ = self.evolution_model.create_completion(
-                system_message=f"You are a helpful assistant. Please provide the instruction wrapped within tags <instruction> and </instruction> that belongs to the given input-output pairs.",
+                system_message=system_message,
                 prompt=prompt,
             )
+            input(response)
             matches = re.findall(
                 # regex that extracts anything within tags <instruction> and optional </instruction>
                 rf"<instruction>(.+?)(?:(?=</instruction>)|$)",
diff --git a/evoprompt/task/question_answering.py b/evoprompt/task/question_answering.py
index 0634b3e9bcfb1d4127c00bd30d2dee32d8ff2b57..6266c86d03b879424276242f2534f77d1cbcf589 100644
--- a/evoprompt/task/question_answering.py
+++ b/evoprompt/task/question_answering.py
@@ -8,7 +8,7 @@ from datasets import Dataset
 from evaluate import load as load_metric
 from llama_cpp import LlamaGrammar
 
-from evoprompt.helpers.prompts import BasePromptsFromGeneration
+from evoprompt.task.base_prompts_mixin import BasePromptsFromGeneration
 from evoprompt.opt_types import ModelUsage
 from evoprompt.task.task import DatasetDatum, Task
 from evoprompt.utils import get_rng
@@ -70,10 +70,14 @@ class QuestionAnswering(Task):
             )
 
     @staticmethod
-    def _get_generation_prefix():
+    def _get_prediction_prefix():
         return "Answer: "
 
-    def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str:
+    def _get_prompt_text_for_datum(
+        self, datum: DatasetDatum, use_prefix: bool = False
+    ) -> str:
+        # for qa we always use prefixes because there are multple inputs
+        # TODO Shall we try omitting the prefix?
         context = self._get_context_for_datum(datum)
         question = self._get_question_for_datum(datum)
         return f"Context: {context}\nQuestion: {question}"
@@ -101,16 +105,16 @@ class QuestionAnswering(Task):
             return response
         # if we do not use a grammar, we need to extract the answer from the response
         # otherwise the answer is from the context as enforced by the grammar
-        prefix_to_match = self._get_generation_prefix().replace(" ", r"\s?")
+        prefix_to_match = self._get_prediction_prefix().replace(" ", r"\s?")
         matches = re.findall(
-            # regex that matches class labels after "Response: "
+            # regex that matches class labels after prediction prefix
             rf"(?:{prefix_to_match})?(.+)",
             response.splitlines()[-1],
             flags=re.IGNORECASE,
         )
         # look for an answer in the response, if not found, use whole response
         if matches:
-            return matches[-1]
+            return matches[0]
         else:
             return response
 
@@ -125,6 +129,9 @@ class QuestionAnswering(Task):
         )
         return result["f1"]
 
+    @abstractmethod
+    def _get_gold_label_for_datum(self, datum: DatasetDatum): ...
+
     def _aggregate_result(self, results: list[float]) -> float:
         return sum(results) / len(results)
 
@@ -190,6 +197,7 @@ class SQuAD(BasePromptsFromGeneration, QuestionAnswering):
         return datum["answers"]
 
     def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str:
+        # TODO use all answers -- does this actually happen in any QA dataset??
         return self._get_gold_label_for_datum(datum)["text"][0]
 
     @property
diff --git a/evoprompt/task/sentiment_analysis.py b/evoprompt/task/sentiment_analysis.py
index 82bbd89505d6849bce30a3e9a1f389b17a7a9f64..bf72c175ed7bf2f6e4ad763c5ef69f0c72ed125b 100644
--- a/evoprompt/task/sentiment_analysis.py
+++ b/evoprompt/task/sentiment_analysis.py
@@ -4,7 +4,7 @@ from typing import Mapping
 
 from datasets import load_dataset
 
-from evoprompt.helpers.prompts import BasePromptsFromJsonMixin
+from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin
 from evoprompt.task import TextClassification
 from evoprompt.task.task import DatasetDatum
 
@@ -23,7 +23,7 @@ class SentimentAnalysis(TextClassification):
         return {"negative": 0, "positive": 1}
 
     @staticmethod
-    def _get_generation_prefix():
+    def _get_prediction_prefix():
         return "Sentiment: "
 
 
diff --git a/evoprompt/task/simplification.py b/evoprompt/task/simplification.py
index 1d62bc40064207750148a5e386095103ca111c5a..37e0ec50f36f829bd01d3c8db290ac2db0c810de 100644
--- a/evoprompt/task/simplification.py
+++ b/evoprompt/task/simplification.py
@@ -2,7 +2,7 @@ import logging
 
 from evaluate import load as load_metric
 
-from evoprompt.helpers.prompts import BasePromptsFromJsonMixin
+from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin
 from evoprompt.task import TextGeneration
 from evoprompt.task.task import DatasetDatum
 
@@ -16,7 +16,7 @@ class Simplification(TextGeneration):
         self.metric = load_metric("evaluate-metric/sari")
 
     def compute_metric(self, datum: DatasetDatum, prediction: str) -> float:
-        gold_label = self._get_gold_label_for_datum(datum)
+        gold_label = self._get_gold_label_generation_for_datum(datum)
         return self.metric.compute(
             sources=[self._get_text_for_datum(datum)],
             predictions=[prediction],
@@ -24,7 +24,7 @@ class Simplification(TextGeneration):
         )["sari"]
 
     @staticmethod
-    def _get_generation_prefix():
+    def _get_prediction_prefix():
         return "Simplification: "
 
     @property
@@ -54,5 +54,5 @@ class ASSET(BasePromptsFromJsonMixin, Simplification):
         return datum["original"]
 
     @staticmethod
-    def _get_gold_label_for_datum(datum: DatasetDatum) -> str:
+    def _get_gold_label_generation_for_datum(datum: DatasetDatum) -> str:
         return datum["simplifications"]
diff --git a/evoprompt/task/subjectivity_classification.py b/evoprompt/task/subjectivity_classification.py
index 051294d432df9abcabec557f575601e0aca2919a..7c3882e0a96a9c89db131220df97e7aeecfa15ab 100644
--- a/evoprompt/task/subjectivity_classification.py
+++ b/evoprompt/task/subjectivity_classification.py
@@ -3,7 +3,7 @@ from typing import Mapping
 
 from datasets import load_dataset
 
-from evoprompt.helpers.prompts import BasePromptsFromJsonMixin
+from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin
 from evoprompt.task import TextClassification
 from evoprompt.task.task import DatasetDatum
 
@@ -43,7 +43,7 @@ class Subj(BasePromptsFromJsonMixin, TextClassification):
         return datum["label"]
 
     @staticmethod
-    def _get_generation_prefix():
+    def _get_prediction_prefix():
         return "Subjectivity: "
 
     @staticmethod
diff --git a/evoprompt/task/summarization.py b/evoprompt/task/summarization.py
index 9d485b85fc01c5fdf4597bf5f945baf317f88932..fed21c15fbe2a409bfb2f4866242b96b4e48b16c 100644
--- a/evoprompt/task/summarization.py
+++ b/evoprompt/task/summarization.py
@@ -2,7 +2,7 @@ import logging
 
 from evaluate import load as load_metric
 
-from evoprompt.helpers.prompts import BasePromptsFromJsonMixin
+from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin
 from evoprompt.task import TextGeneration
 from evoprompt.task.task import DatasetDatum
 
@@ -16,7 +16,7 @@ class Summarization(TextGeneration):
         self.metric = load_metric("evaluate-metric/rouge")
 
     def compute_metric(self, datum: DatasetDatum, prediction: str) -> float:
-        gold_label = self._get_gold_label_for_datum(datum)
+        gold_label = self._get_gold_label_generation_for_datum(datum)
         return (
             self.metric.compute(predictions=[prediction], references=[gold_label])[
                 "rougeL"
@@ -25,7 +25,7 @@ class Summarization(TextGeneration):
         )
 
     @staticmethod
-    def _get_generation_prefix():
+    def _get_prediction_prefix():
         return "Summary: "
 
     @property
@@ -55,5 +55,5 @@ class SAMSum(BasePromptsFromJsonMixin, Summarization):
         return datum["dialogue"]
 
     @staticmethod
-    def _get_gold_label_for_datum(datum: DatasetDatum) -> str:
+    def _get_gold_label_generation_for_datum(datum: DatasetDatum) -> str:
         return datum["summary"]
diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py
index c5b2e6699a63a6033c39765963bf15f96d4d1cdd..6b2a3344d2f3bf8f30787fb97c247282090077e7 100644
--- a/evoprompt/task/task.py
+++ b/evoprompt/task/task.py
@@ -17,9 +17,7 @@ from evoprompt.utils import log_calls
 logger = logging.getLogger(__name__)
 
 
-SYSTEM_MESSAGE = """
-You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-"""
+SYSTEM_MESSAGE = "You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
 
 DatasetDatum = dict
 
@@ -148,7 +146,7 @@ def get_evaluation_strategy(evaluation_strategy_key: EvaluationStrategyKey):
 
 
 class EvaluationStrategy:
-    def __init__(self, task):
+    def __init__(self, task: "Task"):
         self.task = task
 
     @abstractmethod
@@ -197,7 +195,7 @@ class ShortestFirstStrategy(EarlyStoppingStrategy):
         self, dataset: Dataset, parent_histories: ParentHistories | None
     ):
         sorted_dataset = sorted(
-            dataset, key=lambda x: len(self.task._get_prompt_text_for_datum(x))
+            dataset, key=lambda x: len(self.task.build_prompt_input(x))
         )
         return super().get_dataset_iterator(sorted_dataset, parent_histories)
 
@@ -274,6 +272,10 @@ class Task(metaclass=ABCMeta):
         self.use_grammar = use_grammar
         self.n_evaluation_demo = n_evaluation_demo
 
+        # TODO do we want to be able to set these?
+        self.force_task_input_prefix = False
+        self.force_task_prediction_prefix = False
+
         self.evaluation_strategy = get_evaluation_strategy(evaluation_strategy)(self)
         logger.info(
             f"using evaluation strategy: {self.evaluation_strategy.__class__.__name__}"
@@ -342,59 +344,16 @@ class Task(metaclass=ABCMeta):
     ) -> Iterable[int]:
         pass
 
-    @weave.op()
-    def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]:
-        # run model for inference using grammar to constrain output
-        # TODO grammar also depends on prompt and vice-versa -> what are good labels?
-        response, _, _, usage = self.model.create_completion(
-            system_message=SYSTEM_MESSAGE,
-            prompt=prompt,
-            # grammar can be applied to constrain the model output
-            grammar=self._get_grammar(datum) if self.use_grammar else None,
-            # we use cached completions to speed up the process although we loose the non-deterministic behavior of LMs, but we're ok with a single result
-            use_cache=True,
-            # use less randomness, i.e., more certain outputs
-            temperature=0.0,
-        )
-
-        if not self.use_grammar:
-            # we postprocess the model output to return as answer
-            response = response.strip()
-
-        return response, usage
-
-    def _build_prompt(self, prompt: str, datum: DatasetDatum) -> str:
-        prompt = f"{prompt}\n\n{self._get_prompt_text_for_datum(datum)}\n{self._get_generation_prefix()}"
-        return prompt
-
-    @abstractmethod
-    def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str: ...
-
-    @abstractmethod
-    def _get_generation_prefix() -> str: ...
 
-    @abstractmethod
-    def _get_grammar(self, datum: DatasetDatum) -> LlamaGrammar:
-        pass
-
-    @abstractmethod
-    def _evaluate_sample(self, response: str, datum: DatasetDatum) -> Any: ...
-
-    @abstractmethod
-    def _parse_response(self, response: str) -> str: ...
-
-    @abstractmethod
-    def _get_gold_label_for_datum(self, datum: DatasetDatum) -> str:
-        pass
-
-    @abstractmethod
-    # This method is needed for the demonstration examples.
-    def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str:
-        pass
+    @log_calls("Evaluating validation dataset")
+    def evaluate_validation(
+        self, prompt: str, parent_histories: list[list[float]] | None = None
+    ):
+        return self.evaluate(prompt, self.validation_dataset, parent_histories)
 
-    @abstractmethod
-    def _aggregate_result(self, results: list) -> float:
-        pass
+    @log_calls("Evaluating test dataset")
+    def evaluate_test(self, prompt: str):
+        return self.evaluate(prompt, self.test_dataset, no_early_stopping=True)
 
     @weave.op()
     def evaluate(
@@ -412,22 +371,20 @@ class Task(metaclass=ABCMeta):
         evaluation_history = []
 
         # augment prompt with demonstration samples
-        prompt += "".join(
-            [
-                f"\n\n{self._get_prompt_text_for_datum(datum)}\n{self._get_generation_prefix()}{self._get_gold_label_generation_for_datum(datum)}"
-                for datum in self.demonstration_samples
-            ]
-        )
+        prompt_with_examples = self.build_demonstration_prompt(self.demonstration_samples, prompt=prompt)
 
         for datum in dataset_iterator:
-            # build prompt for current sample
-            prompt_for_datum = self._build_prompt(prompt, datum)
             # run prediction
-            response, usage = self.predict(prompt=prompt_for_datum, datum=datum)
+            response, usage = self.predict(prompt=prompt_with_examples, datum=datum)
+            logger.debug(f"Response: '{response}'")
             # parse response
             response = self._parse_response(response=response)
+            logger.debug(f"Parsed response: '{response}'")
             # evaluate response
             result = self._evaluate_sample(response=response, datum=datum)
+            logger.debug(
+                f"Prediction: '{response}', Gold label: '{self._get_gold_label_generation_for_datum(datum)}', Result: {result}"
+            )
             results.append(result)
             current_metric = self._aggregate_result(results)
             dataset_iterator.set_postfix({self.metric_name: f"{current_metric:.2f}"})
@@ -442,23 +399,76 @@ class Task(metaclass=ABCMeta):
                 break
 
         return self._aggregate_result(results), evaluation_usage, evaluation_history
+    
+    @weave.op()
+    def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]:
+        # run model for inference using grammar to constrain output
+        # TODO grammar also depends on prompt and vice-versa -> what are good labels?
+        # build prompt for current sample
+        prompt_for_datum = self.build_prompt_input(datum, prompt=prompt, use_prediction_prefix=self.model._get_prediction_prefix() is None)
+        logger.debug(f"Prompt for datum:\n{prompt_for_datum}")
+        response, _, _, usage = self.model.create_completion(
+            system_message=SYSTEM_MESSAGE,
+            prompt=prompt_for_datum,
+            # grammar can be applied to constrain the model output
+            grammar=self._get_grammar(datum) if self.use_grammar else None,
+            # we use cached completions to speed up the process although we loose the non-deterministic behavior of LMs, but we're ok with a single result
+            use_cache=True,
+            # use less randomness, i.e., more certain outputs
+            use_randomness=False,
+        )
 
-    @log_calls("Evaluating validation dataset")
-    def evaluate_validation(
-        self, prompt: str, parent_histories: list[list[float]] | None = None
-    ):
-        return self.evaluate(prompt, self.validation_dataset, parent_histories)
+        if not self.use_grammar:
+            # we postprocess the model output to return as answer
+            response = response.strip()
 
-    @log_calls("Evaluating test dataset")
-    def evaluate_test(self, prompt: str):
-        return self.evaluate(prompt, self.test_dataset, no_early_stopping=True)
+        return response, usage
+    
+    def build_prompt_input(
+        self, sample, prompt: str = "", use_prediction_prefix: bool = False,
+    ) -> str:
+        # the default is to use the prompt as is and concatenate the datum string
+        prompt += f"\n\n{self.model._get_input_prefix() if self.model._get_input_prefix() is not None else ""}{self._get_prompt_text_for_datum(sample, use_prefix=self.force_task_input_prefix or not self.model._get_input_prefix())}"
+        if use_prediction_prefix:
+            prompt += f"\n{self._get_prediction_prefix().strip()} "
+        return prompt.strip()
+    
+    def build_demonstration_prompt(
+        self,
+        demonstration_samples: list[dict],
+        prompt: str = "",
+    ) -> str:
+        for sample in demonstration_samples:
+            prompt += "\n\n" + self.build_prompt_input(sample)
+            prompt += f"\n{self.model._get_prediction_prefix() if self.model._get_prediction_prefix() is not None else self._get_prediction_prefix()}{self._get_gold_label_generation_for_datum(sample)}"
+        return prompt.strip()
+    
+    @abstractmethod
+    def _get_prompt_text_for_datum(self, datum: DatasetDatum, use_prefix: bool = False) -> str: ...
+
+    @abstractmethod
+    def _get_prediction_prefix() -> str: ...
+
+    @abstractmethod
+    def _get_grammar(self, datum: DatasetDatum) -> LlamaGrammar: ...
+
+    @abstractmethod
+    def _evaluate_sample(self, response: str, datum: DatasetDatum) -> Any: ...
+
+    @abstractmethod
+    def _parse_response(self, response: str) -> str: ...
+
+    @abstractmethod
+    # This method is needed for the demonstration examples.
+    def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str: ...
+
+    @abstractmethod
+    def _aggregate_result(self, results: list) -> float: ...
 
     @property
     @abstractmethod
-    def metric_name(self) -> str:
-        pass
+    def metric_name(self) -> str: ...
 
     @property
     @abstractmethod
-    def base_prompts(self) -> list[str]:
-        pass
+    def base_prompts(self) -> list[str]: ...
diff --git a/evoprompt/task/text_classification.py b/evoprompt/task/text_classification.py
index 74e8dbcb2d8d6ab1ae8cfb262df9c9a86848f491..af30f1545b21f5e43ff674c89a3305abb0d9153f 100644
--- a/evoprompt/task/text_classification.py
+++ b/evoprompt/task/text_classification.py
@@ -23,12 +23,12 @@ class TextClassification(Task):
         else:
             matches = re.findall(
                 # regex that matches class labels after the generation prefix
-                rf"{self._get_generation_prefix()}({'|'.join(class_mapping.keys())})",
+                rf"{self._get_prediction_prefix()}({'|'.join(class_mapping.keys())})",
                 response,
                 flags=re.IGNORECASE,
             )
             if matches:
-                return matches[-1]
+                return matches[0]
             else:
                 # look for a label in the response, if not found, return failed
                 matches = re.findall(
@@ -64,12 +64,12 @@ class TextClassification(Task):
         ).shuffle(42)
         sample_ids = []
 
-        def check_sample_label(sample, label, gold_label_for_datum_fn):
+        def filter_sample_for_label(sample, label, gold_label_for_datum_fn):
             return gold_label_for_datum_fn(sample) == label
 
         for label in self._get_label_mapping().values():
             sample_ids_for_label = dataset_with_row_indices.filter(
-                check_sample_label,
+                filter_sample_for_label,
                 fn_kwargs={
                     "label": label,
                     "gold_label_for_datum_fn": self._get_gold_label_for_datum,
@@ -78,13 +78,6 @@ class TextClassification(Task):
             sample_ids += sample_ids_for_label
         return sample_ids
 
-    def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str:
-        # NOTE it seems that quotes in the prompt make it worse
-        return f"{self._get_input_prefix()}{self._get_text_for_datum(datum)}"
-
-    @abstractmethod
-    def _get_text_for_datum(self, datum: DatasetDatum) -> str: ...
-
     # NOTE cannot be cached since grammar is not picklable
     def _get_grammar(self, datum: DatasetDatum, verbose: bool = False):
         return LlamaGrammar.from_string(
@@ -94,17 +87,22 @@ class TextClassification(Task):
             verbose=verbose,
         )
 
-    @staticmethod
-    def _get_input_prefix():
-        return "Text: "
+    def _get_prompt_text_for_datum(
+        self, datum: DatasetDatum, use_prefix: bool = False
+    ) -> str:
+        text = self._get_text_for_datum(datum)
+        if use_prefix:
+            return f"Text: {text}"
+        return text
+
+    @abstractmethod
+    def _get_text_for_datum(self, datum: DatasetDatum) -> str: ...
 
     @abstractmethod
-    def _get_label_mapping(self) -> Mapping:
-        pass
+    def _get_label_mapping(self) -> Mapping: ...
 
     @abstractmethod
-    def _get_gold_label_for_datum(self, datum: DatasetDatum) -> str:
-        pass
+    def _get_gold_label_for_datum(self, datum: DatasetDatum) -> str: ...
 
     @lru_cache
     def _get_inverse_label_mapping(self):
diff --git a/evoprompt/task/text_generation.py b/evoprompt/task/text_generation.py
index 81a81f6866ef1b4fdd9af732852645ee5a648b16..1826d78ffd26dceaef8b15945399bed7ebbd708b 100644
--- a/evoprompt/task/text_generation.py
+++ b/evoprompt/task/text_generation.py
@@ -25,10 +25,14 @@ class TextGeneration(Task):
     def _evaluate_sample(self, response: str, datum: DatasetDatum) -> float:
         return self.compute_metric(datum, response.lower())
 
-    def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str:
-        # NOTE it seems that quotes in the prompt make it worse
-        return f"{self._get_input_prefix()}{self._get_text_for_datum(datum)}"
-
+    def _get_prompt_text_for_datum(
+        self, datum: DatasetDatum, use_prefix: bool = False
+    ) -> str:
+        text = self._get_text_for_datum(datum)
+        if use_prefix:
+            return f"Text: {text}"
+        return text
+    
     @abstractmethod
     def _get_text_for_datum(self, datum: DatasetDatum) -> str: ...
 
@@ -36,22 +40,11 @@ class TextGeneration(Task):
         # there is no grammar for open text generation
         return None
 
-    @staticmethod
-    def _get_input_prefix():
-        return "Text: "
-
     def _get_demonstration_sample_ids(
         self, dataset: Dataset, n_evaluation_demo: int
     ) -> Iterable[int]:
         # select demonstration samples uniformly at random
         return get_rng().choice(len(dataset), n_evaluation_demo, replace=False)
 
-    def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str:
-        return self._get_gold_label_for_datum(datum)
-
-    @abstractmethod
-    def _get_text_for_datum(self, datum: DatasetDatum) -> str:
-        pass
-
     def _aggregate_result(self, results: list[str]) -> float:
         return sum(results) / len(results)
diff --git a/evoprompt/task/topic_classification.py b/evoprompt/task/topic_classification.py
index c4048a7b2f42f9bcbce080c2526f51cc10ee293c..dd1905f1d46ee0ff1d773aaba36a78c12a4cd7d4 100644
--- a/evoprompt/task/topic_classification.py
+++ b/evoprompt/task/topic_classification.py
@@ -3,14 +3,14 @@ from typing import Mapping
 
 from datasets import load_dataset
 
-from evoprompt.helpers.prompts import BasePromptsFromJsonMixin
+from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin
 from evoprompt.task import TextClassification
 from evoprompt.task.task import DatasetDatum
 
 
 class TopicClassification(TextClassification):
     @staticmethod
-    def _get_generation_prefix():
+    def _get_prediction_prefix():
         return "Topic: "
 
 
diff --git a/evoprompt/utils.py b/evoprompt/utils.py
index 4ca878d4b5064d048370abea0bc359289557d6a5..701ae38dc558f789e2ae0eef23e62aac398a1bd1 100644
--- a/evoprompt/utils.py
+++ b/evoprompt/utils.py
@@ -79,7 +79,9 @@ def initialize_run_directory(model):
 
     # make sure that we use high randomness for generating the run name even if a seed is set for the model
     response, _, _, _ = model.create_completion(
-        None, run_name_prompt, temperature=2.0, seed=random.randint(0, 2**32 - 1)
+        system_message=None,
+        prompt=run_name_prompt,
+        use_randomness=True,
     )
     run_name_match = re.search(r"^\w+$", response, re.MULTILINE)
     existing_run_names = os.listdir(RUNS_DIR) if RUNS_DIR.exists() else []
diff --git a/main.py b/main.py
index 2256754fe0d6b7cd11f27674302743d46556b63a..40c8d25dd7e2126e8f5acc83aa5548391e9b532f 100644
--- a/main.py
+++ b/main.py
@@ -9,7 +9,7 @@ from weave.trace.settings import UserSettings
 
 from evoprompt.cli import argument_parser
 from evoprompt.evolution import get_optimizer_class
-from evoprompt.models import HfLlamaChat, Llama, LlamaChat, LLMModel
+from evoprompt.models import HfChat, Llama, LlamaChat, LLMModel
 from evoprompt.task import get_task
 from evoprompt.utils import init_rng, setup_console_logger
 
@@ -86,26 +86,26 @@ if __name__ == "__main__":
         logger.info("DEBUG mode: Do a quick run")
 
     # set up evolution model
-    evolution_model = LLMModel.get_model(options.evolution_engine, options=options)
+    evolution_model = LLMModel.get_model(options.evolution_engine, **vars(options))
     logger.info(
         f"Using {evolution_model.__class__.__name__.lower()} as the evolution engine"
     )
 
     judge_model: LLMModel | None = None
     if options.judge_engine is not None:
-        judge_model = LLMModel.get_model(options.judge_engine, options=options)
+        judge_model = LLMModel.get_model(options.judge_engine, **vars(options))
         logger.info(f"Using {options.judge_engine} as the judge engine")
 
     # set up evaluation model
     # NOTE currenty we always stick to Llama (Llama or LlamaChat depending on evolution engine) as evaluation engine
     # TODO allow to set separate engine and model for evaluation?
-    if isinstance(evolution_model, (Llama, LlamaChat, HfLlamaChat)):
+    if isinstance(evolution_model, (Llama, LlamaChat, HfChat)):
         evaluation_model_name = evolution_model.__class__.__name__.lower()
     elif judge_model is not None and isinstance(judge_model, (Llama, LlamaChat)):
         evaluation_model_name = judge_model.__class__.__name__.lower()
     else:
         evaluation_model_name = "llamachat"
-    evaluation_model = LLMModel.get_model(name=evaluation_model_name, options=options)
+    evaluation_model = LLMModel.get_model(name=evaluation_model_name, **vars(options))
     logger.info(f"Using {evaluation_model_name} as the evaluation engine")
 
     task = get_task(
diff --git a/requirements.txt b/requirements.txt
index c3c7115e17085a9d044d9f4441a92dbb7445581f..f4e563dea0de12c90cd418306673510f333d7656 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,4 +15,6 @@ sacrebleu
 sacremoses
 textual
 wandb
-weave
\ No newline at end of file
+weave
+bitsandbytes
+sentencepiece
\ No newline at end of file
diff --git a/test_chat_format.py b/test_chat_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6aeed329e7cf261b790fea3f6a7543d964f8868
--- /dev/null
+++ b/test_chat_format.py
@@ -0,0 +1,20 @@
+from llama_cpp.llama_chat_format import format_alpaca
+
+if __name__ == "__main__":
+    messages = [
+        {
+            "role": "system",
+            "content": "You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.",
+        },
+        {
+            "role": "user",
+            "content": "Have your friend evaluate the movie they had just seen and provide a summary opinion (e.g. terrible, bad, okay, good, or great) to determine the sentiment of the movie review.\n\nText: ( a ) slummer .\nSentiment: "
+        },
+        {
+            "role": "assistant",
+            "content": None,
+        },
+    ]
+
+    formatted_prompt = format_alpaca(messages)
+    print(formatted_prompt)
\ No newline at end of file