From 4e6a46c0bf91791bad51155bb8c876f2f1e78980 Mon Sep 17 00:00:00 2001
From: Maximilian Kimmich <maximilian.kimmich@ims.uni-stuttgart.de>
Date: Tue, 1 Oct 2024 16:20:51 +0200
Subject: [PATCH] Adapt all models to common interface and allow to control
 randomness

---
 evoprompt/models.py | 230 +++++++++++++++++++++++++++++++-------------
 evoprompt/utils.py  |   4 +-
 2 files changed, 164 insertions(+), 70 deletions(-)

diff --git a/evoprompt/models.py b/evoprompt/models.py
index 9146776..3d64e89 100644
--- a/evoprompt/models.py
+++ b/evoprompt/models.py
@@ -2,13 +2,15 @@ import hashlib
 import inspect
 import json
 import logging
+import random
 import warnings
 from abc import ABC, abstractmethod
-from argparse import ArgumentParser, Namespace
+from argparse import ArgumentParser
 from pathlib import Path
 from typing import Any, Callable, ClassVar
 
 import llama_cpp
+from llama_cpp.llama_chat_format import format_llama3
 import openai
 import weave
 from diskcache import Cache
@@ -42,7 +44,7 @@ class LLMModel(ABC):
                 f"Model {name} does not exist; avilable models: {list(LLMModel.registered_models.keys())}"
             )
 
-        key = cls.get_options_kwargs_hash(**kwargs)
+        key = cls.get_hash_from_kwargs(**kwargs)
         # check if model is already loaded
         if cls in LLMModel.loaded_models:
             model, model_key = LLMModel.loaded_models[cls]
@@ -79,9 +81,10 @@ class LLMModel(ABC):
         prompt: str,
         *,
         use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
         stop: str | None = None,
-        history: ChatMessages | None = None,
-        **kwargs: Any,
+        enforce_randomness: bool = False,
+        **kwargs,
     ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]:
         messages = [self._get_user_message(prompt)]
         if system_message is not None:
@@ -89,10 +92,11 @@ class LLMModel(ABC):
 
         reponse, usage = self._create_completion(
             prompt=prompt,
+            grammar=grammar,
             stop=stop,
             use_cache=use_cache,
             max_tokens=self.kwargs.get("max_tokens", None),
-            **kwargs,
+            enforce_randomness=enforce_randomness,
         )
 
         messages.append(self._get_assistant_message(reponse))
@@ -109,9 +113,14 @@ class LLMModel(ABC):
     @abstractmethod
     def _create_completion(
         self,
-        **kwargs,
-    ):
-        pass
+        prompt: str | None = None,
+        *,
+        use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
+        stop: str | None = None,
+        max_tokens: int | None = None,
+        enforce_randomness: bool = False,
+    ): ...
 
     def _get_user_message(self, content: Any):
         return {
@@ -145,7 +154,7 @@ class LLMModel(ABC):
             return model_completion_fn(**kwargs)
 
     @staticmethod
-    def get_options_kwargs_hash(**kwargs):
+    def get_hash_from_kwargs(**kwargs):
         # sometimes we want to ignore certain kwargs from the hash, e.g., when they are not relevant for the model or if they are not serializable
         kwargs = kwargs.copy()
 
@@ -174,7 +183,7 @@ class LLMModel(ABC):
         return (
             str(self.model_name).replace("/", "_")
             + "/"
-            + self.get_options_kwargs_hash(**self.kwargs)
+            + self.get_hash_from_kwargs(**self.kwargs)
         )
 
     @property
@@ -191,65 +200,90 @@ class Llama(LLMModel):
 
     def __init__(
         self,
-        options: Namespace,
+        *,
+        ignore_cache_kwargs: list[str] | None = None,
+        llama_path: Path | None = None,
+        llama_model: str = "QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
+        llama_model_file: str = "Meta-Llama-3.1-8B-Instruct.Q8_0.gguf",
+        chat_format: str | None = None,
+        chat_handler: str | None = None,
+        verbose: int | bool = False,
+        llama_verbose: bool = False,
         n_gpu_layers: int = 60,
         n_threads: int = 8,
         n_ctx: int = 8192,
         **kwargs,
     ) -> None:
-        # initialize model
-        add_kwargs = {}
+        # we collect all arguments to make sure they are passed to the super constructor
+        hashed_model_kwargs = {
+            "n_ctx": n_ctx,
+        }
+        if chat_format is not None:
+            hashed_model_kwargs["chat_format"] = chat_format
+        if chat_handler is not None:
+            hashed_model_kwargs["chat_handler"] = chat_handler
         seed = get_seed()
         if seed is not None:
-            add_kwargs["seed"] = seed
+            hashed_model_kwargs["seed"] = seed
+
+        model_kwargs = dict(
+            **hashed_model_kwargs,
+            verbose=(verbose > 1 or llama_verbose),
+            n_gpu_layers=n_gpu_layers,
+            n_threads=n_threads,
+        )
 
-        # TODO some options could be optional
+        ignore_cache_kwargs = (
+            ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else []
+        )
 
-        if options.llama_path is not None:
+        if llama_path is not None:
             # use local file
             self.model = llama_cpp.Llama(
-                model_path=options.llama_path,
-                chat_format=options.chat_format,
-                chat_handler=options.chat_handler,
-                verbose=options.verbose > 1 or options.llama_verbose,
-                n_gpu_layers=n_gpu_layers,
-                n_threads=n_threads,
-                n_ctx=n_ctx,
-                **add_kwargs,
-                **kwargs,
+                model_path=llama_path,
+                **model_kwargs,
             )
-            self._model_name = Path(options.llama_path).stem
+            self._model_name = llama_path.stem
         else:
             # use pre-trained model from HF hub
             self.model = llama_cpp.Llama.from_pretrained(
-                repo_id=options.llama_model,
-                filename=options.llama_model_file,
-                chat_format=options.chat_format,
-                chat_handler=options.chat_handler,
-                verbose=options.verbose > 1 or options.llama_verbose,
-                n_gpu_layers=n_gpu_layers,
-                n_threads=n_threads,
-                n_ctx=n_ctx,
-                **add_kwargs,
-                **kwargs,
+                repo_id=llama_model,
+                filename=llama_model_file,
+                **model_kwargs,
             )
-            self._model_name = Path(
-                options.llama_model, options.llama_model_file
-            ).with_suffix("")
+            self._model_name = Path(llama_model, llama_model_file).with_suffix("")
 
         # pass all arguments to super constructor which should be taken into account for caching
         # needs to be called after model is initialized
-        super().__init__(options=options, n_ctx=n_ctx, **kwargs)
+        super().__init__(ignore_cache_kwargs=ignore_cache_kwargs, **hashed_model_kwargs)
 
     def _create_completion(
         self,
+        prompt: str | None = None,
+        *,
         use_cache: bool = False,
-        **kwargs,
+        grammar: llama_cpp.LlamaGrammar | None = None,
+        stop: str | None = None,
+        max_tokens: int | None = None,
+        enforce_randomness: bool = False,
     ):
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "prompt": prompt,
+            "grammar": grammar,
+            "stop": stop,
+            "max_tokens": max_tokens,
+        }
+        if enforce_randomness:
+            model_call_kwargs["temperature"] = 2.0
+            model_call_kwargs["seed"] = random.randint(0, 2**32 - 1)
+        else:
+            model_call_kwargs["temperature"] = 0.0
+
         response = self._call_model(
             self.model.create_completion,
             use_cache=use_cache,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response["choices"][0]["text"]
 
@@ -306,13 +340,13 @@ class ChatModel:
         self,
         system_message: str | None,
         prompt: str,
-        prompt_context_separator: str | None = None,
-        prompt_context: str | None = None,
         *,
         use_cache: bool = False,
+        grammar: llama_cpp.LlamaGrammar | None = None,
         stop: str | None = None,
         history: ChatMessages | None = None,
-        **kwargs: Any,
+        enforce_randomness: bool = False,
+        **kwargs,
     ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]:
         # a history is prepended to the messages, and we assume that it also includes a system message, i.e., we never add a system message in this case
         # TODO is it better to check for a system message in the history?
@@ -323,15 +357,17 @@ class ChatModel:
                 history = [self._get_system_message(system_message)]
             else:
                 history = []
-        # we prepend the history to the messages, and append an empty assistant message to add the generation token
-        messages_for_model = history + messages + [self._get_assistant_message("")]
+        # we prepend the history to the messages
+        # the chat format should take care of adding appropriate assistant messages for generating the completion
+        messages_for_model = history + messages
 
         reponse, usage = self._create_completion(
             messages=messages_for_model,
+            grammar=grammar,
             stop=stop,
             use_cache=use_cache,
             max_tokens=self.kwargs.get("max_tokens", None),
-            **kwargs,
+            enforce_randomness=enforce_randomness,
         )
 
         messages.append(self._get_assistant_message(reponse))
@@ -343,15 +379,37 @@ class LlamaChat(ChatModel, Llama):
     @weave.op()
     def _create_completion(
         self,
-        use_cache: bool = False,
-        **kwargs,
+        messages: ChatMessages,
+        *,
+        use_cache: bool,
+        grammar: llama_cpp.LlamaGrammar | None = None,
+        stop: str | None,
+        max_tokens: int | None,
+        enforce_randomness: bool,
     ):
+        # input(
+        #     f"The input for the model will look like this:\n{format_llama3(messages).prompt}"
+        # )
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "messages": messages,
+            "grammar": grammar,
+            "stop": stop,
+            "max_tokens": max_tokens,
+        }
+        if enforce_randomness:
+            model_call_kwargs["temperature"] = 2.0
+            model_call_kwargs["seed"] = random.randint(0, 2**32 - 1)
+        else:
+            model_call_kwargs["temperature"] = 0.0
+
         response = self._call_model(
             self.model.create_chat_completion,
             use_cache=use_cache,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response["choices"][0]["message"]["content"]
+        # input(response_text)
 
         usage = ModelUsage(**response["usage"])
         return response_text, usage
@@ -361,7 +419,7 @@ class HfChat(ChatModel, LLMModel):
 
     def __init__(
         self,
-        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
         ignore_cache_kwargs: list[str] | None = None,
         **kwargs,
     ) -> None:
@@ -380,6 +438,11 @@ class HfChat(ChatModel, LLMModel):
         }
         if "do_sample" in kwargs:
             model_kwargs["do_sample"] = kwargs.pop("do_sample")
+        ignore_cache_kwargs = (
+            ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else []
+        )
+        # torch.dtype is not JSON serializable therefore we ignore it
+        ignore_cache_kwargs.extend(["torch_dtype"])
 
         # pass all arguments to super constructor which should be taken into account for caching
         super().__init__(**model_kwargs, ignore_cache_kwargs=ignore_cache_kwargs)
@@ -400,19 +463,31 @@ class HfChat(ChatModel, LLMModel):
     def _create_completion(
         self,
         messages: ChatMessages,
-        use_cache: bool = False,
-        max_tokens: int = None,
-        **kwargs,
+        *,
+        use_cache: bool,
+        stop: str | None,
+        max_tokens: int | None,
+        enforce_randomness: bool,
     ):
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "text_inputs": messages,
+            "stop": stop,
+            "max_length": max_tokens if max_tokens is not None else 2048,
+        }
+        if enforce_randomness:
+            model_call_kwargs["temperature"] = 2.0
+            model_call_kwargs["do_sample"] = True
+        else:
+            model_call_kwargs["do_sample"] = False
+
         # input(
         #     f"The input for the model will look like this:\n{self.pipeline.tokenizer.apply_chat_template(messages, tokenize=False)}"
         # )
         response = self._call_model(
             self.pipeline,
-            text_inputs=messages,
             use_cache=use_cache,
-            max_length=max_tokens if max_tokens is not None else 1024,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response[0]["generated_text"][-1]["content"]
         # no usage supported by HF pipeline; TODO manually compute usage?
@@ -428,11 +503,11 @@ class HfChat(ChatModel, LLMModel):
 class AlpacaHfChat(HfChat):
     def __init__(
         self,
-        *args,
-        model="chavinlo/alpaca-native",
+        model: str = "chavinlo/alpaca-native",
+        ignore_cache_kwargs: list[str] | None = None,
         **kwargs,
     ):
-        super().__init__(*args, **kwargs, model=model)
+        super().__init__(model=model, ignore_cache_kwargs=ignore_cache_kwargs, **kwargs)
 
         # chat template for Alpaca adapted from https://huggingface.co/Vezora/Mistral-22B-v0.1/blob/c15d70465e2fc46c3c4d7fec8fb62f533d4ef09b/tokenizer_config.json#L30
         self.pipeline.tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ system_message + '\\n\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n'  + message['content'].strip()}}{% endif %}{% endfor %}"
@@ -450,25 +525,42 @@ class OpenAIChat(ChatModel, LLMModel):
 
     def __init__(
         self,
-        model_kwargs: dict,
+        openai_model: str,
+        ignore_cache_kwargs: list[str] | None = None,
         **kwargs,
     ) -> None:
         # initialize client for API calls
-        self.openai_client = openai.OpenAI(**kwargs)
-        self._model_name = model_kwargs["openai_model"]
+        # TODO set API key, project etc. via keyword arguments?
+        self.openai_client = openai.OpenAI()
+        self._model_name = openai_model
 
-        super().__init__(model_kwargs, **kwargs)
+        super().__init__(model=openai_model, ignore_cache_kwargs=ignore_cache_kwargs)
 
     def _create_completion(
         self,
-        use_cache: bool = False,
-        **kwargs,
+        messages: ChatMessages,
+        *,
+        use_cache: bool,
+        stop: str | None,
+        max_tokens: int | None,
+        enforce_randomness: bool,
     ):
+        # setup kwargs for model call
+        model_call_kwargs = {
+            "model": self.model_name,
+            "messages": messages,
+            "stop": stop,
+            "max_completion_tokens": max_tokens if max_tokens is not None else 1024,
+        }
+        if enforce_randomness:
+            model_call_kwargs["temperature"] = 2.0
+        else:
+            model_call_kwargs["temperature"] = 0.0
+
         response = self._call_model(
             self.openai_client.chat.completions.create,
-            model=self.model_name,
             use_cache=use_cache,
-            **kwargs,
+            **model_call_kwargs,
         )
         response_text = response.choices[0].message.content
         usage = ModelUsage(**response.usage.__dict__)
diff --git a/evoprompt/utils.py b/evoprompt/utils.py
index 4ca878d..6bed5b8 100644
--- a/evoprompt/utils.py
+++ b/evoprompt/utils.py
@@ -79,7 +79,9 @@ def initialize_run_directory(model):
 
     # make sure that we use high randomness for generating the run name even if a seed is set for the model
     response, _, _, _ = model.create_completion(
-        None, run_name_prompt, temperature=2.0, seed=random.randint(0, 2**32 - 1)
+        system_message=None,
+        prompt=run_name_prompt,
+        enforce_randomness=True,
     )
     run_name_match = re.search(r"^\w+$", response, re.MULTILINE)
     existing_run_names = os.listdir(RUNS_DIR) if RUNS_DIR.exists() else []
-- 
GitLab