From 4e6a46c0bf91791bad51155bb8c876f2f1e78980 Mon Sep 17 00:00:00 2001 From: Maximilian Kimmich <maximilian.kimmich@ims.uni-stuttgart.de> Date: Tue, 1 Oct 2024 16:20:51 +0200 Subject: [PATCH] Adapt all models to common interface and allow to control randomness --- evoprompt/models.py | 230 +++++++++++++++++++++++++++++++------------- evoprompt/utils.py | 4 +- 2 files changed, 164 insertions(+), 70 deletions(-) diff --git a/evoprompt/models.py b/evoprompt/models.py index 9146776..3d64e89 100644 --- a/evoprompt/models.py +++ b/evoprompt/models.py @@ -2,13 +2,15 @@ import hashlib import inspect import json import logging +import random import warnings from abc import ABC, abstractmethod -from argparse import ArgumentParser, Namespace +from argparse import ArgumentParser from pathlib import Path from typing import Any, Callable, ClassVar import llama_cpp +from llama_cpp.llama_chat_format import format_llama3 import openai import weave from diskcache import Cache @@ -42,7 +44,7 @@ class LLMModel(ABC): f"Model {name} does not exist; avilable models: {list(LLMModel.registered_models.keys())}" ) - key = cls.get_options_kwargs_hash(**kwargs) + key = cls.get_hash_from_kwargs(**kwargs) # check if model is already loaded if cls in LLMModel.loaded_models: model, model_key = LLMModel.loaded_models[cls] @@ -79,9 +81,10 @@ class LLMModel(ABC): prompt: str, *, use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, stop: str | None = None, - history: ChatMessages | None = None, - **kwargs: Any, + enforce_randomness: bool = False, + **kwargs, ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]: messages = [self._get_user_message(prompt)] if system_message is not None: @@ -89,10 +92,11 @@ class LLMModel(ABC): reponse, usage = self._create_completion( prompt=prompt, + grammar=grammar, stop=stop, use_cache=use_cache, max_tokens=self.kwargs.get("max_tokens", None), - **kwargs, + enforce_randomness=enforce_randomness, ) messages.append(self._get_assistant_message(reponse)) @@ -109,9 +113,14 @@ class LLMModel(ABC): @abstractmethod def _create_completion( self, - **kwargs, - ): - pass + prompt: str | None = None, + *, + use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, + stop: str | None = None, + max_tokens: int | None = None, + enforce_randomness: bool = False, + ): ... def _get_user_message(self, content: Any): return { @@ -145,7 +154,7 @@ class LLMModel(ABC): return model_completion_fn(**kwargs) @staticmethod - def get_options_kwargs_hash(**kwargs): + def get_hash_from_kwargs(**kwargs): # sometimes we want to ignore certain kwargs from the hash, e.g., when they are not relevant for the model or if they are not serializable kwargs = kwargs.copy() @@ -174,7 +183,7 @@ class LLMModel(ABC): return ( str(self.model_name).replace("/", "_") + "/" - + self.get_options_kwargs_hash(**self.kwargs) + + self.get_hash_from_kwargs(**self.kwargs) ) @property @@ -191,65 +200,90 @@ class Llama(LLMModel): def __init__( self, - options: Namespace, + *, + ignore_cache_kwargs: list[str] | None = None, + llama_path: Path | None = None, + llama_model: str = "QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF", + llama_model_file: str = "Meta-Llama-3.1-8B-Instruct.Q8_0.gguf", + chat_format: str | None = None, + chat_handler: str | None = None, + verbose: int | bool = False, + llama_verbose: bool = False, n_gpu_layers: int = 60, n_threads: int = 8, n_ctx: int = 8192, **kwargs, ) -> None: - # initialize model - add_kwargs = {} + # we collect all arguments to make sure they are passed to the super constructor + hashed_model_kwargs = { + "n_ctx": n_ctx, + } + if chat_format is not None: + hashed_model_kwargs["chat_format"] = chat_format + if chat_handler is not None: + hashed_model_kwargs["chat_handler"] = chat_handler seed = get_seed() if seed is not None: - add_kwargs["seed"] = seed + hashed_model_kwargs["seed"] = seed + + model_kwargs = dict( + **hashed_model_kwargs, + verbose=(verbose > 1 or llama_verbose), + n_gpu_layers=n_gpu_layers, + n_threads=n_threads, + ) - # TODO some options could be optional + ignore_cache_kwargs = ( + ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else [] + ) - if options.llama_path is not None: + if llama_path is not None: # use local file self.model = llama_cpp.Llama( - model_path=options.llama_path, - chat_format=options.chat_format, - chat_handler=options.chat_handler, - verbose=options.verbose > 1 or options.llama_verbose, - n_gpu_layers=n_gpu_layers, - n_threads=n_threads, - n_ctx=n_ctx, - **add_kwargs, - **kwargs, + model_path=llama_path, + **model_kwargs, ) - self._model_name = Path(options.llama_path).stem + self._model_name = llama_path.stem else: # use pre-trained model from HF hub self.model = llama_cpp.Llama.from_pretrained( - repo_id=options.llama_model, - filename=options.llama_model_file, - chat_format=options.chat_format, - chat_handler=options.chat_handler, - verbose=options.verbose > 1 or options.llama_verbose, - n_gpu_layers=n_gpu_layers, - n_threads=n_threads, - n_ctx=n_ctx, - **add_kwargs, - **kwargs, + repo_id=llama_model, + filename=llama_model_file, + **model_kwargs, ) - self._model_name = Path( - options.llama_model, options.llama_model_file - ).with_suffix("") + self._model_name = Path(llama_model, llama_model_file).with_suffix("") # pass all arguments to super constructor which should be taken into account for caching # needs to be called after model is initialized - super().__init__(options=options, n_ctx=n_ctx, **kwargs) + super().__init__(ignore_cache_kwargs=ignore_cache_kwargs, **hashed_model_kwargs) def _create_completion( self, + prompt: str | None = None, + *, use_cache: bool = False, - **kwargs, + grammar: llama_cpp.LlamaGrammar | None = None, + stop: str | None = None, + max_tokens: int | None = None, + enforce_randomness: bool = False, ): + # setup kwargs for model call + model_call_kwargs = { + "prompt": prompt, + "grammar": grammar, + "stop": stop, + "max_tokens": max_tokens, + } + if enforce_randomness: + model_call_kwargs["temperature"] = 2.0 + model_call_kwargs["seed"] = random.randint(0, 2**32 - 1) + else: + model_call_kwargs["temperature"] = 0.0 + response = self._call_model( self.model.create_completion, use_cache=use_cache, - **kwargs, + **model_call_kwargs, ) response_text = response["choices"][0]["text"] @@ -306,13 +340,13 @@ class ChatModel: self, system_message: str | None, prompt: str, - prompt_context_separator: str | None = None, - prompt_context: str | None = None, *, use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, stop: str | None = None, history: ChatMessages | None = None, - **kwargs: Any, + enforce_randomness: bool = False, + **kwargs, ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]: # a history is prepended to the messages, and we assume that it also includes a system message, i.e., we never add a system message in this case # TODO is it better to check for a system message in the history? @@ -323,15 +357,17 @@ class ChatModel: history = [self._get_system_message(system_message)] else: history = [] - # we prepend the history to the messages, and append an empty assistant message to add the generation token - messages_for_model = history + messages + [self._get_assistant_message("")] + # we prepend the history to the messages + # the chat format should take care of adding appropriate assistant messages for generating the completion + messages_for_model = history + messages reponse, usage = self._create_completion( messages=messages_for_model, + grammar=grammar, stop=stop, use_cache=use_cache, max_tokens=self.kwargs.get("max_tokens", None), - **kwargs, + enforce_randomness=enforce_randomness, ) messages.append(self._get_assistant_message(reponse)) @@ -343,15 +379,37 @@ class LlamaChat(ChatModel, Llama): @weave.op() def _create_completion( self, - use_cache: bool = False, - **kwargs, + messages: ChatMessages, + *, + use_cache: bool, + grammar: llama_cpp.LlamaGrammar | None = None, + stop: str | None, + max_tokens: int | None, + enforce_randomness: bool, ): + # input( + # f"The input for the model will look like this:\n{format_llama3(messages).prompt}" + # ) + # setup kwargs for model call + model_call_kwargs = { + "messages": messages, + "grammar": grammar, + "stop": stop, + "max_tokens": max_tokens, + } + if enforce_randomness: + model_call_kwargs["temperature"] = 2.0 + model_call_kwargs["seed"] = random.randint(0, 2**32 - 1) + else: + model_call_kwargs["temperature"] = 0.0 + response = self._call_model( self.model.create_chat_completion, use_cache=use_cache, - **kwargs, + **model_call_kwargs, ) response_text = response["choices"][0]["message"]["content"] + # input(response_text) usage = ModelUsage(**response["usage"]) return response_text, usage @@ -361,7 +419,7 @@ class HfChat(ChatModel, LLMModel): def __init__( self, - model="meta-llama/Meta-Llama-3.1-8B-Instruct", + model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", ignore_cache_kwargs: list[str] | None = None, **kwargs, ) -> None: @@ -380,6 +438,11 @@ class HfChat(ChatModel, LLMModel): } if "do_sample" in kwargs: model_kwargs["do_sample"] = kwargs.pop("do_sample") + ignore_cache_kwargs = ( + ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else [] + ) + # torch.dtype is not JSON serializable therefore we ignore it + ignore_cache_kwargs.extend(["torch_dtype"]) # pass all arguments to super constructor which should be taken into account for caching super().__init__(**model_kwargs, ignore_cache_kwargs=ignore_cache_kwargs) @@ -400,19 +463,31 @@ class HfChat(ChatModel, LLMModel): def _create_completion( self, messages: ChatMessages, - use_cache: bool = False, - max_tokens: int = None, - **kwargs, + *, + use_cache: bool, + stop: str | None, + max_tokens: int | None, + enforce_randomness: bool, ): + # setup kwargs for model call + model_call_kwargs = { + "text_inputs": messages, + "stop": stop, + "max_length": max_tokens if max_tokens is not None else 2048, + } + if enforce_randomness: + model_call_kwargs["temperature"] = 2.0 + model_call_kwargs["do_sample"] = True + else: + model_call_kwargs["do_sample"] = False + # input( # f"The input for the model will look like this:\n{self.pipeline.tokenizer.apply_chat_template(messages, tokenize=False)}" # ) response = self._call_model( self.pipeline, - text_inputs=messages, use_cache=use_cache, - max_length=max_tokens if max_tokens is not None else 1024, - **kwargs, + **model_call_kwargs, ) response_text = response[0]["generated_text"][-1]["content"] # no usage supported by HF pipeline; TODO manually compute usage? @@ -428,11 +503,11 @@ class HfChat(ChatModel, LLMModel): class AlpacaHfChat(HfChat): def __init__( self, - *args, - model="chavinlo/alpaca-native", + model: str = "chavinlo/alpaca-native", + ignore_cache_kwargs: list[str] | None = None, **kwargs, ): - super().__init__(*args, **kwargs, model=model) + super().__init__(model=model, ignore_cache_kwargs=ignore_cache_kwargs, **kwargs) # chat template for Alpaca adapted from https://huggingface.co/Vezora/Mistral-22B-v0.1/blob/c15d70465e2fc46c3c4d7fec8fb62f533d4ef09b/tokenizer_config.json#L30 self.pipeline.tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ system_message + '\\n\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n' + message['content'].strip()}}{% endif %}{% endfor %}" @@ -450,25 +525,42 @@ class OpenAIChat(ChatModel, LLMModel): def __init__( self, - model_kwargs: dict, + openai_model: str, + ignore_cache_kwargs: list[str] | None = None, **kwargs, ) -> None: # initialize client for API calls - self.openai_client = openai.OpenAI(**kwargs) - self._model_name = model_kwargs["openai_model"] + # TODO set API key, project etc. via keyword arguments? + self.openai_client = openai.OpenAI() + self._model_name = openai_model - super().__init__(model_kwargs, **kwargs) + super().__init__(model=openai_model, ignore_cache_kwargs=ignore_cache_kwargs) def _create_completion( self, - use_cache: bool = False, - **kwargs, + messages: ChatMessages, + *, + use_cache: bool, + stop: str | None, + max_tokens: int | None, + enforce_randomness: bool, ): + # setup kwargs for model call + model_call_kwargs = { + "model": self.model_name, + "messages": messages, + "stop": stop, + "max_completion_tokens": max_tokens if max_tokens is not None else 1024, + } + if enforce_randomness: + model_call_kwargs["temperature"] = 2.0 + else: + model_call_kwargs["temperature"] = 0.0 + response = self._call_model( self.openai_client.chat.completions.create, - model=self.model_name, use_cache=use_cache, - **kwargs, + **model_call_kwargs, ) response_text = response.choices[0].message.content usage = ModelUsage(**response.usage.__dict__) diff --git a/evoprompt/utils.py b/evoprompt/utils.py index 4ca878d..6bed5b8 100644 --- a/evoprompt/utils.py +++ b/evoprompt/utils.py @@ -79,7 +79,9 @@ def initialize_run_directory(model): # make sure that we use high randomness for generating the run name even if a seed is set for the model response, _, _, _ = model.create_completion( - None, run_name_prompt, temperature=2.0, seed=random.randint(0, 2**32 - 1) + system_message=None, + prompt=run_name_prompt, + enforce_randomness=True, ) run_name_match = re.search(r"^\w+$", response, re.MULTILINE) existing_run_names = os.listdir(RUNS_DIR) if RUNS_DIR.exists() else [] -- GitLab