diff --git a/eval_prompt.py b/eval_prompt.py index dc54636209b15cd8a320090842433c0b1ed9788c..3730ab42b6ac51852630302864e3b8577e9026e2 100644 --- a/eval_prompt.py +++ b/eval_prompt.py @@ -6,32 +6,19 @@ import torch from evoprompt.models import LLMModel from evoprompt.task import get_task, tasks -from evoprompt.utils import setup_console_logger +from evoprompt.utils import init_rng, setup_console_logger logger = logging.getLogger(__name__) -def evaluate_prompt(prompt: str, task_name: str, args: Namespace): +def evaluate_prompt(prompt: str, task_args: Namespace, model_args: Namespace): logger.info(f'Evaluating prompt "{prompt}"') evaluation_model = LLMModel.get_model( - name="hfllamachat", - # name="llamachat", - options=args, - model="PKU-Alignment/alpaca-7b-reproduced", - # model="chavinlo/alpaca-native", - model_kwargs=dict( - load_in_8bit=True, - torch_dtype=torch.float16, - device_map="auto", - # use less randomness, i.e., more certain outputs - temperature=0.0, - ), - # torch_dtype is not JSON serializable therefore we ignore it - ignore_cache_kwargs=["model_kwargs.torch_dtype"], + **vars(model_args), ) - task = get_task(task_name, evaluation_model, **vars(args)) + task = get_task(evaluation_model=evaluation_model, **vars(task_args)) eval_score, eval_usage, _ = task.evaluate_validation(prompt) logger.info(f"Score on evaluation set: {eval_score}") @@ -40,28 +27,54 @@ def evaluate_prompt(prompt: str, task_name: str, args: Namespace): if __name__ == "__main__": - setup_console_logger() - argparser = argparse.ArgumentParser() - argparser.add_argument("-p", "--prompt", type=str, required=True) - argparser.add_argument( - "-t", "--task", type=str, choices=sorted(tasks.keys()), required=True - ) - args = argparser.parse_args() + def main(): + argparser = argparse.ArgumentParser() + argparser.add_argument("-p", "--prompt", type=str, required=True) + argparser.add_argument( + "-t", "--task", type=str, choices=sorted(tasks.keys()), required=True + ) + argparser.add_argument( + "-v", "--verbose", action="count", default=0, help="Increase verbosity" + ) + args = argparser.parse_args() - options = Namespace( - llama_path=None, - chat_format=None, - chat_handler=None, - verbose=False, - llama_verbose=False, - llama_model="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF", - llama_model_file="Meta-Llama-3.1-8B-Instruct.Q8_0.gguf", - disable_cache=False, - use_grammar=False, - evaluation_strategy="simple", - max_tokens=None, - **vars(args), - ) + init_rng(1) + setup_console_logger(verbosity_level=args.verbose) + + task_options = Namespace( + name=args.task, + use_grammar=False, + evaluation_strategy="simple", + n_evaluation_demo=1, + ) + model_options = Namespace( + # name="hfchat", + name="alpacahfchat", + # name="llamachat", + verbose=args.verbose, + # model="PKU-Alignment/alpaca-7b-reproduced", + # model="chavinlo/alpaca-native", + load_in_8bit=True, + torch_dtype=torch.float16, + device_map="auto", + # temperature=0.0, + # torch_dtype is not JSON serializable therefore we ignore it + ignore_cache_kwargs=["torch_dtype"], + llama_path=None, + chat_format=None, + chat_handler=None, + llama_verbose=False, + llama_model="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF", + # llama_model="TheBloke/Llama-2-70B-Chat-GGUF", + llama_model_file="Meta-Llama-3.1-8B-Instruct.Q8_0.gguf", + # llama_model_file="llama-2-70b-chat.Q4_K_M.gguf", + disable_cache=False, + max_tokens=None, + ) + print(task_options) + print(model_options) + + evaluate_prompt(args.prompt, task_options, model_options) - evaluate_prompt(args.prompt, args.task, options) + main() diff --git a/evoprompt/evolution/evolution.py b/evoprompt/evolution/evolution.py index e6b7d365237ad9221fd85c3890ceb4a192640e1c..57556f0c0b5716fc12390d01076a84377df03261 100644 --- a/evoprompt/evolution/evolution.py +++ b/evoprompt/evolution/evolution.py @@ -249,6 +249,7 @@ class GeneticAlgorithm(EvolutionAlgorithm): self.evolution_model.create_completion( system_message=SYSTEM_MESSAGE, prompt=filled_prompt, + use_randomness=True, ) ) @@ -351,6 +352,7 @@ class DifferentialEvolution(EvolutionAlgorithm): self.evolution_model.create_completion( system_message=SYSTEM_MESSAGE, prompt=filled_prompt, + use_randomness=True, ) ) @@ -467,7 +469,9 @@ class DifferentialEvolutionWithCot(DifferentialEvolution): system_message=SYSTEM_MESSAGE, prompt=filled_prompt, history=history, - stop="</prompt>" if idx == len(DE_COT_PROMPTS) - 1 else None, + # the models often repeat the instuction which could also contain </prompt> therefore we should not stop early + stop=None, # "</prompt>" if idx == len(DE_COT_PROMPTS) - 1 else None, + use_randomness=True, ) ) logger.debug( diff --git a/evoprompt/models.py b/evoprompt/models.py index cb6e1b3e6595f73b1c397c7204d2d9f1ca45ce9a..8653c0d5cbda4b9cebdbb07b556d50efdcbc08af 100644 --- a/evoprompt/models.py +++ b/evoprompt/models.py @@ -2,13 +2,15 @@ import hashlib import inspect import json import logging +import random import warnings from abc import ABC, abstractmethod -from argparse import ArgumentParser, Namespace +from argparse import ArgumentParser from pathlib import Path from typing import Any, Callable, ClassVar import llama_cpp +from llama_cpp.llama_chat_format import format_llama3 import openai import weave from diskcache import Cache @@ -36,11 +38,13 @@ class LLMModel(ABC): cls.register_arguments(argument_parser) @classmethod - def get_model(cls, name: str, options: Namespace, **kwargs): + def get_model(cls, name: str, **kwargs): if name not in LLMModel.registered_models: - raise ValueError("Model %s does not exist", name) + raise ValueError( + f"Model {name} does not exist; avilable models: {list(LLMModel.registered_models.keys())}" + ) - key = cls.get_options_kwargs_hash(options, kwargs) + key = cls.get_hash_from_kwargs(**kwargs) # check if model is already loaded if cls in LLMModel.loaded_models: model, model_key = LLMModel.loaded_models[cls] @@ -49,20 +53,20 @@ class LLMModel(ABC): f"Model {model} is already loaded with different arguments" ) else: - model = LLMModel.registered_models[name](options=options, **kwargs) + model = LLMModel.registered_models[name](**kwargs) LLMModel.loaded_models[cls] = (model, key) return model - def __init__(self, options: Namespace, **kwargs): + def __init__(self, ignore_cache_kwargs: list[str] | None = None, **kwargs): self.usage = ModelUsage() # store kwargs for caching - self.options = options - self.kwargs = kwargs + self.kwargs = kwargs.copy() + self.kwargs["ignore_cache_kwargs"] = ignore_cache_kwargs # set up caching for model calls self._call_model_cached = None - if not options.disable_cache: + if not self.kwargs.get("disable_cache", False): cache = Cache(Path(".cache_dir", self.model_cache_key)) @cache.memoize(typed=True, ignore=[0, "func"]) @@ -71,32 +75,66 @@ class LLMModel(ABC): self._call_model_cached = _call_function - @abstractmethod def create_completion( self, system_message: str | None, prompt: str, *, use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, stop: str | None = None, - history: ChatMessages | None = None, - **kwargs: Any, + use_randomness: bool = False, + **kwargs, ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]: - pass + messages = [self._get_user_message(prompt)] + if system_message is not None: + prompt = system_message + prompt - def _get_user_message(self, content: str): + reponse, usage = self._create_completion( + prompt=prompt, + grammar=grammar, + stop=stop, + use_cache=use_cache, + max_tokens=self.kwargs.get("max_tokens", None), + use_randomness=use_randomness, + ) + + messages.append(self._get_assistant_message(reponse)) + return reponse, None, messages, usage + + def _get_prediction_prefix(self): + # some models use a special token prefix for the prediction + return None + + def _get_input_prefix(self): + # some models have a special token for the input + return None + + @abstractmethod + def _create_completion( + self, + prompt: str | None = None, + *, + use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, + stop: str | None = None, + max_tokens: int | None = None, + use_randomness: bool = False, + ): ... + + def _get_user_message(self, content: Any): return { "role": "user", "content": content, } - def _get_system_message(self, content: str): + def _get_system_message(self, content: Any): return { "role": "system", "content": content, } - def _get_assistant_message(self, content: str): + def _get_assistant_message(self, content: Any): return { "role": "assistant", "content": content, @@ -116,7 +154,7 @@ class LLMModel(ABC): return model_completion_fn(**kwargs) @staticmethod - def get_options_kwargs_hash(options: Namespace, kwargs): + def get_hash_from_kwargs(**kwargs): # sometimes we want to ignore certain kwargs from the hash, e.g., when they are not relevant for the model or if they are not serializable kwargs = kwargs.copy() @@ -130,19 +168,12 @@ class LLMModel(ABC): ignore_cache_kwargs: list[str] | None = kwargs.pop("ignore_cache_kwargs", None) if ignore_cache_kwargs is not None: - options = Namespace( - **{ - k: v - for k, v in iter_dict(vars(options)) - if k not in ignore_cache_kwargs - } - ) kwargs = { k: v for k, v in iter_dict(kwargs) if k not in ignore_cache_kwargs } unique_options_key = json.dumps( - (vars(options), kwargs), + kwargs, sort_keys=True, ) return hashlib.sha1(unique_options_key.encode()).hexdigest() @@ -152,11 +183,15 @@ class LLMModel(ABC): return ( str(self.model_name).replace("/", "_") + "/" - + self.get_options_kwargs_hash(self.options, self.kwargs) + + self.get_hash_from_kwargs(**self.kwargs) ) - @classmethod + @property @abstractmethod + def model_name(self): + pass + + @classmethod def register_arguments(cls, parser: ArgumentParser): pass @@ -165,96 +200,101 @@ class Llama(LLMModel): def __init__( self, - options: Namespace, - n_gpu_layers: int = 60, + *, + ignore_cache_kwargs: list[str] | None = None, + llama_path: Path | None = None, + llama_model: str = "QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF", + llama_model_file: str = "Meta-Llama-3.1-8B-Instruct.Q8_0.gguf", + chat_format: str | None = None, + chat_handler: str | None = None, + verbose: int | bool = False, + llama_verbose: bool = False, + n_gpu_layers: int = -1, n_threads: int = 8, n_ctx: int = 8192, **kwargs, ) -> None: - # initialize model - add_kwargs = {} + # we collect all arguments to make sure they are passed to the super constructor + hashed_model_kwargs = { + "n_ctx": n_ctx, + } + if chat_format is not None: + hashed_model_kwargs["chat_format"] = chat_format + if chat_handler is not None: + hashed_model_kwargs["chat_handler"] = chat_handler seed = get_seed() if seed is not None: - add_kwargs["seed"] = seed + hashed_model_kwargs["seed"] = seed - # TODO some options could be optional + model_kwargs = dict( + **hashed_model_kwargs, + verbose=(verbose > 1 or llama_verbose), + n_gpu_layers=n_gpu_layers, + n_threads=n_threads, + ) + + ignore_cache_kwargs = ( + ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else [] + ) - if options.llama_path is not None: + if llama_path is not None: # use local file self.model = llama_cpp.Llama( - model_path=options.llama_path, - chat_format=options.chat_format, - chat_handler=options.chat_handler, - verbose=options.verbose > 1 or options.llama_verbose, - n_gpu_layers=n_gpu_layers, - n_threads=n_threads, - n_ctx=n_ctx, - **add_kwargs, - **kwargs, + model_path=llama_path, + **model_kwargs, ) - self.model_name = Path(options.llama_path).stem + self._model_name = llama_path.stem else: # use pre-trained model from HF hub self.model = llama_cpp.Llama.from_pretrained( - repo_id=options.llama_model, - filename=options.llama_model_file, - chat_format=options.chat_format, - chat_handler=options.chat_handler, - verbose=options.verbose > 1 or options.llama_verbose, - n_gpu_layers=n_gpu_layers, - n_threads=n_threads, - n_ctx=n_ctx, - **add_kwargs, - **kwargs, + repo_id=llama_model, + filename=llama_model_file, + **model_kwargs, ) - self.model_name = Path( - options.llama_model, options.llama_model_file - ).with_suffix("") + self._model_name = Path(llama_model, llama_model_file).with_suffix("") # pass all arguments to super constructor which should be taken into account for caching # needs to be called after model is initialized - super().__init__(options=options, n_ctx=n_ctx, **kwargs) + super().__init__(ignore_cache_kwargs=ignore_cache_kwargs, **hashed_model_kwargs) - def create_completion( + def _create_completion( self, - system_message: str | None, - prompt: str, + prompt: str | None = None, *, use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, stop: str | None = None, - history: ChatMessages | None = None, - **kwargs: Any, - ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]: - messages = [self._get_user_message(prompt)] - if system_message is not None: - prompt = system_message + prompt - - reponse, usage = self._create_completion( - prompt=prompt, - stop=stop, - use_cache=use_cache, - max_tokens=self.options.max_tokens, - **kwargs, - ) - - messages.append(self._get_assistant_message(reponse)) - return reponse, None, messages, usage - - def _create_completion( - self, - use_cache: bool = False, - **kwargs, + max_tokens: int | None = None, + use_randomness: bool = False, ): + # setup kwargs for model call + model_call_kwargs = { + "prompt": prompt, + "grammar": grammar, + "stop": stop, + "max_tokens": max_tokens, + } + if use_randomness: + # same temperature as in evoprompt paper reference implementation + model_call_kwargs["temperature"] = 0.5 + model_call_kwargs["seed"] = random.randint(0, 2**32 - 1) + else: + model_call_kwargs["temperature"] = 0.0 + response = self._call_model( self.model.create_completion, use_cache=use_cache, - **kwargs, + **model_call_kwargs, ) response_text = response["choices"][0]["text"] usage = ModelUsage(**response["usage"]) return response_text, usage + @property + def model_name(self): + return self._model_name + @classmethod def register_arguments(cls, parser: ArgumentParser): group = parser.add_argument_group(f"{cls.__name__} model arguments") @@ -303,9 +343,11 @@ class ChatModel: prompt: str, *, use_cache: bool = False, + grammar: llama_cpp.LlamaGrammar | None = None, stop: str | None = None, history: ChatMessages | None = None, - **kwargs: Any, + use_randomness: bool = False, + **kwargs, ) -> tuple[str, ChatMessages | None, ChatMessages, ModelUsage]: # a history is prepended to the messages, and we assume that it also includes a system message, i.e., we never add a system message in this case # TODO is it better to check for a system message in the history? @@ -316,13 +358,17 @@ class ChatModel: history = [self._get_system_message(system_message)] else: history = [] + # we prepend the history to the messages + # the chat format should take care of adding appropriate assistant messages for generating the completion + messages_for_model = history + messages reponse, usage = self._create_completion( - messages=history + messages, + messages=messages_for_model, + grammar=grammar, stop=stop, use_cache=use_cache, - max_tokens=self.options.max_tokens, - **kwargs, + max_tokens=self.kwargs.get("max_tokens", None), + use_randomness=use_randomness, ) messages.append(self._get_assistant_message(reponse)) @@ -334,13 +380,35 @@ class LlamaChat(ChatModel, Llama): @weave.op() def _create_completion( self, - use_cache: bool = False, - **kwargs, + messages: ChatMessages, + *, + use_cache: bool, + grammar: llama_cpp.LlamaGrammar | None = None, + stop: str | None, + max_tokens: int | None, + use_randomness: bool, ): + # input( + # f"The input for the model will look like this:\n{format_llama3(messages).prompt}" + # ) + # setup kwargs for model call + model_call_kwargs = { + "messages": messages, + "grammar": grammar, + "stop": stop, + "max_tokens": max_tokens, + } + if use_randomness: + # same temperature as in evoprompt paper reference implementation + model_call_kwargs["temperature"] = 0.5 + model_call_kwargs["seed"] = random.randint(0, 2**32 - 1) + else: + model_call_kwargs["temperature"] = 0.0 + response = self._call_model( self.model.create_chat_completion, use_cache=use_cache, - **kwargs, + **model_call_kwargs, ) response_text = response["choices"][0]["message"]["content"] @@ -348,82 +416,177 @@ class LlamaChat(ChatModel, Llama): return response_text, usage -class HfLlamaChat(ChatModel, Llama): +class HfChat(ChatModel, LLMModel): def __init__( self, - options: Namespace, - model="meta-llama/Meta-Llama-3.1-8B-Instruct", + model: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", + ignore_cache_kwargs: list[str] | None = None, **kwargs, ) -> None: import torch import transformers - super().__init__(options, **kwargs) + self._model_name = model - # set some default values - model_kwargs = kwargs.pop("model_kwargs", {}) - if "torch_dtype" not in model_kwargs: - model_kwargs["torch_dtype"] = torch.bfloat16 - if "device_map" not in model_kwargs: - model_kwargs["device_map"] = "auto" + # we collect all arguments to make sure they are passed to the super constructor + # also set some default values + model_kwargs = { + "task": "text-generation", + "model": model, + "torch_dtype": kwargs.pop("torch_dtype", torch.bfloat16), + "device_map": kwargs.pop("device_map", "auto"), + } + if "do_sample" in kwargs: + model_kwargs["do_sample"] = kwargs.pop("do_sample") + ignore_cache_kwargs = ( + ignore_cache_kwargs.copy() if ignore_cache_kwargs is not None else [] + ) + # torch.dtype is not JSON serializable therefore we ignore it + ignore_cache_kwargs.extend(["torch_dtype"]) + + # pass all arguments to super constructor which should be taken into account for caching + super().__init__(**model_kwargs, ignore_cache_kwargs=ignore_cache_kwargs) # initialize model self.pipeline = transformers.pipeline( - "text-generation", - model=model, - model_kwargs=model_kwargs, - **kwargs, + **model_kwargs, ) # Setting the pad token to the eos token to avoid stdout prints - self.pipeline.model.generation_config.pad_token_id = ( - self.pipeline.model.generation_config.eos_token_id - ) + # if there are multiple eos tokens, we use the first one (similarly to how it is done in the TF library) + if isinstance( + self.pipeline.model.generation_config.eos_token_id, (list, tuple) + ): + eos_token_id = self.pipeline.model.generation_config.eos_token_id[0] + else: + eos_token_id = self.pipeline.model.generation_config.eos_token_id + + self.pipeline.model.generation_config.pad_token_id = eos_token_id def _create_completion( self, messages: ChatMessages, - use_cache: bool = False, - max_tokens: int = None, + *, + use_cache: bool, + stop: str | None, + max_tokens: int | None, + use_randomness: bool, **kwargs, ): + # setup kwargs for model call + model_call_kwargs = { + "text_inputs": messages, + "stop": stop, + "max_length": max_tokens if max_tokens is not None else 2048, + } + if use_randomness: + # same temperature as in evoprompt paper reference implementation + model_call_kwargs["temperature"] = 0.5 + model_call_kwargs["do_sample"] = True + else: + model_call_kwargs["do_sample"] = False + + # input( + # f"The input for the model will look like this:\n{self.pipeline.tokenizer.apply_chat_template(messages, tokenize=False)}" + # ) response = self._call_model( self.pipeline, - text_inputs=messages, use_cache=use_cache, - max_length=max_tokens if max_tokens is not None else 1024, - **kwargs, + **model_call_kwargs, ) response_text = response[0]["generated_text"][-1]["content"] - # TODO no usage supported by HF pipeline; manually compute usage? + # no usage supported by HF pipeline; TODO manually compute usage? usage = ModelUsage() return response_text, usage + @property + def model_name(self): + return self._model_name -class OpenAiChat(ChatModel, LLMModel): + +# For Alpaca we build inputs to follow the fine-tuning chat format like https://github.com/tatsu-lab/stanford_alpaca/blob/761dc5bfbdeeffa89b8bff5d038781a4055f796a/README.md?plain=1#L56-L66 +class AlpacaHfChat(HfChat): + def __init__( + self, + model: str = "chavinlo/alpaca-native", + ignore_cache_kwargs: list[str] | None = None, + **kwargs, + ): + super().__init__(model=model, ignore_cache_kwargs=ignore_cache_kwargs, **kwargs) + + # chat template for Alpaca adapted from https://huggingface.co/Vezora/Mistral-22B-v0.1/blob/c15d70465e2fc46c3c4d7fec8fb62f533d4ef09b/tokenizer_config.json#L30 + self.pipeline.tokenizer.chat_template = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ system_message + '\\n\\n' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '### Instruction:\\n' + message['content'].strip() + '\\n\\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response:\\n' + message['content'].strip()}}{% endif %}{% endfor %}" + + def _create_completion( + self, + messages: list[dict[str, str]], + *, + use_cache: bool, + stop: str | None, + max_tokens: int | None, + use_randomness: bool, + **kwargs, + ): + # for some reason adding an empty assistant message yields different generations than adding it manually in the chat template + return super()._create_completion( + messages + [self._get_assistant_message("")], + use_cache=use_cache, + stop=stop, + max_tokens=max_tokens, + use_randomness=use_randomness, + **kwargs, + ) + + def _get_input_prefix(self): + return "### Input:\n" + + def _get_prediction_prefix(self): + # Alpaca uses a special token prefix for the prediction + return "\n### Response:\n" + + +class OpenAIChat(ChatModel, LLMModel): """Queries an OpenAI model using its API.""" def __init__( self, - options: Namespace, + openai_model: str, + ignore_cache_kwargs: list[str] | None = None, **kwargs, ) -> None: # initialize client for API calls - self.openai_client = openai.OpenAI(**kwargs) - self.model_name = options.openai_model + # TODO set API key, project etc. via keyword arguments? + self.openai_client = openai.OpenAI() + self._model_name = openai_model - super().__init__(options, **kwargs) + super().__init__(model=openai_model, ignore_cache_kwargs=ignore_cache_kwargs) def _create_completion( self, - use_cache: bool = False, - **kwargs, + messages: ChatMessages, + *, + use_cache: bool, + stop: str | None, + max_tokens: int | None, + use_randomness: bool, ): + # setup kwargs for model call + model_call_kwargs = { + "model": self.model_name, + "messages": messages, + "stop": stop, + "max_completion_tokens": max_tokens if max_tokens is not None else 1024, + } + if use_randomness: + # same temperature as in evoprompt paper reference implementation + model_call_kwargs["temperature"] = 0.5 + else: + model_call_kwargs["temperature"] = 0.0 + response = self._call_model( self.openai_client.chat.completions.create, - model=self.model_name, use_cache=use_cache, - **kwargs, + **model_call_kwargs, ) response_text = response.choices[0].message.content usage = ModelUsage(**response.usage.__dict__) @@ -434,6 +597,10 @@ class OpenAiChat(ChatModel, LLMModel): group = parser.add_argument_group("OpenAI model arguments") group.add_argument("--openai-model", "-m", type=str, default="gpt-3.5-turbo") + @property + def model_name(self): + return self._model_name + argument_group = argument_parser.add_argument_group("Model arguments") argument_group.add_argument( diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py index 81538709c460da09134e5fbe3841e1595105986b..c1147f40f3b1a1c7c0df1ed0358012149c80acc6 100644 --- a/evoprompt/optimization.py +++ b/evoprompt/optimization.py @@ -93,8 +93,8 @@ class ResponseEditor(App): return self.text_area.text -@log_calls("Paraphrasing prompts") -def paraphrase_prompts( +@log_calls("Paraphrasing prompt") +def paraphrase_prompt( model: LLMModel, prompt: str, n: int, @@ -112,7 +112,8 @@ def paraphrase_prompts( num_tries += 1 paraphrase, _, _, usage = model.create_completion( system_message=PARAPHRASE_PROMPT, - prompt=f"Instruction: {prompt}", + prompt=prompt, + use_randomness=True, ) total_usage += usage if "<prompt>" in paraphrase: @@ -200,7 +201,7 @@ class PromptOptimization: # fill up the rest with paraphrases of the top prompts promptindex_to_paraphrase = 0 while len(initial_population) < num_initial_prompts: - paraphrases, paraphrase_usage = paraphrase_prompts( + paraphrases, paraphrase_usage = paraphrase_prompt( self.evolution_model, top_prompts[promptindex_to_paraphrase], n=1, diff --git a/evoprompt/task/__init__.py b/evoprompt/task/__init__.py index 5238a2cf8d9a5cf231b4de1d3b18fd161b3af55b..4387f7c24ea05e75720984adc94c2bb3fb2a311d 100644 --- a/evoprompt/task/__init__.py +++ b/evoprompt/task/__init__.py @@ -24,7 +24,9 @@ tasks = { def get_task(name: str, evaluation_model: LLMModel, **options) -> Task: if name not in tasks: - raise ValueError("Model %s does not exist", name) + raise ValueError( + f"Task %{name} does not exist; available tasks: {list(tasks.keys())}" + ) return tasks[name](evaluation_model, **options) diff --git a/evoprompt/helpers/prompts.py b/evoprompt/task/base_prompts_mixin.py similarity index 78% rename from evoprompt/helpers/prompts.py rename to evoprompt/task/base_prompts_mixin.py index 8e653f761ad035761586084f8eb53123c8361777..c4d00cd99d71788623ce4b6d1e67e5b5dfd14e60 100644 --- a/evoprompt/helpers/prompts.py +++ b/evoprompt/task/base_prompts_mixin.py @@ -5,6 +5,7 @@ import re from datasets import Dataset from evoprompt.models import LLMModel +from evoprompt.utils import get_rng class BasePromptsFromJsonMixin: @@ -35,18 +36,25 @@ class BasePromptsFromGeneration: self, num_prompts: int, patience: int = 10, allow_duplicates: bool = False ) -> str: self.validation_dataset: Dataset - samples = self.validation_dataset._select_contiguous(0, 5) + samples = self.validation_dataset.shuffle(42).select( + get_rng().choice(len(self.validation_dataset), 5, replace=False) + ) prompt = "I gave a friend an instruction and five inputs. The friend read the instruction and wrote an output for every one of the inputs. Here are the input-output pairs:\n" - for sample in samples: - prompt += f"\n\n{self._get_prompt_text_for_datum(sample)}\n{self._get_generation_prefix()}{self._get_gold_label_generation_for_datum(sample)}\n" + raise NotImplementedError( + "The prompt needs to be adapted for the model taking into account the correct format." + ) + prompt = self.build_demonstration_prompt(samples, prompt=prompt) prompt += "\nThe instruction was " + system_message = "You are a helpful assistant. Please provide the instruction wrapped within tags <instruction> and </instruction> that belongs to the given input-output pairs." + input(prompt) generated_prompts = [] while len(generated_prompts) < num_prompts: response, _, _, _ = self.evolution_model.create_completion( - system_message=f"You are a helpful assistant. Please provide the instruction wrapped within tags <instruction> and </instruction> that belongs to the given input-output pairs.", + system_message=system_message, prompt=prompt, ) + input(response) matches = re.findall( # regex that extracts anything within tags <instruction> and optional </instruction> rf"<instruction>(.+?)(?:(?=</instruction>)|$)", diff --git a/evoprompt/task/question_answering.py b/evoprompt/task/question_answering.py index 0634b3e9bcfb1d4127c00bd30d2dee32d8ff2b57..6266c86d03b879424276242f2534f77d1cbcf589 100644 --- a/evoprompt/task/question_answering.py +++ b/evoprompt/task/question_answering.py @@ -8,7 +8,7 @@ from datasets import Dataset from evaluate import load as load_metric from llama_cpp import LlamaGrammar -from evoprompt.helpers.prompts import BasePromptsFromGeneration +from evoprompt.task.base_prompts_mixin import BasePromptsFromGeneration from evoprompt.opt_types import ModelUsage from evoprompt.task.task import DatasetDatum, Task from evoprompt.utils import get_rng @@ -70,10 +70,14 @@ class QuestionAnswering(Task): ) @staticmethod - def _get_generation_prefix(): + def _get_prediction_prefix(): return "Answer: " - def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str: + def _get_prompt_text_for_datum( + self, datum: DatasetDatum, use_prefix: bool = False + ) -> str: + # for qa we always use prefixes because there are multple inputs + # TODO Shall we try omitting the prefix? context = self._get_context_for_datum(datum) question = self._get_question_for_datum(datum) return f"Context: {context}\nQuestion: {question}" @@ -101,16 +105,16 @@ class QuestionAnswering(Task): return response # if we do not use a grammar, we need to extract the answer from the response # otherwise the answer is from the context as enforced by the grammar - prefix_to_match = self._get_generation_prefix().replace(" ", r"\s?") + prefix_to_match = self._get_prediction_prefix().replace(" ", r"\s?") matches = re.findall( - # regex that matches class labels after "Response: " + # regex that matches class labels after prediction prefix rf"(?:{prefix_to_match})?(.+)", response.splitlines()[-1], flags=re.IGNORECASE, ) # look for an answer in the response, if not found, use whole response if matches: - return matches[-1] + return matches[0] else: return response @@ -125,6 +129,9 @@ class QuestionAnswering(Task): ) return result["f1"] + @abstractmethod + def _get_gold_label_for_datum(self, datum: DatasetDatum): ... + def _aggregate_result(self, results: list[float]) -> float: return sum(results) / len(results) @@ -190,6 +197,7 @@ class SQuAD(BasePromptsFromGeneration, QuestionAnswering): return datum["answers"] def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str: + # TODO use all answers -- does this actually happen in any QA dataset?? return self._get_gold_label_for_datum(datum)["text"][0] @property diff --git a/evoprompt/task/sentiment_analysis.py b/evoprompt/task/sentiment_analysis.py index 82bbd89505d6849bce30a3e9a1f389b17a7a9f64..bf72c175ed7bf2f6e4ad763c5ef69f0c72ed125b 100644 --- a/evoprompt/task/sentiment_analysis.py +++ b/evoprompt/task/sentiment_analysis.py @@ -4,7 +4,7 @@ from typing import Mapping from datasets import load_dataset -from evoprompt.helpers.prompts import BasePromptsFromJsonMixin +from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin from evoprompt.task import TextClassification from evoprompt.task.task import DatasetDatum @@ -23,7 +23,7 @@ class SentimentAnalysis(TextClassification): return {"negative": 0, "positive": 1} @staticmethod - def _get_generation_prefix(): + def _get_prediction_prefix(): return "Sentiment: " diff --git a/evoprompt/task/simplification.py b/evoprompt/task/simplification.py index 1d62bc40064207750148a5e386095103ca111c5a..37e0ec50f36f829bd01d3c8db290ac2db0c810de 100644 --- a/evoprompt/task/simplification.py +++ b/evoprompt/task/simplification.py @@ -2,7 +2,7 @@ import logging from evaluate import load as load_metric -from evoprompt.helpers.prompts import BasePromptsFromJsonMixin +from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin from evoprompt.task import TextGeneration from evoprompt.task.task import DatasetDatum @@ -16,7 +16,7 @@ class Simplification(TextGeneration): self.metric = load_metric("evaluate-metric/sari") def compute_metric(self, datum: DatasetDatum, prediction: str) -> float: - gold_label = self._get_gold_label_for_datum(datum) + gold_label = self._get_gold_label_generation_for_datum(datum) return self.metric.compute( sources=[self._get_text_for_datum(datum)], predictions=[prediction], @@ -24,7 +24,7 @@ class Simplification(TextGeneration): )["sari"] @staticmethod - def _get_generation_prefix(): + def _get_prediction_prefix(): return "Simplification: " @property @@ -54,5 +54,5 @@ class ASSET(BasePromptsFromJsonMixin, Simplification): return datum["original"] @staticmethod - def _get_gold_label_for_datum(datum: DatasetDatum) -> str: + def _get_gold_label_generation_for_datum(datum: DatasetDatum) -> str: return datum["simplifications"] diff --git a/evoprompt/task/subjectivity_classification.py b/evoprompt/task/subjectivity_classification.py index 051294d432df9abcabec557f575601e0aca2919a..7c3882e0a96a9c89db131220df97e7aeecfa15ab 100644 --- a/evoprompt/task/subjectivity_classification.py +++ b/evoprompt/task/subjectivity_classification.py @@ -3,7 +3,7 @@ from typing import Mapping from datasets import load_dataset -from evoprompt.helpers.prompts import BasePromptsFromJsonMixin +from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin from evoprompt.task import TextClassification from evoprompt.task.task import DatasetDatum @@ -43,7 +43,7 @@ class Subj(BasePromptsFromJsonMixin, TextClassification): return datum["label"] @staticmethod - def _get_generation_prefix(): + def _get_prediction_prefix(): return "Subjectivity: " @staticmethod diff --git a/evoprompt/task/summarization.py b/evoprompt/task/summarization.py index 9d485b85fc01c5fdf4597bf5f945baf317f88932..fed21c15fbe2a409bfb2f4866242b96b4e48b16c 100644 --- a/evoprompt/task/summarization.py +++ b/evoprompt/task/summarization.py @@ -2,7 +2,7 @@ import logging from evaluate import load as load_metric -from evoprompt.helpers.prompts import BasePromptsFromJsonMixin +from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin from evoprompt.task import TextGeneration from evoprompt.task.task import DatasetDatum @@ -16,7 +16,7 @@ class Summarization(TextGeneration): self.metric = load_metric("evaluate-metric/rouge") def compute_metric(self, datum: DatasetDatum, prediction: str) -> float: - gold_label = self._get_gold_label_for_datum(datum) + gold_label = self._get_gold_label_generation_for_datum(datum) return ( self.metric.compute(predictions=[prediction], references=[gold_label])[ "rougeL" @@ -25,7 +25,7 @@ class Summarization(TextGeneration): ) @staticmethod - def _get_generation_prefix(): + def _get_prediction_prefix(): return "Summary: " @property @@ -55,5 +55,5 @@ class SAMSum(BasePromptsFromJsonMixin, Summarization): return datum["dialogue"] @staticmethod - def _get_gold_label_for_datum(datum: DatasetDatum) -> str: + def _get_gold_label_generation_for_datum(datum: DatasetDatum) -> str: return datum["summary"] diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py index c5b2e6699a63a6033c39765963bf15f96d4d1cdd..6b2a3344d2f3bf8f30787fb97c247282090077e7 100644 --- a/evoprompt/task/task.py +++ b/evoprompt/task/task.py @@ -17,9 +17,7 @@ from evoprompt.utils import log_calls logger = logging.getLogger(__name__) -SYSTEM_MESSAGE = """ -You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. -""" +SYSTEM_MESSAGE = "You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request." DatasetDatum = dict @@ -148,7 +146,7 @@ def get_evaluation_strategy(evaluation_strategy_key: EvaluationStrategyKey): class EvaluationStrategy: - def __init__(self, task): + def __init__(self, task: "Task"): self.task = task @abstractmethod @@ -197,7 +195,7 @@ class ShortestFirstStrategy(EarlyStoppingStrategy): self, dataset: Dataset, parent_histories: ParentHistories | None ): sorted_dataset = sorted( - dataset, key=lambda x: len(self.task._get_prompt_text_for_datum(x)) + dataset, key=lambda x: len(self.task.build_prompt_input(x)) ) return super().get_dataset_iterator(sorted_dataset, parent_histories) @@ -274,6 +272,10 @@ class Task(metaclass=ABCMeta): self.use_grammar = use_grammar self.n_evaluation_demo = n_evaluation_demo + # TODO do we want to be able to set these? + self.force_task_input_prefix = False + self.force_task_prediction_prefix = False + self.evaluation_strategy = get_evaluation_strategy(evaluation_strategy)(self) logger.info( f"using evaluation strategy: {self.evaluation_strategy.__class__.__name__}" @@ -342,59 +344,16 @@ class Task(metaclass=ABCMeta): ) -> Iterable[int]: pass - @weave.op() - def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]: - # run model for inference using grammar to constrain output - # TODO grammar also depends on prompt and vice-versa -> what are good labels? - response, _, _, usage = self.model.create_completion( - system_message=SYSTEM_MESSAGE, - prompt=prompt, - # grammar can be applied to constrain the model output - grammar=self._get_grammar(datum) if self.use_grammar else None, - # we use cached completions to speed up the process although we loose the non-deterministic behavior of LMs, but we're ok with a single result - use_cache=True, - # use less randomness, i.e., more certain outputs - temperature=0.0, - ) - - if not self.use_grammar: - # we postprocess the model output to return as answer - response = response.strip() - - return response, usage - - def _build_prompt(self, prompt: str, datum: DatasetDatum) -> str: - prompt = f"{prompt}\n\n{self._get_prompt_text_for_datum(datum)}\n{self._get_generation_prefix()}" - return prompt - - @abstractmethod - def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str: ... - - @abstractmethod - def _get_generation_prefix() -> str: ... - @abstractmethod - def _get_grammar(self, datum: DatasetDatum) -> LlamaGrammar: - pass - - @abstractmethod - def _evaluate_sample(self, response: str, datum: DatasetDatum) -> Any: ... - - @abstractmethod - def _parse_response(self, response: str) -> str: ... - - @abstractmethod - def _get_gold_label_for_datum(self, datum: DatasetDatum) -> str: - pass - - @abstractmethod - # This method is needed for the demonstration examples. - def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str: - pass + @log_calls("Evaluating validation dataset") + def evaluate_validation( + self, prompt: str, parent_histories: list[list[float]] | None = None + ): + return self.evaluate(prompt, self.validation_dataset, parent_histories) - @abstractmethod - def _aggregate_result(self, results: list) -> float: - pass + @log_calls("Evaluating test dataset") + def evaluate_test(self, prompt: str): + return self.evaluate(prompt, self.test_dataset, no_early_stopping=True) @weave.op() def evaluate( @@ -412,22 +371,20 @@ class Task(metaclass=ABCMeta): evaluation_history = [] # augment prompt with demonstration samples - prompt += "".join( - [ - f"\n\n{self._get_prompt_text_for_datum(datum)}\n{self._get_generation_prefix()}{self._get_gold_label_generation_for_datum(datum)}" - for datum in self.demonstration_samples - ] - ) + prompt_with_examples = self.build_demonstration_prompt(self.demonstration_samples, prompt=prompt) for datum in dataset_iterator: - # build prompt for current sample - prompt_for_datum = self._build_prompt(prompt, datum) # run prediction - response, usage = self.predict(prompt=prompt_for_datum, datum=datum) + response, usage = self.predict(prompt=prompt_with_examples, datum=datum) + logger.debug(f"Response: '{response}'") # parse response response = self._parse_response(response=response) + logger.debug(f"Parsed response: '{response}'") # evaluate response result = self._evaluate_sample(response=response, datum=datum) + logger.debug( + f"Prediction: '{response}', Gold label: '{self._get_gold_label_generation_for_datum(datum)}', Result: {result}" + ) results.append(result) current_metric = self._aggregate_result(results) dataset_iterator.set_postfix({self.metric_name: f"{current_metric:.2f}"}) @@ -442,23 +399,76 @@ class Task(metaclass=ABCMeta): break return self._aggregate_result(results), evaluation_usage, evaluation_history + + @weave.op() + def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]: + # run model for inference using grammar to constrain output + # TODO grammar also depends on prompt and vice-versa -> what are good labels? + # build prompt for current sample + prompt_for_datum = self.build_prompt_input(datum, prompt=prompt, use_prediction_prefix=self.model._get_prediction_prefix() is None) + logger.debug(f"Prompt for datum:\n{prompt_for_datum}") + response, _, _, usage = self.model.create_completion( + system_message=SYSTEM_MESSAGE, + prompt=prompt_for_datum, + # grammar can be applied to constrain the model output + grammar=self._get_grammar(datum) if self.use_grammar else None, + # we use cached completions to speed up the process although we loose the non-deterministic behavior of LMs, but we're ok with a single result + use_cache=True, + # use less randomness, i.e., more certain outputs + use_randomness=False, + ) - @log_calls("Evaluating validation dataset") - def evaluate_validation( - self, prompt: str, parent_histories: list[list[float]] | None = None - ): - return self.evaluate(prompt, self.validation_dataset, parent_histories) + if not self.use_grammar: + # we postprocess the model output to return as answer + response = response.strip() - @log_calls("Evaluating test dataset") - def evaluate_test(self, prompt: str): - return self.evaluate(prompt, self.test_dataset, no_early_stopping=True) + return response, usage + + def build_prompt_input( + self, sample, prompt: str = "", use_prediction_prefix: bool = False, + ) -> str: + # the default is to use the prompt as is and concatenate the datum string + prompt += f"\n\n{self.model._get_input_prefix() if self.model._get_input_prefix() is not None else ""}{self._get_prompt_text_for_datum(sample, use_prefix=self.force_task_input_prefix or not self.model._get_input_prefix())}" + if use_prediction_prefix: + prompt += f"\n{self._get_prediction_prefix().strip()} " + return prompt.strip() + + def build_demonstration_prompt( + self, + demonstration_samples: list[dict], + prompt: str = "", + ) -> str: + for sample in demonstration_samples: + prompt += "\n\n" + self.build_prompt_input(sample) + prompt += f"\n{self.model._get_prediction_prefix() if self.model._get_prediction_prefix() is not None else self._get_prediction_prefix()}{self._get_gold_label_generation_for_datum(sample)}" + return prompt.strip() + + @abstractmethod + def _get_prompt_text_for_datum(self, datum: DatasetDatum, use_prefix: bool = False) -> str: ... + + @abstractmethod + def _get_prediction_prefix() -> str: ... + + @abstractmethod + def _get_grammar(self, datum: DatasetDatum) -> LlamaGrammar: ... + + @abstractmethod + def _evaluate_sample(self, response: str, datum: DatasetDatum) -> Any: ... + + @abstractmethod + def _parse_response(self, response: str) -> str: ... + + @abstractmethod + # This method is needed for the demonstration examples. + def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str: ... + + @abstractmethod + def _aggregate_result(self, results: list) -> float: ... @property @abstractmethod - def metric_name(self) -> str: - pass + def metric_name(self) -> str: ... @property @abstractmethod - def base_prompts(self) -> list[str]: - pass + def base_prompts(self) -> list[str]: ... diff --git a/evoprompt/task/text_classification.py b/evoprompt/task/text_classification.py index 74e8dbcb2d8d6ab1ae8cfb262df9c9a86848f491..af30f1545b21f5e43ff674c89a3305abb0d9153f 100644 --- a/evoprompt/task/text_classification.py +++ b/evoprompt/task/text_classification.py @@ -23,12 +23,12 @@ class TextClassification(Task): else: matches = re.findall( # regex that matches class labels after the generation prefix - rf"{self._get_generation_prefix()}({'|'.join(class_mapping.keys())})", + rf"{self._get_prediction_prefix()}({'|'.join(class_mapping.keys())})", response, flags=re.IGNORECASE, ) if matches: - return matches[-1] + return matches[0] else: # look for a label in the response, if not found, return failed matches = re.findall( @@ -64,12 +64,12 @@ class TextClassification(Task): ).shuffle(42) sample_ids = [] - def check_sample_label(sample, label, gold_label_for_datum_fn): + def filter_sample_for_label(sample, label, gold_label_for_datum_fn): return gold_label_for_datum_fn(sample) == label for label in self._get_label_mapping().values(): sample_ids_for_label = dataset_with_row_indices.filter( - check_sample_label, + filter_sample_for_label, fn_kwargs={ "label": label, "gold_label_for_datum_fn": self._get_gold_label_for_datum, @@ -78,13 +78,6 @@ class TextClassification(Task): sample_ids += sample_ids_for_label return sample_ids - def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str: - # NOTE it seems that quotes in the prompt make it worse - return f"{self._get_input_prefix()}{self._get_text_for_datum(datum)}" - - @abstractmethod - def _get_text_for_datum(self, datum: DatasetDatum) -> str: ... - # NOTE cannot be cached since grammar is not picklable def _get_grammar(self, datum: DatasetDatum, verbose: bool = False): return LlamaGrammar.from_string( @@ -94,17 +87,22 @@ class TextClassification(Task): verbose=verbose, ) - @staticmethod - def _get_input_prefix(): - return "Text: " + def _get_prompt_text_for_datum( + self, datum: DatasetDatum, use_prefix: bool = False + ) -> str: + text = self._get_text_for_datum(datum) + if use_prefix: + return f"Text: {text}" + return text + + @abstractmethod + def _get_text_for_datum(self, datum: DatasetDatum) -> str: ... @abstractmethod - def _get_label_mapping(self) -> Mapping: - pass + def _get_label_mapping(self) -> Mapping: ... @abstractmethod - def _get_gold_label_for_datum(self, datum: DatasetDatum) -> str: - pass + def _get_gold_label_for_datum(self, datum: DatasetDatum) -> str: ... @lru_cache def _get_inverse_label_mapping(self): diff --git a/evoprompt/task/text_generation.py b/evoprompt/task/text_generation.py index 81a81f6866ef1b4fdd9af732852645ee5a648b16..1826d78ffd26dceaef8b15945399bed7ebbd708b 100644 --- a/evoprompt/task/text_generation.py +++ b/evoprompt/task/text_generation.py @@ -25,10 +25,14 @@ class TextGeneration(Task): def _evaluate_sample(self, response: str, datum: DatasetDatum) -> float: return self.compute_metric(datum, response.lower()) - def _get_prompt_text_for_datum(self, datum: DatasetDatum) -> str: - # NOTE it seems that quotes in the prompt make it worse - return f"{self._get_input_prefix()}{self._get_text_for_datum(datum)}" - + def _get_prompt_text_for_datum( + self, datum: DatasetDatum, use_prefix: bool = False + ) -> str: + text = self._get_text_for_datum(datum) + if use_prefix: + return f"Text: {text}" + return text + @abstractmethod def _get_text_for_datum(self, datum: DatasetDatum) -> str: ... @@ -36,22 +40,11 @@ class TextGeneration(Task): # there is no grammar for open text generation return None - @staticmethod - def _get_input_prefix(): - return "Text: " - def _get_demonstration_sample_ids( self, dataset: Dataset, n_evaluation_demo: int ) -> Iterable[int]: # select demonstration samples uniformly at random return get_rng().choice(len(dataset), n_evaluation_demo, replace=False) - def _get_gold_label_generation_for_datum(self, datum: DatasetDatum) -> str: - return self._get_gold_label_for_datum(datum) - - @abstractmethod - def _get_text_for_datum(self, datum: DatasetDatum) -> str: - pass - def _aggregate_result(self, results: list[str]) -> float: return sum(results) / len(results) diff --git a/evoprompt/task/topic_classification.py b/evoprompt/task/topic_classification.py index c4048a7b2f42f9bcbce080c2526f51cc10ee293c..dd1905f1d46ee0ff1d773aaba36a78c12a4cd7d4 100644 --- a/evoprompt/task/topic_classification.py +++ b/evoprompt/task/topic_classification.py @@ -3,14 +3,14 @@ from typing import Mapping from datasets import load_dataset -from evoprompt.helpers.prompts import BasePromptsFromJsonMixin +from evoprompt.task.base_prompts_mixin import BasePromptsFromJsonMixin from evoprompt.task import TextClassification from evoprompt.task.task import DatasetDatum class TopicClassification(TextClassification): @staticmethod - def _get_generation_prefix(): + def _get_prediction_prefix(): return "Topic: " diff --git a/evoprompt/utils.py b/evoprompt/utils.py index 4ca878d4b5064d048370abea0bc359289557d6a5..701ae38dc558f789e2ae0eef23e62aac398a1bd1 100644 --- a/evoprompt/utils.py +++ b/evoprompt/utils.py @@ -79,7 +79,9 @@ def initialize_run_directory(model): # make sure that we use high randomness for generating the run name even if a seed is set for the model response, _, _, _ = model.create_completion( - None, run_name_prompt, temperature=2.0, seed=random.randint(0, 2**32 - 1) + system_message=None, + prompt=run_name_prompt, + use_randomness=True, ) run_name_match = re.search(r"^\w+$", response, re.MULTILINE) existing_run_names = os.listdir(RUNS_DIR) if RUNS_DIR.exists() else [] diff --git a/main.py b/main.py index 2256754fe0d6b7cd11f27674302743d46556b63a..40c8d25dd7e2126e8f5acc83aa5548391e9b532f 100644 --- a/main.py +++ b/main.py @@ -9,7 +9,7 @@ from weave.trace.settings import UserSettings from evoprompt.cli import argument_parser from evoprompt.evolution import get_optimizer_class -from evoprompt.models import HfLlamaChat, Llama, LlamaChat, LLMModel +from evoprompt.models import HfChat, Llama, LlamaChat, LLMModel from evoprompt.task import get_task from evoprompt.utils import init_rng, setup_console_logger @@ -86,26 +86,26 @@ if __name__ == "__main__": logger.info("DEBUG mode: Do a quick run") # set up evolution model - evolution_model = LLMModel.get_model(options.evolution_engine, options=options) + evolution_model = LLMModel.get_model(options.evolution_engine, **vars(options)) logger.info( f"Using {evolution_model.__class__.__name__.lower()} as the evolution engine" ) judge_model: LLMModel | None = None if options.judge_engine is not None: - judge_model = LLMModel.get_model(options.judge_engine, options=options) + judge_model = LLMModel.get_model(options.judge_engine, **vars(options)) logger.info(f"Using {options.judge_engine} as the judge engine") # set up evaluation model # NOTE currenty we always stick to Llama (Llama or LlamaChat depending on evolution engine) as evaluation engine # TODO allow to set separate engine and model for evaluation? - if isinstance(evolution_model, (Llama, LlamaChat, HfLlamaChat)): + if isinstance(evolution_model, (Llama, LlamaChat, HfChat)): evaluation_model_name = evolution_model.__class__.__name__.lower() elif judge_model is not None and isinstance(judge_model, (Llama, LlamaChat)): evaluation_model_name = judge_model.__class__.__name__.lower() else: evaluation_model_name = "llamachat" - evaluation_model = LLMModel.get_model(name=evaluation_model_name, options=options) + evaluation_model = LLMModel.get_model(name=evaluation_model_name, **vars(options)) logger.info(f"Using {evaluation_model_name} as the evaluation engine") task = get_task( diff --git a/requirements.txt b/requirements.txt index c3c7115e17085a9d044d9f4441a92dbb7445581f..f4e563dea0de12c90cd418306673510f333d7656 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,6 @@ sacrebleu sacremoses textual wandb -weave \ No newline at end of file +weave +bitsandbytes +sentencepiece \ No newline at end of file diff --git a/test_chat_format.py b/test_chat_format.py new file mode 100644 index 0000000000000000000000000000000000000000..b6aeed329e7cf261b790fea3f6a7543d964f8868 --- /dev/null +++ b/test_chat_format.py @@ -0,0 +1,20 @@ +from llama_cpp.llama_chat_format import format_alpaca + +if __name__ == "__main__": + messages = [ + { + "role": "system", + "content": "You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.", + }, + { + "role": "user", + "content": "Have your friend evaluate the movie they had just seen and provide a summary opinion (e.g. terrible, bad, okay, good, or great) to determine the sentiment of the movie review.\n\nText: ( a ) slummer .\nSentiment: " + }, + { + "role": "assistant", + "content": None, + }, + ] + + formatted_prompt = format_alpaca(messages) + print(formatted_prompt) \ No newline at end of file