diff --git a/cli.py b/cli.py index 071c220d14580eeed8cbf0263a894ddd80790ee8..89a875c5f5edad08ab53fba75eefc3ea3d771d19 100644 --- a/cli.py +++ b/cli.py @@ -25,10 +25,13 @@ argument_parser.add_argument( argument_parser.add_argument( "--evolution-algorithm", "-a", type=str, choices=["ga", "de"], default="ga" ) -argument_parser.add_argument("--model", "-m", type=str) argument_parser.add_argument( "--task", "-t", type=str, required=True, choices=["sa", "qa"] ) argument_parser.add_argument("--use-grammar", "-g", action="store_true") argument_parser.add_argument("--debug", "-d", action="store_true", default=None) argument_parser.add_argument("--chat", "-c", action="store_true") +argument_parser.add_argument("--openai-model", "-m", type=str, default="gpt-3.5-turbo") +argument_parser.add_argument( + "--llama-path", default="models/llama-2-13b-chat.Q5_K_M.gguf" +) diff --git a/main.py b/main.py index 4a139592384d88bce4fb0da3786aff2718a9143d..674d80c7a617c83b1390a96ff3d1d79918622537 100644 --- a/main.py +++ b/main.py @@ -1,8 +1,9 @@ import os from typing import Any -from cli import argument_parser from dotenv import load_dotenv + +from cli import argument_parser from evolution import DifferentialEvolution, GeneticAlgorithm from models import Llama2, get_model_init from task import QuestionAnswering, SentimentAnalysis @@ -26,30 +27,23 @@ if __name__ == "__main__": # set up evolution model model_init_fn = get_model_init(options.evolution_engine) - evolution_model = model_init_fn( - options.model, - chat=options.chat, - ) + evolution_model = model_init_fn(options) match options.evolution_engine: case "llama2": - logger.info("Using Llama2 client as the evolution engine") + logger.info("Using Llama2 as the evolution engine") case "openai": - logger.info("Using OpenAI client as the evolution engine") - + logger.info(f"Using {options.openai_model} as the evolution engine") # set up evaluation model # NOTE currenty we always stick to Llama2 as evaluation engine # TODO allow to set separate engine and model for evaluation? - logger.info("Using Llama2 client as the evaluation engine") + logger.info("Using Llama2 as the evaluation engine") match options.evolution_engine: case "llama2": evaluation_model = evolution_model case "openai": - evaluation_model = Llama2( - model_path=options.model, - chat=options.chat, - ) + evaluation_model = Llama2(options) # log cli arguments logger.info( diff --git a/models.py b/models.py index f6f9fd6afa94004914f98e4db8f003df25f044a6..56d50164aad09a93ed67e734f9cdd33fc379a264 100644 --- a/models.py +++ b/models.py @@ -1,19 +1,45 @@ from abc import abstractmethod +from argparse import Namespace from pathlib import Path from typing import Any, Callable import openai from llama_cpp import Llama + from opt_types import ModelUsage current_directory = Path(__file__).resolve().parent -MODELS = {} +class LLMModel: + chat: bool + + def __init__(self, options: Namespace): + self.usage = ModelUsage() + self.chat = options.chat + + @abstractmethod + def __call__( + self, + system_message: str | None, + prompt: str, + *, + prompt_appendix: str, + prompt_prefix: str, + prompt_suffix: str, + chat: bool | None, + stop: str, + max_tokens: int, + **kwargs: Any, + ) -> Any: + pass + + +MODELS: dict[str, type[LLMModel]] = {} def register_model(name: str): - def wrapper(f: Callable): + def wrapper(f: type[LLMModel]): global MODELS if name in MODELS: raise ValueError("Cannot register model class %s: already exists", name) @@ -34,40 +60,13 @@ def get_model_init(name: str): return MODELS[name] -class LLMModel: - chat: bool - model: Any - - def __init__(self, chat: bool, model: Any): - self.usage = ModelUsage() - self.chat = chat - self.model = model - - @abstractmethod - def __call__( - self, - system_message: str | None, - prompt: str, - *, - prompt_appendix: str, - prompt_prefix: str, - prompt_suffix: str, - chat: bool | None, - stop: str, - max_tokens: int, - **kwargs: Any, - ) -> Any: - pass - - @register_model("llama2") class Llama2(LLMModel): """Loads and queries a Llama2 model.""" def __init__( self, - model_path: str, - chat: bool = False, + options: Namespace, n_gpu_layers: int = 60, n_threads: int = 8, n_ctx: int = 4096, @@ -76,8 +75,8 @@ class Llama2(LLMModel): ) -> None: # initialize model - model = Llama( - model_path, + self.model = Llama( + options.llama_path, chat_format="llama-2", verbose=verbose, n_gpu_layers=n_gpu_layers, @@ -85,7 +84,8 @@ class Llama2(LLMModel): n_ctx=n_ctx, **kwargs, ) - super().__init__(chat, model) + + super().__init__(options) def __call__( self, @@ -149,12 +149,12 @@ class OpenAI(LLMModel): def __init__( self, - model: str = "gpt-3.5-turbo", - chat: bool = False, + options: Namespace, verbose: bool = False, **kwargs, ) -> None: - super().__init__(chat, model) + self.model_name = options.openai_model + super().__init__(options) # initialize client for API calls self.openai_client = openai.OpenAI(**kwargs) @@ -191,7 +191,7 @@ class OpenAI(LLMModel): }, ) response = self.openai_client.chat.completions.create( - model=self.model, + model=self.model_name, messages=messages, stop=stop, max_tokens=max_tokens, diff --git a/optimization.py b/optimization.py index 9f18365c07f93fedd8753f5a082bb6dff17e11d0..90d6e2ea7e20b6330492f480e38b699a46d74ce2 100644 --- a/optimization.py +++ b/optimization.py @@ -87,7 +87,7 @@ class PromptOptimization: ] def get_prompt(self, prompt_id: str): - return self.all_prompt[prompt_id] + return self.all_prompts[prompt_id] def get_prompts(self, prompt_ids: list[str]): return [self.get_prompt(p_id) for p_id in prompt_ids] diff --git a/task.py b/task.py index 461348d35be3b41dbb79986ca5df87e47640056b..abbc554dd575297fd192e083691f22b9884ff2b6 100644 --- a/task.py +++ b/task.py @@ -1,23 +1,28 @@ import re from abc import abstractmethod -from collections import defaultdict from functools import lru_cache -from typing import DefaultDict, Mapping, Union +from typing import Union from datasets import Dataset, load_dataset from evaluate import load as load_metric from llama_cpp import LlamaGrammar +from tqdm import tqdm + from models import Llama2, OpenAI from opt_types import ModelUsage -from tqdm import tqdm from utils import log_calls, logger SYSTEM_MESSAGE = """ You are given an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. """ +DatasetDatum = dict + class Task: + validation_dataset: Dataset + test_dataset: Dataset + def __init__( self, model: Union[Llama2, OpenAI], @@ -43,26 +48,50 @@ class Task: pass @abstractmethod - def _evaluate(self, prompt: str, dataset) -> tuple[float, ModelUsage]: + def _evaluate_sample( + self, prompt: str, datum: DatasetDatum + ) -> tuple[str, ModelUsage]: + pass + + @abstractmethod + def _aggregate_result(self, results: list) -> float: pass + def evaluate(self, prompt: str, dataset: Dataset) -> tuple[float, ModelUsage]: + results: list = [] + dataset_iterator: tqdm[DatasetDatum] = tqdm( + dataset, desc="evaluating prompt", leave=False + ) + evaluation_usage = ModelUsage() + + for datum in dataset_iterator: + result, usage = self._evaluate_sample(prompt, datum) + results.append(result) + current_metrics = self._aggregate_result(results) + dataset_iterator.set_postfix( + {self.metric_name: f"{current_metrics*100:.1f}%"} + ) + evaluation_usage += usage + + return self._aggregate_result(results), evaluation_usage + @log_calls("Evaluating validation dataset") @lru_cache(maxsize=None) def evaluate_validation(self, prompt: str): - return self._evaluate(prompt, self.validation_dataset) + return self.evaluate(prompt, self.validation_dataset) @log_calls("Evaluating test dataset") def evaluate_test(self, prompt: str): - return self._evaluate(prompt, self.test_dataset) + return self.evaluate(prompt, self.test_dataset) @property @abstractmethod - def metric_name(self): + def metric_name(self) -> str: pass @property @abstractmethod - def base_prompt(self): + def base_prompt(self) -> str: pass @@ -117,39 +146,33 @@ class SentimentAnalysis(Task): return response, usage - def _evaluate(self, prompt: str, dataset: Dataset): + def _evaluate_sample(self, prompt: str, datum: DatasetDatum): sst2_labels = {"negative": 0, "positive": 1} - results: DefaultDict[str, int] = defaultdict(int) - dataset_iterator = tqdm(dataset, desc="evaluating prompt", leave=False) - evaluation_usage = ModelUsage() - - for datum in dataset_iterator: - response, usage = self.predict(prompt=prompt, text=datum["text"]) - response = response.lower() - evaluation_usage += usage - if self.use_grammar: - # model output is from label space - answer_label = sst2_labels[response] + response, usage = self.predict(prompt=prompt, text=datum["text"]) + response = response.lower() + if self.use_grammar: + # model output is from label space + answer_label = sst2_labels[response] + else: + answer_label = None + for label in sst2_labels.keys(): + if label in response: + answer_label = sst2_labels[label] + break else: - answer_label = None - for label in sst2_labels.keys(): - if label in response: - answer_label = sst2_labels[label] - break - else: - logger.warning(f"Invalid answer: {response}") - results["failed"] += 1 - continue - - classification_result = ( - "incorrect" if answer_label != datum["label"] else "correct" - ) - results[classification_result] += 1 - dataset_iterator.set_postfix(results) + logger.warning(f"Invalid answer: {response}") + return "failed" - accuracy = results["correct"] / sum(results.values()) - return accuracy, evaluation_usage + classification_result = ( + "incorrect" if answer_label != datum["label"] else "correct" + ) + return classification_result, usage + + def _aggregate_result(self, results: list[str]) -> float: + num_correct_results = sum(1 for result in results if result == "correct") + accuracy = num_correct_results / len(results) + return accuracy @property def metric_name(self): @@ -255,10 +278,26 @@ class QuestionAnswering(Task): return response, usage - def _evaluate(self, prompt: str, dataset: Dataset): - evaluation_usage = ModelUsage() + def _evaluate_sample(self, prompt: str, datum: DatasetDatum): + answer, usage = self.predict( + prompt, + context=datum["context"], + question=datum["question"], + ) + # TODO check if answer is lower-cased in metric computation + + result = self.metric.compute( + predictions=[{"prediction_text": answer, "id": datum["id"]}], + references=[{"answers": datum["answers"], "id": datum["id"]}], + ) - def replace_symbol_for_grammar(sample: Mapping): + return result["f1"] / 100, usage + + def _aggregate_result(self, results: list[float]) -> float: + return sum(results) / len(results) + + def evaluate(self, prompt: str, dataset: Dataset) -> tuple[float, ModelUsage]: + def replace_symbol_for_grammar(sample: DatasetDatum): symbol_replacement_mapping = { "\u2013": "-", "\u2014": "-", @@ -281,35 +320,7 @@ class QuestionAnswering(Task): if self.use_grammar: # NOTE: the LlamaGrammar has issues with symbol '–' therefore we replace all occurences with '-' (hyphen) dataset = dataset.map(replace_symbol_for_grammar, desc="Replacing symbols") - - dataset_iterator = tqdm(dataset, desc="evaluating prompt", leave=False) - - num_samples = 0 - f1 = 0.0 - em = 0 - for datum in dataset_iterator: - answer, usage = self.predict( - prompt, - context=datum["context"], - question=datum["question"], - ) - # TODO check if answer is lower-cased in metric computation - - evaluation_usage += usage - - num_samples += 1 - result = self.metric.compute( - predictions=[{"prediction_text": answer, "id": datum["id"]}], - references=[{"answers": datum["answers"], "id": datum["id"]}], - ) - f1 += result["f1"] - em += result["exact_match"] - - dataset_iterator.set_postfix( - {"f1": f1 / num_samples, "em": em / num_samples} - ) - - return f1 / num_samples, evaluation_usage + return super().evaluate(prompt, dataset) @property def metric_name(self):