From 464b3c31bf7b72dc560f7441dec90929f17d73a6 Mon Sep 17 00:00:00 2001
From: Maximilian Schmidt <maximilian.schmidt@ims.uni-stuttgart.de>
Date: Thu, 1 Feb 2024 18:12:43 +0100
Subject: [PATCH] Add QA task

---
 cli.py           |   3 +-
 main.py          |  87 ++++++++----------------
 models.py        |  13 ++--
 requirements.txt |   1 +
 task.py          | 173 +++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 213 insertions(+), 64 deletions(-)
 create mode 100644 task.py

diff --git a/cli.py b/cli.py
index 47d3949..29c3780 100644
--- a/cli.py
+++ b/cli.py
@@ -8,6 +8,7 @@ argument_parser.add_argument(
 argument_parser.add_argument(
     "--evolution-algorithm", "-a", type=str, choices=["ga", "de"], default="ga"
 )
+argument_parser.add_argument("--model-path", "-m", type=str, required=True)
 argument_parser.add_argument(
-    "--model-path", "-m", type=str, required=True
+    "--task", "-t", type=str, required=True, choices=["sa", "qa"]
 )
diff --git a/main.py b/main.py
index 955a0f1..bf58276 100644
--- a/main.py
+++ b/main.py
@@ -1,14 +1,16 @@
+from functools import lru_cache
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import DefaultDict, get_type_hints
 
 from datasets import Dataset, load_dataset
+from evaluate import load as load_metric
 from dotenv import load_dotenv
-from llama_cpp import Callable, Llama
 from numpy.random import choice
-from tqdm import tqdm, trange
+from tqdm import trange
 
 from cli import argument_parser
+from task import QuestionAnswering, SentimentAnalysis
 from models import Llama2, OpenAI
 from utils import (
     Prompt,
@@ -25,18 +27,6 @@ load_dotenv()
 
 current_directory = Path(__file__).resolve().parent
 
-CLASSIFICATION_PROMPT = """
-Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
-
-### Instruction:
-{instruction}.
-
-### Input:
-{input}
-
-### Response:
-"""
-
 PARAPHRASE_PROMPT = """
 Below is an instruction that describes a task. Write a response that paraphrases the instruction. Only output the paraphrased instruction bracketed in <prompt> and </prompt>.
 
@@ -48,38 +38,6 @@ Below is an instruction that describes a task. Write a response that paraphrases
 """
 
 
-@log_calls("Evaluating dataset")
-def evaluate_prompt(prompt: str, dataset: Dataset):
-    sst2_labels = {"negative": 0, "positive": 1}
-
-    results: DefaultDict[str, int] = DefaultDict(int)
-    dataset_iterator = tqdm(dataset, desc="evaluating prompt", leave=False)
-
-    for datum in dataset_iterator:
-        response = evaluation_model(
-            prompt=CLASSIFICATION_PROMPT.format(instruction=prompt, input=datum["text"])
-        )
-        answer = response.lower()
-        answer_label = None
-        for label in sst2_labels.keys():
-            if label in answer:
-                answer_label = sst2_labels[label]
-                break
-        else:
-            logger.warning(f"Invalid answer: {answer}")
-            results["failed"] += 1
-            continue
-
-        classification_result = (
-            "incorrect" if answer_label != datum["label"] else "correct"
-        )
-        results[classification_result] += 1
-        dataset_iterator.set_postfix(results)
-
-    accuracy = results["correct"] / sum(results.values())
-    return accuracy
-
-
 @log_calls("Paraphrasing prompts")
 def paraphrase_prompts(prompt: str, n: int):
     paraphrases = []
@@ -195,17 +153,12 @@ def run_episode(evo_alg_str: str):
     # - Size of population
     N = 10
     # - Initial prompts P0 = {p1, p2, . . . , pN }
-    sst2_base_prompt = """In this task, you are given sentences from movie reviews. The task is to classify a sentence as "’positive’" if the sentiment of the sentence is positive or as "’negative’" if the sentiment of the sentence is negative. Return label only without any other text."""  #  from the paper: RLPROMPT: Optimizing Discrete Text Prompts with Reinforcement Learning
-    paraphrases = paraphrase_prompts(sst2_base_prompt, n=N - 1)
+    paraphrases = paraphrase_prompts(task.base_prompt, n=N - 1)
     # the initial population
-    initial_population = [sst2_base_prompt] + paraphrases
-
-    # - A dev set D
-    # The size of the development set is 200.
-    D = load_dataset("SetFit/sst2", split="validation[:200]")
+    initial_population = [task.base_prompt] + paraphrases
 
     # - fD(·) denotes the score of a prompt on the desired LLM evaluated on D
-    f_D = lru_cache(maxsize=None)(partial(evaluate_prompt, dataset=D))
+    f_D = lru_cache(maxsize=None)(task.evaluate_validation)
 
     # - a pre-defined number of iterations T
     T = 10
@@ -310,11 +263,9 @@ def run_episode(evo_alg_str: str):
     logger.info(f"Best prompt: {population[p]}")
 
     # We pick the prompt with the highest score on the development set and report its score on the testset.
-    test_D = load_dataset("SetFit/sst2", split="test")
-    evaluate_prompt(P[p], test_D)
+    task.evaluate_test(P[p])
 
 
-family_tree = {}
 if __name__ == "__main__":
     options = argument_parser.parse_args()
 
@@ -342,4 +293,26 @@ if __name__ == "__main__":
                 chat=USE_CHAT,
             )
 
+    match options.task:
+        case "sa":
+            task = SentimentAnalysis(
+                evaluation_model,
+                "SetFit/sst2",
+                "SetFit/sst2",
+                validation_split="validation",
+                test_split="test",
+            )
+        case "qa":
+            task = QuestionAnswering(
+                evaluation_model,
+                "squad",
+                "squad",
+                validation_split=f"train[:{5 if DEBUG else 200}]",
+                test_split="validation[:20]" if DEBUG else "validation",
+            )
+        case _:
+            raise ValueError(
+                f"Task {options.task} does not exist. Choose from 'sa', 'qa'."
+            )
+
     run_episode(evo_alg_str=options.evolution_algorithm)
diff --git a/models.py b/models.py
index a23e183..fc9949d 100644
--- a/models.py
+++ b/models.py
@@ -2,17 +2,18 @@ from abc import abstractmethod
 from pathlib import Path
 from typing import Any
 
-# TODO support for other libraries?
 from llama_cpp import Llama
 import openai
 
+from utils import log_calls
+
 
 current_directory = Path(__file__).resolve().parent
 
 
 class Llama2:
-    """Loads and queries a Llama2 model.
-    """    
+    """Loads and queries a Llama2 model."""
+
     def __init__(
         self,
         model_path: str,
@@ -38,6 +39,7 @@ class Llama2:
             **kwargs,
         )
 
+    # @log_calls("Running Llama model")
     def __call__(
         self,
         prompt: str,
@@ -69,8 +71,8 @@ class Llama2:
 
 
 class OpenAI:
-    """Loads and queries an OpenAI model.
-    """   
+    """Loads and queries an OpenAI model."""
+
     def __init__(
         self, model: str, chat: bool = False, verbose: bool = False, **kwargs
     ) -> None:
@@ -122,4 +124,3 @@ class OpenAI:
                 .choices[0]
                 .message.content
             )
-        
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4cb42cb..724d6df 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 numpy
 datasets
+evaluate
 llama-cpp-python
 tqdm
 graphviz
diff --git a/task.py b/task.py
new file mode 100644
index 0000000..8737451
--- /dev/null
+++ b/task.py
@@ -0,0 +1,173 @@
+from abc import abstractmethod
+from collections import defaultdict
+from typing import DefaultDict
+
+from datasets import Dataset, load_dataset
+from evaluate import load as load_metric
+from tqdm import tqdm
+
+from utils import log_calls, logger
+
+
+CLASSIFICATION_PROMPT = """
+Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+### Instruction:
+{instruction}.
+
+### Input:
+{input}
+
+### Response:
+"""
+
+
+class Task:
+    def __init__(
+        self,
+        model,
+        validation_dataset: str,
+        test_dataset: str,
+        validation_split: str = None,
+        test_split: str = None,
+    ) -> None:
+        self.model = model
+
+        self.validation_dataset = load_dataset(
+            validation_dataset, split=validation_split
+        )
+        self.test_dataset = load_dataset(test_dataset, split=test_split)
+
+    @abstractmethod
+    def _evaluate(self, prompt: str, dataset):
+        pass
+
+    @log_calls("Evaluating validation dataset")
+    def evaluate_validation(self, prompt: str):
+        return self._evaluate(prompt, self.validation_dataset)
+
+    @log_calls("Evaluating test dataset")
+    def evaluate_test(self, prompt: str):
+        return self._evaluate(prompt, self.test_dataset)
+
+    @property
+    @abstractmethod
+    def base_prompt(self):
+        pass
+
+
+class SentimentAnalysis(Task):
+    def __init__(
+        self,
+        model,
+        validation_dataset: str,
+        test_dataset: str,
+        validation_split: str = None,
+        test_split: str = None,
+    ) -> None:
+        super().__init__(
+            model, validation_dataset, test_dataset, validation_split, test_split
+        )
+
+    def _evaluate(self, prompt: str, dataset: Dataset):
+        sst2_labels = {"negative": 0, "positive": 1}
+
+        results: DefaultDict[str, int] = defaultdict(int)
+        dataset_iterator = tqdm(dataset, desc="evaluating prompt", leave=False)
+
+        for datum in dataset_iterator:
+            response = self.model(
+                prompt=CLASSIFICATION_PROMPT.format(
+                    instruction=prompt, input=datum["text"]
+                )
+            )
+            answer = response.lower()
+            answer_label = None
+            for label in sst2_labels.keys():
+                if label in answer:
+                    answer_label = sst2_labels[label]
+                    break
+            else:
+                logger.warning(f"Invalid answer: {answer}")
+                results["failed"] += 1
+                continue
+
+            classification_result = (
+                "incorrect" if answer_label != datum["label"] else "correct"
+            )
+            results[classification_result] += 1
+            dataset_iterator.set_postfix(results)
+
+        accuracy = results["correct"] / sum(results.values())
+        return accuracy
+
+    @property
+    def base_prompt(self):
+        #  from the paper: RLPROMPT: Optimizing Discrete Text Prompts with Reinforcement Learning
+        return """In this task, you are given sentences from movie reviews. The task is to classify a sentence as "’positive’" if the sentiment of the sentence is positive or as "’negative’" if the sentiment of the sentence is negative. Return label only without any other text."""
+
+
+class QuestionAnswering(Task):
+    def __init__(
+        self,
+        model,
+        validation_dataset: str,
+        test_dataset: str,
+        validation_split: str = None,
+        test_split: str = None,
+    ) -> None:
+        self.evaluation_prompt = """
+        Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+
+        ### Instruction:
+        {instruction}
+
+        ### Context:
+        {context}
+
+        ### Question:
+        {question}
+
+        ### Response:
+        """
+
+        self.metric = load_metric("squad")
+
+        super().__init__(
+            model, validation_dataset, test_dataset, validation_split, test_split
+        )
+
+    def _evaluate(self, prompt: str, dataset: Dataset):
+        dataset_iterator = tqdm(dataset, desc="evaluating prompt", leave=False)
+
+        num_samples = 0
+        f1 = 0.0
+        em = 0
+        for datum in dataset_iterator:
+            response = self.model(
+                prompt=self.evaluation_prompt.format(
+                    instruction=prompt,
+                    context=datum["context"],
+                    question=datum["question"],
+                )
+            )
+            answer = response.lower().strip()
+
+            num_samples += 1
+            result = self.metric.compute(
+                predictions=[{"prediction_text": answer, "id": datum["id"]}],
+                references=[{"answers": datum["answers"], "id": datum["id"]}],
+            )
+            f1 += result["f1"]
+            em += result["exact_match"]
+
+            dataset_iterator.set_postfix(
+                {"f1": f1 / num_samples, "em": em / num_samples}
+            )
+
+        return f1 / num_samples  # , em/num_samples
+
+    @property
+    def base_prompt(self):
+        # TODO find good prompt
+        return """In this task, you are given contexts with questions. The task is to answer the question given the context. Return only the answer without any other text. Make sure that the answer is taken directly from the context."""
-- 
GitLab