From dad152859395664958ff272075b48c03c9bb3e34 Mon Sep 17 00:00:00 2001
From: Maximilian Schmidt <maximilian.schmidt@ims.uni-stuttgart.de>
Date: Thu, 8 Aug 2024 17:17:06 +0200
Subject: [PATCH] Add demonstration for DE

---
 evoprompt/evolution.py               |  26 +++---
 evoprompt/task/__init__.py           |  11 ++-
 evoprompt/task/sentiment_analysis.py |   4 +-
 evoprompt/task/task.py               |  16 ++--
 evoprompt/template_de.py             | 118 +++++++++++++++++++++++++++
 5 files changed, 148 insertions(+), 27 deletions(-)
 create mode 100644 evoprompt/template_de.py

diff --git a/evoprompt/evolution.py b/evoprompt/evolution.py
index e96e0ee..168798c 100644
--- a/evoprompt/evolution.py
+++ b/evoprompt/evolution.py
@@ -10,7 +10,8 @@ from evoprompt.models import LLMModel
 from evoprompt.opt_types import ModelUsage, Prompt
 from evoprompt.optimization import PromptOptimization
 from evoprompt.task import Task
-from evoprompt.utils import log_calls, get_all_subclasses
+from evoprompt.template_de import get_de_prompt_template
+from evoprompt.utils import get_all_subclasses, log_calls
 
 logger = logging.getLogger(__name__)
 
@@ -27,18 +28,6 @@ Prompt 2: {prompt2}
 """
 
 
-DE_PROMPT = """
-1. Identify the different parts between the Prompt 1 and Prompt 2:
-Prompt 1: {prompt1}
-Prompt 2: {prompt2}
-2. Randomly mutate the different parts
-3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.
-Prompt 3: {prompt3}
-4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
-Basic Prompt: {basic_prompt}
-"""
-
-
 class EvolutionAlgorithm(PromptOptimization):
     shorthand: str
 
@@ -59,6 +48,7 @@ class EvolutionAlgorithm(PromptOptimization):
             evaluation_model=evaluation_model,
             run_options=run_options,
         )
+        self.use_evolution_demo = run_options.get("use_evolution_demo", False)
 
         self.population_size = population_size
 
@@ -243,7 +233,7 @@ class DifferentialEvolution(EvolutionAlgorithm):
 
         evolved_prompt, usage = self.evolution_model(
             system_message=SYSTEM_MESSAGE,
-            prompt=DE_PROMPT.format(
+            prompt=get_de_prompt_template(self.use_evolution_demo, self.task).format(
                 prompt1=prompt_1,
                 prompt2=prompt_2,
                 prompt3=best_prompt_current_evolution,
@@ -280,7 +270,8 @@ class DifferentialEvolution(EvolutionAlgorithm):
 
 
 optimizers = {
-    algorithm.shorthand: algorithm for algorithm in EvolutionAlgorithm.__subclasses__()
+    algorithm.shorthand: algorithm
+    for algorithm in get_all_subclasses(EvolutionAlgorithm)
 }
 
 
@@ -293,3 +284,8 @@ def get_optimizer_class(name: str):
 argument_parser.add_argument(
     "--evolution-algorithm", "-a", type=str, choices=optimizers.keys(), default="ga"
 )
+argument_parser.add_argument(
+    "--use-evolution-demo",
+    action="store_true",
+    help="Whether to prepend a single demonstration example for evolution or not",
+)
diff --git a/evoprompt/task/__init__.py b/evoprompt/task/__init__.py
index 260a269..0bfbdf2 100644
--- a/evoprompt/task/__init__.py
+++ b/evoprompt/task/__init__.py
@@ -11,8 +11,8 @@ from evoprompt.task.text_classification import TextClassification
 from evoprompt.task.sentiment_analysis import SentimentAnalysis
 from evoprompt.task.topic_classification import AGNews, TREC
 from evoprompt.task.subjectivity_classification import Subj
-from evoprompt.task.summarization import SAMSum
-from evoprompt.task.simplification import ASSET
+from evoprompt.task.summarization import Summarization, SAMSum
+from evoprompt.task.simplification import Simplification, ASSET
 
 from evoprompt.utils import get_all_subclasses
 
@@ -32,6 +32,9 @@ def get_task(name: str, evaluation_model: LLMModel, **options):
 
 argument_parser.add_argument("--debug", "-d", action="store_true", default=None)
 argument_group = argument_parser.add_argument_group("Task arguments")
+argument_group.add_argument(
+    "--task", "-t", type=str, required=True, choices=tasks.keys()
+)
 argument_group.add_argument("--use-grammar", "-g", action="store_true")
 argument_group.add_argument(
     "--evaluation-strategy",
@@ -39,5 +42,7 @@ argument_group.add_argument(
     default="simple",
 )
 argument_group.add_argument(
-    "--task", "-t", type=str, required=True, choices=tasks.keys()
+    "--n-evaluation-demo",
+    type=int,
+    help="Number of demonstration examples per class used for evaluation",
 )
diff --git a/evoprompt/task/sentiment_analysis.py b/evoprompt/task/sentiment_analysis.py
index 3fa073d..6f33c43 100644
--- a/evoprompt/task/sentiment_analysis.py
+++ b/evoprompt/task/sentiment_analysis.py
@@ -35,7 +35,7 @@ class HfSST2(SentimentAnalysis):
     def load_validation_set(
         self, validation_dataset: str | None, validation_split: str | None
     ):
-        return super().load_validation_set("stanfordnlp/sst2", "validation[:200]")
+        return super().load_validation_set("stanfordnlp/sst2", "validation")
 
     def load_test_set(self, test_dataset: str, test_split: str | None):
         return super().load_test_set("stanfordnlp/sst2", "test")
@@ -128,7 +128,7 @@ class HfMovieReviews(SentimentAnalysis):
         self, validation_dataset: str | None, validation_split: str | None
     ):
         return super().load_validation_set(
-            "cornell-movie-review-data/rotten_tomatoes", "validation[:200]"
+            "cornell-movie-review-data/rotten_tomatoes", "validation"
         )
 
     def load_test_set(self, test_dataset: str, test_split: str | None):
diff --git a/evoprompt/task/task.py b/evoprompt/task/task.py
index 1a741bd..7d6016f 100644
--- a/evoprompt/task/task.py
+++ b/evoprompt/task/task.py
@@ -262,15 +262,16 @@ class Task(metaclass=ABCMeta):
         use_grammar: bool,
         evaluation_strategy: EvaluationStrategyKey,
         validation_split: str | None = None,
+        use_evolution_demo: bool = False,
         test_split: str | None = None,
         debug: bool = False,
         **kwargs,
     ) -> None:
         self.model = model
         self.debug = debug
-        # whether we use the grammar to constrain the model output or not
         self.use_grammar = use_grammar
         self.evaluation_strategy = get_evaluation_strategy(evaluation_strategy)(self)
+        self.use_evolution_demo = use_evolution_demo
 
         self.validation_dataset = self.load_validation_set(
             validation_dataset, validation_split
@@ -279,12 +280,11 @@ class Task(metaclass=ABCMeta):
             self.validation_dataset = self.validation_dataset.shuffle(42).select(
                 range(10)
             )
-        else:
-            # NOTE currently we select a subset for validation
-            if len(self.validation_dataset) > 200:
-                self.validation_dataset = self.validation_dataset.shuffle(42).select(
-                    range(200)
-                )
+        elif len(self.validation_dataset) > 200:
+            # NOTE currently we select a subset as validation set
+            self.validation_dataset = self.validation_dataset.shuffle(42).select(
+                range(200)
+            )
 
         self.test_dataset = self.load_test_set(test_dataset, test_split)
         if self.debug and len(self.test_dataset) > 5:
@@ -301,10 +301,12 @@ class Task(metaclass=ABCMeta):
     def predict(self, prompt: str, datum: DatasetDatum) -> tuple[str, ModelUsage]:
         # run model for inference using grammar to constrain output
         # TODO grammar also depends on prompt and vice-versa -> what are good labels?
+
         response, usage = self.model(
             system_message=SYSTEM_MESSAGE,
             prompt=prompt,
             prompt_appendix=self._get_prompt_text_for_datum(datum),
+            # grammar can be applied to constrain the model output
             grammar=self._get_grammar(datum) if self.use_grammar else None,
         )
 
diff --git a/evoprompt/template_de.py b/evoprompt/template_de.py
new file mode 100644
index 0000000..60f6bfc
--- /dev/null
+++ b/evoprompt/template_de.py
@@ -0,0 +1,118 @@
+# adopted from https://github.com/beeevita/EvoPrompt/blob/bf43b0dcc63fb79b7c0007d4693b2c0721e9a1a7/data/template_de.py
+from evoprompt.task import Task, TextClassification, Summarization, Simplification
+
+
+DE_PROMPT = """
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt.
+Prompt 3: {prompt3}
+4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: {basic_prompt}
+"""
+
+DE_PROMPT_WITH_DEMONSTRATION_SIM = """Please follow the instruction step-by-step to generate a better prompt.
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: Rewrite the input text into simpler text.
+Prompt 2: Rewrite my complex sentence in simpler terms, but keep the meaning.
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt.
+Prompt 3: Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.
+4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: Make the sentence easier for people who do not speak English fluently to comprehend.
+
+1. Identifying the different parts between Prompt 1 and Prompt 2:
+Prompt 1: Rewrite the input text into simpler text.
+Prompt 2: Rewrite my complex sentence in simpler terms, but keep the meaning.
+Different parts:
+"input text" vs "my complex sentence"
+"simpler text" vs "simpler terms, but keep the meaning"
+
+2. Randomly mutate the different parts:
+"input text" -> "provided text"
+"my complex sentence" -> "the difficult sentence"
+"simpler text" -> "easier language"
+"simpler terms, but keep the meaning" -> "simpler words while maintaining the meaning"
+
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:
+Prompt 3: Rewrite the given input text into simpler English sentences while preserving the same meaning, so it can be understood by non-native English speakers.
+New Prompt: Transform the provided text into easier language while maintaining the meaning, making it accessible for non-native English speakers.
+
+4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: Make the sentence easier for people who do not speak English fluently to comprehend.
+Final Prompt: <prompt>Convert the difficult sentence into simpler words while preserving the meaning, so it's easier for non-native English speakers to understand.</prompt>
+
+
+Please follow the instruction step-by-step to generate a better prompt.
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step2 and generate a new prompt.
+Prompt 3: {prompt3}
+4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: {basic_prompt}
+
+1. """
+
+DE_PROMPT_WITH_DEMONSTRATION_CLS = """Please follow the instruction step-by-step to generate a better prompt.
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.
+Prompt 2: In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt.
+Prompt 3: Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.
+4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.
+
+1. Identifying the different parts between Prompt 1 and Prompt 2:
+Prompt 1: Your task is to classify the comment as one of the following categories: terrible, bad, okay, good, great.
+Prompt 2: In this task, you are given sentences from movie reviews. The task is to classify a sentence as one of the following categories: terrible, bad, okay, good, great.
+Different parts:
+"classify the comment" vs "classify a sentence"
+"Your task is to" vs "In this task, you are given sentences from movie reviews. The task is to"
+
+2. Randomly mutate the different parts:
+"classify the comment" -> "categorize the statement"
+"classify a sentence" -> "evaluate the review"
+"Your task is to" -> "Your mission is to"
+"In this task, you are given sentences from movie reviews. The task is to" -> "In this assignment, you will receive movie review sentences. Your job is to"
+
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step 2 and generate a new prompt:
+Prompt 3: Assess a movie or a book based on its explanation and determine the sentiment of the movie review. Have your colleague's evaluation of the movie they watched be expressed in a concise remark (e.g. awesome, all right, terrible, or horrendous) following the narrative synopsis they were provided, and choose from terrible, bad, okay, good and great to describe the movie.
+New Prompt: In this assignment, you will receive movie review sentences. Your job is to evaluate the review and determine the sentiment, choosing from terrible, bad, okay, good, and great to describe the movie.
+
+4. Crossover the prompt in step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: You are a sentiment classifier. To do this, you must first understand the meaning of the sentence and any relevant context. And then you should classify it as one of the following categories: terrible, bad, okay, good, great.
+Final Prompt: <prompt>Your mission is to categorize the statement from a movie review by understanding its meaning and context, and then classify it as one of the following categories: terrible, bad, okay, good, or great.</prompt>
+
+Please follow the instruction step-by-step to generate a better prompt.
+1. Identify the different parts between the Prompt 1 and Prompt 2:
+Prompt 1: {prompt1}
+Prompt 2: {prompt2}
+2. Randomly mutate the different parts
+3. Combine the different parts with Prompt 3, selectively replace it with the different parts in step2 and generate a new prompt.
+Prompt 3: {prompt3}
+4. Crossover the prompt in the step3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>:
+Basic Prompt: {basic_prompt}
+
+1. """
+
+
+def get_de_prompt_template(use_demonstration_example: bool, task: None | Task = None):
+    if use_demonstration_example:
+        assert (
+            task is not None
+        ), "Task cannot be done if demonstation data should be used."
+
+        if isinstance(task, (TextClassification, Summarization)):
+            return DE_PROMPT_WITH_DEMONSTRATION_SIM
+        elif isinstance(task, Simplification):
+            return DE_PROMPT_WITH_DEMONSTRATION_CLS
+        else:
+            raise NotImplementedError(
+                f"Prompt with demonstration data is not implemented for task of type {type(task)}."
+            )
+    return DE_PROMPT
-- 
GitLab