diff --git a/evolution.py b/evolution.py new file mode 100644 index 0000000000000000000000000000000000000000..03d0e5958fcabdaa9e032ae7c06d42226b2a04fd --- /dev/null +++ b/evolution.py @@ -0,0 +1,258 @@ +from abc import abstractmethod + +from models import LLMModel +from numpy.random import choice +from opt_types import Prompt +from optimization import PromptOptimization +from task import Task +from tqdm import trange +from utils import initialize_run_directory, log_calls, logger, save_snapshot + +SYSTEM_MESSAGE = ( + "Please follow the instruction step-by-step to generate a better prompt." +) + +GA_PROMPT = """ +1. Cross over the following prompts and generate a new prompt: +Prompt 1: {prompt1} +Prompt 2: {prompt2} +2. Mutate the prompt generated in Step 1 and generate a final prompt bracketed with <prompt> and </prompt>. +""" + + +DE_PROMPT = """ +1. Identify the different parts between the Prompt 1 and Prompt 2: +Prompt 1: {prompt1} +Prompt 2: {prompt2} +2. Randomly mutate the different parts +3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt. +Prompt 3: {prompt3} +4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>: +Basic Prompt: {basic_prompt} +""" + + +class EvolutionAlgorithm(PromptOptimization): + # TODO add docstrings + """The super class for all evolution algorithms containing shared parameters.""" + + def __init__( + self, + population_size: int, + *, + task: Task, + evolution_model: LLMModel, + evaluation_model: LLMModel, + ) -> None: + super().__init__( + task=task, + evolution_model=evolution_model, + evaluation_model=evaluation_model, + ) + + self.population_size = population_size + + @log_calls("Performing selection") + def select(self, prompts: list[Prompt]): + # In GA, two parent solutions are normally selected based on the roulette wheel + # selection method according to the fitness value (Lipowski & Lipowska, 2012). + # Similar to this, we utilize the roulette wheel selection method to select + # two parent prompts in the current population according to the scores evaluated + # on development sets. Specifically, let si denote the performance score on the + # development set of the i-th prompt in the population, which contains a total + # of N prompts. The probability of selecting the i-th prompt as a parent can be expressed as + # pi = si / Σj=1->N sj. + scores = [prompt.score for prompt in prompts] + if sum(scores) == 0: + # sum of scores is 0 ==> each score is 0, draw with equal probability + selection_probabilities = len(scores) * [1 / len(scores)] + else: + selection_probabilities = [score / sum(scores) for score in scores] + return choice(prompts, size=2, replace=False, p=selection_probabilities) + + @abstractmethod + def evolve( + self, + prompt_1: str, + prompt_2: str, + *, + prompts_current_evolution: list[Prompt], + current_iteration: int, + ): + pass + + @abstractmethod + def update(self, *args, **kwargs): + pass + + def run( + self, num_iterations: int, add_snapshot_dict: dict, debug: bool = False + ) -> None: + # debug mode for quick run + if debug: + self.population_size = 3 + num_iterations = 2 + + run_directory = initialize_run_directory(self.evolution_model) + + initial_prompts, _ = self.init_run(self.population_size) + + # Algorithm 1 Discrete prompt optimization: EVOPROMPT + + # P keeps track of prompts in each generation + P = [initial_prompts] + + # Line 2: + for t in trange(1, num_iterations + 1, desc="iterations", leave=True): + # Line 3: Selection: select a certain number of prompts from current population as parent prompts + # pr1,...,prk ∼ Pt−1 + prompts_current_evolution = P[t - 1] + + new_evolutions = [] + + for i in trange(self.population_size, desc="updates", leave=False): + # for both GA and DE we start with two parent prompts + pr1, pr2 = self.select(P[t - 1]) + + # Line 4: Evolution: generate a new prompt based on the selected parent prompts by leveraging LLM to perform evolutionary operators + # p′i â†Evo(pr1,...,prk) + p_i, _ = self.evolve( + pr1, + pr2, + prompts_current_evolution=prompts_current_evolution, + current_iteration=i, + ) + + evolved_prompt = self.add_prompt(p_i, (pr1, pr2), {"gen": t}) + + new_evolutions.append(evolved_prompt) + # Line 6: Update based on the evaluation scores + # Pt ↠{Pt−1, p′i} and St ↠{St−1, s′i} + new_population = self.update(new_evolutions, prompts_current_evolution) + + # store new generation + P.append(new_population) + + # TODO move to super class + save_snapshot( + run_directory, + self.all_prompts, + self.family_tree, + [[prompt.id for prompt in population] for population in P], + num_iterations, + self.population_size, + self.task, + self.evolution_model, + # model usage for evaluating prompts + self.evaluation_model.usage, + # model usage for evolution of prompts + self.evolution_model.usage, + add_snapshot_dict, + ) + # Line 8: Return the best prompt, p∗, among the final population PT : + # p∗ ↠argmaxp∈PT f(p, D) + p = max(P[-1], key=lambda prompt: self.all_prompts[prompt.id].score) + logger.info(f"Best prompt: {p}") + + # We pick the prompt with the highest score on the development set and report its score on the testset. + test_performance = self.task.evaluate_test(p.content) + logger.info(f"Best prompt on test set: {test_performance}") + + +class GeneticAlgorithm(EvolutionAlgorithm): + """The genetic algorithm implemented using LLMs.""" + + @log_calls("Performing prompt evolution using GA") + def evolve( + self, + prompt_1: str, + prompt_2: str, + *, + prompts_current_evolution: list[Prompt], + current_iteration: int, + ): + # Following the evolutionary operators in GA, a new candidate prompt is generated through + # a two-step process based on the selected two parents: + # 1) The parent prompts undergo crossover, resulting in a new prompt that + # selectively combines components from both parents; + # 2) The newly generated prompt from the first step undergoes mutation, + # in which random alterations are made to some of its content. + # Based on this two-step process, we design instructions, guiding LLMs to + # generate a new prompt based on these steps to perform Evo(·) in Algorithm 1. + + evolved_prompt, usage = self.evolution_model( + system_message=SYSTEM_MESSAGE, + prompt=GA_PROMPT.format(prompt1=prompt_1, prompt2=prompt_2), + ) + if "<prompt>" in evolved_prompt: + evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0] + return evolved_prompt, usage + + @log_calls("Performing update for GA") + def update( + self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt] + ): + # EVOPROMPT iteratively generates new candidate prompts and assesses each prompt + # using a development set, denoted as D, to obtain a score that quantifies the + # quality of the prompt. We consider a straightforward selection strategy. + # Specifically, at each iteration, EVOPROMPT based on GA produces N new prompts, + # which are combined with the current population of N prompts. + # The updated population is then selected by retaining the N prompts with the highest scores. + retained_prompts: list[Prompt] = [] + min_retained_score = 0 + for prompt in prompts_current_evolution + new_evolutions: + if len(retained_prompts) < self.population_size: + retained_prompts.append(prompt) + min_retained_score = min(min_retained_score, prompt.score) + elif prompt.score > min_retained_score: + retained_prompts.sort(key=lambda p: p.score) + retained_prompts[0] = prompt + + return retained_prompts + + +class DifferentialEvolution(EvolutionAlgorithm): + """The genetic algorithm implemented using LLMs.""" + + @log_calls("Performing prompt evolution using GA") + def evolve( + self, + prompt_1: str, + prompt_2: str, + *, + prompts_current_evolution: list[Prompt], + current_iteration: int, + ): + # TODO add description from paper + + # DE needs best prompt for evolution + best_prompt_current_evolution = max( + prompts_current_evolution, key=lambda prompt: prompt.score + ) + + evolved_prompt, usage = self.evolution_model( + system_message=SYSTEM_MESSAGE, + prompt=DE_PROMPT.format( + prompt1=prompt_1, + prompt2=prompt_2, + prompt3=best_prompt_current_evolution, + basic_prompt=prompts_current_evolution[current_iteration], + ), + ) + if "<prompt>" in evolved_prompt: + evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0] + return evolved_prompt, usage + + @log_calls("Performing update for GA") + def update( + self, prompts_current_evolution: list[Prompt], new_evolutions: list[Prompt] + ): + # for DE we keep the evolved prompt if it is better than the basic prompt, and use the basic prompt otherwise + assert len(prompts_current_evolution) == len(new_evolutions) + population = [ + (new_prompt if new_prompt.score > current_prompt.score else current_prompt) + for current_prompt, new_prompt in zip( + prompts_current_evolution, new_evolutions + ) + ] + return population diff --git a/main.py b/main.py index 5524016966d5ba565f06e06131024660e2e0b0b3..2095276a34e462f8f4489e69a9d0983247964081 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,14 @@ import os -from functools import lru_cache from typing import Any -from dotenv import load_dotenv -from numpy.random import choice -from tqdm import trange - from cli import argument_parser -from evo_types import ModelUsage, Prompt +from dotenv import load_dotenv +from evolution import DifferentialEvolution, GeneticAlgorithm from models import Llama2, OpenAI from task import QuestionAnswering, SentimentAnalysis -from utils import initialize_run_directory, log_calls, logger, save_snapshot +from utils import logger + +load_dotenv() def conv2bool(_str: Any): @@ -23,265 +21,6 @@ def conv2bool(_str: Any): return None -load_dotenv() - -PARAPHRASE_PROMPT = """You are given an instruction that describes a task. Write a response that paraphrases the instruction. Only output the paraphrased instruction bracketed in <prompt> and </prompt>.""" - - -@log_calls("Paraphrasing prompts") -def paraphrase_prompts(prompt: str, n: int): - total_usage = ModelUsage() - paraphrases = [] - for _ in range(n): - paraphrase, usage = evolution_model( - system_message=PARAPHRASE_PROMPT, - prompt=prompt, - prompt_prefix=' Instruction: "', - prompt_suffix='"', - ) - total_usage += usage - if "<prompt>" in paraphrase: - paraphrase = paraphrase.split("<prompt>")[1].split("</prompt>")[0] - paraphrases.append(paraphrase) - return paraphrases, usage - - -@log_calls("Performing selection") -def selection(prompts): - # In GA, two parent solutions are normally selected based on the roulette wheel - # selection method according to the fitness value (Lipowski & Lipowska, 2012). - # Similar to this, we utilize the roulette wheel selection method to select - # two parent prompts in the current population according to the scores evaluated - # on development sets. Specifically, let si denote the performance score on the - # development set of the i-th prompt in the population, which contains a total - # of N prompts. The probability of selecting the i-th prompt as a parent can be expressed as - # pi = si / Σj=1->N sj. - scores = [prompt.score for prompt in prompts] - if sum(scores) == 0: - # sum of scores is 0 ==> each score is 0, draw with equal probability - selection_probabilities = len(scores) * [1 / len(scores)] - else: - selection_probabilities = [score / sum(scores) for score in scores] - return choice(prompts, size=2, replace=False, p=selection_probabilities) - - -SYSTEM_MESSAGE = ( - "Please follow the instruction step-by-step to generate a better prompt." -) - -GA_PROMPT = """ -1. Cross over the following prompts and generate a new prompt: -Prompt 1: {prompt1} -Prompt 2: {prompt2} -2. Mutate the prompt generated in Step 1 and generate a final prompt bracketed with <prompt> and </prompt>. -""" - - -DE_PROMPT = """ -1. Identify the different parts between the Prompt 1 and Prompt 2: -Prompt 1: {prompt1} -Prompt 2: {prompt2} -2. Randomly mutate the different parts -3. Combine the different parts with Prompt 3, selectively replace it with the different parts in Step 2 and generate a new prompt. -Prompt 3: {prompt3} -4. Cross over the prompt in the Step 3 with the following basic prompt and generate a final prompt bracketed with <prompt> and </prompt>: -Basic Prompt: {basic_prompt} -""" - - -@log_calls("Performing prompt evolution using GA") -def evolution_ga(prompt1: str, prompt2: str): - # Following the evolutionary operators in GA, a new candidate prompt is generated through - # a two-step process based on the selected two parents: - # 1) The parent prompts undergo crossover, resulting in a new prompt that - # selectively combines components from both parents; - # 2) The newly generated prompt from the first step undergoes mutation, - # in which random alterations are made to some of its content. - # Based on this two-step process, we design instructions, guiding LLMs to - # generate a new prompt based on these steps to perform Evo(·) in Algorithm 1. - evolved_prompt, usage = evolution_model( - system_message=SYSTEM_MESSAGE, - prompt=GA_PROMPT.format(prompt1=prompt1, prompt2=prompt2), - ) - if "<prompt>" in evolved_prompt: - evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0] - return evolved_prompt, usage - - -@log_calls("Performing prompt evolution using DE") -def evolution_de(prompt1: str, prompt2: str, basic_prompt: str, best_prompt: str): - # TODO add comment from paper - evolved_prompt, usage = evolution_model( - system_message=SYSTEM_MESSAGE, - prompt=DE_PROMPT.format( - prompt1=prompt1, - prompt2=prompt2, - prompt3=best_prompt, - basic_prompt=basic_prompt, - ), - ) - if "<prompt>" in evolved_prompt: - evolved_prompt = evolved_prompt.split("<prompt>")[1].split("</prompt>")[0] - return evolved_prompt, usage - - -@log_calls("Updating prompts") -def update(prompts: list[str], N: int): - # EVOPROMPT iteratively generates new candidate prompts and assesses each prompt - # using a development set, denoted as D, to obtain a score that quantifies the - # quality of the prompt. We consider a straightforward selection strategy. - # Specifically, at each iteration, EVOPROMPT based on GA produces N new prompts, - # which are combined with the current population of N prompts. - # The updated population is then selected by retaining the N prompts with the highest scores. - retained_prompts: list[Prompt] = [] - min_retained_score = 0 - for prompt in prompts: - if len(retained_prompts) < N: - retained_prompts.append(prompt) - min_retained_score = min(min_retained_score, prompt.score) - elif prompt.score > min_retained_score: - retained_prompts.sort(key=lambda p: p.score) - retained_prompts[0] = prompt - - return retained_prompts - - -def run_episode(evo_alg_str: str, debug: bool = False): - # model usage for evolution of prompts - evolution_usage = ModelUsage() - # model usage for evaluating prompts - evaluation_usage = ModelUsage() - # Algorithm 1 Discrete prompt optimization: EVOPROMPT - - # Require: - # - Size of population - N = 3 if debug else 10 - # - Initial prompts P0 = {p1, p2, . . . , pN } - paraphrases, usage = paraphrase_prompts(task.base_prompt, n=N - 1) - evolution_usage += usage - # the initial population - initial_population = [task.base_prompt] + paraphrases - - # - fD(·) denotes the score of a prompt on the desired LLM evaluated on D - f_D = lru_cache(maxsize=None)(task.evaluate_validation) - - # - a pre-defined number of iterations T - T = 2 if debug else 10 - - # - carefully designed evolutionary operators to generate a new prompt Evo(·) - - # Line 1: Initial evaluation scores: S0 ↠{si = fD (pi )|i ∈ [1, N ]} - # the current population's scores - initial_population_scores: list[float] = [f_D(p) for p in initial_population] - - # all_prompts contains a list of Prompt objects that took part in this run at some time - # converting prompts to Prompt object - all_prompts: dict[str, Prompt] = { - prompt.id: prompt - for prompt in [ - Prompt(p, score=score, gen=0, usage=usage) - for (p, (score, usage)) in zip( - initial_population, initial_population_scores - ) - ] - } - - # P keeps track of prompts in each generation - P = [[prompt_id for prompt_id in all_prompts.keys()]] - - # add initial prompts to family tree - # None marks that there is no parent - family_tree: dict[str, tuple[str, str] | None] = { - prompt_id: None for prompt_id in P[0] - } - - # Line 2: - for t in trange(1, T + 1, desc="T", leave=True): - # Line 3: Selection: select a certain number of prompts from current population as parent prompts - # pr1,...,prk ∼ Pt−1 - prompts_current_evolution = [all_prompts[prompt_id] for prompt_id in P[t - 1]] - if evo_alg_str == "de": - # DE needs best prompt for evolution - best_prompt_current_evolution = max( - prompts_current_evolution, key=lambda prompt: prompt.score - ) - - new_evolutions = [] - - for i in trange(N, desc="N", leave=False): - # for both GA and DE we start with two parent prompts - pr1, pr2 = selection([all_prompts[prompt_id] for prompt_id in P[t - 1]]) - - # Line 4: Evolution: generate a new prompt based on the selected parent prompts by leveraging LLM to perform evolutionary operators - # p′i â†Evo(pr1,...,prk) - if evo_alg_str == "ga": - p_i, usage = evolution_ga(pr1, pr2) - elif evo_alg_str == "de": - p_i, usage = evolution_de( - pr1, - pr2, - prompts_current_evolution[i], - best_prompt_current_evolution, - ) - evolution_usage += usage - - # Line 5: Evaluation - # s′_i ↠f(p′i,D) - s_i, usage = f_D(p_i) - evaluation_usage += usage - - evolved_prompt = Prompt(content=p_i, score=s_i, gen=t, usage=usage) - - # keep track of genealogy - family_tree[evolved_prompt.id] = (pr1.id, pr2.id) - - new_evolutions.append(evolved_prompt) - all_prompts |= {prompt.id: prompt for prompt in new_evolutions} - # Line 6: Update based on the evaluation scores - # Pt ↠{Pt−1, p′i} and St ↠{St−1, s′i} - if evo_alg_str == "ga": - # GA keeps N best prompts from current population and evolutions - population = update(new_evolutions + prompts_current_evolution, N) - elif evo_alg_str == "de": - # for DE we keep the evolved prompt if it is better than the basic prompt, and use the basic prompt otherwise - assert len(prompts_current_evolution) == len(new_evolutions) - population = [ - ( - new_prompt - if new_prompt.score > current_prompt.score - else current_prompt - ) - for current_prompt, new_prompt in zip( - prompts_current_evolution, new_evolutions - ) - ] - - # store new generation - P.append([prompt.id for prompt in population]) - - save_snapshot( - run_directory, - all_prompts, - family_tree, - P, - T, - N, - task, - evolution_model, - evaluation_usage, - evolution_usage, - options.__dict__, - ) - # Line 8: Return the best prompt, p∗, among the final population PT : - # p∗ ↠argmaxp∈PT f(p, D) - p = all_prompts[max(P[-1], key=lambda prompt_id: all_prompts[prompt_id].score)] - logger.info(f"Best prompt: {p}") - - # We pick the prompt with the highest score on the development set and report its score on the testset. - test_performance = task.evaluate_test(p.content) - logger.info(f"Best prompt on test set: {test_performance}") - - if __name__ == "__main__": options = argument_parser.parse_args() @@ -309,8 +48,6 @@ if __name__ == "__main__": chat=options.chat, ) - run_directory = initialize_run_directory(evolution_model) - # log cli arguments logger.info( "CLI arguments:\n\tPositional:%s\n\tKeyword:\n\t\t%s", @@ -355,4 +92,16 @@ if __name__ == "__main__": logger.info("Using evolutionary algorithm '%s'", options.evolution_algorithm) - run_episode(evo_alg_str=options.evolution_algorithm, debug=debug) + # TODO allow to register algorithms and map to classes + if options.evolution_algorithm == "ga": + optimizer_class = GeneticAlgorithm + else: + optimizer_class = DifferentialEvolution + + optimizer = optimizer_class( + population_size=10, + task=task, + evolution_model=evolution_model, + evaluation_model=evaluation_model, + ) + optimizer.run(10, debug=debug, add_snapshot_dict=options.__dict__) diff --git a/models.py b/models.py index eaa8f208c27837df9e1dcf544d5f505001c37997..486801546244e0c28ce8de309a91a8f6ff866125 100644 --- a/models.py +++ b/models.py @@ -4,8 +4,7 @@ from typing import Any import openai from llama_cpp import Llama - -from evo_types import ModelUsage +from opt_types import ModelUsage current_directory = Path(__file__).resolve().parent diff --git a/evo_types.py b/opt_types.py similarity index 84% rename from evo_types.py rename to opt_types.py index 55b6a71ce043549ca528cdf65451515e87d5d13c..3d46de80719d2ff86025b92880b51e04044b113f 100644 --- a/evo_types.py +++ b/opt_types.py @@ -2,8 +2,6 @@ import json from dataclasses import dataclass, field, is_dataclass from uuid import uuid4 -from llama_cpp.llama_types import CompletionUsage - @dataclass(frozen=True) class ModelUsage: @@ -31,24 +29,20 @@ class ModelUsage: class Prompt: content: str score: float - gen: int usage: ModelUsage - id: str = field(default_factory=lambda: uuid4().hex) meta: dict = field(default_factory=dict) + id: str = field(default_factory=lambda: uuid4().hex) def __str__(self) -> str: return self.content def __hash__(self) -> int: return ( - hash(self.content) - + hash(self.score) - + hash(self.gen) - + hash(frozenset(self.meta.items())) + hash(self.content) + hash(self.score) + hash(frozenset(self.meta.items())) ) -class EvoTypeEncoder(json.JSONEncoder): +class OptTypeEncoder(json.JSONEncoder): def default(self, obj): if is_dataclass(obj): return obj.__dict__ diff --git a/optimization.py b/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..7d707133b61a6fd4cbe8f4e9df72b476f899a56a --- /dev/null +++ b/optimization.py @@ -0,0 +1,108 @@ +from itertools import zip_longest + +from models import LLMModel +from opt_types import ModelUsage, Prompt +from task import Task +from utils import log_calls + +PARAPHRASE_PROMPT = """You are given an instruction that describes a task. Write a response that paraphrases the instruction. Only output the paraphrased instruction bracketed in <prompt> and </prompt>.""" + + +@log_calls("Paraphrasing prompts") +def paraphrase_prompts( + model: LLMModel, + prompt: str, + n: int, + unique_prompts: bool = False, + num_tries: int = 10, + return_only_unique_prompts: bool = False, +): + # TODO implement unique paraphrases + total_usage = ModelUsage() + paraphrases = [] + for _ in range(n): + paraphrase, usage = model( + system_message=PARAPHRASE_PROMPT, + prompt=prompt, + prompt_prefix=' Instruction: "', + prompt_suffix='"', + ) + total_usage += usage + if "<prompt>" in paraphrase: + paraphrase = paraphrase.split("<prompt>")[1].split("</prompt>")[0] + paraphrases.append(paraphrase) + if return_only_unique_prompts: + paraphrases = list(set(paraphrases)) + return paraphrases, usage + + +class PromptOptimization: + def __init__( + self, *, task: Task, evolution_model: LLMModel, evaluation_model: LLMModel + ) -> None: + self.task = task + self.evolution_model = evolution_model + self.evaluation_model = evaluation_model + self._init() + + def _init(self): + # use caching for evaluation + self.family_tree: dict[str, tuple[str, ...] | None] = {} + # all_prompts contains a list of Prompt objects that took part in the optimization + # converting prompts to Prompt object + self.all_prompts: dict[str, Prompt] = {} + + def reset(self): + self._init + + def evaluate_prompt(self, prompt: str): + return self.task.evaluate_validation(prompt) + + def add_prompt( + self, prompt: str, parents: tuple[Prompt] = None, meta: dict = None + ) -> Prompt: + score, usage = self.evaluate_prompt(prompt) + prompt = Prompt(content=prompt, score=score, meta=meta, usage=usage) + + # keep track of prompt + self.all_prompts[prompt.id] = prompt + self.family_tree[prompt.id] = ( + tuple(p.id for p in parents) if parents is not None else None + ) + + return prompt + + def add_prompts( + self, + prompts: list[str], + parents: list[tuple[Prompt]] = iter(()), + metas: list[dict] = iter(()), + ): + return [ + self.add_prompt(prompt, _parents, meta) + for prompt, _parents, meta in zip_longest(prompts, parents, metas) + ] + + def get_prompt(self, prompt_id: str): + return self.all_prompt[prompt_id] + + def get_prompts(self, prompt_ids: list[str]): + return [self.get_prompt(p_id) for p_id in prompt_ids] + + def init_run(self, num_initial_prompts: int) -> tuple[list[Prompt], ModelUsage]: + # - Initial prompts P0 = {p1, p2, . . . , pN } + paraphrases, usage = paraphrase_prompts( + self.evolution_model, self.task.base_prompt, n=num_initial_prompts - 1 + ) + + # the initial prompts + initial_prompts = [self.task.base_prompt] + paraphrases + initial_prompts = self.add_prompts( + initial_prompts, metas=[{"gen": 0} for _ in initial_prompts] + ) + + # accumulate usage + for prompt in initial_prompts: + usage += prompt.usage + + return initial_prompts, usage diff --git a/task.py b/task.py index 08948ba9df927065dcc118da393d56578049a3d2..3c2dc0eecc2d065a6430c1290f1033a925c7206b 100644 --- a/task.py +++ b/task.py @@ -7,9 +7,8 @@ from typing import DefaultDict, Mapping, Union from datasets import Dataset, load_dataset from evaluate import load as load_metric from llama_cpp import LlamaGrammar -from tqdm import tqdm - from models import Llama2, OpenAI +from tqdm import tqdm from utils import ModelUsage, log_calls, logger SYSTEM_MESSAGE = """ @@ -47,6 +46,7 @@ class Task: pass @log_calls("Evaluating validation dataset") + @lru_cache(maxsize=None) def evaluate_validation(self, prompt: str): return self._evaluate(prompt, self.validation_dataset) @@ -262,9 +262,15 @@ class QuestionAnswering(Task): "\u2013": "-", "\u2014": "-", } - symbol_replacement_mapping = dict((re.escape(k), v) for k, v in symbol_replacement_mapping.items()) - symbol_replacement_pattern = re.compile("|".join(symbol_replacement_mapping.keys())) - replace_fn = lambda text: symbol_replacement_pattern.sub(lambda m: symbol_replacement_mapping[re.escape(m.group(0))], text) + symbol_replacement_mapping = dict( + (re.escape(k), v) for k, v in symbol_replacement_mapping.items() + ) + symbol_replacement_pattern = re.compile( + "|".join(symbol_replacement_mapping.keys()) + ) + replace_fn = lambda text: symbol_replacement_pattern.sub( + lambda m: symbol_replacement_mapping[re.escape(m.group(0))], text + ) sample["context"] = replace_fn(sample["context"]) sample["answers"]["text"] = [ replace_fn(text) for text in sample["answers"]["text"] diff --git a/utils.py b/utils.py index 167451ac702d54b3423e953206819f27fa39f0c2..3f4781de4f347fda537b1ceae42b5f7287405a59 100644 --- a/utils.py +++ b/utils.py @@ -10,8 +10,8 @@ from textwrap import dedent, indent from typing import Any, Callable from uuid import uuid4 -from evo_types import EvoTypeEncoder, ModelUsage, Prompt from models import Llama2, OpenAI +from opt_types import ModelUsage, OptTypeEncoder, Prompt current_directory = Path(__file__).resolve().parent logger = logging.getLogger("test-classifier") @@ -35,6 +35,7 @@ Only return the name without any text before or after.""".strip() RUNS_DIR = current_directory / "runs" + def initialize_run_directory(model: OpenAI | Llama2): response, usage = model(None, run_name_prompt, chat=True) model.usage -= usage @@ -163,7 +164,7 @@ def save_snapshot( }, f, indent=4, - cls=EvoTypeEncoder, + cls=OptTypeEncoder, )