diff --git a/evoprompt/evolution.py b/evoprompt/evolution.py index 2b4e6c7050701ea3273a4cbb4e1235442201af8c..9e1a9f3fb7ef44651de315f6f9d68c25b7ceddce 100644 --- a/evoprompt/evolution.py +++ b/evoprompt/evolution.py @@ -46,7 +46,7 @@ class EvolutionAlgorithm(PromptOptimization, metaclass=ABCMeta): task: Task, evolution_model: LLMModel, evaluation_model: LLMModel, - judge_model: LLMModel, + judge_model: LLMModel | None, run_options: dict[str, Any] = {}, ) -> None: super().__init__( diff --git a/evoprompt/optimization.py b/evoprompt/optimization.py index 4f91aba0b162a6f22de8d83099536e596c69573e..94ef34013d2010563048db45962f9f8fa8563dff 100644 --- a/evoprompt/optimization.py +++ b/evoprompt/optimization.py @@ -24,7 +24,7 @@ PromptSource = Literal["baseprompt", "paraphrase", "evolution", "corrected"] class Judgement(NamedTuple): original_response: str corrected_response: str - happy: bool + happy: bool | None class PromptMeta(TypedDict): @@ -122,7 +122,7 @@ class PromptOptimization: task: Task, evolution_model: LLMModel, evaluation_model: LLMModel, - judge_model: LLMModel, + judge_model: LLMModel | None, run_options: dict[str, Any] = {}, ) -> None: self.task = task @@ -278,6 +278,9 @@ class PromptOptimization: def judge_and_correct_step( self, instruction: str, response: str, history: ChatMessages ) -> Judgement: + if self.judge_model is None: + return Judgement(response, response, happy=None) + # TODO: judge the actual response judge_happy = False diff --git a/main.py b/main.py index a6474b2408f0ab397e98a56bef5e9d4f1cf894f3..c3a2fc3becefa175841dc22402c405bfdbcbb8c2 100644 --- a/main.py +++ b/main.py @@ -69,13 +69,10 @@ if __name__ == "__main__": case "openai": logger.info(f"Using {options.openai_model} as the evolution engine") - judge_model: LLMModel + judge_model: LLMModel | None if options.judge_engine is not None: judge_model = LLMModel.get_model(options.judge_engine, options=options) logger.info(f"Using {options.judge_engine} as the judge engine") - else: - judge_model = evolution_model - logger.info("Using the same model for judging as for evolution") # set up evaluation model # NOTE currenty we always stick to Llama as evaluation engine