diff --git a/eval_prompt.py b/eval_prompt.py
index 3092c163dba937815478eb8118cac6ac4372e11c..06a6375b4e683bfbb0d7bb0f14a7ac8ad71a83b7 100644
--- a/eval_prompt.py
+++ b/eval_prompt.py
@@ -15,9 +15,9 @@ def evaluate_prompt(prompt: str, task_name: str, args: Namespace):
     logger.info(f'Evaluating prompt "{prompt}"')
 
     evaluation_model = LLMModel.get_model(
-        name="hfchat",
+        # name="hfchat",
         # name="alpacahfchat",
-        # name="llamachat",
+        name="llamachat",
         # model="PKU-Alignment/alpaca-7b-reproduced",
         # model="chavinlo/alpaca-native",
         load_in_8bit=True,
@@ -42,24 +42,28 @@ def evaluate_prompt(prompt: str, task_name: str, args: Namespace):
 if __name__ == "__main__":
 
     def main():
-        init_rng(1)
-        setup_console_logger()
-
         argparser = argparse.ArgumentParser()
         argparser.add_argument("-p", "--prompt", type=str, required=True)
         argparser.add_argument(
             "-t", "--task", type=str, choices=sorted(tasks.keys()), required=True
         )
+        argparser.add_argument(
+            "-v", "--verbose", action="count", default=0, help="Increase verbosity"
+        )
         args = argparser.parse_args()
 
+        init_rng(1)
+        setup_console_logger(verbosity_level=args.verbose)
+
         options = Namespace(
             llama_path=None,
             chat_format=None,
             chat_handler=None,
-            verbose=False,
             llama_verbose=False,
             llama_model="QuantFactory/Meta-Llama-3.1-8B-Instruct-GGUF",
+            # llama_model="TheBloke/Llama-2-70B-Chat-GGUF",
             llama_model_file="Meta-Llama-3.1-8B-Instruct.Q8_0.gguf",
+            # llama_model_file="llama-2-70b-chat.Q4_K_M.gguf",
             disable_cache=False,
             use_grammar=False,
             evaluation_strategy="simple",
diff --git a/main.py b/main.py
index 2256754fe0d6b7cd11f27674302743d46556b63a..40c8d25dd7e2126e8f5acc83aa5548391e9b532f 100644
--- a/main.py
+++ b/main.py
@@ -9,7 +9,7 @@ from weave.trace.settings import UserSettings
 
 from evoprompt.cli import argument_parser
 from evoprompt.evolution import get_optimizer_class
-from evoprompt.models import HfLlamaChat, Llama, LlamaChat, LLMModel
+from evoprompt.models import HfChat, Llama, LlamaChat, LLMModel
 from evoprompt.task import get_task
 from evoprompt.utils import init_rng, setup_console_logger
 
@@ -86,26 +86,26 @@ if __name__ == "__main__":
         logger.info("DEBUG mode: Do a quick run")
 
     # set up evolution model
-    evolution_model = LLMModel.get_model(options.evolution_engine, options=options)
+    evolution_model = LLMModel.get_model(options.evolution_engine, **vars(options))
     logger.info(
         f"Using {evolution_model.__class__.__name__.lower()} as the evolution engine"
     )
 
     judge_model: LLMModel | None = None
     if options.judge_engine is not None:
-        judge_model = LLMModel.get_model(options.judge_engine, options=options)
+        judge_model = LLMModel.get_model(options.judge_engine, **vars(options))
         logger.info(f"Using {options.judge_engine} as the judge engine")
 
     # set up evaluation model
     # NOTE currenty we always stick to Llama (Llama or LlamaChat depending on evolution engine) as evaluation engine
     # TODO allow to set separate engine and model for evaluation?
-    if isinstance(evolution_model, (Llama, LlamaChat, HfLlamaChat)):
+    if isinstance(evolution_model, (Llama, LlamaChat, HfChat)):
         evaluation_model_name = evolution_model.__class__.__name__.lower()
     elif judge_model is not None and isinstance(judge_model, (Llama, LlamaChat)):
         evaluation_model_name = judge_model.__class__.__name__.lower()
     else:
         evaluation_model_name = "llamachat"
-    evaluation_model = LLMModel.get_model(name=evaluation_model_name, options=options)
+    evaluation_model = LLMModel.get_model(name=evaluation_model_name, **vars(options))
     logger.info(f"Using {evaluation_model_name} as the evaluation engine")
 
     task = get_task(